From e2641e09cc0daf44f63f654230f72d22acf3a9af Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 22 Jun 2010 00:07:48 +0200 Subject: redis.c split into many different C files. networking related stuff moved into networking.c moved more code more work on layout of source code SDS instantaneuos memory saving. By Pieter and Salvatore at VMware ;) cleanly compiling again after the first split, now splitting it in more C files moving more things around... work in progress split replication code splitting more Sets split Hash split replication split even more splitting more splitting minor change --- Makefile | 114 - adlist.c | 325 -- adlist.h | 92 - ae.c | 390 -- ae.h | 117 - ae_epoll.c | 91 - ae_kqueue.c | 93 - ae_select.c | 72 - anet.c | 270 - anet.h | 49 - config.h | 45 - dict.c | 727 --- dict.h | 151 - fmacros.h | 15 - linenoise.c | 433 -- linenoise.h | 41 - lzf.h | 100 - lzfP.h | 159 - lzf_c.c | 295 -- lzf_d.c | 150 - mkreleasehdr.sh | 9 - pqsort.c | 197 - pqsort.h | 15 - redis-benchmark.c | 665 --- redis-check-aof.c | 185 - redis-check-dump.c | 671 --- redis-cli.c | 493 -- redis.c | 11621 -------------------------------------------- redis.h | 75 - release.c | 13 - sds.c | 359 -- sds.h | 73 - sha1.c | 276 -- sha1.h | 17 - solarisfixes.h | 21 - src/Makefile | 111 + src/adlist.c | 325 ++ src/adlist.h | 92 + src/ae.c | 390 ++ src/ae.h | 117 + src/ae_epoll.c | 91 + src/ae_kqueue.c | 93 + src/ae_select.c | 72 + src/anet.c | 270 + src/anet.h | 49 + src/aof.c | 694 +++ src/config.c | 438 ++ src/config.h | 45 + src/db.c | 508 ++ src/debug.c | 309 ++ src/dict.c | 727 +++ src/dict.h | 151 + src/fmacros.h | 15 + src/linenoise.c | 433 ++ src/linenoise.h | 41 + src/lzf.h | 100 + src/lzfP.h | 159 + src/lzf_c.c | 295 ++ src/lzf_d.c | 150 + src/mkreleasehdr.sh | 9 + src/multi.c | 266 + src/networking.c | 589 +++ src/object.c | 405 ++ src/pqsort.c | 197 + src/pqsort.h | 15 + src/pubsub.c | 259 + src/rdb.c | 886 ++++ src/redis-benchmark.c | 665 +++ src/redis-check-aof.c | 185 + src/redis-check-dump.c | 671 +++ src/redis-cli.c | 493 ++ src/redis.c | 1516 ++++++ src/redis.h | 885 ++++ src/release.c | 13 + src/replication.c | 475 ++ src/sds.c | 384 ++ src/sds.h | 74 + src/sha1.c | 276 ++ src/sha1.h | 17 + src/solarisfixes.h | 21 + src/sort.c | 383 ++ src/t_hash.c | 397 ++ src/t_list.c | 829 ++++ src/t_set.c | 349 ++ src/t_string.c | 251 + src/t_zset.c | 985 ++++ src/util.c | 223 + src/version.h | 1 + src/vm.c | 1126 +++++ src/ziplist.c | 959 ++++ src/ziplist.h | 15 + src/zipmap.c | 455 ++ src/zipmap.h | 48 + src/zmalloc.c | 158 + src/zmalloc.h | 41 + staticsymbols.h | 374 -- tests/integration/aof.tcl | 4 +- tests/support/server.tcl | 4 +- ziplist.c | 959 ---- ziplist.h | 15 - zipmap.c | 455 -- zipmap.h | 48 - zmalloc.c | 158 - zmalloc.h | 41 - 104 files changed, 20200 insertions(+), 20473 deletions(-) delete mode 100644 Makefile delete mode 100644 adlist.c delete mode 100644 adlist.h delete mode 100644 ae.c delete mode 100644 ae.h delete mode 100644 ae_epoll.c delete mode 100644 ae_kqueue.c delete mode 100644 ae_select.c delete mode 100644 anet.c delete mode 100644 anet.h delete mode 100644 config.h delete mode 100644 dict.c delete mode 100644 dict.h delete mode 100644 fmacros.h delete mode 100644 linenoise.c delete mode 100644 linenoise.h delete mode 100644 lzf.h delete mode 100644 lzfP.h delete mode 100644 lzf_c.c delete mode 100644 lzf_d.c delete mode 100755 mkreleasehdr.sh delete mode 100644 pqsort.c delete mode 100644 pqsort.h delete mode 100644 redis-benchmark.c delete mode 100644 redis-check-aof.c delete mode 100644 redis-check-dump.c delete mode 100644 redis-cli.c delete mode 100644 redis.c delete mode 100644 redis.h delete mode 100644 release.c delete mode 100644 sds.c delete mode 100644 sds.h delete mode 100644 sha1.c delete mode 100644 sha1.h delete mode 100644 solarisfixes.h create mode 100644 src/Makefile create mode 100644 src/adlist.c create mode 100644 src/adlist.h create mode 100644 src/ae.c create mode 100644 src/ae.h create mode 100644 src/ae_epoll.c create mode 100644 src/ae_kqueue.c create mode 100644 src/ae_select.c create mode 100644 src/anet.c create mode 100644 src/anet.h create mode 100644 src/aof.c create mode 100644 src/config.c create mode 100644 src/config.h create mode 100644 src/db.c create mode 100644 src/debug.c create mode 100644 src/dict.c create mode 100644 src/dict.h create mode 100644 src/fmacros.h create mode 100644 src/linenoise.c create mode 100644 src/linenoise.h create mode 100644 src/lzf.h create mode 100644 src/lzfP.h create mode 100644 src/lzf_c.c create mode 100644 src/lzf_d.c create mode 100755 src/mkreleasehdr.sh create mode 100644 src/multi.c create mode 100644 src/networking.c create mode 100644 src/object.c create mode 100644 src/pqsort.c create mode 100644 src/pqsort.h create mode 100644 src/pubsub.c create mode 100644 src/rdb.c create mode 100644 src/redis-benchmark.c create mode 100644 src/redis-check-aof.c create mode 100644 src/redis-check-dump.c create mode 100644 src/redis-cli.c create mode 100644 src/redis.c create mode 100644 src/redis.h create mode 100644 src/release.c create mode 100644 src/replication.c create mode 100644 src/sds.c create mode 100644 src/sds.h create mode 100644 src/sha1.c create mode 100644 src/sha1.h create mode 100644 src/solarisfixes.h create mode 100644 src/sort.c create mode 100644 src/t_hash.c create mode 100644 src/t_list.c create mode 100644 src/t_set.c create mode 100644 src/t_string.c create mode 100644 src/t_zset.c create mode 100644 src/util.c create mode 100644 src/version.h create mode 100644 src/vm.c create mode 100644 src/ziplist.c create mode 100644 src/ziplist.h create mode 100644 src/zipmap.c create mode 100644 src/zipmap.h create mode 100644 src/zmalloc.c create mode 100644 src/zmalloc.h delete mode 100644 staticsymbols.h delete mode 100644 ziplist.c delete mode 100644 ziplist.h delete mode 100644 zipmap.c delete mode 100644 zipmap.h delete mode 100644 zmalloc.c delete mode 100644 zmalloc.h diff --git a/Makefile b/Makefile deleted file mode 100644 index 96dddd69e..000000000 --- a/Makefile +++ /dev/null @@ -1,114 +0,0 @@ -# Redis Makefile -# Copyright (C) 2009 Salvatore Sanfilippo -# This file is released under the BSD license, see the COPYING file - -release_hdr := $(shell sh -c './mkreleasehdr.sh') -uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not') -OPTIMIZATION?=-O2 -ifeq ($(uname_S),SunOS) - CFLAGS?= -std=c99 -pedantic $(OPTIMIZATION) -Wall -W -D__EXTENSIONS__ -D_XPG6 - CCLINK?= -ldl -lnsl -lsocket -lm -lpthread -else - CFLAGS?= -std=c99 -pedantic $(OPTIMIZATION) -Wall -W $(ARCH) $(PROF) - CCLINK?= -lm -pthread -endif -CCOPT= $(CFLAGS) $(CCLINK) $(ARCH) $(PROF) -DEBUG?= -g -rdynamic -ggdb - -OBJ = adlist.o ae.o anet.o dict.o redis.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o -BENCHOBJ = ae.o anet.o redis-benchmark.o sds.o adlist.o zmalloc.o -CLIOBJ = anet.o sds.o adlist.o redis-cli.o zmalloc.o linenoise.o -CHECKDUMPOBJ = redis-check-dump.o lzf_c.o lzf_d.o -CHECKAOFOBJ = redis-check-aof.o - -PRGNAME = redis-server -BENCHPRGNAME = redis-benchmark -CLIPRGNAME = redis-cli -CHECKDUMPPRGNAME = redis-check-dump -CHECKAOFPRGNAME = redis-check-aof - -all: redis-server redis-benchmark redis-cli redis-check-dump redis-check-aof - -# Deps (use make dep to generate this) -adlist.o: adlist.c adlist.h zmalloc.h -ae.o: ae.c ae.h zmalloc.h config.h ae_kqueue.c -ae_epoll.o: ae_epoll.c -ae_kqueue.o: ae_kqueue.c -ae_select.o: ae_select.c -anet.o: anet.c fmacros.h anet.h -dict.o: dict.c fmacros.h dict.h zmalloc.h -linenoise.o: linenoise.c fmacros.h -lzf_c.o: lzf_c.c lzfP.h -lzf_d.o: lzf_d.c lzfP.h -pqsort.o: pqsort.c -redis-benchmark.o: redis-benchmark.c fmacros.h ae.h anet.h sds.h adlist.h \ - zmalloc.h -redis-check-aof.o: redis-check-aof.c fmacros.h config.h -redis-check-dump.o: redis-check-dump.c lzf.h -redis-cli.o: redis-cli.c fmacros.h anet.h sds.h adlist.h zmalloc.h \ - linenoise.h -redis.o: redis.c fmacros.h config.h redis.h ae.h sds.h anet.h dict.h \ - adlist.h zmalloc.h lzf.h pqsort.h zipmap.h ziplist.h sha1.h staticsymbols.h -release.o: release.c release.h -sds.o: sds.c sds.h zmalloc.h -sha1.o: sha1.c sha1.h -ziplist.o: ziplist.c zmalloc.h ziplist.h -zipmap.o: zipmap.c zmalloc.h -zmalloc.o: zmalloc.c config.h - -redis-server: $(OBJ) - $(CC) -o $(PRGNAME) $(CCOPT) $(DEBUG) $(OBJ) - @echo "" - @echo "Hint: To run 'make test' is a good idea ;)" - @echo "" - -redis-benchmark: $(BENCHOBJ) - $(CC) -o $(BENCHPRGNAME) $(CCOPT) $(DEBUG) $(BENCHOBJ) - -redis-cli: $(CLIOBJ) - $(CC) -o $(CLIPRGNAME) $(CCOPT) $(DEBUG) $(CLIOBJ) - -redis-check-dump: $(CHECKDUMPOBJ) - $(CC) -o $(CHECKDUMPPRGNAME) $(CCOPT) $(DEBUG) $(CHECKDUMPOBJ) - -redis-check-aof: $(CHECKAOFOBJ) - $(CC) -o $(CHECKAOFPRGNAME) $(CCOPT) $(DEBUG) $(CHECKAOFOBJ) - -.c.o: - $(CC) -c $(CFLAGS) $(DEBUG) $(COMPILE_TIME) $< - -clean: - rm -rf $(PRGNAME) $(BENCHPRGNAME) $(CLIPRGNAME) $(CHECKDUMPPRGNAME) $(CHECKAOFPRGNAME) *.o *.gcda *.gcno *.gcov - -dep: - $(CC) -MM *.c - -staticsymbols: - tclsh utils/build-static-symbols.tcl > staticsymbols.h - -test: - tclsh8.5 tests/test_helper.tcl --tags "${TAGS}" - -bench: - ./redis-benchmark - -log: - git log '--pretty=format:%ad %s (%cn)' --date=short > Changelog - -32bit: - @echo "" - @echo "WARNING: if it fails under Linux you probably need to install libc6-dev-i386" - @echo "" - make ARCH="-m32" - -gprof: - make PROF="-pg" - -gcov: - make PROF="-fprofile-arcs -ftest-coverage" - -noopt: - make OPTIMIZATION="" - -32bitgprof: - make PROF="-pg" ARCH="-arch i386" diff --git a/adlist.c b/adlist.c deleted file mode 100644 index 015012f5c..000000000 --- a/adlist.c +++ /dev/null @@ -1,325 +0,0 @@ -/* adlist.c - A generic doubly linked list implementation - * - * Copyright (c) 2006-2010, Salvatore Sanfilippo - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - - -#include -#include "adlist.h" -#include "zmalloc.h" - -/* Create a new list. The created list can be freed with - * AlFreeList(), but private value of every node need to be freed - * by the user before to call AlFreeList(). - * - * On error, NULL is returned. Otherwise the pointer to the new list. */ -list *listCreate(void) -{ - struct list *list; - - if ((list = zmalloc(sizeof(*list))) == NULL) - return NULL; - list->head = list->tail = NULL; - list->len = 0; - list->dup = NULL; - list->free = NULL; - list->match = NULL; - return list; -} - -/* Free the whole list. - * - * This function can't fail. */ -void listRelease(list *list) -{ - unsigned int len; - listNode *current, *next; - - current = list->head; - len = list->len; - while(len--) { - next = current->next; - if (list->free) list->free(current->value); - zfree(current); - current = next; - } - zfree(list); -} - -/* Add a new node to the list, to head, contaning the specified 'value' - * pointer as value. - * - * On error, NULL is returned and no operation is performed (i.e. the - * list remains unaltered). - * On success the 'list' pointer you pass to the function is returned. */ -list *listAddNodeHead(list *list, void *value) -{ - listNode *node; - - if ((node = zmalloc(sizeof(*node))) == NULL) - return NULL; - node->value = value; - if (list->len == 0) { - list->head = list->tail = node; - node->prev = node->next = NULL; - } else { - node->prev = NULL; - node->next = list->head; - list->head->prev = node; - list->head = node; - } - list->len++; - return list; -} - -/* Add a new node to the list, to tail, contaning the specified 'value' - * pointer as value. - * - * On error, NULL is returned and no operation is performed (i.e. the - * list remains unaltered). - * On success the 'list' pointer you pass to the function is returned. */ -list *listAddNodeTail(list *list, void *value) -{ - listNode *node; - - if ((node = zmalloc(sizeof(*node))) == NULL) - return NULL; - node->value = value; - if (list->len == 0) { - list->head = list->tail = node; - node->prev = node->next = NULL; - } else { - node->prev = list->tail; - node->next = NULL; - list->tail->next = node; - list->tail = node; - } - list->len++; - return list; -} - -list *listInsertNode(list *list, listNode *old_node, void *value, int after) { - listNode *node; - - if ((node = zmalloc(sizeof(*node))) == NULL) - return NULL; - node->value = value; - if (after) { - node->prev = old_node; - node->next = old_node->next; - if (list->tail == old_node) { - list->tail = node; - } - } else { - node->next = old_node; - node->prev = old_node->prev; - if (list->head == old_node) { - list->head = node; - } - } - if (node->prev != NULL) { - node->prev->next = node; - } - if (node->next != NULL) { - node->next->prev = node; - } - list->len++; - return list; -} - -/* Remove the specified node from the specified list. - * It's up to the caller to free the private value of the node. - * - * This function can't fail. */ -void listDelNode(list *list, listNode *node) -{ - if (node->prev) - node->prev->next = node->next; - else - list->head = node->next; - if (node->next) - node->next->prev = node->prev; - else - list->tail = node->prev; - if (list->free) list->free(node->value); - zfree(node); - list->len--; -} - -/* Returns a list iterator 'iter'. After the initialization every - * call to listNext() will return the next element of the list. - * - * This function can't fail. */ -listIter *listGetIterator(list *list, int direction) -{ - listIter *iter; - - if ((iter = zmalloc(sizeof(*iter))) == NULL) return NULL; - if (direction == AL_START_HEAD) - iter->next = list->head; - else - iter->next = list->tail; - iter->direction = direction; - return iter; -} - -/* Release the iterator memory */ -void listReleaseIterator(listIter *iter) { - zfree(iter); -} - -/* Create an iterator in the list private iterator structure */ -void listRewind(list *list, listIter *li) { - li->next = list->head; - li->direction = AL_START_HEAD; -} - -void listRewindTail(list *list, listIter *li) { - li->next = list->tail; - li->direction = AL_START_TAIL; -} - -/* Return the next element of an iterator. - * It's valid to remove the currently returned element using - * listDelNode(), but not to remove other elements. - * - * The function returns a pointer to the next element of the list, - * or NULL if there are no more elements, so the classical usage patter - * is: - * - * iter = listGetIterator(list,); - * while ((node = listNext(iter)) != NULL) { - * doSomethingWith(listNodeValue(node)); - * } - * - * */ -listNode *listNext(listIter *iter) -{ - listNode *current = iter->next; - - if (current != NULL) { - if (iter->direction == AL_START_HEAD) - iter->next = current->next; - else - iter->next = current->prev; - } - return current; -} - -/* Duplicate the whole list. On out of memory NULL is returned. - * On success a copy of the original list is returned. - * - * The 'Dup' method set with listSetDupMethod() function is used - * to copy the node value. Otherwise the same pointer value of - * the original node is used as value of the copied node. - * - * The original list both on success or error is never modified. */ -list *listDup(list *orig) -{ - list *copy; - listIter *iter; - listNode *node; - - if ((copy = listCreate()) == NULL) - return NULL; - copy->dup = orig->dup; - copy->free = orig->free; - copy->match = orig->match; - iter = listGetIterator(orig, AL_START_HEAD); - while((node = listNext(iter)) != NULL) { - void *value; - - if (copy->dup) { - value = copy->dup(node->value); - if (value == NULL) { - listRelease(copy); - listReleaseIterator(iter); - return NULL; - } - } else - value = node->value; - if (listAddNodeTail(copy, value) == NULL) { - listRelease(copy); - listReleaseIterator(iter); - return NULL; - } - } - listReleaseIterator(iter); - return copy; -} - -/* Search the list for a node matching a given key. - * The match is performed using the 'match' method - * set with listSetMatchMethod(). If no 'match' method - * is set, the 'value' pointer of every node is directly - * compared with the 'key' pointer. - * - * On success the first matching node pointer is returned - * (search starts from head). If no matching node exists - * NULL is returned. */ -listNode *listSearchKey(list *list, void *key) -{ - listIter *iter; - listNode *node; - - iter = listGetIterator(list, AL_START_HEAD); - while((node = listNext(iter)) != NULL) { - if (list->match) { - if (list->match(node->value, key)) { - listReleaseIterator(iter); - return node; - } - } else { - if (key == node->value) { - listReleaseIterator(iter); - return node; - } - } - } - listReleaseIterator(iter); - return NULL; -} - -/* Return the element at the specified zero-based index - * where 0 is the head, 1 is the element next to head - * and so on. Negative integers are used in order to count - * from the tail, -1 is the last element, -2 the penultimante - * and so on. If the index is out of range NULL is returned. */ -listNode *listIndex(list *list, int index) { - listNode *n; - - if (index < 0) { - index = (-index)-1; - n = list->tail; - while(index-- && n) n = n->prev; - } else { - n = list->head; - while(index-- && n) n = n->next; - } - return n; -} diff --git a/adlist.h b/adlist.h deleted file mode 100644 index a1209f62f..000000000 --- a/adlist.h +++ /dev/null @@ -1,92 +0,0 @@ -/* adlist.h - A generic doubly linked list implementation - * - * Copyright (c) 2006-2010, Salvatore Sanfilippo - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __ADLIST_H__ -#define __ADLIST_H__ - -/* Node, List, and Iterator are the only data structures used currently. */ - -typedef struct listNode { - struct listNode *prev; - struct listNode *next; - void *value; -} listNode; - -typedef struct listIter { - listNode *next; - int direction; -} listIter; - -typedef struct list { - listNode *head; - listNode *tail; - void *(*dup)(void *ptr); - void (*free)(void *ptr); - int (*match)(void *ptr, void *key); - unsigned int len; -} list; - -/* Functions implemented as macros */ -#define listLength(l) ((l)->len) -#define listFirst(l) ((l)->head) -#define listLast(l) ((l)->tail) -#define listPrevNode(n) ((n)->prev) -#define listNextNode(n) ((n)->next) -#define listNodeValue(n) ((n)->value) - -#define listSetDupMethod(l,m) ((l)->dup = (m)) -#define listSetFreeMethod(l,m) ((l)->free = (m)) -#define listSetMatchMethod(l,m) ((l)->match = (m)) - -#define listGetDupMethod(l) ((l)->dup) -#define listGetFree(l) ((l)->free) -#define listGetMatchMethod(l) ((l)->match) - -/* Prototypes */ -list *listCreate(void); -void listRelease(list *list); -list *listAddNodeHead(list *list, void *value); -list *listAddNodeTail(list *list, void *value); -list *listInsertNode(list *list, listNode *old_node, void *value, int after); -void listDelNode(list *list, listNode *node); -listIter *listGetIterator(list *list, int direction); -listNode *listNext(listIter *iter); -void listReleaseIterator(listIter *iter); -list *listDup(list *orig); -listNode *listSearchKey(list *list, void *key); -listNode *listIndex(list *list, int index); -void listRewind(list *list, listIter *li); -void listRewindTail(list *list, listIter *li); - -/* Directions for iterators */ -#define AL_START_HEAD 0 -#define AL_START_TAIL 1 - -#endif /* __ADLIST_H__ */ diff --git a/ae.c b/ae.c deleted file mode 100644 index c7918ee1d..000000000 --- a/ae.c +++ /dev/null @@ -1,390 +0,0 @@ -/* A simple event-driven programming library. Originally I wrote this code - * for the Jim's event-loop (Jim is a Tcl interpreter) but later translated - * it in form of a library for easy reuse. - * - * Copyright (c) 2006-2010, Salvatore Sanfilippo - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include -#include -#include -#include -#include - -#include "ae.h" -#include "zmalloc.h" -#include "config.h" - -/* Include the best multiplexing layer supported by this system. - * The following should be ordered by performances, descending. */ -#ifdef HAVE_EPOLL -#include "ae_epoll.c" -#else - #ifdef HAVE_KQUEUE - #include "ae_kqueue.c" - #else - #include "ae_select.c" - #endif -#endif - -aeEventLoop *aeCreateEventLoop(void) { - aeEventLoop *eventLoop; - int i; - - eventLoop = zmalloc(sizeof(*eventLoop)); - if (!eventLoop) return NULL; - eventLoop->timeEventHead = NULL; - eventLoop->timeEventNextId = 0; - eventLoop->stop = 0; - eventLoop->maxfd = -1; - eventLoop->beforesleep = NULL; - if (aeApiCreate(eventLoop) == -1) { - zfree(eventLoop); - return NULL; - } - /* Events with mask == AE_NONE are not set. So let's initialize the - * vector with it. */ - for (i = 0; i < AE_SETSIZE; i++) - eventLoop->events[i].mask = AE_NONE; - return eventLoop; -} - -void aeDeleteEventLoop(aeEventLoop *eventLoop) { - aeApiFree(eventLoop); - zfree(eventLoop); -} - -void aeStop(aeEventLoop *eventLoop) { - eventLoop->stop = 1; -} - -int aeCreateFileEvent(aeEventLoop *eventLoop, int fd, int mask, - aeFileProc *proc, void *clientData) -{ - if (fd >= AE_SETSIZE) return AE_ERR; - aeFileEvent *fe = &eventLoop->events[fd]; - - if (aeApiAddEvent(eventLoop, fd, mask) == -1) - return AE_ERR; - fe->mask |= mask; - if (mask & AE_READABLE) fe->rfileProc = proc; - if (mask & AE_WRITABLE) fe->wfileProc = proc; - fe->clientData = clientData; - if (fd > eventLoop->maxfd) - eventLoop->maxfd = fd; - return AE_OK; -} - -void aeDeleteFileEvent(aeEventLoop *eventLoop, int fd, int mask) -{ - if (fd >= AE_SETSIZE) return; - aeFileEvent *fe = &eventLoop->events[fd]; - - if (fe->mask == AE_NONE) return; - fe->mask = fe->mask & (~mask); - if (fd == eventLoop->maxfd && fe->mask == AE_NONE) { - /* Update the max fd */ - int j; - - for (j = eventLoop->maxfd-1; j >= 0; j--) - if (eventLoop->events[j].mask != AE_NONE) break; - eventLoop->maxfd = j; - } - aeApiDelEvent(eventLoop, fd, mask); -} - -static void aeGetTime(long *seconds, long *milliseconds) -{ - struct timeval tv; - - gettimeofday(&tv, NULL); - *seconds = tv.tv_sec; - *milliseconds = tv.tv_usec/1000; -} - -static void aeAddMillisecondsToNow(long long milliseconds, long *sec, long *ms) { - long cur_sec, cur_ms, when_sec, when_ms; - - aeGetTime(&cur_sec, &cur_ms); - when_sec = cur_sec + milliseconds/1000; - when_ms = cur_ms + milliseconds%1000; - if (when_ms >= 1000) { - when_sec ++; - when_ms -= 1000; - } - *sec = when_sec; - *ms = when_ms; -} - -long long aeCreateTimeEvent(aeEventLoop *eventLoop, long long milliseconds, - aeTimeProc *proc, void *clientData, - aeEventFinalizerProc *finalizerProc) -{ - long long id = eventLoop->timeEventNextId++; - aeTimeEvent *te; - - te = zmalloc(sizeof(*te)); - if (te == NULL) return AE_ERR; - te->id = id; - aeAddMillisecondsToNow(milliseconds,&te->when_sec,&te->when_ms); - te->timeProc = proc; - te->finalizerProc = finalizerProc; - te->clientData = clientData; - te->next = eventLoop->timeEventHead; - eventLoop->timeEventHead = te; - return id; -} - -int aeDeleteTimeEvent(aeEventLoop *eventLoop, long long id) -{ - aeTimeEvent *te, *prev = NULL; - - te = eventLoop->timeEventHead; - while(te) { - if (te->id == id) { - if (prev == NULL) - eventLoop->timeEventHead = te->next; - else - prev->next = te->next; - if (te->finalizerProc) - te->finalizerProc(eventLoop, te->clientData); - zfree(te); - return AE_OK; - } - prev = te; - te = te->next; - } - return AE_ERR; /* NO event with the specified ID found */ -} - -/* Search the first timer to fire. - * This operation is useful to know how many time the select can be - * put in sleep without to delay any event. - * If there are no timers NULL is returned. - * - * Note that's O(N) since time events are unsorted. - * Possible optimizations (not needed by Redis so far, but...): - * 1) Insert the event in order, so that the nearest is just the head. - * Much better but still insertion or deletion of timers is O(N). - * 2) Use a skiplist to have this operation as O(1) and insertion as O(log(N)). - */ -static aeTimeEvent *aeSearchNearestTimer(aeEventLoop *eventLoop) -{ - aeTimeEvent *te = eventLoop->timeEventHead; - aeTimeEvent *nearest = NULL; - - while(te) { - if (!nearest || te->when_sec < nearest->when_sec || - (te->when_sec == nearest->when_sec && - te->when_ms < nearest->when_ms)) - nearest = te; - te = te->next; - } - return nearest; -} - -/* Process time events */ -static int processTimeEvents(aeEventLoop *eventLoop) { - int processed = 0; - aeTimeEvent *te; - long long maxId; - - te = eventLoop->timeEventHead; - maxId = eventLoop->timeEventNextId-1; - while(te) { - long now_sec, now_ms; - long long id; - - if (te->id > maxId) { - te = te->next; - continue; - } - aeGetTime(&now_sec, &now_ms); - if (now_sec > te->when_sec || - (now_sec == te->when_sec && now_ms >= te->when_ms)) - { - int retval; - - id = te->id; - retval = te->timeProc(eventLoop, id, te->clientData); - processed++; - /* After an event is processed our time event list may - * no longer be the same, so we restart from head. - * Still we make sure to don't process events registered - * by event handlers itself in order to don't loop forever. - * To do so we saved the max ID we want to handle. - * - * FUTURE OPTIMIZATIONS: - * Note that this is NOT great algorithmically. Redis uses - * a single time event so it's not a problem but the right - * way to do this is to add the new elements on head, and - * to flag deleted elements in a special way for later - * deletion (putting references to the nodes to delete into - * another linked list). */ - if (retval != AE_NOMORE) { - aeAddMillisecondsToNow(retval,&te->when_sec,&te->when_ms); - } else { - aeDeleteTimeEvent(eventLoop, id); - } - te = eventLoop->timeEventHead; - } else { - te = te->next; - } - } - return processed; -} - -/* Process every pending time event, then every pending file event - * (that may be registered by time event callbacks just processed). - * Without special flags the function sleeps until some file event - * fires, or when the next time event occurrs (if any). - * - * If flags is 0, the function does nothing and returns. - * if flags has AE_ALL_EVENTS set, all the kind of events are processed. - * if flags has AE_FILE_EVENTS set, file events are processed. - * if flags has AE_TIME_EVENTS set, time events are processed. - * if flags has AE_DONT_WAIT set the function returns ASAP until all - * the events that's possible to process without to wait are processed. - * - * The function returns the number of events processed. */ -int aeProcessEvents(aeEventLoop *eventLoop, int flags) -{ - int processed = 0, numevents; - - /* Nothing to do? return ASAP */ - if (!(flags & AE_TIME_EVENTS) && !(flags & AE_FILE_EVENTS)) return 0; - - /* Note that we want call select() even if there are no - * file events to process as long as we want to process time - * events, in order to sleep until the next time event is ready - * to fire. */ - if (eventLoop->maxfd != -1 || - ((flags & AE_TIME_EVENTS) && !(flags & AE_DONT_WAIT))) { - int j; - aeTimeEvent *shortest = NULL; - struct timeval tv, *tvp; - - if (flags & AE_TIME_EVENTS && !(flags & AE_DONT_WAIT)) - shortest = aeSearchNearestTimer(eventLoop); - if (shortest) { - long now_sec, now_ms; - - /* Calculate the time missing for the nearest - * timer to fire. */ - aeGetTime(&now_sec, &now_ms); - tvp = &tv; - tvp->tv_sec = shortest->when_sec - now_sec; - if (shortest->when_ms < now_ms) { - tvp->tv_usec = ((shortest->when_ms+1000) - now_ms)*1000; - tvp->tv_sec --; - } else { - tvp->tv_usec = (shortest->when_ms - now_ms)*1000; - } - if (tvp->tv_sec < 0) tvp->tv_sec = 0; - if (tvp->tv_usec < 0) tvp->tv_usec = 0; - } else { - /* If we have to check for events but need to return - * ASAP because of AE_DONT_WAIT we need to se the timeout - * to zero */ - if (flags & AE_DONT_WAIT) { - tv.tv_sec = tv.tv_usec = 0; - tvp = &tv; - } else { - /* Otherwise we can block */ - tvp = NULL; /* wait forever */ - } - } - - numevents = aeApiPoll(eventLoop, tvp); - for (j = 0; j < numevents; j++) { - aeFileEvent *fe = &eventLoop->events[eventLoop->fired[j].fd]; - int mask = eventLoop->fired[j].mask; - int fd = eventLoop->fired[j].fd; - int rfired = 0; - - /* note the fe->mask & mask & ... code: maybe an already processed - * event removed an element that fired and we still didn't - * processed, so we check if the event is still valid. */ - if (fe->mask & mask & AE_READABLE) { - rfired = 1; - fe->rfileProc(eventLoop,fd,fe->clientData,mask); - } - if (fe->mask & mask & AE_WRITABLE) { - if (!rfired || fe->wfileProc != fe->rfileProc) - fe->wfileProc(eventLoop,fd,fe->clientData,mask); - } - processed++; - } - } - /* Check time events */ - if (flags & AE_TIME_EVENTS) - processed += processTimeEvents(eventLoop); - - return processed; /* return the number of processed file/time events */ -} - -/* Wait for millseconds until the given file descriptor becomes - * writable/readable/exception */ -int aeWait(int fd, int mask, long long milliseconds) { - struct timeval tv; - fd_set rfds, wfds, efds; - int retmask = 0, retval; - - tv.tv_sec = milliseconds/1000; - tv.tv_usec = (milliseconds%1000)*1000; - FD_ZERO(&rfds); - FD_ZERO(&wfds); - FD_ZERO(&efds); - - if (mask & AE_READABLE) FD_SET(fd,&rfds); - if (mask & AE_WRITABLE) FD_SET(fd,&wfds); - if ((retval = select(fd+1, &rfds, &wfds, &efds, &tv)) > 0) { - if (FD_ISSET(fd,&rfds)) retmask |= AE_READABLE; - if (FD_ISSET(fd,&wfds)) retmask |= AE_WRITABLE; - return retmask; - } else { - return retval; - } -} - -void aeMain(aeEventLoop *eventLoop) { - eventLoop->stop = 0; - while (!eventLoop->stop) { - if (eventLoop->beforesleep != NULL) - eventLoop->beforesleep(eventLoop); - aeProcessEvents(eventLoop, AE_ALL_EVENTS); - } -} - -char *aeGetApiName(void) { - return aeApiName(); -} - -void aeSetBeforeSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *beforesleep) { - eventLoop->beforesleep = beforesleep; -} diff --git a/ae.h b/ae.h deleted file mode 100644 index a9db18ed9..000000000 --- a/ae.h +++ /dev/null @@ -1,117 +0,0 @@ -/* A simple event-driven programming library. Originally I wrote this code - * for the Jim's event-loop (Jim is a Tcl interpreter) but later translated - * it in form of a library for easy reuse. - * - * Copyright (c) 2006-2010, Salvatore Sanfilippo - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __AE_H__ -#define __AE_H__ - -#define AE_SETSIZE (1024*10) /* Max number of fd supported */ - -#define AE_OK 0 -#define AE_ERR -1 - -#define AE_NONE 0 -#define AE_READABLE 1 -#define AE_WRITABLE 2 - -#define AE_FILE_EVENTS 1 -#define AE_TIME_EVENTS 2 -#define AE_ALL_EVENTS (AE_FILE_EVENTS|AE_TIME_EVENTS) -#define AE_DONT_WAIT 4 - -#define AE_NOMORE -1 - -/* Macros */ -#define AE_NOTUSED(V) ((void) V) - -struct aeEventLoop; - -/* Types and data structures */ -typedef void aeFileProc(struct aeEventLoop *eventLoop, int fd, void *clientData, int mask); -typedef int aeTimeProc(struct aeEventLoop *eventLoop, long long id, void *clientData); -typedef void aeEventFinalizerProc(struct aeEventLoop *eventLoop, void *clientData); -typedef void aeBeforeSleepProc(struct aeEventLoop *eventLoop); - -/* File event structure */ -typedef struct aeFileEvent { - int mask; /* one of AE_(READABLE|WRITABLE) */ - aeFileProc *rfileProc; - aeFileProc *wfileProc; - void *clientData; -} aeFileEvent; - -/* Time event structure */ -typedef struct aeTimeEvent { - long long id; /* time event identifier. */ - long when_sec; /* seconds */ - long when_ms; /* milliseconds */ - aeTimeProc *timeProc; - aeEventFinalizerProc *finalizerProc; - void *clientData; - struct aeTimeEvent *next; -} aeTimeEvent; - -/* A fired event */ -typedef struct aeFiredEvent { - int fd; - int mask; -} aeFiredEvent; - -/* State of an event based program */ -typedef struct aeEventLoop { - int maxfd; - long long timeEventNextId; - aeFileEvent events[AE_SETSIZE]; /* Registered events */ - aeFiredEvent fired[AE_SETSIZE]; /* Fired events */ - aeTimeEvent *timeEventHead; - int stop; - void *apidata; /* This is used for polling API specific data */ - aeBeforeSleepProc *beforesleep; -} aeEventLoop; - -/* Prototypes */ -aeEventLoop *aeCreateEventLoop(void); -void aeDeleteEventLoop(aeEventLoop *eventLoop); -void aeStop(aeEventLoop *eventLoop); -int aeCreateFileEvent(aeEventLoop *eventLoop, int fd, int mask, - aeFileProc *proc, void *clientData); -void aeDeleteFileEvent(aeEventLoop *eventLoop, int fd, int mask); -long long aeCreateTimeEvent(aeEventLoop *eventLoop, long long milliseconds, - aeTimeProc *proc, void *clientData, - aeEventFinalizerProc *finalizerProc); -int aeDeleteTimeEvent(aeEventLoop *eventLoop, long long id); -int aeProcessEvents(aeEventLoop *eventLoop, int flags); -int aeWait(int fd, int mask, long long milliseconds); -void aeMain(aeEventLoop *eventLoop); -char *aeGetApiName(void); -void aeSetBeforeSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *beforesleep); - -#endif diff --git a/ae_epoll.c b/ae_epoll.c deleted file mode 100644 index d48977b65..000000000 --- a/ae_epoll.c +++ /dev/null @@ -1,91 +0,0 @@ -/* Linux epoll(2) based ae.c module - * Copyright (C) 2009-2010 Salvatore Sanfilippo - antirez@gmail.com - * Released under the BSD license. See the COPYING file for more info. */ - -#include - -typedef struct aeApiState { - int epfd; - struct epoll_event events[AE_SETSIZE]; -} aeApiState; - -static int aeApiCreate(aeEventLoop *eventLoop) { - aeApiState *state = zmalloc(sizeof(aeApiState)); - - if (!state) return -1; - state->epfd = epoll_create(1024); /* 1024 is just an hint for the kernel */ - if (state->epfd == -1) return -1; - eventLoop->apidata = state; - return 0; -} - -static void aeApiFree(aeEventLoop *eventLoop) { - aeApiState *state = eventLoop->apidata; - - close(state->epfd); - zfree(state); -} - -static int aeApiAddEvent(aeEventLoop *eventLoop, int fd, int mask) { - aeApiState *state = eventLoop->apidata; - struct epoll_event ee; - /* If the fd was already monitored for some event, we need a MOD - * operation. Otherwise we need an ADD operation. */ - int op = eventLoop->events[fd].mask == AE_NONE ? - EPOLL_CTL_ADD : EPOLL_CTL_MOD; - - ee.events = 0; - mask |= eventLoop->events[fd].mask; /* Merge old events */ - if (mask & AE_READABLE) ee.events |= EPOLLIN; - if (mask & AE_WRITABLE) ee.events |= EPOLLOUT; - ee.data.u64 = 0; /* avoid valgrind warning */ - ee.data.fd = fd; - if (epoll_ctl(state->epfd,op,fd,&ee) == -1) return -1; - return 0; -} - -static void aeApiDelEvent(aeEventLoop *eventLoop, int fd, int delmask) { - aeApiState *state = eventLoop->apidata; - struct epoll_event ee; - int mask = eventLoop->events[fd].mask & (~delmask); - - ee.events = 0; - if (mask & AE_READABLE) ee.events |= EPOLLIN; - if (mask & AE_WRITABLE) ee.events |= EPOLLOUT; - ee.data.u64 = 0; /* avoid valgrind warning */ - ee.data.fd = fd; - if (mask != AE_NONE) { - epoll_ctl(state->epfd,EPOLL_CTL_MOD,fd,&ee); - } else { - /* Note, Kernel < 2.6.9 requires a non null event pointer even for - * EPOLL_CTL_DEL. */ - epoll_ctl(state->epfd,EPOLL_CTL_DEL,fd,&ee); - } -} - -static int aeApiPoll(aeEventLoop *eventLoop, struct timeval *tvp) { - aeApiState *state = eventLoop->apidata; - int retval, numevents = 0; - - retval = epoll_wait(state->epfd,state->events,AE_SETSIZE, - tvp ? (tvp->tv_sec*1000 + tvp->tv_usec/1000) : -1); - if (retval > 0) { - int j; - - numevents = retval; - for (j = 0; j < numevents; j++) { - int mask = 0; - struct epoll_event *e = state->events+j; - - if (e->events & EPOLLIN) mask |= AE_READABLE; - if (e->events & EPOLLOUT) mask |= AE_WRITABLE; - eventLoop->fired[j].fd = e->data.fd; - eventLoop->fired[j].mask = mask; - } - } - return numevents; -} - -static char *aeApiName(void) { - return "epoll"; -} diff --git a/ae_kqueue.c b/ae_kqueue.c deleted file mode 100644 index 04c3536ba..000000000 --- a/ae_kqueue.c +++ /dev/null @@ -1,93 +0,0 @@ -/* Kqueue(2)-based ae.c module - * Copyright (C) 2009 Harish Mallipeddi - harish.mallipeddi@gmail.com - * Released under the BSD license. See the COPYING file for more info. */ - -#include -#include -#include - -typedef struct aeApiState { - int kqfd; - struct kevent events[AE_SETSIZE]; -} aeApiState; - -static int aeApiCreate(aeEventLoop *eventLoop) { - aeApiState *state = zmalloc(sizeof(aeApiState)); - - if (!state) return -1; - state->kqfd = kqueue(); - if (state->kqfd == -1) return -1; - eventLoop->apidata = state; - - return 0; -} - -static void aeApiFree(aeEventLoop *eventLoop) { - aeApiState *state = eventLoop->apidata; - - close(state->kqfd); - zfree(state); -} - -static int aeApiAddEvent(aeEventLoop *eventLoop, int fd, int mask) { - aeApiState *state = eventLoop->apidata; - struct kevent ke; - - if (mask & AE_READABLE) { - EV_SET(&ke, fd, EVFILT_READ, EV_ADD, 0, 0, NULL); - if (kevent(state->kqfd, &ke, 1, NULL, 0, NULL) == -1) return -1; - } - if (mask & AE_WRITABLE) { - EV_SET(&ke, fd, EVFILT_WRITE, EV_ADD, 0, 0, NULL); - if (kevent(state->kqfd, &ke, 1, NULL, 0, NULL) == -1) return -1; - } - return 0; -} - -static void aeApiDelEvent(aeEventLoop *eventLoop, int fd, int mask) { - aeApiState *state = eventLoop->apidata; - struct kevent ke; - - if (mask & AE_READABLE) { - EV_SET(&ke, fd, EVFILT_READ, EV_DELETE, 0, 0, NULL); - kevent(state->kqfd, &ke, 1, NULL, 0, NULL); - } - if (mask & AE_WRITABLE) { - EV_SET(&ke, fd, EVFILT_WRITE, EV_DELETE, 0, 0, NULL); - kevent(state->kqfd, &ke, 1, NULL, 0, NULL); - } -} - -static int aeApiPoll(aeEventLoop *eventLoop, struct timeval *tvp) { - aeApiState *state = eventLoop->apidata; - int retval, numevents = 0; - - if (tvp != NULL) { - struct timespec timeout; - timeout.tv_sec = tvp->tv_sec; - timeout.tv_nsec = tvp->tv_usec * 1000; - retval = kevent(state->kqfd, NULL, 0, state->events, AE_SETSIZE, &timeout); - } else { - retval = kevent(state->kqfd, NULL, 0, state->events, AE_SETSIZE, NULL); - } - - if (retval > 0) { - int j; - - numevents = retval; - for(j = 0; j < numevents; j++) { - int mask = 0; - struct kevent *e = state->events+j; - - if (e->filter == EVFILT_READ) mask |= AE_READABLE; - if (e->filter == EVFILT_WRITE) mask |= AE_WRITABLE; - eventLoop->fired[j].fd = e->ident; - eventLoop->fired[j].mask = mask; - } - } - return numevents; -} - -static char *aeApiName(void) { - return "kqueue"; -} diff --git a/ae_select.c b/ae_select.c deleted file mode 100644 index 43f5867f3..000000000 --- a/ae_select.c +++ /dev/null @@ -1,72 +0,0 @@ -/* Select()-based ae.c module - * Copyright (C) 2009-2010 Salvatore Sanfilippo - antirez@gmail.com - * Released under the BSD license. See the COPYING file for more info. */ - -#include - -typedef struct aeApiState { - fd_set rfds, wfds; - /* We need to have a copy of the fd sets as it's not safe to reuse - * FD sets after select(). */ - fd_set _rfds, _wfds; -} aeApiState; - -static int aeApiCreate(aeEventLoop *eventLoop) { - aeApiState *state = zmalloc(sizeof(aeApiState)); - - if (!state) return -1; - FD_ZERO(&state->rfds); - FD_ZERO(&state->wfds); - eventLoop->apidata = state; - return 0; -} - -static void aeApiFree(aeEventLoop *eventLoop) { - zfree(eventLoop->apidata); -} - -static int aeApiAddEvent(aeEventLoop *eventLoop, int fd, int mask) { - aeApiState *state = eventLoop->apidata; - - if (mask & AE_READABLE) FD_SET(fd,&state->rfds); - if (mask & AE_WRITABLE) FD_SET(fd,&state->wfds); - return 0; -} - -static void aeApiDelEvent(aeEventLoop *eventLoop, int fd, int mask) { - aeApiState *state = eventLoop->apidata; - - if (mask & AE_READABLE) FD_CLR(fd,&state->rfds); - if (mask & AE_WRITABLE) FD_CLR(fd,&state->wfds); -} - -static int aeApiPoll(aeEventLoop *eventLoop, struct timeval *tvp) { - aeApiState *state = eventLoop->apidata; - int retval, j, numevents = 0; - - memcpy(&state->_rfds,&state->rfds,sizeof(fd_set)); - memcpy(&state->_wfds,&state->wfds,sizeof(fd_set)); - - retval = select(eventLoop->maxfd+1, - &state->_rfds,&state->_wfds,NULL,tvp); - if (retval > 0) { - for (j = 0; j <= eventLoop->maxfd; j++) { - int mask = 0; - aeFileEvent *fe = &eventLoop->events[j]; - - if (fe->mask == AE_NONE) continue; - if (fe->mask & AE_READABLE && FD_ISSET(j,&state->_rfds)) - mask |= AE_READABLE; - if (fe->mask & AE_WRITABLE && FD_ISSET(j,&state->_wfds)) - mask |= AE_WRITABLE; - eventLoop->fired[numevents].fd = j; - eventLoop->fired[numevents].mask = mask; - numevents++; - } - } - return numevents; -} - -static char *aeApiName(void) { - return "select"; -} diff --git a/anet.c b/anet.c deleted file mode 100644 index 4fe811a11..000000000 --- a/anet.c +++ /dev/null @@ -1,270 +0,0 @@ -/* anet.c -- Basic TCP socket stuff made a bit less boring - * - * Copyright (c) 2006-2010, Salvatore Sanfilippo - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include "fmacros.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "anet.h" - -static void anetSetError(char *err, const char *fmt, ...) -{ - va_list ap; - - if (!err) return; - va_start(ap, fmt); - vsnprintf(err, ANET_ERR_LEN, fmt, ap); - va_end(ap); -} - -int anetNonBlock(char *err, int fd) -{ - int flags; - - /* Set the socket nonblocking. - * Note that fcntl(2) for F_GETFL and F_SETFL can't be - * interrupted by a signal. */ - if ((flags = fcntl(fd, F_GETFL)) == -1) { - anetSetError(err, "fcntl(F_GETFL): %s\n", strerror(errno)); - return ANET_ERR; - } - if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) == -1) { - anetSetError(err, "fcntl(F_SETFL,O_NONBLOCK): %s\n", strerror(errno)); - return ANET_ERR; - } - return ANET_OK; -} - -int anetTcpNoDelay(char *err, int fd) -{ - int yes = 1; - if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &yes, sizeof(yes)) == -1) - { - anetSetError(err, "setsockopt TCP_NODELAY: %s\n", strerror(errno)); - return ANET_ERR; - } - return ANET_OK; -} - -int anetSetSendBuffer(char *err, int fd, int buffsize) -{ - if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &buffsize, sizeof(buffsize)) == -1) - { - anetSetError(err, "setsockopt SO_SNDBUF: %s\n", strerror(errno)); - return ANET_ERR; - } - return ANET_OK; -} - -int anetTcpKeepAlive(char *err, int fd) -{ - int yes = 1; - if (setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &yes, sizeof(yes)) == -1) { - anetSetError(err, "setsockopt SO_KEEPALIVE: %s\n", strerror(errno)); - return ANET_ERR; - } - return ANET_OK; -} - -int anetResolve(char *err, char *host, char *ipbuf) -{ - struct sockaddr_in sa; - - sa.sin_family = AF_INET; - if (inet_aton(host, &sa.sin_addr) == 0) { - struct hostent *he; - - he = gethostbyname(host); - if (he == NULL) { - anetSetError(err, "can't resolve: %s\n", host); - return ANET_ERR; - } - memcpy(&sa.sin_addr, he->h_addr, sizeof(struct in_addr)); - } - strcpy(ipbuf,inet_ntoa(sa.sin_addr)); - return ANET_OK; -} - -#define ANET_CONNECT_NONE 0 -#define ANET_CONNECT_NONBLOCK 1 -static int anetTcpGenericConnect(char *err, char *addr, int port, int flags) -{ - int s, on = 1; - struct sockaddr_in sa; - - if ((s = socket(AF_INET, SOCK_STREAM, 0)) == -1) { - anetSetError(err, "creating socket: %s\n", strerror(errno)); - return ANET_ERR; - } - /* Make sure connection-intensive things like the redis benckmark - * will be able to close/open sockets a zillion of times */ - setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)); - - sa.sin_family = AF_INET; - sa.sin_port = htons(port); - if (inet_aton(addr, &sa.sin_addr) == 0) { - struct hostent *he; - - he = gethostbyname(addr); - if (he == NULL) { - anetSetError(err, "can't resolve: %s\n", addr); - close(s); - return ANET_ERR; - } - memcpy(&sa.sin_addr, he->h_addr, sizeof(struct in_addr)); - } - if (flags & ANET_CONNECT_NONBLOCK) { - if (anetNonBlock(err,s) != ANET_OK) - return ANET_ERR; - } - if (connect(s, (struct sockaddr*)&sa, sizeof(sa)) == -1) { - if (errno == EINPROGRESS && - flags & ANET_CONNECT_NONBLOCK) - return s; - - anetSetError(err, "connect: %s\n", strerror(errno)); - close(s); - return ANET_ERR; - } - return s; -} - -int anetTcpConnect(char *err, char *addr, int port) -{ - return anetTcpGenericConnect(err,addr,port,ANET_CONNECT_NONE); -} - -int anetTcpNonBlockConnect(char *err, char *addr, int port) -{ - return anetTcpGenericConnect(err,addr,port,ANET_CONNECT_NONBLOCK); -} - -/* Like read(2) but make sure 'count' is read before to return - * (unless error or EOF condition is encountered) */ -int anetRead(int fd, char *buf, int count) -{ - int nread, totlen = 0; - while(totlen != count) { - nread = read(fd,buf,count-totlen); - if (nread == 0) return totlen; - if (nread == -1) return -1; - totlen += nread; - buf += nread; - } - return totlen; -} - -/* Like write(2) but make sure 'count' is read before to return - * (unless error is encountered) */ -int anetWrite(int fd, char *buf, int count) -{ - int nwritten, totlen = 0; - while(totlen != count) { - nwritten = write(fd,buf,count-totlen); - if (nwritten == 0) return totlen; - if (nwritten == -1) return -1; - totlen += nwritten; - buf += nwritten; - } - return totlen; -} - -int anetTcpServer(char *err, int port, char *bindaddr) -{ - int s, on = 1; - struct sockaddr_in sa; - - if ((s = socket(AF_INET, SOCK_STREAM, 0)) == -1) { - anetSetError(err, "socket: %s\n", strerror(errno)); - return ANET_ERR; - } - if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) { - anetSetError(err, "setsockopt SO_REUSEADDR: %s\n", strerror(errno)); - close(s); - return ANET_ERR; - } - memset(&sa,0,sizeof(sa)); - sa.sin_family = AF_INET; - sa.sin_port = htons(port); - sa.sin_addr.s_addr = htonl(INADDR_ANY); - if (bindaddr) { - if (inet_aton(bindaddr, &sa.sin_addr) == 0) { - anetSetError(err, "Invalid bind address\n"); - close(s); - return ANET_ERR; - } - } - if (bind(s, (struct sockaddr*)&sa, sizeof(sa)) == -1) { - anetSetError(err, "bind: %s\n", strerror(errno)); - close(s); - return ANET_ERR; - } - if (listen(s, 511) == -1) { /* the magic 511 constant is from nginx */ - anetSetError(err, "listen: %s\n", strerror(errno)); - close(s); - return ANET_ERR; - } - return s; -} - -int anetAccept(char *err, int serversock, char *ip, int *port) -{ - int fd; - struct sockaddr_in sa; - unsigned int saLen; - - while(1) { - saLen = sizeof(sa); - fd = accept(serversock, (struct sockaddr*)&sa, &saLen); - if (fd == -1) { - if (errno == EINTR) - continue; - else { - anetSetError(err, "accept: %s\n", strerror(errno)); - return ANET_ERR; - } - } - break; - } - if (ip) strcpy(ip,inet_ntoa(sa.sin_addr)); - if (port) *port = ntohs(sa.sin_port); - return fd; -} diff --git a/anet.h b/anet.h deleted file mode 100644 index ce0f47787..000000000 --- a/anet.h +++ /dev/null @@ -1,49 +0,0 @@ -/* anet.c -- Basic TCP socket stuff made a bit less boring - * - * Copyright (c) 2006-2010, Salvatore Sanfilippo - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef ANET_H -#define ANET_H - -#define ANET_OK 0 -#define ANET_ERR -1 -#define ANET_ERR_LEN 256 - -int anetTcpConnect(char *err, char *addr, int port); -int anetTcpNonBlockConnect(char *err, char *addr, int port); -int anetRead(int fd, char *buf, int count); -int anetResolve(char *err, char *host, char *ipbuf); -int anetTcpServer(char *err, int port, char *bindaddr); -int anetAccept(char *err, int serversock, char *ip, int *port); -int anetWrite(int fd, char *buf, int count); -int anetNonBlock(char *err, int fd); -int anetTcpNoDelay(char *err, int fd); -int anetTcpKeepAlive(char *err, int fd); - -#endif diff --git a/config.h b/config.h deleted file mode 100644 index 6e98fbb2c..000000000 --- a/config.h +++ /dev/null @@ -1,45 +0,0 @@ -#ifndef __CONFIG_H -#define __CONFIG_H - -#ifdef __APPLE__ -#include -#endif - -/* test for malloc_size() */ -#ifdef __APPLE__ -#include -#define HAVE_MALLOC_SIZE 1 -#define redis_malloc_size(p) malloc_size(p) -#endif - -/* define redis_fstat to fstat or fstat64() */ -#if defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6) -#define redis_fstat fstat64 -#define redis_stat stat64 -#else -#define redis_fstat fstat -#define redis_stat stat -#endif - -/* test for backtrace() */ -#if defined(__APPLE__) || defined(__linux__) -#define HAVE_BACKTRACE 1 -#endif - -/* test for polling API */ -#ifdef __linux__ -#define HAVE_EPOLL 1 -#endif - -#if (defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined (__NetBSD__) -#define HAVE_KQUEUE 1 -#endif - -/* define aof_fsync to fdatasync() in Linux and fsync() for all the rest */ -#ifdef __linux__ -#define aof_fsync fdatasync -#else -#define aof_fsync fsync -#endif - -#endif diff --git a/dict.c b/dict.c deleted file mode 100644 index d5010708c..000000000 --- a/dict.c +++ /dev/null @@ -1,727 +0,0 @@ -/* Hash Tables Implementation. - * - * This file implements in memory hash tables with insert/del/replace/find/ - * get-random-element operations. Hash tables will auto resize if needed - * tables of power of two in size are used, collisions are handled by - * chaining. See the source code for more information... :) - * - * Copyright (c) 2006-2010, Salvatore Sanfilippo - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include "fmacros.h" - -#include -#include -#include -#include -#include -#include -#include - -#include "dict.h" -#include "zmalloc.h" - -/* Using dictEnableResize() / dictDisableResize() we make possible to - * enable/disable resizing of the hash table as needed. This is very important - * for Redis, as we use copy-on-write and don't want to move too much memory - * around when there is a child performing saving operations. */ -static int dict_can_resize = 1; - -/* ---------------------------- Utility funcitons --------------------------- */ - -static void _dictPanic(const char *fmt, ...) -{ - va_list ap; - - va_start(ap, fmt); - fprintf(stderr, "\nDICT LIBRARY PANIC: "); - vfprintf(stderr, fmt, ap); - fprintf(stderr, "\n\n"); - va_end(ap); -} - -/* ------------------------- Heap Management Wrappers------------------------ */ - -static void *_dictAlloc(size_t size) -{ - void *p = zmalloc(size); - if (p == NULL) - _dictPanic("Out of memory"); - return p; -} - -static void _dictFree(void *ptr) { - zfree(ptr); -} - -/* -------------------------- private prototypes ---------------------------- */ - -static int _dictExpandIfNeeded(dict *ht); -static unsigned long _dictNextPower(unsigned long size); -static int _dictKeyIndex(dict *ht, const void *key); -static int _dictInit(dict *ht, dictType *type, void *privDataPtr); - -/* -------------------------- hash functions -------------------------------- */ - -/* Thomas Wang's 32 bit Mix Function */ -unsigned int dictIntHashFunction(unsigned int key) -{ - key += ~(key << 15); - key ^= (key >> 10); - key += (key << 3); - key ^= (key >> 6); - key += ~(key << 11); - key ^= (key >> 16); - return key; -} - -/* Identity hash function for integer keys */ -unsigned int dictIdentityHashFunction(unsigned int key) -{ - return key; -} - -/* Generic hash function (a popular one from Bernstein). - * I tested a few and this was the best. */ -unsigned int dictGenHashFunction(const unsigned char *buf, int len) { - unsigned int hash = 5381; - - while (len--) - hash = ((hash << 5) + hash) + (*buf++); /* hash * 33 + c */ - return hash; -} - -/* ----------------------------- API implementation ------------------------- */ - -/* Reset an hashtable already initialized with ht_init(). - * NOTE: This function should only called by ht_destroy(). */ -static void _dictReset(dictht *ht) -{ - ht->table = NULL; - ht->size = 0; - ht->sizemask = 0; - ht->used = 0; -} - -/* Create a new hash table */ -dict *dictCreate(dictType *type, - void *privDataPtr) -{ - dict *d = _dictAlloc(sizeof(*d)); - - _dictInit(d,type,privDataPtr); - return d; -} - -/* Initialize the hash table */ -int _dictInit(dict *d, dictType *type, - void *privDataPtr) -{ - _dictReset(&d->ht[0]); - _dictReset(&d->ht[1]); - d->type = type; - d->privdata = privDataPtr; - d->rehashidx = -1; - d->iterators = 0; - return DICT_OK; -} - -/* Resize the table to the minimal size that contains all the elements, - * but with the invariant of a USER/BUCKETS ration near to <= 1 */ -int dictResize(dict *d) -{ - int minimal; - - if (!dict_can_resize || dictIsRehashing(d)) return DICT_ERR; - minimal = d->ht[0].used; - if (minimal < DICT_HT_INITIAL_SIZE) - minimal = DICT_HT_INITIAL_SIZE; - return dictExpand(d, minimal); -} - -/* Expand or create the hashtable */ -int dictExpand(dict *d, unsigned long size) -{ - dictht n; /* the new hashtable */ - unsigned long realsize = _dictNextPower(size); - - /* the size is invalid if it is smaller than the number of - * elements already inside the hashtable */ - if (dictIsRehashing(d) || d->ht[0].used > size) - return DICT_ERR; - - n.size = realsize; - n.sizemask = realsize-1; - n.table = _dictAlloc(realsize*sizeof(dictEntry*)); - n.used = 0; - - /* Initialize all the pointers to NULL */ - memset(n.table, 0, realsize*sizeof(dictEntry*)); - - /* Is this the first initialization? If so it's not really a rehashing - * we just set the first hash table so that it can accept keys. */ - if (d->ht[0].table == NULL) { - d->ht[0] = n; - return DICT_OK; - } - - /* Prepare a second hash table for incremental rehashing */ - d->ht[1] = n; - d->rehashidx = 0; - return DICT_OK; -} - -/* Performs N steps of incremental rehashing. Returns 1 if there are still - * keys to move from the old to the new hash table, otherwise 0 is returned. - * Note that a rehashing step consists in moving a bucket (that may have more - * thank one key as we use chaining) from the old to the new hash table. */ -int dictRehash(dict *d, int n) { - if (!dictIsRehashing(d)) return 0; - - while(n--) { - dictEntry *de, *nextde; - - /* Check if we already rehashed the whole table... */ - if (d->ht[0].used == 0) { - _dictFree(d->ht[0].table); - d->ht[0] = d->ht[1]; - _dictReset(&d->ht[1]); - d->rehashidx = -1; - return 0; - } - - /* Note that rehashidx can't overflow as we are sure there are more - * elements because ht[0].used != 0 */ - while(d->ht[0].table[d->rehashidx] == NULL) d->rehashidx++; - de = d->ht[0].table[d->rehashidx]; - /* Move all the keys in this bucket from the old to the new hash HT */ - while(de) { - unsigned int h; - - nextde = de->next; - /* Get the index in the new hash table */ - h = dictHashKey(d, de->key) & d->ht[1].sizemask; - de->next = d->ht[1].table[h]; - d->ht[1].table[h] = de; - d->ht[0].used--; - d->ht[1].used++; - de = nextde; - } - d->ht[0].table[d->rehashidx] = NULL; - d->rehashidx++; - } - return 1; -} - -long long timeInMilliseconds(void) { - struct timeval tv; - - gettimeofday(&tv,NULL); - return (((long long)tv.tv_sec)*1000)+(tv.tv_usec/1000); -} - -/* Rehash for an amount of time between ms milliseconds and ms+1 milliseconds */ -int dictRehashMilliseconds(dict *d, int ms) { - long long start = timeInMilliseconds(); - int rehashes = 0; - - while(dictRehash(d,100)) { - rehashes += 100; - if (timeInMilliseconds()-start > ms) break; - } - return rehashes; -} - -/* This function performs just a step of rehashing, and only if there are - * not iterators bound to our hash table. When we have iterators in the middle - * of a rehashing we can't mess with the two hash tables otherwise some element - * can be missed or duplicated. - * - * This function is called by common lookup or update operations in the - * dictionary so that the hash table automatically migrates from H1 to H2 - * while it is actively used. */ -static void _dictRehashStep(dict *d) { - if (d->iterators == 0) dictRehash(d,1); -} - -/* Add an element to the target hash table */ -int dictAdd(dict *d, void *key, void *val) -{ - int index; - dictEntry *entry; - dictht *ht; - - if (dictIsRehashing(d)) _dictRehashStep(d); - - /* Get the index of the new element, or -1 if - * the element already exists. */ - if ((index = _dictKeyIndex(d, key)) == -1) - return DICT_ERR; - - /* Allocates the memory and stores key */ - ht = dictIsRehashing(d) ? &d->ht[1] : &d->ht[0]; - entry = _dictAlloc(sizeof(*entry)); - entry->next = ht->table[index]; - ht->table[index] = entry; - ht->used++; - - /* Set the hash entry fields. */ - dictSetHashKey(d, entry, key); - dictSetHashVal(d, entry, val); - return DICT_OK; -} - -/* Add an element, discarding the old if the key already exists. - * Return 1 if the key was added from scratch, 0 if there was already an - * element with such key and dictReplace() just performed a value update - * operation. */ -int dictReplace(dict *d, void *key, void *val) -{ - dictEntry *entry, auxentry; - - /* Try to add the element. If the key - * does not exists dictAdd will suceed. */ - if (dictAdd(d, key, val) == DICT_OK) - return 1; - /* It already exists, get the entry */ - entry = dictFind(d, key); - /* Free the old value and set the new one */ - /* Set the new value and free the old one. Note that it is important - * to do that in this order, as the value may just be exactly the same - * as the previous one. In this context, think to reference counting, - * you want to increment (set), and then decrement (free), and not the - * reverse. */ - auxentry = *entry; - dictSetHashVal(d, entry, val); - dictFreeEntryVal(d, &auxentry); - return 0; -} - -/* Search and remove an element */ -static int dictGenericDelete(dict *d, const void *key, int nofree) -{ - unsigned int h, idx; - dictEntry *he, *prevHe; - int table; - - if (d->ht[0].size == 0) return DICT_ERR; /* d->ht[0].table is NULL */ - if (dictIsRehashing(d)) _dictRehashStep(d); - h = dictHashKey(d, key); - - for (table = 0; table <= 1; table++) { - idx = h & d->ht[table].sizemask; - he = d->ht[table].table[idx]; - prevHe = NULL; - while(he) { - if (dictCompareHashKeys(d, key, he->key)) { - /* Unlink the element from the list */ - if (prevHe) - prevHe->next = he->next; - else - d->ht[table].table[idx] = he->next; - if (!nofree) { - dictFreeEntryKey(d, he); - dictFreeEntryVal(d, he); - } - _dictFree(he); - d->ht[table].used--; - return DICT_OK; - } - prevHe = he; - he = he->next; - } - if (!dictIsRehashing(d)) break; - } - return DICT_ERR; /* not found */ -} - -int dictDelete(dict *ht, const void *key) { - return dictGenericDelete(ht,key,0); -} - -int dictDeleteNoFree(dict *ht, const void *key) { - return dictGenericDelete(ht,key,1); -} - -/* Destroy an entire dictionary */ -int _dictClear(dict *d, dictht *ht) -{ - unsigned long i; - - /* Free all the elements */ - for (i = 0; i < ht->size && ht->used > 0; i++) { - dictEntry *he, *nextHe; - - if ((he = ht->table[i]) == NULL) continue; - while(he) { - nextHe = he->next; - dictFreeEntryKey(d, he); - dictFreeEntryVal(d, he); - _dictFree(he); - ht->used--; - he = nextHe; - } - } - /* Free the table and the allocated cache structure */ - _dictFree(ht->table); - /* Re-initialize the table */ - _dictReset(ht); - return DICT_OK; /* never fails */ -} - -/* Clear & Release the hash table */ -void dictRelease(dict *d) -{ - _dictClear(d,&d->ht[0]); - _dictClear(d,&d->ht[1]); - _dictFree(d); -} - -dictEntry *dictFind(dict *d, const void *key) -{ - dictEntry *he; - unsigned int h, idx, table; - - if (d->ht[0].size == 0) return NULL; /* We don't have a table at all */ - if (dictIsRehashing(d)) _dictRehashStep(d); - h = dictHashKey(d, key); - for (table = 0; table <= 1; table++) { - idx = h & d->ht[table].sizemask; - he = d->ht[table].table[idx]; - while(he) { - if (dictCompareHashKeys(d, key, he->key)) - return he; - he = he->next; - } - if (!dictIsRehashing(d)) return NULL; - } - return NULL; -} - -void *dictFetchValue(dict *d, const void *key) { - dictEntry *he; - - he = dictFind(d,key); - return he ? dictGetEntryVal(he) : NULL; -} - -dictIterator *dictGetIterator(dict *d) -{ - dictIterator *iter = _dictAlloc(sizeof(*iter)); - - iter->d = d; - iter->table = 0; - iter->index = -1; - iter->entry = NULL; - iter->nextEntry = NULL; - return iter; -} - -dictEntry *dictNext(dictIterator *iter) -{ - while (1) { - if (iter->entry == NULL) { - dictht *ht = &iter->d->ht[iter->table]; - if (iter->index == -1 && iter->table == 0) iter->d->iterators++; - iter->index++; - if (iter->index >= (signed) ht->size) { - if (dictIsRehashing(iter->d) && iter->table == 0) { - iter->table++; - iter->index = 0; - ht = &iter->d->ht[1]; - } else { - break; - } - } - iter->entry = ht->table[iter->index]; - } else { - iter->entry = iter->nextEntry; - } - if (iter->entry) { - /* We need to save the 'next' here, the iterator user - * may delete the entry we are returning. */ - iter->nextEntry = iter->entry->next; - return iter->entry; - } - } - return NULL; -} - -void dictReleaseIterator(dictIterator *iter) -{ - if (!(iter->index == -1 && iter->table == 0)) iter->d->iterators--; - _dictFree(iter); -} - -/* Return a random entry from the hash table. Useful to - * implement randomized algorithms */ -dictEntry *dictGetRandomKey(dict *d) -{ - dictEntry *he, *orighe; - unsigned int h; - int listlen, listele; - - if (dictSize(d) == 0) return NULL; - if (dictIsRehashing(d)) _dictRehashStep(d); - if (dictIsRehashing(d)) { - do { - h = random() % (d->ht[0].size+d->ht[1].size); - he = (h >= d->ht[0].size) ? d->ht[1].table[h - d->ht[0].size] : - d->ht[0].table[h]; - } while(he == NULL); - } else { - do { - h = random() & d->ht[0].sizemask; - he = d->ht[0].table[h]; - } while(he == NULL); - } - - /* Now we found a non empty bucket, but it is a linked - * list and we need to get a random element from the list. - * The only sane way to do so is counting the elements and - * select a random index. */ - listlen = 0; - orighe = he; - while(he) { - he = he->next; - listlen++; - } - listele = random() % listlen; - he = orighe; - while(listele--) he = he->next; - return he; -} - -/* ------------------------- private functions ------------------------------ */ - -/* Expand the hash table if needed */ -static int _dictExpandIfNeeded(dict *d) -{ - /* If the hash table is empty expand it to the intial size, - * if the table is "full" dobule its size. */ - if (dictIsRehashing(d)) return DICT_OK; - if (d->ht[0].size == 0) - return dictExpand(d, DICT_HT_INITIAL_SIZE); - if (d->ht[0].used >= d->ht[0].size && dict_can_resize) - return dictExpand(d, ((d->ht[0].size > d->ht[0].used) ? - d->ht[0].size : d->ht[0].used)*2); - return DICT_OK; -} - -/* Our hash table capability is a power of two */ -static unsigned long _dictNextPower(unsigned long size) -{ - unsigned long i = DICT_HT_INITIAL_SIZE; - - if (size >= LONG_MAX) return LONG_MAX; - while(1) { - if (i >= size) - return i; - i *= 2; - } -} - -/* Returns the index of a free slot that can be populated with - * an hash entry for the given 'key'. - * If the key already exists, -1 is returned. - * - * Note that if we are in the process of rehashing the hash table, the - * index is always returned in the context of the second (new) hash table. */ -static int _dictKeyIndex(dict *d, const void *key) -{ - unsigned int h, idx, table; - dictEntry *he; - - /* Expand the hashtable if needed */ - if (_dictExpandIfNeeded(d) == DICT_ERR) - return -1; - /* Compute the key hash value */ - h = dictHashKey(d, key); - for (table = 0; table <= 1; table++) { - idx = h & d->ht[table].sizemask; - /* Search if this slot does not already contain the given key */ - he = d->ht[table].table[idx]; - while(he) { - if (dictCompareHashKeys(d, key, he->key)) - return -1; - he = he->next; - } - if (!dictIsRehashing(d)) break; - } - return idx; -} - -void dictEmpty(dict *d) { - _dictClear(d,&d->ht[0]); - _dictClear(d,&d->ht[1]); - d->rehashidx = -1; - d->iterators = 0; -} - -#define DICT_STATS_VECTLEN 50 -static void _dictPrintStatsHt(dictht *ht) { - unsigned long i, slots = 0, chainlen, maxchainlen = 0; - unsigned long totchainlen = 0; - unsigned long clvector[DICT_STATS_VECTLEN]; - - if (ht->used == 0) { - printf("No stats available for empty dictionaries\n"); - return; - } - - for (i = 0; i < DICT_STATS_VECTLEN; i++) clvector[i] = 0; - for (i = 0; i < ht->size; i++) { - dictEntry *he; - - if (ht->table[i] == NULL) { - clvector[0]++; - continue; - } - slots++; - /* For each hash entry on this slot... */ - chainlen = 0; - he = ht->table[i]; - while(he) { - chainlen++; - he = he->next; - } - clvector[(chainlen < DICT_STATS_VECTLEN) ? chainlen : (DICT_STATS_VECTLEN-1)]++; - if (chainlen > maxchainlen) maxchainlen = chainlen; - totchainlen += chainlen; - } - printf("Hash table stats:\n"); - printf(" table size: %ld\n", ht->size); - printf(" number of elements: %ld\n", ht->used); - printf(" different slots: %ld\n", slots); - printf(" max chain length: %ld\n", maxchainlen); - printf(" avg chain length (counted): %.02f\n", (float)totchainlen/slots); - printf(" avg chain length (computed): %.02f\n", (float)ht->used/slots); - printf(" Chain length distribution:\n"); - for (i = 0; i < DICT_STATS_VECTLEN-1; i++) { - if (clvector[i] == 0) continue; - printf(" %s%ld: %ld (%.02f%%)\n",(i == DICT_STATS_VECTLEN-1)?">= ":"", i, clvector[i], ((float)clvector[i]/ht->size)*100); - } -} - -void dictPrintStats(dict *d) { - _dictPrintStatsHt(&d->ht[0]); - if (dictIsRehashing(d)) { - printf("-- Rehashing into ht[1]:\n"); - _dictPrintStatsHt(&d->ht[1]); - } -} - -void dictEnableResize(void) { - dict_can_resize = 1; -} - -void dictDisableResize(void) { - dict_can_resize = 0; -} - -/* ----------------------- StringCopy Hash Table Type ------------------------*/ - -static unsigned int _dictStringCopyHTHashFunction(const void *key) -{ - return dictGenHashFunction(key, strlen(key)); -} - -static void *_dictStringCopyHTKeyDup(void *privdata, const void *key) -{ - int len = strlen(key); - char *copy = _dictAlloc(len+1); - DICT_NOTUSED(privdata); - - memcpy(copy, key, len); - copy[len] = '\0'; - return copy; -} - -static void *_dictStringKeyValCopyHTValDup(void *privdata, const void *val) -{ - int len = strlen(val); - char *copy = _dictAlloc(len+1); - DICT_NOTUSED(privdata); - - memcpy(copy, val, len); - copy[len] = '\0'; - return copy; -} - -static int _dictStringCopyHTKeyCompare(void *privdata, const void *key1, - const void *key2) -{ - DICT_NOTUSED(privdata); - - return strcmp(key1, key2) == 0; -} - -static void _dictStringCopyHTKeyDestructor(void *privdata, void *key) -{ - DICT_NOTUSED(privdata); - - _dictFree((void*)key); /* ATTENTION: const cast */ -} - -static void _dictStringKeyValCopyHTValDestructor(void *privdata, void *val) -{ - DICT_NOTUSED(privdata); - - _dictFree((void*)val); /* ATTENTION: const cast */ -} - -dictType dictTypeHeapStringCopyKey = { - _dictStringCopyHTHashFunction, /* hash function */ - _dictStringCopyHTKeyDup, /* key dup */ - NULL, /* val dup */ - _dictStringCopyHTKeyCompare, /* key compare */ - _dictStringCopyHTKeyDestructor, /* key destructor */ - NULL /* val destructor */ -}; - -/* This is like StringCopy but does not auto-duplicate the key. - * It's used for intepreter's shared strings. */ -dictType dictTypeHeapStrings = { - _dictStringCopyHTHashFunction, /* hash function */ - NULL, /* key dup */ - NULL, /* val dup */ - _dictStringCopyHTKeyCompare, /* key compare */ - _dictStringCopyHTKeyDestructor, /* key destructor */ - NULL /* val destructor */ -}; - -/* This is like StringCopy but also automatically handle dynamic - * allocated C strings as values. */ -dictType dictTypeHeapStringCopyKeyValue = { - _dictStringCopyHTHashFunction, /* hash function */ - _dictStringCopyHTKeyDup, /* key dup */ - _dictStringKeyValCopyHTValDup, /* val dup */ - _dictStringCopyHTKeyCompare, /* key compare */ - _dictStringCopyHTKeyDestructor, /* key destructor */ - _dictStringKeyValCopyHTValDestructor, /* val destructor */ -}; diff --git a/dict.h b/dict.h deleted file mode 100644 index 30ace4db7..000000000 --- a/dict.h +++ /dev/null @@ -1,151 +0,0 @@ -/* Hash Tables Implementation. - * - * This file implements in memory hash tables with insert/del/replace/find/ - * get-random-element operations. Hash tables will auto resize if needed - * tables of power of two in size are used, collisions are handled by - * chaining. See the source code for more information... :) - * - * Copyright (c) 2006-2010, Salvatore Sanfilippo - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __DICT_H -#define __DICT_H - -#define DICT_OK 0 -#define DICT_ERR 1 - -/* Unused arguments generate annoying warnings... */ -#define DICT_NOTUSED(V) ((void) V) - -typedef struct dictEntry { - void *key; - void *val; - struct dictEntry *next; -} dictEntry; - -typedef struct dictType { - unsigned int (*hashFunction)(const void *key); - void *(*keyDup)(void *privdata, const void *key); - void *(*valDup)(void *privdata, const void *obj); - int (*keyCompare)(void *privdata, const void *key1, const void *key2); - void (*keyDestructor)(void *privdata, void *key); - void (*valDestructor)(void *privdata, void *obj); -} dictType; - -/* This is our hash table structure. Every dictionary has two of this as we - * implement incremental rehashing, for the old to the new table. */ -typedef struct dictht { - dictEntry **table; - unsigned long size; - unsigned long sizemask; - unsigned long used; -} dictht; - -typedef struct dict { - dictType *type; - void *privdata; - dictht ht[2]; - int rehashidx; /* rehashing not in progress if rehashidx == -1 */ - int iterators; /* number of iterators currently running */ -} dict; - -typedef struct dictIterator { - dict *d; - int table; - int index; - dictEntry *entry, *nextEntry; -} dictIterator; - -/* This is the initial size of every hash table */ -#define DICT_HT_INITIAL_SIZE 4 - -/* ------------------------------- Macros ------------------------------------*/ -#define dictFreeEntryVal(d, entry) \ - if ((d)->type->valDestructor) \ - (d)->type->valDestructor((d)->privdata, (entry)->val) - -#define dictSetHashVal(d, entry, _val_) do { \ - if ((d)->type->valDup) \ - entry->val = (d)->type->valDup((d)->privdata, _val_); \ - else \ - entry->val = (_val_); \ -} while(0) - -#define dictFreeEntryKey(d, entry) \ - if ((d)->type->keyDestructor) \ - (d)->type->keyDestructor((d)->privdata, (entry)->key) - -#define dictSetHashKey(d, entry, _key_) do { \ - if ((d)->type->keyDup) \ - entry->key = (d)->type->keyDup((d)->privdata, _key_); \ - else \ - entry->key = (_key_); \ -} while(0) - -#define dictCompareHashKeys(d, key1, key2) \ - (((d)->type->keyCompare) ? \ - (d)->type->keyCompare((d)->privdata, key1, key2) : \ - (key1) == (key2)) - -#define dictHashKey(d, key) (d)->type->hashFunction(key) - -#define dictGetEntryKey(he) ((he)->key) -#define dictGetEntryVal(he) ((he)->val) -#define dictSlots(d) ((d)->ht[0].size+(d)->ht[1].size) -#define dictSize(d) ((d)->ht[0].used+(d)->ht[1].used) -#define dictIsRehashing(ht) ((ht)->rehashidx != -1) - -/* API */ -dict *dictCreate(dictType *type, void *privDataPtr); -int dictExpand(dict *d, unsigned long size); -int dictAdd(dict *d, void *key, void *val); -int dictReplace(dict *d, void *key, void *val); -int dictDelete(dict *d, const void *key); -int dictDeleteNoFree(dict *d, const void *key); -void dictRelease(dict *d); -dictEntry * dictFind(dict *d, const void *key); -void *dictFetchValue(dict *d, const void *key); -int dictResize(dict *d); -dictIterator *dictGetIterator(dict *d); -dictEntry *dictNext(dictIterator *iter); -void dictReleaseIterator(dictIterator *iter); -dictEntry *dictGetRandomKey(dict *d); -void dictPrintStats(dict *d); -unsigned int dictGenHashFunction(const unsigned char *buf, int len); -void dictEmpty(dict *d); -void dictEnableResize(void); -void dictDisableResize(void); -int dictRehash(dict *d, int n); -int dictRehashMilliseconds(dict *d, int ms); - -/* Hash table types */ -extern dictType dictTypeHeapStringCopyKey; -extern dictType dictTypeHeapStrings; -extern dictType dictTypeHeapStringCopyKeyValue; - -#endif /* __DICT_H */ diff --git a/fmacros.h b/fmacros.h deleted file mode 100644 index 38f46482a..000000000 --- a/fmacros.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef _REDIS_FMACRO_H -#define _REDIS_FMACRO_H - -#define _BSD_SOURCE - -#ifdef __linux__ -#define _XOPEN_SOURCE 700 -#else -#define _XOPEN_SOURCE -#endif - -#define _LARGEFILE_SOURCE -#define _FILE_OFFSET_BITS 64 - -#endif diff --git a/linenoise.c b/linenoise.c deleted file mode 100644 index 0c04d03fb..000000000 --- a/linenoise.c +++ /dev/null @@ -1,433 +0,0 @@ -/* linenoise.c -- guerrilla line editing library against the idea that a - * line editing lib needs to be 20,000 lines of C code. - * - * You can find the latest source code at: - * - * http://github.com/antirez/linenoise - * - * Does a number of crazy assumptions that happen to be true in 99.9999% of - * the 2010 UNIX computers around. - * - * Copyright (c) 2010, Salvatore Sanfilippo - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * - * References: - * - http://invisible-island.net/xterm/ctlseqs/ctlseqs.html - * - http://www.3waylabs.com/nw/WWW/products/wizcon/vt220.html - * - * Todo list: - * - Switch to gets() if $TERM is something we can't support. - * - Filter bogus Ctrl+ combinations. - * - Win32 support - * - * Bloat: - * - Completion? - * - History search like Ctrl+r in readline? - * - * List of escape sequences used by this program, we do everything just - * with three sequences. In order to be so cheap we may have some - * flickering effect with some slow terminal, but the lesser sequences - * the more compatible. - * - * CHA (Cursor Horizontal Absolute) - * Sequence: ESC [ n G - * Effect: moves cursor to column n - * - * EL (Erase Line) - * Sequence: ESC [ n K - * Effect: if n is 0 or missing, clear from cursor to end of line - * Effect: if n is 1, clear from beginning of line to cursor - * Effect: if n is 2, clear entire line - * - * CUF (CUrsor Forward) - * Sequence: ESC [ n C - * Effect: moves cursor forward of n chars - * - */ - -#include "fmacros.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define LINENOISE_MAX_LINE 4096 -static char *unsupported_term[] = {"dumb","cons25",NULL}; - -static struct termios orig_termios; /* in order to restore at exit */ -static int rawmode = 0; /* for atexit() function to check if restore is needed*/ -static int atexit_registered = 0; /* register atexit just 1 time */ -static int history_max_len = 100; -static int history_len = 0; -char **history = NULL; - -static void linenoiseAtExit(void); -int linenoiseHistoryAdd(const char *line); - -static int isUnsupportedTerm(void) { - char *term = getenv("TERM"); - int j; - - if (term == NULL) return 0; - for (j = 0; unsupported_term[j]; j++) - if (!strcasecmp(term,unsupported_term[j])) return 1; - return 0; -} - -static void freeHistory(void) { - if (history) { - int j; - - for (j = 0; j < history_len; j++) - free(history[j]); - free(history); - } -} - -static int enableRawMode(int fd) { - struct termios raw; - - if (!isatty(STDIN_FILENO)) goto fatal; - if (!atexit_registered) { - atexit(linenoiseAtExit); - atexit_registered = 1; - } - if (tcgetattr(fd,&orig_termios) == -1) goto fatal; - - raw = orig_termios; /* modify the original mode */ - /* input modes: no break, no CR to NL, no parity check, no strip char, - * no start/stop output control. */ - raw.c_iflag &= ~(BRKINT | ICRNL | INPCK | ISTRIP | IXON); - /* output modes - disable post processing */ - raw.c_oflag &= ~(OPOST); - /* control modes - set 8 bit chars */ - raw.c_cflag |= (CS8); - /* local modes - choing off, canonical off, no extended functions, - * no signal chars (^Z,^C) */ - raw.c_lflag &= ~(ECHO | ICANON | IEXTEN | ISIG); - /* control chars - set return condition: min number of bytes and timer. - * We want read to return every single byte, without timeout. */ - raw.c_cc[VMIN] = 1; raw.c_cc[VTIME] = 0; /* 1 byte, no timer */ - - /* put terminal in raw mode after flushing */ - if (tcsetattr(fd,TCSAFLUSH,&raw) < 0) goto fatal; - rawmode = 1; - return 0; - -fatal: - errno = ENOTTY; - return -1; -} - -static void disableRawMode(int fd) { - /* Don't even check the return value as it's too late. */ - if (rawmode && tcsetattr(fd,TCSAFLUSH,&orig_termios) != -1) - rawmode = 0; -} - -/* At exit we'll try to fix the terminal to the initial conditions. */ -static void linenoiseAtExit(void) { - disableRawMode(STDIN_FILENO); - freeHistory(); -} - -static int getColumns(void) { - struct winsize ws; - - if (ioctl(1, TIOCGWINSZ, &ws) == -1) return 80; - return ws.ws_col; -} - -static void refreshLine(int fd, const char *prompt, char *buf, size_t len, size_t pos, size_t cols) { - char seq[64]; - size_t plen = strlen(prompt); - - while((plen+pos) >= cols) { - buf++; - len--; - pos--; - } - while (plen+len > cols) { - len--; - } - - /* Cursor to left edge */ - snprintf(seq,64,"\x1b[0G"); - if (write(fd,seq,strlen(seq)) == -1) return; - /* Write the prompt and the current buffer content */ - if (write(fd,prompt,strlen(prompt)) == -1) return; - if (write(fd,buf,len) == -1) return; - /* Erase to right */ - snprintf(seq,64,"\x1b[0K"); - if (write(fd,seq,strlen(seq)) == -1) return; - /* Move cursor to original position. */ - snprintf(seq,64,"\x1b[0G\x1b[%dC", (int)(pos+plen)); - if (write(fd,seq,strlen(seq)) == -1) return; -} - -static int linenoisePrompt(int fd, char *buf, size_t buflen, const char *prompt) { - size_t plen = strlen(prompt); - size_t pos = 0; - size_t len = 0; - size_t cols = getColumns(); - int history_index = 0; - - buf[0] = '\0'; - buflen--; /* Make sure there is always space for the nulterm */ - - /* The latest history entry is always our current buffer, that - * initially is just an empty string. */ - linenoiseHistoryAdd(""); - - if (write(fd,prompt,plen) == -1) return -1; - while(1) { - char c; - int nread; - char seq[2]; - - nread = read(fd,&c,1); - if (nread <= 0) return len; - switch(c) { - case 13: /* enter */ - history_len--; - return len; - case 4: /* ctrl-d */ - history_len--; - return (len == 0) ? -1 : (int)len; - case 3: /* ctrl-c */ - errno = EAGAIN; - return -1; - case 127: /* backspace */ - case 8: /* ctrl-h */ - if (pos > 0 && len > 0) { - memmove(buf+pos-1,buf+pos,len-pos); - pos--; - len--; - buf[len] = '\0'; - refreshLine(fd,prompt,buf,len,pos,cols); - } - break; - case 20: /* ctrl-t */ - if (pos > 0 && pos < len) { - int aux = buf[pos-1]; - buf[pos-1] = buf[pos]; - buf[pos] = aux; - if (pos != len-1) pos++; - refreshLine(fd,prompt,buf,len,pos,cols); - } - break; - case 2: /* ctrl-b */ - goto left_arrow; - case 6: /* ctrl-f */ - goto right_arrow; - case 16: /* ctrl-p */ - seq[1] = 65; - goto up_down_arrow; - case 14: /* ctrl-n */ - seq[1] = 66; - goto up_down_arrow; - break; - case 27: /* escape sequence */ - if (read(fd,seq,2) == -1) break; - if (seq[0] == 91 && seq[1] == 68) { -left_arrow: - /* left arrow */ - if (pos > 0) { - pos--; - refreshLine(fd,prompt,buf,len,pos,cols); - } - } else if (seq[0] == 91 && seq[1] == 67) { -right_arrow: - /* right arrow */ - if (pos != len) { - pos++; - refreshLine(fd,prompt,buf,len,pos,cols); - } - } else if (seq[0] == 91 && (seq[1] == 65 || seq[1] == 66)) { -up_down_arrow: - /* up and down arrow: history */ - if (history_len > 1) { - /* Update the current history entry before to - * overwrite it with tne next one. */ - free(history[history_len-1-history_index]); - history[history_len-1-history_index] = strdup(buf); - /* Show the new entry */ - history_index += (seq[1] == 65) ? 1 : -1; - if (history_index < 0) { - history_index = 0; - break; - } else if (history_index >= history_len) { - history_index = history_len-1; - break; - } - strncpy(buf,history[history_len-1-history_index],buflen); - buf[buflen] = '\0'; - len = pos = strlen(buf); - refreshLine(fd,prompt,buf,len,pos,cols); - } - } - break; - default: - if (len < buflen) { - if (len == pos) { - buf[pos] = c; - pos++; - len++; - buf[len] = '\0'; - if (plen+len < cols) { - /* Avoid a full update of the line in the - * trivial case. */ - if (write(fd,&c,1) == -1) return -1; - } else { - refreshLine(fd,prompt,buf,len,pos,cols); - } - } else { - memmove(buf+pos+1,buf+pos,len-pos); - buf[pos] = c; - len++; - pos++; - buf[len] = '\0'; - refreshLine(fd,prompt,buf,len,pos,cols); - } - } - break; - case 21: /* Ctrl+u, delete the whole line. */ - buf[0] = '\0'; - pos = len = 0; - refreshLine(fd,prompt,buf,len,pos,cols); - break; - case 11: /* Ctrl+k, delete from current to end of line. */ - buf[pos] = '\0'; - len = pos; - refreshLine(fd,prompt,buf,len,pos,cols); - break; - case 1: /* Ctrl+a, go to the start of the line */ - pos = 0; - refreshLine(fd,prompt,buf,len,pos,cols); - break; - case 5: /* ctrl+e, go to the end of the line */ - pos = len; - refreshLine(fd,prompt,buf,len,pos,cols); - break; - } - } - return len; -} - -static int linenoiseRaw(char *buf, size_t buflen, const char *prompt) { - int fd = STDIN_FILENO; - int count; - - if (buflen == 0) { - errno = EINVAL; - return -1; - } - if (!isatty(STDIN_FILENO)) { - if (fgets(buf, buflen, stdin) == NULL) return -1; - count = strlen(buf); - if (count && buf[count-1] == '\n') { - count--; - buf[count] = '\0'; - } - } else { - if (enableRawMode(fd) == -1) return -1; - count = linenoisePrompt(fd, buf, buflen, prompt); - disableRawMode(fd); - printf("\n"); - } - return count; -} - -char *linenoise(const char *prompt) { - char buf[LINENOISE_MAX_LINE]; - int count; - - if (isUnsupportedTerm()) { - size_t len; - - printf("%s",prompt); - fflush(stdout); - if (fgets(buf,LINENOISE_MAX_LINE,stdin) == NULL) return NULL; - len = strlen(buf); - while(len && (buf[len-1] == '\n' || buf[len-1] == '\r')) { - len--; - buf[len] = '\0'; - } - return strdup(buf); - } else { - count = linenoiseRaw(buf,LINENOISE_MAX_LINE,prompt); - if (count == -1) return NULL; - return strdup(buf); - } -} - -/* Using a circular buffer is smarter, but a bit more complex to handle. */ -int linenoiseHistoryAdd(const char *line) { - char *linecopy; - - if (history_max_len == 0) return 0; - if (history == 0) { - history = malloc(sizeof(char*)*history_max_len); - if (history == NULL) return 0; - memset(history,0,(sizeof(char*)*history_max_len)); - } - linecopy = strdup(line); - if (!linecopy) return 0; - if (history_len == history_max_len) { - memmove(history,history+1,sizeof(char*)*(history_max_len-1)); - history_len--; - } - history[history_len] = linecopy; - history_len++; - return 1; -} - -int linenoiseHistorySetMaxLen(int len) { - char **new; - - if (len < 1) return 0; - if (history) { - int tocopy = history_len; - - new = malloc(sizeof(char*)*len); - if (new == NULL) return 0; - if (len < tocopy) tocopy = len; - memcpy(new,history+(history_max_len-tocopy), sizeof(char*)*tocopy); - free(history); - history = new; - } - history_max_len = len; - if (history_len > history_max_len) - history_len = history_max_len; - return 1; -} diff --git a/linenoise.h b/linenoise.h deleted file mode 100644 index ff45e2c47..000000000 --- a/linenoise.h +++ /dev/null @@ -1,41 +0,0 @@ -/* linenoise.h -- guerrilla line editing library against the idea that a - * line editing lib needs to be 20,000 lines of C code. - * - * See linenoise.c for more information. - * - * Copyright (c) 2010, Salvatore Sanfilippo - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __LINENOISE_H -#define __LINENOISE_H - -char *linenoise(const char *prompt); -int linenoiseHistoryAdd(char *line); -int linenoiseHistorySetMaxLen(int len); - -#endif /* __LINENOISE_H */ diff --git a/lzf.h b/lzf.h deleted file mode 100644 index 919b6e6be..000000000 --- a/lzf.h +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Copyright (c) 2000-2008 Marc Alexander Lehmann - * - * Redistribution and use in source and binary forms, with or without modifica- - * tion, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER- - * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO - * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- - * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH- - * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * - * Alternatively, the contents of this file may be used under the terms of - * the GNU General Public License ("GPL") version 2 or any later version, - * in which case the provisions of the GPL are applicable instead of - * the above. If you wish to allow the use of your version of this file - * only under the terms of the GPL and not to allow others to use your - * version of this file under the BSD license, indicate your decision - * by deleting the provisions above and replace them with the notice - * and other provisions required by the GPL. If you do not delete the - * provisions above, a recipient may use your version of this file under - * either the BSD or the GPL. - */ - -#ifndef LZF_H -#define LZF_H - -/*********************************************************************** -** -** lzf -- an extremely fast/free compression/decompression-method -** http://liblzf.plan9.de/ -** -** This algorithm is believed to be patent-free. -** -***********************************************************************/ - -#define LZF_VERSION 0x0105 /* 1.5, API version */ - -/* - * Compress in_len bytes stored at the memory block starting at - * in_data and write the result to out_data, up to a maximum length - * of out_len bytes. - * - * If the output buffer is not large enough or any error occurs return 0, - * otherwise return the number of bytes used, which might be considerably - * more than in_len (but less than 104% of the original size), so it - * makes sense to always use out_len == in_len - 1), to ensure _some_ - * compression, and store the data uncompressed otherwise (with a flag, of - * course. - * - * lzf_compress might use different algorithms on different systems and - * even different runs, thus might result in different compressed strings - * depending on the phase of the moon or similar factors. However, all - * these strings are architecture-independent and will result in the - * original data when decompressed using lzf_decompress. - * - * The buffers must not be overlapping. - * - * If the option LZF_STATE_ARG is enabled, an extra argument must be - * supplied which is not reflected in this header file. Refer to lzfP.h - * and lzf_c.c. - * - */ -unsigned int -lzf_compress (const void *const in_data, unsigned int in_len, - void *out_data, unsigned int out_len); - -/* - * Decompress data compressed with some version of the lzf_compress - * function and stored at location in_data and length in_len. The result - * will be stored at out_data up to a maximum of out_len characters. - * - * If the output buffer is not large enough to hold the decompressed - * data, a 0 is returned and errno is set to E2BIG. Otherwise the number - * of decompressed bytes (i.e. the original length of the data) is - * returned. - * - * If an error in the compressed data is detected, a zero is returned and - * errno is set to EINVAL. - * - * This function is very fast, about as fast as a copying loop. - */ -unsigned int -lzf_decompress (const void *const in_data, unsigned int in_len, - void *out_data, unsigned int out_len); - -#endif - diff --git a/lzfP.h b/lzfP.h deleted file mode 100644 index d533f1829..000000000 --- a/lzfP.h +++ /dev/null @@ -1,159 +0,0 @@ -/* - * Copyright (c) 2000-2007 Marc Alexander Lehmann - * - * Redistribution and use in source and binary forms, with or without modifica- - * tion, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER- - * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO - * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- - * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH- - * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * - * Alternatively, the contents of this file may be used under the terms of - * the GNU General Public License ("GPL") version 2 or any later version, - * in which case the provisions of the GPL are applicable instead of - * the above. If you wish to allow the use of your version of this file - * only under the terms of the GPL and not to allow others to use your - * version of this file under the BSD license, indicate your decision - * by deleting the provisions above and replace them with the notice - * and other provisions required by the GPL. If you do not delete the - * provisions above, a recipient may use your version of this file under - * either the BSD or the GPL. - */ - -#ifndef LZFP_h -#define LZFP_h - -#define STANDALONE 1 /* at the moment, this is ok. */ - -#ifndef STANDALONE -# include "lzf.h" -#endif - -/* - * Size of hashtable is (1 << HLOG) * sizeof (char *) - * decompression is independent of the hash table size - * the difference between 15 and 14 is very small - * for small blocks (and 14 is usually a bit faster). - * For a low-memory/faster configuration, use HLOG == 13; - * For best compression, use 15 or 16 (or more, up to 23). - */ -#ifndef HLOG -# define HLOG 16 -#endif - -/* - * Sacrifice very little compression quality in favour of compression speed. - * This gives almost the same compression as the default code, and is - * (very roughly) 15% faster. This is the preferred mode of operation. - */ -#ifndef VERY_FAST -# define VERY_FAST 1 -#endif - -/* - * Sacrifice some more compression quality in favour of compression speed. - * (roughly 1-2% worse compression for large blocks and - * 9-10% for small, redundant, blocks and >>20% better speed in both cases) - * In short: when in need for speed, enable this for binary data, - * possibly disable this for text data. - */ -#ifndef ULTRA_FAST -# define ULTRA_FAST 0 -#endif - -/* - * Unconditionally aligning does not cost very much, so do it if unsure - */ -#ifndef STRICT_ALIGN -# define STRICT_ALIGN !(defined(__i386) || defined (__amd64)) -#endif - -/* - * You may choose to pre-set the hash table (might be faster on some - * modern cpus and large (>>64k) blocks, and also makes compression - * deterministic/repeatable when the configuration otherwise is the same). - */ -#ifndef INIT_HTAB -# define INIT_HTAB 0 -#endif - -/* - * Avoid assigning values to errno variable? for some embedding purposes - * (linux kernel for example), this is neccessary. NOTE: this breaks - * the documentation in lzf.h. - */ -#ifndef AVOID_ERRNO -# define AVOID_ERRNO 0 -#endif - -/* - * Wether to pass the LZF_STATE variable as argument, or allocate it - * on the stack. For small-stack environments, define this to 1. - * NOTE: this breaks the prototype in lzf.h. - */ -#ifndef LZF_STATE_ARG -# define LZF_STATE_ARG 0 -#endif - -/* - * Wether to add extra checks for input validity in lzf_decompress - * and return EINVAL if the input stream has been corrupted. This - * only shields against overflowing the input buffer and will not - * detect most corrupted streams. - * This check is not normally noticable on modern hardware - * (<1% slowdown), but might slow down older cpus considerably. - */ -#ifndef CHECK_INPUT -# define CHECK_INPUT 1 -#endif - -/*****************************************************************************/ -/* nothing should be changed below */ - -typedef unsigned char u8; - -typedef const u8 *LZF_STATE[1 << (HLOG)]; - -#if !STRICT_ALIGN -/* for unaligned accesses we need a 16 bit datatype. */ -# include -# if USHRT_MAX == 65535 - typedef unsigned short u16; -# elif UINT_MAX == 65535 - typedef unsigned int u16; -# else -# undef STRICT_ALIGN -# define STRICT_ALIGN 1 -# endif -#endif - -#if ULTRA_FAST -# if defined(VERY_FAST) -# undef VERY_FAST -# endif -#endif - -#if INIT_HTAB -# ifdef __cplusplus -# include -# else -# include -# endif -#endif - -#endif - diff --git a/lzf_c.c b/lzf_c.c deleted file mode 100644 index 99dab091c..000000000 --- a/lzf_c.c +++ /dev/null @@ -1,295 +0,0 @@ -/* - * Copyright (c) 2000-2008 Marc Alexander Lehmann - * - * Redistribution and use in source and binary forms, with or without modifica- - * tion, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER- - * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO - * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- - * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH- - * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * - * Alternatively, the contents of this file may be used under the terms of - * the GNU General Public License ("GPL") version 2 or any later version, - * in which case the provisions of the GPL are applicable instead of - * the above. If you wish to allow the use of your version of this file - * only under the terms of the GPL and not to allow others to use your - * version of this file under the BSD license, indicate your decision - * by deleting the provisions above and replace them with the notice - * and other provisions required by the GPL. If you do not delete the - * provisions above, a recipient may use your version of this file under - * either the BSD or the GPL. - */ - -#include "lzfP.h" - -#define HSIZE (1 << (HLOG)) - -/* - * don't play with this unless you benchmark! - * decompression is not dependent on the hash function - * the hashing function might seem strange, just believe me - * it works ;) - */ -#ifndef FRST -# define FRST(p) (((p[0]) << 8) | p[1]) -# define NEXT(v,p) (((v) << 8) | p[2]) -# if ULTRA_FAST -# define IDX(h) ((( h >> (3*8 - HLOG)) - h ) & (HSIZE - 1)) -# elif VERY_FAST -# define IDX(h) ((( h >> (3*8 - HLOG)) - h*5) & (HSIZE - 1)) -# else -# define IDX(h) ((((h ^ (h << 5)) >> (3*8 - HLOG)) - h*5) & (HSIZE - 1)) -# endif -#endif -/* - * IDX works because it is very similar to a multiplicative hash, e.g. - * ((h * 57321 >> (3*8 - HLOG)) & (HSIZE - 1)) - * the latter is also quite fast on newer CPUs, and compresses similarly. - * - * the next one is also quite good, albeit slow ;) - * (int)(cos(h & 0xffffff) * 1e6) - */ - -#if 0 -/* original lzv-like hash function, much worse and thus slower */ -# define FRST(p) (p[0] << 5) ^ p[1] -# define NEXT(v,p) ((v) << 5) ^ p[2] -# define IDX(h) ((h) & (HSIZE - 1)) -#endif - -#define MAX_LIT (1 << 5) -#define MAX_OFF (1 << 13) -#define MAX_REF ((1 << 8) + (1 << 3)) - -#if __GNUC__ >= 3 -# define expect(expr,value) __builtin_expect ((expr),(value)) -# define inline inline -#else -# define expect(expr,value) (expr) -# define inline static -#endif - -#define expect_false(expr) expect ((expr) != 0, 0) -#define expect_true(expr) expect ((expr) != 0, 1) - -/* - * compressed format - * - * 000LLLLL ; literal - * LLLooooo oooooooo ; backref L - * 111ooooo LLLLLLLL oooooooo ; backref L+7 - * - */ - -unsigned int -lzf_compress (const void *const in_data, unsigned int in_len, - void *out_data, unsigned int out_len -#if LZF_STATE_ARG - , LZF_STATE htab -#endif - ) -{ -#if !LZF_STATE_ARG - LZF_STATE htab; -#endif - const u8 **hslot; - const u8 *ip = (const u8 *)in_data; - u8 *op = (u8 *)out_data; - const u8 *in_end = ip + in_len; - u8 *out_end = op + out_len; - const u8 *ref; - - /* off requires a type wide enough to hold a general pointer difference. - * ISO C doesn't have that (size_t might not be enough and ptrdiff_t only - * works for differences within a single object). We also assume that no - * no bit pattern traps. Since the only platform that is both non-POSIX - * and fails to support both assumptions is windows 64 bit, we make a - * special workaround for it. - */ -#if defined (WIN32) && defined (_M_X64) - unsigned _int64 off; /* workaround for missing POSIX compliance */ -#else - unsigned long off; -#endif - unsigned int hval; - int lit; - - if (!in_len || !out_len) - return 0; - -#if INIT_HTAB - memset (htab, 0, sizeof (htab)); -# if 0 - for (hslot = htab; hslot < htab + HSIZE; hslot++) - *hslot++ = ip; -# endif -#endif - - lit = 0; op++; /* start run */ - - hval = FRST (ip); - while (ip < in_end - 2) - { - hval = NEXT (hval, ip); - hslot = htab + IDX (hval); - ref = *hslot; *hslot = ip; - - if (1 -#if INIT_HTAB - && ref < ip /* the next test will actually take care of this, but this is faster */ -#endif - && (off = ip - ref - 1) < MAX_OFF - && ip + 4 < in_end - && ref > (u8 *)in_data -#if STRICT_ALIGN - && ref[0] == ip[0] - && ref[1] == ip[1] - && ref[2] == ip[2] -#else - && *(u16 *)ref == *(u16 *)ip - && ref[2] == ip[2] -#endif - ) - { - /* match found at *ref++ */ - unsigned int len = 2; - unsigned int maxlen = in_end - ip - len; - maxlen = maxlen > MAX_REF ? MAX_REF : maxlen; - - op [- lit - 1] = lit - 1; /* stop run */ - op -= !lit; /* undo run if length is zero */ - - if (expect_false (op + 3 + 1 >= out_end)) - return 0; - - for (;;) - { - if (expect_true (maxlen > 16)) - { - len++; if (ref [len] != ip [len]) break; - len++; if (ref [len] != ip [len]) break; - len++; if (ref [len] != ip [len]) break; - len++; if (ref [len] != ip [len]) break; - - len++; if (ref [len] != ip [len]) break; - len++; if (ref [len] != ip [len]) break; - len++; if (ref [len] != ip [len]) break; - len++; if (ref [len] != ip [len]) break; - - len++; if (ref [len] != ip [len]) break; - len++; if (ref [len] != ip [len]) break; - len++; if (ref [len] != ip [len]) break; - len++; if (ref [len] != ip [len]) break; - - len++; if (ref [len] != ip [len]) break; - len++; if (ref [len] != ip [len]) break; - len++; if (ref [len] != ip [len]) break; - len++; if (ref [len] != ip [len]) break; - } - - do - len++; - while (len < maxlen && ref[len] == ip[len]); - - break; - } - - len -= 2; /* len is now #octets - 1 */ - ip++; - - if (len < 7) - { - *op++ = (off >> 8) + (len << 5); - } - else - { - *op++ = (off >> 8) + ( 7 << 5); - *op++ = len - 7; - } - - *op++ = off; - lit = 0; op++; /* start run */ - - ip += len + 1; - - if (expect_false (ip >= in_end - 2)) - break; - -#if ULTRA_FAST || VERY_FAST - --ip; -# if VERY_FAST && !ULTRA_FAST - --ip; -# endif - hval = FRST (ip); - - hval = NEXT (hval, ip); - htab[IDX (hval)] = ip; - ip++; - -# if VERY_FAST && !ULTRA_FAST - hval = NEXT (hval, ip); - htab[IDX (hval)] = ip; - ip++; -# endif -#else - ip -= len + 1; - - do - { - hval = NEXT (hval, ip); - htab[IDX (hval)] = ip; - ip++; - } - while (len--); -#endif - } - else - { - /* one more literal byte we must copy */ - if (expect_false (op >= out_end)) - return 0; - - lit++; *op++ = *ip++; - - if (expect_false (lit == MAX_LIT)) - { - op [- lit - 1] = lit - 1; /* stop run */ - lit = 0; op++; /* start run */ - } - } - } - - if (op + 3 > out_end) /* at most 3 bytes can be missing here */ - return 0; - - while (ip < in_end) - { - lit++; *op++ = *ip++; - - if (expect_false (lit == MAX_LIT)) - { - op [- lit - 1] = lit - 1; /* stop run */ - lit = 0; op++; /* start run */ - } - } - - op [- lit - 1] = lit - 1; /* end run */ - op -= !lit; /* undo run if length is zero */ - - return op - (u8 *)out_data; -} - diff --git a/lzf_d.c b/lzf_d.c deleted file mode 100644 index e7e48c138..000000000 --- a/lzf_d.c +++ /dev/null @@ -1,150 +0,0 @@ -/* - * Copyright (c) 2000-2007 Marc Alexander Lehmann - * - * Redistribution and use in source and binary forms, with or without modifica- - * tion, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER- - * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO - * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- - * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH- - * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * - * Alternatively, the contents of this file may be used under the terms of - * the GNU General Public License ("GPL") version 2 or any later version, - * in which case the provisions of the GPL are applicable instead of - * the above. If you wish to allow the use of your version of this file - * only under the terms of the GPL and not to allow others to use your - * version of this file under the BSD license, indicate your decision - * by deleting the provisions above and replace them with the notice - * and other provisions required by the GPL. If you do not delete the - * provisions above, a recipient may use your version of this file under - * either the BSD or the GPL. - */ - -#include "lzfP.h" - -#if AVOID_ERRNO -# define SET_ERRNO(n) -#else -# include -# define SET_ERRNO(n) errno = (n) -#endif - -/* -#if (__i386 || __amd64) && __GNUC__ >= 3 -# define lzf_movsb(dst, src, len) \ - asm ("rep movsb" \ - : "=D" (dst), "=S" (src), "=c" (len) \ - : "0" (dst), "1" (src), "2" (len)); -#endif -*/ - -unsigned int -lzf_decompress (const void *const in_data, unsigned int in_len, - void *out_data, unsigned int out_len) -{ - u8 const *ip = (const u8 *)in_data; - u8 *op = (u8 *)out_data; - u8 const *const in_end = ip + in_len; - u8 *const out_end = op + out_len; - - do - { - unsigned int ctrl = *ip++; - - if (ctrl < (1 << 5)) /* literal run */ - { - ctrl++; - - if (op + ctrl > out_end) - { - SET_ERRNO (E2BIG); - return 0; - } - -#if CHECK_INPUT - if (ip + ctrl > in_end) - { - SET_ERRNO (EINVAL); - return 0; - } -#endif - -#ifdef lzf_movsb - lzf_movsb (op, ip, ctrl); -#else - do - *op++ = *ip++; - while (--ctrl); -#endif - } - else /* back reference */ - { - unsigned int len = ctrl >> 5; - - u8 *ref = op - ((ctrl & 0x1f) << 8) - 1; - -#if CHECK_INPUT - if (ip >= in_end) - { - SET_ERRNO (EINVAL); - return 0; - } -#endif - if (len == 7) - { - len += *ip++; -#if CHECK_INPUT - if (ip >= in_end) - { - SET_ERRNO (EINVAL); - return 0; - } -#endif - } - - ref -= *ip++; - - if (op + len + 2 > out_end) - { - SET_ERRNO (E2BIG); - return 0; - } - - if (ref < (u8 *)out_data) - { - SET_ERRNO (EINVAL); - return 0; - } - -#ifdef lzf_movsb - len += 2; - lzf_movsb (op, ref, len); -#else - *op++ = *ref++; - *op++ = *ref++; - - do - *op++ = *ref++; - while (--len); -#endif - } - } - while (ip < in_end); - - return op - (u8 *)out_data; -} - diff --git a/mkreleasehdr.sh b/mkreleasehdr.sh deleted file mode 100755 index 30984160e..000000000 --- a/mkreleasehdr.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/sh -GIT_SHA1=`(git show-ref --head --hash=8 2> /dev/null || echo 00000000) | head -n1` -GIT_DIRTY=`git diff 2> /dev/null | wc -l` -test -f release.h || touch release.h -(cat release.h | grep SHA1 | grep $GIT_SHA1) && \ -(cat release.h | grep DIRTY | grep $GIT_DIRTY) && exit 0 # Already uptodate -echo "#define REDIS_GIT_SHA1 \"$GIT_SHA1\"" > release.h -echo "#define REDIS_GIT_DIRTY \"$GIT_DIRTY\"" >> release.h -touch release.c # Force recompile of release.c diff --git a/pqsort.c b/pqsort.c deleted file mode 100644 index 257756376..000000000 --- a/pqsort.c +++ /dev/null @@ -1,197 +0,0 @@ -/* The following is the NetBSD libc qsort implementation modified in order to - * support partial sorting of ranges for Redis. - * - * Copyright(C) 2009-2010 Salvatore Sanfilippo. All rights reserved. - * - * The original copyright notice follows. */ - - -/* $NetBSD: qsort.c,v 1.19 2009/01/30 23:38:44 lukem Exp $ */ - -/*- - * Copyright (c) 1992, 1993 - * The Regents of the University of California. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include - -#include -#include -#include - -static inline char *med3 (char *, char *, char *, - int (*)(const void *, const void *)); -static inline void swapfunc (char *, char *, size_t, int); - -#define min(a, b) (a) < (b) ? a : b - -/* - * Qsort routine from Bentley & McIlroy's "Engineering a Sort Function". - */ -#define swapcode(TYPE, parmi, parmj, n) { \ - size_t i = (n) / sizeof (TYPE); \ - TYPE *pi = (TYPE *)(void *)(parmi); \ - TYPE *pj = (TYPE *)(void *)(parmj); \ - do { \ - TYPE t = *pi; \ - *pi++ = *pj; \ - *pj++ = t; \ - } while (--i > 0); \ -} - -#define SWAPINIT(a, es) swaptype = ((char *)a - (char *)0) % sizeof(long) || \ - es % sizeof(long) ? 2 : es == sizeof(long)? 0 : 1; - -static inline void -swapfunc(char *a, char *b, size_t n, int swaptype) -{ - - if (swaptype <= 1) - swapcode(long, a, b, n) - else - swapcode(char, a, b, n) -} - -#define swap(a, b) \ - if (swaptype == 0) { \ - long t = *(long *)(void *)(a); \ - *(long *)(void *)(a) = *(long *)(void *)(b); \ - *(long *)(void *)(b) = t; \ - } else \ - swapfunc(a, b, es, swaptype) - -#define vecswap(a, b, n) if ((n) > 0) swapfunc((a), (b), (size_t)(n), swaptype) - -static inline char * -med3(char *a, char *b, char *c, - int (*cmp) (const void *, const void *)) -{ - - return cmp(a, b) < 0 ? - (cmp(b, c) < 0 ? b : (cmp(a, c) < 0 ? c : a )) - :(cmp(b, c) > 0 ? b : (cmp(a, c) < 0 ? a : c )); -} - -static void -_pqsort(void *a, size_t n, size_t es, - int (*cmp) (const void *, const void *), void *lrange, void *rrange) -{ - char *pa, *pb, *pc, *pd, *pl, *pm, *pn; - size_t d, r; - int swaptype, swap_cnt, cmp_result; - -loop: SWAPINIT(a, es); - swap_cnt = 0; - if (n < 7) { - for (pm = (char *) a + es; pm < (char *) a + n * es; pm += es) - for (pl = pm; pl > (char *) a && cmp(pl - es, pl) > 0; - pl -= es) - swap(pl, pl - es); - return; - } - pm = (char *) a + (n / 2) * es; - if (n > 7) { - pl = (char *) a; - pn = (char *) a + (n - 1) * es; - if (n > 40) { - d = (n / 8) * es; - pl = med3(pl, pl + d, pl + 2 * d, cmp); - pm = med3(pm - d, pm, pm + d, cmp); - pn = med3(pn - 2 * d, pn - d, pn, cmp); - } - pm = med3(pl, pm, pn, cmp); - } - swap(a, pm); - pa = pb = (char *) a + es; - - pc = pd = (char *) a + (n - 1) * es; - for (;;) { - while (pb <= pc && (cmp_result = cmp(pb, a)) <= 0) { - if (cmp_result == 0) { - swap_cnt = 1; - swap(pa, pb); - pa += es; - } - pb += es; - } - while (pb <= pc && (cmp_result = cmp(pc, a)) >= 0) { - if (cmp_result == 0) { - swap_cnt = 1; - swap(pc, pd); - pd -= es; - } - pc -= es; - } - if (pb > pc) - break; - swap(pb, pc); - swap_cnt = 1; - pb += es; - pc -= es; - } - if (swap_cnt == 0) { /* Switch to insertion sort */ - for (pm = (char *) a + es; pm < (char *) a + n * es; pm += es) - for (pl = pm; pl > (char *) a && cmp(pl - es, pl) > 0; - pl -= es) - swap(pl, pl - es); - return; - } - - pn = (char *) a + n * es; - r = min(pa - (char *) a, pb - pa); - vecswap(a, pb - r, r); - r = min((size_t)(pd - pc), pn - pd - es); - vecswap(pb, pn - r, r); - if ((r = pb - pa) > es) { - void *_l = a, *_r = ((unsigned char*)a)+r-1; - if (!((lrange < _l && rrange < _l) || - (lrange > _r && rrange > _r))) - _pqsort(a, r / es, es, cmp, lrange, rrange); - } - if ((r = pd - pc) > es) { - void *_l, *_r; - - /* Iterate rather than recurse to save stack space */ - a = pn - r; - n = r / es; - - _l = a; - _r = ((unsigned char*)a)+r-1; - if (!((lrange < _l && rrange < _l) || - (lrange > _r && rrange > _r))) - goto loop; - } -/* qsort(pn - r, r / es, es, cmp);*/ -} - -void -pqsort(void *a, size_t n, size_t es, - int (*cmp) (const void *, const void *), size_t lrange, size_t rrange) -{ - _pqsort(a,n,es,cmp,((unsigned char*)a)+(lrange*es), - ((unsigned char*)a)+((rrange+1)*es)-1); -} diff --git a/pqsort.h b/pqsort.h deleted file mode 100644 index 5054d5209..000000000 --- a/pqsort.h +++ /dev/null @@ -1,15 +0,0 @@ -/* The following is the NetBSD libc qsort implementation modified in order to - * support partial sorting of ranges for Redis. - * - * Copyright(C) 2009-2010 Salvatore Sanfilippo. All rights reserved. - * - * See the pqsort.c file for the original copyright notice. */ - -#ifndef __PQSORT_H -#define __PQSORT_H - -void -pqsort(void *a, size_t n, size_t es, - int (*cmp) (const void *, const void *), size_t lrange, size_t rrange); - -#endif diff --git a/redis-benchmark.c b/redis-benchmark.c deleted file mode 100644 index 123d81180..000000000 --- a/redis-benchmark.c +++ /dev/null @@ -1,665 +0,0 @@ -/* Redis benchmark utility. - * - * Copyright (c) 2009-2010, Salvatore Sanfilippo - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include "fmacros.h" - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "ae.h" -#include "anet.h" -#include "sds.h" -#include "adlist.h" -#include "zmalloc.h" - -#define REPLY_INT 0 -#define REPLY_RETCODE 1 -#define REPLY_BULK 2 -#define REPLY_MBULK 3 - -#define CLIENT_CONNECTING 0 -#define CLIENT_SENDQUERY 1 -#define CLIENT_READREPLY 2 - -#define MAX_LATENCY 5000 - -#define REDIS_NOTUSED(V) ((void) V) - -static struct config { - int debug; - int numclients; - int requests; - int liveclients; - int donerequests; - int keysize; - int datasize; - int randomkeys; - int randomkeys_keyspacelen; - aeEventLoop *el; - char *hostip; - int hostport; - int keepalive; - long long start; - long long totlatency; - int *latency; - list *clients; - int quiet; - int loop; - int idlemode; -} config; - -typedef struct _client { - int state; - int fd; - sds obuf; - sds ibuf; - int mbulk; /* Number of elements in an mbulk reply */ - int readlen; /* readlen == -1 means read a single line */ - int totreceived; - unsigned int written; /* bytes of 'obuf' already written */ - int replytype; - long long start; /* start time in milliseconds */ -} *client; - -/* Prototypes */ -static void writeHandler(aeEventLoop *el, int fd, void *privdata, int mask); -static void createMissingClients(client c); - -/* Implementation */ -static long long mstime(void) { - struct timeval tv; - long long mst; - - gettimeofday(&tv, NULL); - mst = ((long)tv.tv_sec)*1000; - mst += tv.tv_usec/1000; - return mst; -} - -static void freeClient(client c) { - listNode *ln; - - aeDeleteFileEvent(config.el,c->fd,AE_WRITABLE); - aeDeleteFileEvent(config.el,c->fd,AE_READABLE); - sdsfree(c->ibuf); - sdsfree(c->obuf); - close(c->fd); - zfree(c); - config.liveclients--; - ln = listSearchKey(config.clients,c); - assert(ln != NULL); - listDelNode(config.clients,ln); -} - -static void freeAllClients(void) { - listNode *ln = config.clients->head, *next; - - while(ln) { - next = ln->next; - freeClient(ln->value); - ln = next; - } -} - -static void resetClient(client c) { - aeDeleteFileEvent(config.el,c->fd,AE_WRITABLE); - aeDeleteFileEvent(config.el,c->fd,AE_READABLE); - aeCreateFileEvent(config.el,c->fd, AE_WRITABLE,writeHandler,c); - sdsfree(c->ibuf); - c->ibuf = sdsempty(); - c->readlen = (c->replytype == REPLY_BULK || - c->replytype == REPLY_MBULK) ? -1 : 0; - c->mbulk = -1; - c->written = 0; - c->totreceived = 0; - c->state = CLIENT_SENDQUERY; - c->start = mstime(); - createMissingClients(c); -} - -static void randomizeClientKey(client c) { - char *p; - char buf[32]; - long r; - - p = strstr(c->obuf, "_rand"); - if (!p) return; - p += 5; - r = random() % config.randomkeys_keyspacelen; - sprintf(buf,"%ld",r); - memcpy(p,buf,strlen(buf)); -} - -static void prepareClientForReply(client c, int type) { - if (type == REPLY_BULK) { - c->replytype = REPLY_BULK; - c->readlen = -1; - } else if (type == REPLY_MBULK) { - c->replytype = REPLY_MBULK; - c->readlen = -1; - c->mbulk = -1; - } else { - c->replytype = type; - c->readlen = 0; - } -} - -static void clientDone(client c) { - static int last_tot_received = 1; - - long long latency; - config.donerequests ++; - latency = mstime() - c->start; - if (latency > MAX_LATENCY) latency = MAX_LATENCY; - config.latency[latency]++; - - if (config.debug && last_tot_received != c->totreceived) { - printf("Tot bytes received: %d\n", c->totreceived); - last_tot_received = c->totreceived; - } - if (config.donerequests == config.requests) { - freeClient(c); - aeStop(config.el); - return; - } - if (config.keepalive) { - resetClient(c); - if (config.randomkeys) randomizeClientKey(c); - } else { - config.liveclients--; - createMissingClients(c); - config.liveclients++; - freeClient(c); - } -} - -static void readHandler(aeEventLoop *el, int fd, void *privdata, int mask) -{ - char buf[1024]; - int nread; - client c = privdata; - REDIS_NOTUSED(el); - REDIS_NOTUSED(fd); - REDIS_NOTUSED(mask); - - nread = read(c->fd, buf, 1024); - if (nread == -1) { - fprintf(stderr, "Reading from socket: %s\n", strerror(errno)); - freeClient(c); - return; - } - if (nread == 0) { - fprintf(stderr, "EOF from client\n"); - freeClient(c); - return; - } - c->totreceived += nread; - c->ibuf = sdscatlen(c->ibuf,buf,nread); - -processdata: - /* Are we waiting for the first line of the command of for sdf - * count in bulk or multi bulk operations? */ - if (c->replytype == REPLY_INT || - c->replytype == REPLY_RETCODE || - (c->replytype == REPLY_BULK && c->readlen == -1) || - (c->replytype == REPLY_MBULK && c->readlen == -1) || - (c->replytype == REPLY_MBULK && c->mbulk == -1)) { - char *p; - - /* Check if the first line is complete. This is only true if - * there is a newline inside the buffer. */ - if ((p = strchr(c->ibuf,'\n')) != NULL) { - if (c->replytype == REPLY_BULK || - (c->replytype == REPLY_MBULK && c->mbulk != -1)) - { - /* Read the count of a bulk reply (being it a single bulk or - * a multi bulk reply). "$" for the protocol spec. */ - *p = '\0'; - *(p-1) = '\0'; - c->readlen = atoi(c->ibuf+1)+2; - // printf("BULK ATOI: %s\n", c->ibuf+1); - /* Handle null bulk reply "$-1" */ - if (c->readlen-2 == -1) { - clientDone(c); - return; - } - /* Leave all the rest in the input buffer */ - c->ibuf = sdsrange(c->ibuf,(p-c->ibuf)+1,-1); - /* fall through to reach the point where the code will try - * to check if the bulk reply is complete. */ - } else if (c->replytype == REPLY_MBULK && c->mbulk == -1) { - /* Read the count of a multi bulk reply. That is, how many - * bulk replies we have to read next. "*" protocol. */ - *p = '\0'; - *(p-1) = '\0'; - c->mbulk = atoi(c->ibuf+1); - /* Handle null bulk reply "*-1" */ - if (c->mbulk == -1) { - clientDone(c); - return; - } - // printf("%p) %d elements list\n", c, c->mbulk); - /* Leave all the rest in the input buffer */ - c->ibuf = sdsrange(c->ibuf,(p-c->ibuf)+1,-1); - goto processdata; - } else { - c->ibuf = sdstrim(c->ibuf,"\r\n"); - clientDone(c); - return; - } - } - } - /* bulk read, did we read everything? */ - if (((c->replytype == REPLY_MBULK && c->mbulk != -1) || - (c->replytype == REPLY_BULK)) && c->readlen != -1 && - (unsigned)c->readlen <= sdslen(c->ibuf)) - { - // printf("BULKSTATUS mbulk:%d readlen:%d sdslen:%d\n", - // c->mbulk,c->readlen,sdslen(c->ibuf)); - if (c->replytype == REPLY_BULK) { - clientDone(c); - } else if (c->replytype == REPLY_MBULK) { - // printf("%p) %d (%d)) ",c, c->mbulk, c->readlen); - // fwrite(c->ibuf,c->readlen,1,stdout); - // printf("\n"); - if (--c->mbulk == 0) { - clientDone(c); - } else { - c->ibuf = sdsrange(c->ibuf,c->readlen,-1); - c->readlen = -1; - goto processdata; - } - } - } -} - -static void writeHandler(aeEventLoop *el, int fd, void *privdata, int mask) -{ - client c = privdata; - REDIS_NOTUSED(el); - REDIS_NOTUSED(fd); - REDIS_NOTUSED(mask); - - if (c->state == CLIENT_CONNECTING) { - c->state = CLIENT_SENDQUERY; - c->start = mstime(); - } - if (sdslen(c->obuf) > c->written) { - void *ptr = c->obuf+c->written; - int len = sdslen(c->obuf) - c->written; - int nwritten = write(c->fd, ptr, len); - if (nwritten == -1) { - if (errno != EPIPE) - fprintf(stderr, "Writing to socket: %s\n", strerror(errno)); - freeClient(c); - return; - } - c->written += nwritten; - if (sdslen(c->obuf) == c->written) { - aeDeleteFileEvent(config.el,c->fd,AE_WRITABLE); - aeCreateFileEvent(config.el,c->fd,AE_READABLE,readHandler,c); - c->state = CLIENT_READREPLY; - } - } -} - -static client createClient(void) { - client c = zmalloc(sizeof(struct _client)); - char err[ANET_ERR_LEN]; - - c->fd = anetTcpNonBlockConnect(err,config.hostip,config.hostport); - if (c->fd == ANET_ERR) { - zfree(c); - fprintf(stderr,"Connect: %s\n",err); - return NULL; - } - anetTcpNoDelay(NULL,c->fd); - c->obuf = sdsempty(); - c->ibuf = sdsempty(); - c->mbulk = -1; - c->readlen = 0; - c->written = 0; - c->totreceived = 0; - c->state = CLIENT_CONNECTING; - aeCreateFileEvent(config.el, c->fd, AE_WRITABLE, writeHandler, c); - config.liveclients++; - listAddNodeTail(config.clients,c); - return c; -} - -static void createMissingClients(client c) { - while(config.liveclients < config.numclients) { - client new = createClient(); - if (!new) continue; - sdsfree(new->obuf); - new->obuf = sdsdup(c->obuf); - if (config.randomkeys) randomizeClientKey(c); - prepareClientForReply(new,c->replytype); - } -} - -static void showLatencyReport(char *title) { - int j, seen = 0; - float perc, reqpersec; - - reqpersec = (float)config.donerequests/((float)config.totlatency/1000); - if (!config.quiet) { - printf("====== %s ======\n", title); - printf(" %d requests completed in %.2f seconds\n", config.donerequests, - (float)config.totlatency/1000); - printf(" %d parallel clients\n", config.numclients); - printf(" %d bytes payload\n", config.datasize); - printf(" keep alive: %d\n", config.keepalive); - printf("\n"); - for (j = 0; j <= MAX_LATENCY; j++) { - if (config.latency[j]) { - seen += config.latency[j]; - perc = ((float)seen*100)/config.donerequests; - printf("%.2f%% <= %d milliseconds\n", perc, j); - } - } - printf("%.2f requests per second\n\n", reqpersec); - } else { - printf("%s: %.2f requests per second\n", title, reqpersec); - } -} - -static void prepareForBenchmark(void) -{ - memset(config.latency,0,sizeof(int)*(MAX_LATENCY+1)); - config.start = mstime(); - config.donerequests = 0; -} - -static void endBenchmark(char *title) { - config.totlatency = mstime()-config.start; - showLatencyReport(title); - freeAllClients(); -} - -void parseOptions(int argc, char **argv) { - int i; - - for (i = 1; i < argc; i++) { - int lastarg = i==argc-1; - - if (!strcmp(argv[i],"-c") && !lastarg) { - config.numclients = atoi(argv[i+1]); - i++; - } else if (!strcmp(argv[i],"-n") && !lastarg) { - config.requests = atoi(argv[i+1]); - i++; - } else if (!strcmp(argv[i],"-k") && !lastarg) { - config.keepalive = atoi(argv[i+1]); - i++; - } else if (!strcmp(argv[i],"-h") && !lastarg) { - char *ip = zmalloc(32); - if (anetResolve(NULL,argv[i+1],ip) == ANET_ERR) { - printf("Can't resolve %s\n", argv[i]); - exit(1); - } - config.hostip = ip; - i++; - } else if (!strcmp(argv[i],"-p") && !lastarg) { - config.hostport = atoi(argv[i+1]); - i++; - } else if (!strcmp(argv[i],"-d") && !lastarg) { - config.datasize = atoi(argv[i+1]); - i++; - if (config.datasize < 1) config.datasize=1; - if (config.datasize > 1024*1024) config.datasize = 1024*1024; - } else if (!strcmp(argv[i],"-r") && !lastarg) { - config.randomkeys = 1; - config.randomkeys_keyspacelen = atoi(argv[i+1]); - if (config.randomkeys_keyspacelen < 0) - config.randomkeys_keyspacelen = 0; - i++; - } else if (!strcmp(argv[i],"-q")) { - config.quiet = 1; - } else if (!strcmp(argv[i],"-l")) { - config.loop = 1; - } else if (!strcmp(argv[i],"-D")) { - config.debug = 1; - } else if (!strcmp(argv[i],"-I")) { - config.idlemode = 1; - } else { - printf("Wrong option '%s' or option argument missing\n\n",argv[i]); - printf("Usage: redis-benchmark [-h ] [-p ] [-c ] [-n [-k ]\n\n"); - printf(" -h Server hostname (default 127.0.0.1)\n"); - printf(" -p Server port (default 6379)\n"); - printf(" -c Number of parallel connections (default 50)\n"); - printf(" -n Total number of requests (default 10000)\n"); - printf(" -d Data size of SET/GET value in bytes (default 2)\n"); - printf(" -k 1=keep alive 0=reconnect (default 1)\n"); - printf(" -r Use random keys for SET/GET/INCR, random values for SADD\n"); - printf(" Using this option the benchmark will get/set keys\n"); - printf(" in the form mykey_rand000000012456 instead of constant\n"); - printf(" keys, the argument determines the max\n"); - printf(" number of values for the random number. For instance\n"); - printf(" if set to 10 only rand000000000000 - rand000000000009\n"); - printf(" range will be allowed.\n"); - printf(" -q Quiet. Just show query/sec values\n"); - printf(" -l Loop. Run the tests forever\n"); - printf(" -I Idle mode. Just open N idle connections and wait.\n"); - printf(" -D Debug mode. more verbose.\n"); - exit(1); - } - } -} - -int main(int argc, char **argv) { - client c; - - signal(SIGHUP, SIG_IGN); - signal(SIGPIPE, SIG_IGN); - - config.debug = 0; - config.numclients = 50; - config.requests = 10000; - config.liveclients = 0; - config.el = aeCreateEventLoop(); - config.keepalive = 1; - config.donerequests = 0; - config.datasize = 3; - config.randomkeys = 0; - config.randomkeys_keyspacelen = 0; - config.quiet = 0; - config.loop = 0; - config.idlemode = 0; - config.latency = NULL; - config.clients = listCreate(); - config.latency = zmalloc(sizeof(int)*(MAX_LATENCY+1)); - - config.hostip = "127.0.0.1"; - config.hostport = 6379; - - parseOptions(argc,argv); - - if (config.keepalive == 0) { - printf("WARNING: keepalive disabled, you probably need 'echo 1 > /proc/sys/net/ipv4/tcp_tw_reuse' for Linux and 'sudo sysctl -w net.inet.tcp.msl=1000' for Mac OS X in order to use a lot of clients/requests\n"); - } - - if (config.idlemode) { - printf("Creating %d idle connections and waiting forever (Ctrl+C when done)\n", config.numclients); - prepareForBenchmark(); - c = createClient(); - if (!c) exit(1); - c->obuf = sdsempty(); - prepareClientForReply(c,REPLY_RETCODE); /* will never receive it */ - createMissingClients(c); - aeMain(config.el); - /* and will wait for every */ - } - - do { - prepareForBenchmark(); - c = createClient(); - if (!c) exit(1); - c->obuf = sdscat(c->obuf,"PING\r\n"); - prepareClientForReply(c,REPLY_RETCODE); - createMissingClients(c); - aeMain(config.el); - endBenchmark("PING"); - - prepareForBenchmark(); - c = createClient(); - if (!c) exit(1); - c->obuf = sdscat(c->obuf,"*1\r\n$4\r\nPING\r\n"); - prepareClientForReply(c,REPLY_RETCODE); - createMissingClients(c); - aeMain(config.el); - endBenchmark("PING (multi bulk)"); - - prepareForBenchmark(); - c = createClient(); - if (!c) exit(1); - c->obuf = sdscatprintf(c->obuf,"SET foo_rand000000000000 %d\r\n",config.datasize); - { - char *data = zmalloc(config.datasize+2); - memset(data,'x',config.datasize); - data[config.datasize] = '\r'; - data[config.datasize+1] = '\n'; - c->obuf = sdscatlen(c->obuf,data,config.datasize+2); - } - prepareClientForReply(c,REPLY_RETCODE); - createMissingClients(c); - aeMain(config.el); - endBenchmark("SET"); - - prepareForBenchmark(); - c = createClient(); - if (!c) exit(1); - c->obuf = sdscat(c->obuf,"GET foo_rand000000000000\r\n"); - prepareClientForReply(c,REPLY_BULK); - createMissingClients(c); - aeMain(config.el); - endBenchmark("GET"); - - prepareForBenchmark(); - c = createClient(); - if (!c) exit(1); - c->obuf = sdscat(c->obuf,"INCR counter_rand000000000000\r\n"); - prepareClientForReply(c,REPLY_INT); - createMissingClients(c); - aeMain(config.el); - endBenchmark("INCR"); - - prepareForBenchmark(); - c = createClient(); - if (!c) exit(1); - c->obuf = sdscat(c->obuf,"LPUSH mylist 3\r\nbar\r\n"); - prepareClientForReply(c,REPLY_INT); - createMissingClients(c); - aeMain(config.el); - endBenchmark("LPUSH"); - - prepareForBenchmark(); - c = createClient(); - if (!c) exit(1); - c->obuf = sdscat(c->obuf,"LPOP mylist\r\n"); - prepareClientForReply(c,REPLY_BULK); - createMissingClients(c); - aeMain(config.el); - endBenchmark("LPOP"); - - prepareForBenchmark(); - c = createClient(); - if (!c) exit(1); - c->obuf = sdscat(c->obuf,"SADD myset 24\r\ncounter_rand000000000000\r\n"); - prepareClientForReply(c,REPLY_RETCODE); - createMissingClients(c); - aeMain(config.el); - endBenchmark("SADD"); - - prepareForBenchmark(); - c = createClient(); - if (!c) exit(1); - c->obuf = sdscat(c->obuf,"SPOP myset\r\n"); - prepareClientForReply(c,REPLY_BULK); - createMissingClients(c); - aeMain(config.el); - endBenchmark("SPOP"); - - prepareForBenchmark(); - c = createClient(); - if (!c) exit(1); - c->obuf = sdscat(c->obuf,"LPUSH mylist 3\r\nbar\r\n"); - prepareClientForReply(c,REPLY_RETCODE); - createMissingClients(c); - aeMain(config.el); - endBenchmark("LPUSH (again, in order to bench LRANGE)"); - - prepareForBenchmark(); - c = createClient(); - if (!c) exit(1); - c->obuf = sdscat(c->obuf,"LRANGE mylist 0 99\r\n"); - prepareClientForReply(c,REPLY_MBULK); - createMissingClients(c); - aeMain(config.el); - endBenchmark("LRANGE (first 100 elements)"); - - prepareForBenchmark(); - c = createClient(); - if (!c) exit(1); - c->obuf = sdscat(c->obuf,"LRANGE mylist 0 299\r\n"); - prepareClientForReply(c,REPLY_MBULK); - createMissingClients(c); - aeMain(config.el); - endBenchmark("LRANGE (first 300 elements)"); - - prepareForBenchmark(); - c = createClient(); - if (!c) exit(1); - c->obuf = sdscat(c->obuf,"LRANGE mylist 0 449\r\n"); - prepareClientForReply(c,REPLY_MBULK); - createMissingClients(c); - aeMain(config.el); - endBenchmark("LRANGE (first 450 elements)"); - - prepareForBenchmark(); - c = createClient(); - if (!c) exit(1); - c->obuf = sdscat(c->obuf,"LRANGE mylist 0 599\r\n"); - prepareClientForReply(c,REPLY_MBULK); - createMissingClients(c); - aeMain(config.el); - endBenchmark("LRANGE (first 600 elements)"); - - printf("\n"); - } while(config.loop); - - return 0; -} diff --git a/redis-check-aof.c b/redis-check-aof.c deleted file mode 100644 index ff0d1f82c..000000000 --- a/redis-check-aof.c +++ /dev/null @@ -1,185 +0,0 @@ -#include "fmacros.h" -#include -#include -#include -#include -#include -#include "config.h" - -#define ERROR(...) { \ - char __buf[1024]; \ - sprintf(__buf, __VA_ARGS__); \ - sprintf(error, "0x%08lx: %s", epos, __buf); \ -} - -static char error[1024]; -static long epos; - -int consumeNewline(char *buf) { - if (strncmp(buf,"\r\n",2) != 0) { - ERROR("Expected \\r\\n, got: %02x%02x",buf[0],buf[1]); - return 0; - } - return 1; -} - -int readLong(FILE *fp, char prefix, long *target) { - char buf[128], *eptr; - epos = ftell(fp); - if (fgets(buf,sizeof(buf),fp) == NULL) { - return 0; - } - if (buf[0] != prefix) { - ERROR("Expected prefix '%c', got: '%c'",buf[0],prefix); - return 0; - } - *target = strtol(buf+1,&eptr,10); - return consumeNewline(eptr); -} - -int readBytes(FILE *fp, char *target, long length) { - long real; - epos = ftell(fp); - real = fread(target,1,length,fp); - if (real != length) { - ERROR("Expected to read %ld bytes, got %ld bytes",length,real); - return 0; - } - return 1; -} - -int readString(FILE *fp, char** target) { - long len; - *target = NULL; - if (!readLong(fp,'$',&len)) { - return 0; - } - - /* Increase length to also consume \r\n */ - len += 2; - *target = (char*)malloc(len); - if (!readBytes(fp,*target,len)) { - return 0; - } - if (!consumeNewline(*target+len-2)) { - return 0; - } - (*target)[len-2] = '\0'; - return 1; -} - -int readArgc(FILE *fp, long *target) { - return readLong(fp,'*',target); -} - -long process(FILE *fp) { - long argc, pos = 0; - int i, multi = 0; - char *str; - - while(1) { - if (!multi) pos = ftell(fp); - if (!readArgc(fp, &argc)) break; - - for (i = 0; i < argc; i++) { - if (!readString(fp,&str)) break; - if (i == 0) { - if (strcasecmp(str, "multi") == 0) { - if (multi++) { - ERROR("Unexpected MULTI"); - break; - } - } else if (strcasecmp(str, "exec") == 0) { - if (--multi) { - ERROR("Unexpected EXEC"); - break; - } - } - } - free(str); - } - - /* Stop if the loop did not finish */ - if (i < argc) { - if (str) free(str); - break; - } - } - - if (feof(fp) && multi && strlen(error) == 0) { - ERROR("Reached EOF before reading EXEC for MULTI"); - } - if (strlen(error) > 0) { - printf("%s\n", error); - } - return pos; -} - -int main(int argc, char **argv) { - char *filename; - int fix = 0; - - if (argc < 2) { - printf("Usage: %s [--fix] \n", argv[0]); - exit(1); - } else if (argc == 2) { - filename = argv[1]; - } else if (argc == 3) { - if (strcmp(argv[1],"--fix") != 0) { - printf("Invalid argument: %s\n", argv[1]); - exit(1); - } - filename = argv[2]; - fix = 1; - } else { - printf("Invalid arguments\n"); - exit(1); - } - - FILE *fp = fopen(filename,"r+"); - if (fp == NULL) { - printf("Cannot open file: %s\n", filename); - exit(1); - } - - struct redis_stat sb; - if (redis_fstat(fileno(fp),&sb) == -1) { - printf("Cannot stat file: %s\n", filename); - exit(1); - } - - long size = sb.st_size; - if (size == 0) { - printf("Empty file: %s\n", filename); - exit(1); - } - - long pos = process(fp); - long diff = size-pos; - if (diff > 0) { - if (fix) { - char buf[2]; - printf("This will shrink the AOF from %ld bytes, with %ld bytes, to %ld bytes\n",size,diff,pos); - printf("Continue? [y/N]: "); - if (fgets(buf,sizeof(buf),stdin) == NULL || - strncasecmp(buf,"y",1) != 0) { - printf("Aborting...\n"); - exit(1); - } - if (ftruncate(fileno(fp), pos) == -1) { - printf("Failed to truncate AOF\n"); - exit(1); - } else { - printf("Successfully truncated AOF\n"); - } - } else { - printf("AOF is not valid\n"); - exit(1); - } - } else { - printf("AOF is valid\n"); - } - - fclose(fp); - return 0; -} diff --git a/redis-check-dump.c b/redis-check-dump.c deleted file mode 100644 index 0b002790d..000000000 --- a/redis-check-dump.c +++ /dev/null @@ -1,671 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "lzf.h" - -/* Object types */ -#define REDIS_STRING 0 -#define REDIS_LIST 1 -#define REDIS_SET 2 -#define REDIS_ZSET 3 -#define REDIS_HASH 4 - -/* Objects encoding. Some kind of objects like Strings and Hashes can be - * internally represented in multiple ways. The 'encoding' field of the object - * is set to one of this fields for this object. */ -#define REDIS_ENCODING_RAW 0 /* Raw representation */ -#define REDIS_ENCODING_INT 1 /* Encoded as integer */ -#define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */ -#define REDIS_ENCODING_HT 3 /* Encoded as an hash table */ - -/* Object types only used for dumping to disk */ -#define REDIS_EXPIRETIME 253 -#define REDIS_SELECTDB 254 -#define REDIS_EOF 255 - -/* Defines related to the dump file format. To store 32 bits lengths for short - * keys requires a lot of space, so we check the most significant 2 bits of - * the first byte to interpreter the length: - * - * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte - * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte - * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow - * 11|000000 this means: specially encoded object will follow. The six bits - * number specify the kind of object that follows. - * See the REDIS_RDB_ENC_* defines. - * - * Lenghts up to 63 are stored using a single byte, most DB keys, and may - * values, will fit inside. */ -#define REDIS_RDB_6BITLEN 0 -#define REDIS_RDB_14BITLEN 1 -#define REDIS_RDB_32BITLEN 2 -#define REDIS_RDB_ENCVAL 3 -#define REDIS_RDB_LENERR UINT_MAX - -/* When a length of a string object stored on disk has the first two bits - * set, the remaining two bits specify a special encoding for the object - * accordingly to the following defines: */ -#define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */ -#define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */ -#define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */ -#define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */ - -#define ERROR(...) { \ - printf(__VA_ARGS__); \ - exit(1); \ -} - -/* data type to hold offset in file and size */ -typedef struct { - void *data; - unsigned long size; - unsigned long offset; -} pos; - -static unsigned char level = 0; -static pos positions[16]; - -#define CURR_OFFSET (positions[level].offset) - -/* Hold a stack of errors */ -typedef struct { - char error[16][1024]; - unsigned long offset[16]; - unsigned int level; -} errors_t; -static errors_t errors; - -#define SHIFT_ERROR(provided_offset, ...) { \ - sprintf(errors.error[errors.level], __VA_ARGS__); \ - errors.offset[errors.level] = provided_offset; \ - errors.level++; \ -} - -/* Data type to hold opcode with optional key name an success status */ -typedef struct { - char* key; - int type; - char success; -} entry; - -/* Global vars that are actally used as constants. The following double - * values are used for double on-disk serialization, and are initialized - * at runtime to avoid strange compiler optimizations. */ -static double R_Zero, R_PosInf, R_NegInf, R_Nan; - -/* store string types for output */ -static char types[256][16]; - -/* when number of bytes to read is negative, do a peek */ -int readBytes(void *target, long num) { - char peek = (num < 0) ? 1 : 0; - num = (num < 0) ? -num : num; - - pos p = positions[level]; - if (p.offset + num > p.size) { - return 0; - } else { - memcpy(target, (void*)((unsigned long)p.data + p.offset), num); - if (!peek) positions[level].offset += num; - } - return 1; -} - -int processHeader() { - char buf[10] = "_________"; - int dump_version; - - if (!readBytes(buf, 9)) { - ERROR("Cannot read header\n"); - } - - /* expect the first 5 bytes to equal REDIS */ - if (memcmp(buf,"REDIS",5) != 0) { - ERROR("Wrong signature in header\n"); - } - - dump_version = (int)strtol(buf + 5, NULL, 10); - if (dump_version != 1) { - ERROR("Unknown RDB format version: %d\n", dump_version); - } - return 1; -} - -int loadType(entry *e) { - uint32_t offset = CURR_OFFSET; - - /* this byte needs to qualify as type */ - unsigned char t; - if (readBytes(&t, 1)) { - if (t <= 4 || t >= 253) { - e->type = t; - return 1; - } else { - SHIFT_ERROR(offset, "Unknown type (0x%02x)", t); - } - } else { - SHIFT_ERROR(offset, "Could not read type"); - } - - /* failure */ - return 0; -} - -int peekType() { - unsigned char t; - if (readBytes(&t, -1) && (t <= 4 || t >= 253)) return t; - return -1; -} - -/* discard time, just consume the bytes */ -int processTime() { - uint32_t offset = CURR_OFFSET; - unsigned char t[4]; - if (readBytes(t, 4)) { - return 1; - } else { - SHIFT_ERROR(offset, "Could not read time"); - } - - /* failure */ - return 0; -} - -uint32_t loadLength(int *isencoded) { - unsigned char buf[2]; - uint32_t len; - int type; - - if (isencoded) *isencoded = 0; - if (!readBytes(buf, 1)) return REDIS_RDB_LENERR; - type = (buf[0] & 0xC0) >> 6; - if (type == REDIS_RDB_6BITLEN) { - /* Read a 6 bit len */ - return buf[0] & 0x3F; - } else if (type == REDIS_RDB_ENCVAL) { - /* Read a 6 bit len encoding type */ - if (isencoded) *isencoded = 1; - return buf[0] & 0x3F; - } else if (type == REDIS_RDB_14BITLEN) { - /* Read a 14 bit len */ - if (!readBytes(buf+1,1)) return REDIS_RDB_LENERR; - return ((buf[0] & 0x3F) << 8) | buf[1]; - } else { - /* Read a 32 bit len */ - if (!readBytes(&len, 4)) return REDIS_RDB_LENERR; - return (unsigned int)ntohl(len); - } -} - -char *loadIntegerObject(int enctype) { - uint32_t offset = CURR_OFFSET; - unsigned char enc[4]; - long long val; - - if (enctype == REDIS_RDB_ENC_INT8) { - uint8_t v; - if (!readBytes(enc, 1)) return NULL; - v = enc[0]; - val = (int8_t)v; - } else if (enctype == REDIS_RDB_ENC_INT16) { - uint16_t v; - if (!readBytes(enc, 2)) return NULL; - v = enc[0]|(enc[1]<<8); - val = (int16_t)v; - } else if (enctype == REDIS_RDB_ENC_INT32) { - uint32_t v; - if (!readBytes(enc, 4)) return NULL; - v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24); - val = (int32_t)v; - } else { - SHIFT_ERROR(offset, "Unknown integer encoding (0x%02x)", enctype); - return NULL; - } - - /* convert val into string */ - char *buf; - buf = malloc(sizeof(char) * 128); - sprintf(buf, "%lld", val); - return buf; -} - -char* loadLzfStringObject() { - unsigned int slen, clen; - char *c, *s; - - if ((clen = loadLength(NULL)) == REDIS_RDB_LENERR) return NULL; - if ((slen = loadLength(NULL)) == REDIS_RDB_LENERR) return NULL; - - c = malloc(clen); - if (!readBytes(c, clen)) { - free(c); - return NULL; - } - - s = malloc(slen+1); - if (lzf_decompress(c,clen,s,slen) == 0) { - free(c); free(s); - return NULL; - } - - free(c); - return s; -} - -/* returns NULL when not processable, char* when valid */ -char* loadStringObject() { - uint32_t offset = CURR_OFFSET; - int isencoded; - uint32_t len; - - len = loadLength(&isencoded); - if (isencoded) { - switch(len) { - case REDIS_RDB_ENC_INT8: - case REDIS_RDB_ENC_INT16: - case REDIS_RDB_ENC_INT32: - return loadIntegerObject(len); - case REDIS_RDB_ENC_LZF: - return loadLzfStringObject(); - default: - /* unknown encoding */ - SHIFT_ERROR(offset, "Unknown string encoding (0x%02x)", len); - return NULL; - } - } - - if (len == REDIS_RDB_LENERR) return NULL; - - char *buf = malloc(sizeof(char) * (len+1)); - buf[len] = '\0'; - if (!readBytes(buf, len)) { - free(buf); - return NULL; - } - return buf; -} - -int processStringObject(char** store) { - unsigned long offset = CURR_OFFSET; - char *key = loadStringObject(); - if (key == NULL) { - SHIFT_ERROR(offset, "Error reading string object"); - free(key); - return 0; - } - - if (store != NULL) { - *store = key; - } else { - free(key); - } - return 1; -} - -double* loadDoubleValue() { - char buf[256]; - unsigned char len; - double* val; - - if (!readBytes(&len,1)) return NULL; - - val = malloc(sizeof(double)); - switch(len) { - case 255: *val = R_NegInf; return val; - case 254: *val = R_PosInf; return val; - case 253: *val = R_Nan; return val; - default: - if (!readBytes(buf, len)) { - free(val); - return NULL; - } - buf[len] = '\0'; - sscanf(buf, "%lg", val); - return val; - } -} - -int processDoubleValue(double** store) { - unsigned long offset = CURR_OFFSET; - double *val = loadDoubleValue(); - if (val == NULL) { - SHIFT_ERROR(offset, "Error reading double value"); - free(val); - return 0; - } - - if (store != NULL) { - *store = val; - } else { - free(val); - } - return 1; -} - -int loadPair(entry *e) { - uint32_t offset = CURR_OFFSET; - uint32_t i; - - /* read key first */ - char *key; - if (processStringObject(&key)) { - e->key = key; - } else { - SHIFT_ERROR(offset, "Error reading entry key"); - return 0; - } - - uint32_t length = 0; - if (e->type == REDIS_LIST || - e->type == REDIS_SET || - e->type == REDIS_ZSET || - e->type == REDIS_HASH) { - if ((length = loadLength(NULL)) == REDIS_RDB_LENERR) { - SHIFT_ERROR(offset, "Error reading %s length", types[e->type]); - return 0; - } - } - - switch(e->type) { - case REDIS_STRING: - if (!processStringObject(NULL)) { - SHIFT_ERROR(offset, "Error reading entry value"); - return 0; - } - break; - case REDIS_LIST: - case REDIS_SET: - for (i = 0; i < length; i++) { - offset = CURR_OFFSET; - if (!processStringObject(NULL)) { - SHIFT_ERROR(offset, "Error reading element at index %d (length: %d)", i, length); - return 0; - } - } - break; - case REDIS_ZSET: - for (i = 0; i < length; i++) { - offset = CURR_OFFSET; - if (!processStringObject(NULL)) { - SHIFT_ERROR(offset, "Error reading element key at index %d (length: %d)", i, length); - return 0; - } - offset = CURR_OFFSET; - if (!processDoubleValue(NULL)) { - SHIFT_ERROR(offset, "Error reading element value at index %d (length: %d)", i, length); - return 0; - } - } - break; - case REDIS_HASH: - for (i = 0; i < length; i++) { - offset = CURR_OFFSET; - if (!processStringObject(NULL)) { - SHIFT_ERROR(offset, "Error reading element key at index %d (length: %d)", i, length); - return 0; - } - offset = CURR_OFFSET; - if (!processStringObject(NULL)) { - SHIFT_ERROR(offset, "Error reading element value at index %d (length: %d)", i, length); - return 0; - } - } - break; - default: - SHIFT_ERROR(offset, "Type not implemented"); - return 0; - } - /* because we're done, we assume success */ - e->success = 1; - return 1; -} - -entry loadEntry() { - entry e = { NULL, -1, 0 }; - uint32_t length, offset[4]; - - /* reset error container */ - errors.level = 0; - - offset[0] = CURR_OFFSET; - if (!loadType(&e)) { - return e; - } - - offset[1] = CURR_OFFSET; - if (e.type == REDIS_SELECTDB) { - if ((length = loadLength(NULL)) == REDIS_RDB_LENERR) { - SHIFT_ERROR(offset[1], "Error reading database number"); - return e; - } - if (length > 63) { - SHIFT_ERROR(offset[1], "Database number out of range (%d)", length); - return e; - } - } else if (e.type == REDIS_EOF) { - if (positions[level].offset < positions[level].size) { - SHIFT_ERROR(offset[0], "Unexpected EOF"); - } else { - e.success = 1; - } - return e; - } else { - /* optionally consume expire */ - if (e.type == REDIS_EXPIRETIME) { - if (!processTime()) return e; - if (!loadType(&e)) return e; - } - - offset[1] = CURR_OFFSET; - if (!loadPair(&e)) { - SHIFT_ERROR(offset[1], "Error for type %s", types[e.type]); - return e; - } - } - - /* all entries are followed by a valid type: - * e.g. a new entry, SELECTDB, EXPIRE, EOF */ - offset[2] = CURR_OFFSET; - if (peekType() == -1) { - SHIFT_ERROR(offset[2], "Followed by invalid type"); - SHIFT_ERROR(offset[0], "Error for type %s", types[e.type]); - e.success = 0; - } else { - e.success = 1; - } - - return e; -} - -void printCentered(int indent, int width, char* body) { - char head[256], tail[256]; - memset(head, '\0', 256); - memset(tail, '\0', 256); - - memset(head, '=', indent); - memset(tail, '=', width - 2 - indent - strlen(body)); - printf("%s %s %s\n", head, body, tail); -} - -void printValid(int ops, int bytes) { - char body[80]; - sprintf(body, "Processed %d valid opcodes (in %d bytes)", ops, bytes); - printCentered(4, 80, body); -} - -void printSkipped(int bytes, int offset) { - char body[80]; - sprintf(body, "Skipped %d bytes (resuming at 0x%08x)", bytes, offset); - printCentered(4, 80, body); -} - -void printErrorStack(entry *e) { - unsigned int i; - char body[64]; - - if (e->type == -1) { - sprintf(body, "Error trace"); - } else if (e->type >= 253) { - sprintf(body, "Error trace (%s)", types[e->type]); - } else if (!e->key) { - sprintf(body, "Error trace (%s: (unknown))", types[e->type]); - } else { - char tmp[41]; - strncpy(tmp, e->key, 40); - - /* display truncation at the last 3 chars */ - if (strlen(e->key) > 40) { - memset(&tmp[37], '.', 3); - } - - /* display unprintable characters as ? */ - for (i = 0; i < strlen(tmp); i++) { - if (tmp[i] <= 32) tmp[i] = '?'; - } - sprintf(body, "Error trace (%s: %s)", types[e->type], tmp); - } - - printCentered(4, 80, body); - - /* display error stack */ - for (i = 0; i < errors.level; i++) { - printf("0x%08lx - %s\n", errors.offset[i], errors.error[i]); - } -} - -void process() { - int i, num_errors = 0, num_valid_ops = 0, num_valid_bytes = 0; - entry entry; - processHeader(); - - level = 1; - while(positions[0].offset < positions[0].size) { - positions[1] = positions[0]; - - entry = loadEntry(); - if (!entry.success) { - printValid(num_valid_ops, num_valid_bytes); - printErrorStack(&entry); - num_errors++; - num_valid_ops = 0; - num_valid_bytes = 0; - - /* search for next valid entry */ - unsigned long offset = positions[0].offset + 1; - while (!entry.success && offset < positions[0].size) { - positions[1].offset = offset; - - /* find 3 consecutive valid entries */ - for (i = 0; i < 3; i++) { - entry = loadEntry(); - if (!entry.success) break; - } - /* check if we found 3 consecutive valid entries */ - if (i < 3) { - offset++; - } - } - - /* print how many bytes we have skipped to find a new valid opcode */ - if (offset < positions[0].size) { - printSkipped(offset - positions[0].offset, offset); - } - - positions[0].offset = offset; - } else { - num_valid_ops++; - num_valid_bytes += positions[1].offset - positions[0].offset; - - /* advance position */ - positions[0] = positions[1]; - } - } - - /* because there is another potential error, - * print how many valid ops we have processed */ - printValid(num_valid_ops, num_valid_bytes); - - /* expect an eof */ - if (entry.type != REDIS_EOF) { - /* last byte should be EOF, add error */ - errors.level = 0; - SHIFT_ERROR(positions[0].offset, "Expected EOF, got %s", types[entry.type]); - - /* this is an EOF error so reset type */ - entry.type = -1; - printErrorStack(&entry); - - num_errors++; - } - - /* print summary on errors */ - if (num_errors > 0) { - printf("\n"); - printf("Total unprocessable opcodes: %d\n", num_errors); - } -} - -int main(int argc, char **argv) { - /* expect the first argument to be the dump file */ - if (argc <= 1) { - printf("Usage: %s \n", argv[0]); - exit(0); - } - - int fd; - unsigned long size; - struct stat stat; - void *data; - - fd = open(argv[1], O_RDONLY); - if (fd < 1) { - ERROR("Cannot open file: %s\n", argv[1]); - } - if (fstat(fd, &stat) == -1) { - ERROR("Cannot stat: %s\n", argv[1]); - } else { - size = stat.st_size; - } - - data = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0); - if (data == MAP_FAILED) { - ERROR("Cannot mmap: %s\n", argv[1]); - } - - /* Initialize static vars */ - positions[0].data = data; - positions[0].size = size; - positions[0].offset = 0; - errors.level = 0; - - /* Object types */ - sprintf(types[REDIS_STRING], "STRING"); - sprintf(types[REDIS_LIST], "LIST"); - sprintf(types[REDIS_SET], "SET"); - sprintf(types[REDIS_ZSET], "ZSET"); - sprintf(types[REDIS_HASH], "HASH"); - - /* Object types only used for dumping to disk */ - sprintf(types[REDIS_EXPIRETIME], "EXPIRETIME"); - sprintf(types[REDIS_SELECTDB], "SELECTDB"); - sprintf(types[REDIS_EOF], "EOF"); - - /* Double constants initialization */ - R_Zero = 0.0; - R_PosInf = 1.0/R_Zero; - R_NegInf = -1.0/R_Zero; - R_Nan = R_Zero/R_Zero; - - process(); - - munmap(data, size); - close(fd); - return 0; -} diff --git a/redis-cli.c b/redis-cli.c deleted file mode 100644 index 2daa7c461..000000000 --- a/redis-cli.c +++ /dev/null @@ -1,493 +0,0 @@ -/* Redis CLI (command line interface) - * - * Copyright (c) 2009-2010, Salvatore Sanfilippo - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include "fmacros.h" - -#include -#include -#include -#include -#include - -#include "anet.h" -#include "sds.h" -#include "adlist.h" -#include "zmalloc.h" -#include "linenoise.h" - -#define REDIS_CMD_INLINE 1 -#define REDIS_CMD_BULK 2 -#define REDIS_CMD_MULTIBULK 4 - -#define REDIS_NOTUSED(V) ((void) V) - -static struct config { - char *hostip; - int hostport; - long repeat; - int dbnum; - int argn_from_stdin; - int interactive; - int shutdown; - int monitor_mode; - int pubsub_mode; - int raw_output; - char *auth; -} config; - -static int cliReadReply(int fd); -static void usage(); - -static int cliConnect(void) { - char err[ANET_ERR_LEN]; - static int fd = ANET_ERR; - - if (fd == ANET_ERR) { - fd = anetTcpConnect(err,config.hostip,config.hostport); - if (fd == ANET_ERR) { - fprintf(stderr, "Could not connect to Redis at %s:%d: %s", config.hostip, config.hostport, err); - return -1; - } - anetTcpNoDelay(NULL,fd); - } - return fd; -} - -static sds cliReadLine(int fd) { - sds line = sdsempty(); - - while(1) { - char c; - ssize_t ret; - - ret = read(fd,&c,1); - if (ret == -1) { - sdsfree(line); - return NULL; - } else if ((ret == 0) || (c == '\n')) { - break; - } else { - line = sdscatlen(line,&c,1); - } - } - return sdstrim(line,"\r\n"); -} - -static int cliReadSingleLineReply(int fd, int quiet) { - sds reply = cliReadLine(fd); - - if (reply == NULL) return 1; - if (!quiet) - printf("%s\n", reply); - sdsfree(reply); - return 0; -} - -static void printStringRepr(char *s, int len) { - printf("\""); - while(len--) { - switch(*s) { - case '\\': - case '"': - printf("\\%c",*s); - break; - case '\n': printf("\\n"); break; - case '\r': printf("\\r"); break; - case '\t': printf("\\t"); break; - case '\a': printf("\\a"); break; - case '\b': printf("\\b"); break; - default: - if (isprint(*s)) - printf("%c",*s); - else - printf("\\x%02x",(unsigned char)*s); - break; - } - s++; - } - printf("\"\n"); -} - -static int cliReadBulkReply(int fd) { - sds replylen = cliReadLine(fd); - char *reply, crlf[2]; - int bulklen; - - if (replylen == NULL) return 1; - bulklen = atoi(replylen); - if (bulklen == -1) { - sdsfree(replylen); - printf("(nil)\n"); - return 0; - } - reply = zmalloc(bulklen); - anetRead(fd,reply,bulklen); - anetRead(fd,crlf,2); - if (config.raw_output || !isatty(fileno(stdout))) { - if (bulklen && fwrite(reply,bulklen,1,stdout) == 0) { - zfree(reply); - return 1; - } - } else { - /* If you are producing output for the standard output we want - * a more interesting output with quoted characters and so forth */ - printStringRepr(reply,bulklen); - } - zfree(reply); - return 0; -} - -static int cliReadMultiBulkReply(int fd) { - sds replylen = cliReadLine(fd); - int elements, c = 1; - - if (replylen == NULL) return 1; - elements = atoi(replylen); - if (elements == -1) { - sdsfree(replylen); - printf("(nil)\n"); - return 0; - } - if (elements == 0) { - printf("(empty list or set)\n"); - } - while(elements--) { - printf("%d. ", c); - if (cliReadReply(fd)) return 1; - c++; - } - return 0; -} - -static int cliReadReply(int fd) { - char type; - - if (anetRead(fd,&type,1) <= 0) { - if (config.shutdown) return 0; - exit(1); - } - switch(type) { - case '-': - printf("(error) "); - cliReadSingleLineReply(fd,0); - return 1; - case '+': - return cliReadSingleLineReply(fd,0); - case ':': - printf("(integer) "); - return cliReadSingleLineReply(fd,0); - case '$': - return cliReadBulkReply(fd); - case '*': - return cliReadMultiBulkReply(fd); - default: - printf("protocol error, got '%c' as reply type byte\n", type); - return 1; - } -} - -static int selectDb(int fd) { - int retval; - sds cmd; - char type; - - if (config.dbnum == 0) - return 0; - - cmd = sdsempty(); - cmd = sdscatprintf(cmd,"SELECT %d\r\n",config.dbnum); - anetWrite(fd,cmd,sdslen(cmd)); - anetRead(fd,&type,1); - if (type <= 0 || type != '+') return 1; - retval = cliReadSingleLineReply(fd,1); - if (retval) { - return retval; - } - return 0; -} - -static int cliSendCommand(int argc, char **argv, int repeat) { - char *command = argv[0]; - int fd, j, retval = 0; - sds cmd; - - config.raw_output = !strcasecmp(command,"info"); - if (!strcasecmp(command,"shutdown")) config.shutdown = 1; - if (!strcasecmp(command,"monitor")) config.monitor_mode = 1; - if (!strcasecmp(command,"subscribe") || - !strcasecmp(command,"psubscribe")) config.pubsub_mode = 1; - if ((fd = cliConnect()) == -1) return 1; - - /* Select db number */ - retval = selectDb(fd); - if (retval) { - fprintf(stderr,"Error setting DB num\n"); - return 1; - } - - /* Build the command to send */ - cmd = sdscatprintf(sdsempty(),"*%d\r\n",argc); - for (j = 0; j < argc; j++) { - cmd = sdscatprintf(cmd,"$%lu\r\n", - (unsigned long)sdslen(argv[j])); - cmd = sdscatlen(cmd,argv[j],sdslen(argv[j])); - cmd = sdscatlen(cmd,"\r\n",2); - } - - while(repeat--) { - anetWrite(fd,cmd,sdslen(cmd)); - while (config.monitor_mode) { - cliReadSingleLineReply(fd,0); - } - - if (config.pubsub_mode) { - printf("Reading messages... (press Ctrl-c to quit)\n"); - while (1) { - cliReadReply(fd); - printf("\n"); - } - } - - retval = cliReadReply(fd); - if (retval) { - return retval; - } - } - return 0; -} - -static int parseOptions(int argc, char **argv) { - int i; - - for (i = 1; i < argc; i++) { - int lastarg = i==argc-1; - - if (!strcmp(argv[i],"-h") && !lastarg) { - char *ip = zmalloc(32); - if (anetResolve(NULL,argv[i+1],ip) == ANET_ERR) { - printf("Can't resolve %s\n", argv[i]); - exit(1); - } - config.hostip = ip; - i++; - } else if (!strcmp(argv[i],"-h") && lastarg) { - usage(); - } else if (!strcmp(argv[i],"-p") && !lastarg) { - config.hostport = atoi(argv[i+1]); - i++; - } else if (!strcmp(argv[i],"-r") && !lastarg) { - config.repeat = strtoll(argv[i+1],NULL,10); - i++; - } else if (!strcmp(argv[i],"-n") && !lastarg) { - config.dbnum = atoi(argv[i+1]); - i++; - } else if (!strcmp(argv[i],"-a") && !lastarg) { - config.auth = argv[i+1]; - i++; - } else if (!strcmp(argv[i],"-i")) { - config.interactive = 1; - } else if (!strcmp(argv[i],"-c")) { - config.argn_from_stdin = 1; - } else { - break; - } - } - return i; -} - -static sds readArgFromStdin(void) { - char buf[1024]; - sds arg = sdsempty(); - - while(1) { - int nread = read(fileno(stdin),buf,1024); - - if (nread == 0) break; - else if (nread == -1) { - perror("Reading from standard input"); - exit(1); - } - arg = sdscatlen(arg,buf,nread); - } - return arg; -} - -static void usage() { - fprintf(stderr, "usage: redis-cli [-h host] [-p port] [-a authpw] [-r repeat_times] [-n db_num] [-i] cmd arg1 arg2 arg3 ... argN\n"); - fprintf(stderr, "usage: echo \"argN\" | redis-cli -c [-h host] [-p port] [-a authpw] [-r repeat_times] [-n db_num] cmd arg1 arg2 ... arg(N-1)\n"); - fprintf(stderr, "\nIf a pipe from standard input is detected this data is used as last argument.\n\n"); - fprintf(stderr, "example: cat /etc/passwd | redis-cli set my_passwd\n"); - fprintf(stderr, "example: redis-cli get my_passwd\n"); - fprintf(stderr, "example: redis-cli -r 100 lpush mylist x\n"); - fprintf(stderr, "\nRun in interactive mode: redis-cli -i or just don't pass any command\n"); - exit(1); -} - -/* Turn the plain C strings into Sds strings */ -static char **convertToSds(int count, char** args) { - int j; - char **sds = zmalloc(sizeof(char*)*count); - - for(j = 0; j < count; j++) - sds[j] = sdsnew(args[j]); - - return sds; -} - -static char **splitArguments(char *line, int *argc) { - char *p = line; - char *current = NULL; - char **vector = NULL; - - *argc = 0; - while(1) { - /* skip blanks */ - while(*p && isspace(*p)) p++; - if (*p) { - /* get a token */ - int inq=0; /* set to 1 if we are in "quotes" */ - int done = 0; - - if (current == NULL) current = sdsempty(); - while(!done) { - if (inq) { - if (*p == '\\' && *(p+1)) { - char c; - - p++; - switch(*p) { - case 'n': c = '\n'; break; - case 'r': c = '\r'; break; - case 't': c = '\t'; break; - case 'b': c = '\b'; break; - case 'a': c = '\a'; break; - default: c = *p; break; - } - current = sdscatlen(current,&c,1); - } else if (*p == '"') { - done = 1; - } else { - current = sdscatlen(current,p,1); - } - } else { - switch(*p) { - case ' ': - case '\n': - case '\r': - case '\t': - case '\0': - done=1; - break; - case '"': - inq=1; - break; - default: - current = sdscatlen(current,p,1); - break; - } - } - if (*p) p++; - } - /* add the token to the vector */ - vector = zrealloc(vector,((*argc)+1)*sizeof(char*)); - vector[*argc] = current; - (*argc)++; - current = NULL; - } else { - return vector; - } - } -} - -#define LINE_BUFLEN 4096 -static void repl() { - int argc, j; - char *line, **argv; - - while((line = linenoise("redis> ")) != NULL) { - if (line[0] != '\0') { - argv = splitArguments(line,&argc); - linenoiseHistoryAdd(line); - if (argc > 0) { - if (strcasecmp(argv[0],"quit") == 0 || - strcasecmp(argv[0],"exit") == 0) - exit(0); - else - cliSendCommand(argc, argv, 1); - } - /* Free the argument vector */ - for (j = 0; j < argc; j++) - sdsfree(argv[j]); - zfree(argv); - } - /* linenoise() returns malloc-ed lines like readline() */ - free(line); - } - exit(0); -} - -int main(int argc, char **argv) { - int firstarg; - char **argvcopy; - - config.hostip = "127.0.0.1"; - config.hostport = 6379; - config.repeat = 1; - config.dbnum = 0; - config.argn_from_stdin = 0; - config.shutdown = 0; - config.interactive = 0; - config.monitor_mode = 0; - config.pubsub_mode = 0; - config.raw_output = 0; - config.auth = NULL; - - firstarg = parseOptions(argc,argv); - argc -= firstarg; - argv += firstarg; - - if (config.auth != NULL) { - char *authargv[2]; - - authargv[0] = "AUTH"; - authargv[1] = config.auth; - cliSendCommand(2, convertToSds(2, authargv), 1); - } - - if (argc == 0 || config.interactive == 1) repl(); - - argvcopy = convertToSds(argc+1, argv); - if (config.argn_from_stdin) { - sds lastarg = readArgFromStdin(); - argvcopy[argc] = lastarg; - argc++; - } - return cliSendCommand(argc, argvcopy, config.repeat); -} diff --git a/redis.c b/redis.c deleted file mode 100644 index 1e1cd7816..000000000 --- a/redis.c +++ /dev/null @@ -1,11621 +0,0 @@ -/* - * Copyright (c) 2009-2010, Salvatore Sanfilippo - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#define REDIS_VERSION "2.1.1" - -#include "fmacros.h" -#include "config.h" - -#include -#include -#include -#include -#include -#include - -#ifdef HAVE_BACKTRACE -#include -#include -#endif /* HAVE_BACKTRACE */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined(__sun) -#include "solarisfixes.h" -#endif - -#include "redis.h" -#include "ae.h" /* Event driven programming library */ -#include "sds.h" /* Dynamic safe strings */ -#include "anet.h" /* Networking the easy way */ -#include "dict.h" /* Hash tables */ -#include "adlist.h" /* Linked lists */ -#include "zmalloc.h" /* total memory usage aware version of malloc/free */ -#include "lzf.h" /* LZF compression library */ -#include "pqsort.h" /* Partial qsort for SORT+LIMIT */ -#include "zipmap.h" /* Compact dictionary-alike data structure */ -#include "ziplist.h" /* Compact list data structure */ -#include "sha1.h" /* SHA1 is used for DEBUG DIGEST */ - -/* Error codes */ -#define REDIS_OK 0 -#define REDIS_ERR -1 - -/* Static server configuration */ -#define REDIS_SERVERPORT 6379 /* TCP port */ -#define REDIS_MAXIDLETIME (60*5) /* default client timeout */ -#define REDIS_IOBUF_LEN 1024 -#define REDIS_LOADBUF_LEN 1024 -#define REDIS_STATIC_ARGS 8 -#define REDIS_DEFAULT_DBNUM 16 -#define REDIS_CONFIGLINE_MAX 1024 -#define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */ -#define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */ -#define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */ -#define REDIS_MAX_WRITE_PER_EVENT (1024*64) -#define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */ - -/* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */ -#define REDIS_WRITEV_THRESHOLD 3 -/* Max number of iovecs used for each writev call */ -#define REDIS_WRITEV_IOVEC_COUNT 256 - -/* Hash table parameters */ -#define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */ - -/* Command flags */ -#define REDIS_CMD_BULK 1 /* Bulk write command */ -#define REDIS_CMD_INLINE 2 /* Inline command */ -/* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with - this flags will return an error when the 'maxmemory' option is set in the - config file and the server is using more than maxmemory bytes of memory. - In short this commands are denied on low memory conditions. */ -#define REDIS_CMD_DENYOOM 4 -#define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */ - -/* Object types */ -#define REDIS_STRING 0 -#define REDIS_LIST 1 -#define REDIS_SET 2 -#define REDIS_ZSET 3 -#define REDIS_HASH 4 -#define REDIS_VMPOINTER 8 - -/* Objects encoding. Some kind of objects like Strings and Hashes can be - * internally represented in multiple ways. The 'encoding' field of the object - * is set to one of this fields for this object. */ -#define REDIS_ENCODING_RAW 0 /* Raw representation */ -#define REDIS_ENCODING_INT 1 /* Encoded as integer */ -#define REDIS_ENCODING_HT 2 /* Encoded as hash table */ -#define REDIS_ENCODING_ZIPMAP 3 /* Encoded as zipmap */ -#define REDIS_ENCODING_LINKEDLIST 4 /* Encoded as regular linked list */ -#define REDIS_ENCODING_ZIPLIST 5 /* Encoded as ziplist */ - -static char* strencoding[] = { - "raw", "int", "hashtable", "zipmap", "linkedlist", "ziplist" -}; - -/* Object types only used for dumping to disk */ -#define REDIS_EXPIRETIME 253 -#define REDIS_SELECTDB 254 -#define REDIS_EOF 255 - -/* Defines related to the dump file format. To store 32 bits lengths for short - * keys requires a lot of space, so we check the most significant 2 bits of - * the first byte to interpreter the length: - * - * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte - * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte - * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow - * 11|000000 this means: specially encoded object will follow. The six bits - * number specify the kind of object that follows. - * See the REDIS_RDB_ENC_* defines. - * - * Lenghts up to 63 are stored using a single byte, most DB keys, and may - * values, will fit inside. */ -#define REDIS_RDB_6BITLEN 0 -#define REDIS_RDB_14BITLEN 1 -#define REDIS_RDB_32BITLEN 2 -#define REDIS_RDB_ENCVAL 3 -#define REDIS_RDB_LENERR UINT_MAX - -/* When a length of a string object stored on disk has the first two bits - * set, the remaining two bits specify a special encoding for the object - * accordingly to the following defines: */ -#define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */ -#define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */ -#define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */ -#define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */ - -/* Virtual memory object->where field. */ -#define REDIS_VM_MEMORY 0 /* The object is on memory */ -#define REDIS_VM_SWAPPED 1 /* The object is on disk */ -#define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */ -#define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */ - -/* Virtual memory static configuration stuff. - * Check vmFindContiguousPages() to know more about this magic numbers. */ -#define REDIS_VM_MAX_NEAR_PAGES 65536 -#define REDIS_VM_MAX_RANDOM_JUMP 4096 -#define REDIS_VM_MAX_THREADS 32 -#define REDIS_THREAD_STACK_SIZE (1024*1024*4) -/* The following is the *percentage* of completed I/O jobs to process when the - * handelr is called. While Virtual Memory I/O operations are performed by - * threads, this operations must be processed by the main thread when completed - * in order to take effect. */ -#define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1 - -/* Client flags */ -#define REDIS_SLAVE 1 /* This client is a slave server */ -#define REDIS_MASTER 2 /* This client is a master server */ -#define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */ -#define REDIS_MULTI 8 /* This client is in a MULTI context */ -#define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */ -#define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */ -#define REDIS_DIRTY_CAS 64 /* Watched keys modified. EXEC will fail. */ - -/* Slave replication state - slave side */ -#define REDIS_REPL_NONE 0 /* No active replication */ -#define REDIS_REPL_CONNECT 1 /* Must connect to master */ -#define REDIS_REPL_CONNECTED 2 /* Connected to master */ - -/* Slave replication state - from the point of view of master - * Note that in SEND_BULK and ONLINE state the slave receives new updates - * in its output queue. In the WAIT_BGSAVE state instead the server is waiting - * to start the next background saving in order to send updates to it. */ -#define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */ -#define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */ -#define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */ -#define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */ - -/* List related stuff */ -#define REDIS_HEAD 0 -#define REDIS_TAIL 1 - -/* Sort operations */ -#define REDIS_SORT_GET 0 -#define REDIS_SORT_ASC 1 -#define REDIS_SORT_DESC 2 -#define REDIS_SORTKEY_MAX 1024 - -/* Log levels */ -#define REDIS_DEBUG 0 -#define REDIS_VERBOSE 1 -#define REDIS_NOTICE 2 -#define REDIS_WARNING 3 - -/* Anti-warning macro... */ -#define REDIS_NOTUSED(V) ((void) V) - -#define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */ -#define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */ - -/* Append only defines */ -#define APPENDFSYNC_NO 0 -#define APPENDFSYNC_ALWAYS 1 -#define APPENDFSYNC_EVERYSEC 2 - -/* Zip structure related defaults */ -#define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64 -#define REDIS_HASH_MAX_ZIPMAP_VALUE 512 -#define REDIS_LIST_MAX_ZIPLIST_ENTRIES 1024 -#define REDIS_LIST_MAX_ZIPLIST_VALUE 32 - -/* We can print the stacktrace, so our assert is defined this way: */ -#define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1))) -#define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1) -static void _redisAssert(char *estr, char *file, int line); -static void _redisPanic(char *msg, char *file, int line); - -/*================================= Data types ============================== */ - -/* A redis object, that is a type able to hold a string / list / set */ - -/* The actual Redis Object */ -typedef struct redisObject { - unsigned type:4; - unsigned storage:2; /* REDIS_VM_MEMORY or REDIS_VM_SWAPPING */ - unsigned encoding:4; - unsigned lru:22; /* lru time (relative to server.lruclock) */ - int refcount; - void *ptr; - /* VM fields are only allocated if VM is active, otherwise the - * object allocation function will just allocate - * sizeof(redisObjct) minus sizeof(redisObjectVM), so using - * Redis without VM active will not have any overhead. */ -} robj; - -/* The VM pointer structure - identifies an object in the swap file. - * - * This object is stored in place of the value - * object in the main key->value hash table representing a database. - * Note that the first fields (type, storage) are the same as the redisObject - * structure so that vmPointer strucuters can be accessed even when casted - * as redisObject structures. - * - * This is useful as we don't know if a value object is or not on disk, but we - * are always able to read obj->storage to check this. For vmPointer - * structures "type" is set to REDIS_VMPOINTER (even if without this field - * is still possible to check the kind of object from the value of 'storage').*/ -typedef struct vmPointer { - unsigned type:4; - unsigned storage:2; /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */ - unsigned notused:26; - unsigned int vtype; /* type of the object stored in the swap file */ - off_t page; /* the page at witch the object is stored on disk */ - off_t usedpages; /* number of pages used on disk */ -} vmpointer; - -/* Macro used to initalize a Redis object allocated on the stack. - * Note that this macro is taken near the structure definition to make sure - * we'll update it when the structure is changed, to avoid bugs like - * bug #85 introduced exactly in this way. */ -#define initStaticStringObject(_var,_ptr) do { \ - _var.refcount = 1; \ - _var.type = REDIS_STRING; \ - _var.encoding = REDIS_ENCODING_RAW; \ - _var.ptr = _ptr; \ - _var.storage = REDIS_VM_MEMORY; \ -} while(0); - -typedef struct redisDb { - dict *dict; /* The keyspace for this DB */ - dict *expires; /* Timeout of keys with a timeout set */ - dict *blocking_keys; /* Keys with clients waiting for data (BLPOP) */ - dict *io_keys; /* Keys with clients waiting for VM I/O */ - dict *watched_keys; /* WATCHED keys for MULTI/EXEC CAS */ - int id; -} redisDb; - -/* Client MULTI/EXEC state */ -typedef struct multiCmd { - robj **argv; - int argc; - struct redisCommand *cmd; -} multiCmd; - -typedef struct multiState { - multiCmd *commands; /* Array of MULTI commands */ - int count; /* Total number of MULTI commands */ -} multiState; - -/* With multiplexing we need to take per-clinet state. - * Clients are taken in a liked list. */ -typedef struct redisClient { - int fd; - redisDb *db; - int dictid; - sds querybuf; - robj **argv, **mbargv; - int argc, mbargc; - int bulklen; /* bulk read len. -1 if not in bulk read mode */ - int multibulk; /* multi bulk command format active */ - list *reply; - int sentlen; - time_t lastinteraction; /* time of the last interaction, used for timeout */ - int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */ - int slaveseldb; /* slave selected db, if this client is a slave */ - int authenticated; /* when requirepass is non-NULL */ - int replstate; /* replication state if this is a slave */ - int repldbfd; /* replication DB file descriptor */ - long repldboff; /* replication DB file offset */ - off_t repldbsize; /* replication DB file size */ - multiState mstate; /* MULTI/EXEC state */ - robj **blocking_keys; /* The key we are waiting to terminate a blocking - * operation such as BLPOP. Otherwise NULL. */ - int blocking_keys_num; /* Number of blocking keys */ - time_t blockingto; /* Blocking operation timeout. If UNIX current time - * is >= blockingto then the operation timed out. */ - list *io_keys; /* Keys this client is waiting to be loaded from the - * swap file in order to continue. */ - list *watched_keys; /* Keys WATCHED for MULTI/EXEC CAS */ - dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */ - list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */ -} redisClient; - -struct saveparam { - time_t seconds; - int changes; -}; - -/* Global server state structure */ -struct redisServer { - int port; - int fd; - redisDb *db; - long long dirty; /* changes to DB from the last save */ - list *clients; - list *slaves, *monitors; - char neterr[ANET_ERR_LEN]; - aeEventLoop *el; - int cronloops; /* number of times the cron function run */ - list *objfreelist; /* A list of freed objects to avoid malloc() */ - time_t lastsave; /* Unix time of last save succeeede */ - /* Fields used only for stats */ - time_t stat_starttime; /* server start time */ - long long stat_numcommands; /* number of processed commands */ - long long stat_numconnections; /* number of connections received */ - long long stat_expiredkeys; /* number of expired keys */ - /* Configuration */ - int verbosity; - int glueoutputbuf; - int maxidletime; - int dbnum; - int daemonize; - int appendonly; - int appendfsync; - int no_appendfsync_on_rewrite; - int shutdown_asap; - time_t lastfsync; - int appendfd; - int appendseldb; - char *pidfile; - pid_t bgsavechildpid; - pid_t bgrewritechildpid; - sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */ - sds aofbuf; /* AOF buffer, written before entering the event loop */ - struct saveparam *saveparams; - int saveparamslen; - char *logfile; - char *bindaddr; - char *dbfilename; - char *appendfilename; - char *requirepass; - int rdbcompression; - int activerehashing; - /* Replication related */ - int isslave; - char *masterauth; - char *masterhost; - int masterport; - redisClient *master; /* client that is master for this slave */ - int replstate; - unsigned int maxclients; - unsigned long long maxmemory; - unsigned int blpop_blocked_clients; - unsigned int vm_blocked_clients; - /* Sort parameters - qsort_r() is only available under BSD so we - * have to take this state global, in order to pass it to sortCompare() */ - int sort_desc; - int sort_alpha; - int sort_bypattern; - /* Virtual memory configuration */ - int vm_enabled; - char *vm_swap_file; - off_t vm_page_size; - off_t vm_pages; - unsigned long long vm_max_memory; - /* Zip structure config */ - size_t hash_max_zipmap_entries; - size_t hash_max_zipmap_value; - size_t list_max_ziplist_entries; - size_t list_max_ziplist_value; - /* Virtual memory state */ - FILE *vm_fp; - int vm_fd; - off_t vm_next_page; /* Next probably empty page */ - off_t vm_near_pages; /* Number of pages allocated sequentially */ - unsigned char *vm_bitmap; /* Bitmap of free/used pages */ - time_t unixtime; /* Unix time sampled every second. */ - /* Virtual memory I/O threads stuff */ - /* An I/O thread process an element taken from the io_jobs queue and - * put the result of the operation in the io_done list. While the - * job is being processed, it's put on io_processing queue. */ - list *io_newjobs; /* List of VM I/O jobs yet to be processed */ - list *io_processing; /* List of VM I/O jobs being processed */ - list *io_processed; /* List of VM I/O jobs already processed */ - list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */ - pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */ - pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */ - pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */ - pthread_attr_t io_threads_attr; /* attributes for threads creation */ - int io_active_threads; /* Number of running I/O threads */ - int vm_max_threads; /* Max number of I/O threads running at the same time */ - /* Our main thread is blocked on the event loop, locking for sockets ready - * to be read or written, so when a threaded I/O operation is ready to be - * processed by the main thread, the I/O thread will use a unix pipe to - * awake the main thread. The followings are the two pipe FDs. */ - int io_ready_pipe_read; - int io_ready_pipe_write; - /* Virtual memory stats */ - unsigned long long vm_stats_used_pages; - unsigned long long vm_stats_swapped_objects; - unsigned long long vm_stats_swapouts; - unsigned long long vm_stats_swapins; - /* Pubsub */ - dict *pubsub_channels; /* Map channels to list of subscribed clients */ - list *pubsub_patterns; /* A list of pubsub_patterns */ - /* Misc */ - FILE *devnull; - unsigned lruclock:22; /* clock incrementing every minute, for LRU */ - unsigned lruclock_padding:10; -}; - -typedef struct pubsubPattern { - redisClient *client; - robj *pattern; -} pubsubPattern; - -typedef void redisCommandProc(redisClient *c); -typedef void redisVmPreloadProc(redisClient *c, struct redisCommand *cmd, int argc, robj **argv); -struct redisCommand { - char *name; - redisCommandProc *proc; - int arity; - int flags; - /* Use a function to determine which keys need to be loaded - * in the background prior to executing this command. Takes precedence - * over vm_firstkey and others, ignored when NULL */ - redisVmPreloadProc *vm_preload_proc; - /* What keys should be loaded in background when calling this command? */ - int vm_firstkey; /* The first argument that's a key (0 = no keys) */ - int vm_lastkey; /* THe last argument that's a key */ - int vm_keystep; /* The step between first and last key */ -}; - -struct redisFunctionSym { - char *name; - unsigned long pointer; -}; - -typedef struct _redisSortObject { - robj *obj; - union { - double score; - robj *cmpobj; - } u; -} redisSortObject; - -typedef struct _redisSortOperation { - int type; - robj *pattern; -} redisSortOperation; - -/* ZSETs use a specialized version of Skiplists */ - -typedef struct zskiplistNode { - struct zskiplistNode **forward; - struct zskiplistNode *backward; - unsigned int *span; - double score; - robj *obj; -} zskiplistNode; - -typedef struct zskiplist { - struct zskiplistNode *header, *tail; - unsigned long length; - int level; -} zskiplist; - -typedef struct zset { - dict *dict; - zskiplist *zsl; -} zset; - -/* Our shared "common" objects */ - -#define REDIS_SHARED_INTEGERS 10000 -struct sharedObjectsStruct { - robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *cnegone, *pong, *space, - *colon, *nullbulk, *nullmultibulk, *queued, - *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr, - *outofrangeerr, *plus, - *select0, *select1, *select2, *select3, *select4, - *select5, *select6, *select7, *select8, *select9, - *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3, - *mbulk4, *psubscribebulk, *punsubscribebulk, - *integers[REDIS_SHARED_INTEGERS]; -} shared; - -/* Global vars that are actally used as constants. The following double - * values are used for double on-disk serialization, and are initialized - * at runtime to avoid strange compiler optimizations. */ - -static double R_Zero, R_PosInf, R_NegInf, R_Nan; - -/* VM threaded I/O request message */ -#define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */ -#define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */ -#define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */ -typedef struct iojob { - int type; /* Request type, REDIS_IOJOB_* */ - redisDb *db;/* Redis database */ - robj *key; /* This I/O request is about swapping this key */ - robj *id; /* Unique identifier of this job: - this is the object to swap for REDIS_IOREQ_*_SWAP, or the - vmpointer objct for REDIS_IOREQ_LOAD. */ - robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this - * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */ - off_t page; /* Swap page where to read/write the object */ - off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */ - int canceled; /* True if this command was canceled by blocking side of VM */ - pthread_t thread; /* ID of the thread processing this entry */ -} iojob; - -/*================================ Prototypes =============================== */ -char *redisGitSHA1(void); -char *redisGitDirty(void); - -static void freeStringObject(robj *o); -static void freeListObject(robj *o); -static void freeSetObject(robj *o); -static void decrRefCount(void *o); -static robj *createObject(int type, void *ptr); -static void freeClient(redisClient *c); -static int rdbLoad(char *filename); -static void addReply(redisClient *c, robj *obj); -static void addReplySds(redisClient *c, sds s); -static void incrRefCount(robj *o); -static int rdbSaveBackground(char *filename); -static robj *createStringObject(char *ptr, size_t len); -static robj *dupStringObject(robj *o); -static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc); -static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc); -static void flushAppendOnlyFile(void); -static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc); -static int syncWithMaster(void); -static robj *tryObjectEncoding(robj *o); -static robj *getDecodedObject(robj *o); -static int removeExpire(redisDb *db, robj *key); -static int expireIfNeeded(redisDb *db, robj *key); -static int deleteIfVolatile(redisDb *db, robj *key); -static int dbDelete(redisDb *db, robj *key); -static time_t getExpire(redisDb *db, robj *key); -static int setExpire(redisDb *db, robj *key, time_t when); -static void updateSlavesWaitingBgsave(int bgsaveerr); -static void freeMemoryIfNeeded(void); -static int processCommand(redisClient *c); -static void setupSigSegvAction(void); -static void rdbRemoveTempFile(pid_t childpid); -static void aofRemoveTempFile(pid_t childpid); -static size_t stringObjectLen(robj *o); -static void processInputBuffer(redisClient *c); -static zskiplist *zslCreate(void); -static void zslFree(zskiplist *zsl); -static void zslInsert(zskiplist *zsl, double score, robj *obj); -static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask); -static void initClientMultiState(redisClient *c); -static void freeClientMultiState(redisClient *c); -static void queueMultiCommand(redisClient *c, struct redisCommand *cmd); -static void unblockClientWaitingData(redisClient *c); -static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele); -static void vmInit(void); -static void vmMarkPagesFree(off_t page, off_t count); -static robj *vmLoadObject(robj *o); -static robj *vmPreviewObject(robj *o); -static int vmSwapOneObjectBlocking(void); -static int vmSwapOneObjectThreaded(void); -static int vmCanSwapOut(void); -static int tryFreeOneObjectFromFreelist(void); -static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask); -static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask); -static void vmCancelThreadedIOJob(robj *o); -static void lockThreadedIO(void); -static void unlockThreadedIO(void); -static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db); -static void freeIOJob(iojob *j); -static void queueIOJob(iojob *j); -static int vmWriteObjectOnSwap(robj *o, off_t page); -static robj *vmReadObjectFromSwap(off_t page, int type); -static void waitEmptyIOJobsQueue(void); -static void vmReopenSwapFile(void); -static int vmFreePage(off_t page); -static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv); -static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv); -static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd); -static int dontWaitForSwappedKey(redisClient *c, robj *key); -static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key); -static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask); -static struct redisCommand *lookupCommand(char *name); -static void call(redisClient *c, struct redisCommand *cmd); -static void resetClient(redisClient *c); -static void convertToRealHash(robj *o); -static void listTypeConvert(robj *o, int enc); -static int pubsubUnsubscribeAllChannels(redisClient *c, int notify); -static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify); -static void freePubsubPattern(void *p); -static int listMatchPubsubPattern(void *a, void *b); -static int compareStringObjects(robj *a, robj *b); -static int equalStringObjects(robj *a, robj *b); -static void usage(); -static int rewriteAppendOnlyFileBackground(void); -static vmpointer *vmSwapObjectBlocking(robj *val); -static int prepareForShutdown(); -static void touchWatchedKey(redisDb *db, robj *key); -static void touchWatchedKeysOnFlush(int dbid); -static void unwatchAllKeys(redisClient *c); - -static void authCommand(redisClient *c); -static void pingCommand(redisClient *c); -static void echoCommand(redisClient *c); -static void setCommand(redisClient *c); -static void setnxCommand(redisClient *c); -static void setexCommand(redisClient *c); -static void getCommand(redisClient *c); -static void delCommand(redisClient *c); -static void existsCommand(redisClient *c); -static void incrCommand(redisClient *c); -static void decrCommand(redisClient *c); -static void incrbyCommand(redisClient *c); -static void decrbyCommand(redisClient *c); -static void selectCommand(redisClient *c); -static void randomkeyCommand(redisClient *c); -static void keysCommand(redisClient *c); -static void dbsizeCommand(redisClient *c); -static void lastsaveCommand(redisClient *c); -static void saveCommand(redisClient *c); -static void bgsaveCommand(redisClient *c); -static void bgrewriteaofCommand(redisClient *c); -static void shutdownCommand(redisClient *c); -static void moveCommand(redisClient *c); -static void renameCommand(redisClient *c); -static void renamenxCommand(redisClient *c); -static void lpushCommand(redisClient *c); -static void rpushCommand(redisClient *c); -static void lpushxCommand(redisClient *c); -static void rpushxCommand(redisClient *c); -static void linsertCommand(redisClient *c); -static void lpopCommand(redisClient *c); -static void rpopCommand(redisClient *c); -static void llenCommand(redisClient *c); -static void lindexCommand(redisClient *c); -static void lrangeCommand(redisClient *c); -static void ltrimCommand(redisClient *c); -static void typeCommand(redisClient *c); -static void lsetCommand(redisClient *c); -static void saddCommand(redisClient *c); -static void sremCommand(redisClient *c); -static void smoveCommand(redisClient *c); -static void sismemberCommand(redisClient *c); -static void scardCommand(redisClient *c); -static void spopCommand(redisClient *c); -static void srandmemberCommand(redisClient *c); -static void sinterCommand(redisClient *c); -static void sinterstoreCommand(redisClient *c); -static void sunionCommand(redisClient *c); -static void sunionstoreCommand(redisClient *c); -static void sdiffCommand(redisClient *c); -static void sdiffstoreCommand(redisClient *c); -static void syncCommand(redisClient *c); -static void flushdbCommand(redisClient *c); -static void flushallCommand(redisClient *c); -static void sortCommand(redisClient *c); -static void lremCommand(redisClient *c); -static void rpoplpushcommand(redisClient *c); -static void infoCommand(redisClient *c); -static void mgetCommand(redisClient *c); -static void monitorCommand(redisClient *c); -static void expireCommand(redisClient *c); -static void expireatCommand(redisClient *c); -static void getsetCommand(redisClient *c); -static void ttlCommand(redisClient *c); -static void slaveofCommand(redisClient *c); -static void debugCommand(redisClient *c); -static void msetCommand(redisClient *c); -static void msetnxCommand(redisClient *c); -static void zaddCommand(redisClient *c); -static void zincrbyCommand(redisClient *c); -static void zrangeCommand(redisClient *c); -static void zrangebyscoreCommand(redisClient *c); -static void zcountCommand(redisClient *c); -static void zrevrangeCommand(redisClient *c); -static void zcardCommand(redisClient *c); -static void zremCommand(redisClient *c); -static void zscoreCommand(redisClient *c); -static void zremrangebyscoreCommand(redisClient *c); -static void multiCommand(redisClient *c); -static void execCommand(redisClient *c); -static void discardCommand(redisClient *c); -static void blpopCommand(redisClient *c); -static void brpopCommand(redisClient *c); -static void appendCommand(redisClient *c); -static void substrCommand(redisClient *c); -static void zrankCommand(redisClient *c); -static void zrevrankCommand(redisClient *c); -static void hsetCommand(redisClient *c); -static void hsetnxCommand(redisClient *c); -static void hgetCommand(redisClient *c); -static void hmsetCommand(redisClient *c); -static void hmgetCommand(redisClient *c); -static void hdelCommand(redisClient *c); -static void hlenCommand(redisClient *c); -static void zremrangebyrankCommand(redisClient *c); -static void zunionstoreCommand(redisClient *c); -static void zinterstoreCommand(redisClient *c); -static void hkeysCommand(redisClient *c); -static void hvalsCommand(redisClient *c); -static void hgetallCommand(redisClient *c); -static void hexistsCommand(redisClient *c); -static void configCommand(redisClient *c); -static void hincrbyCommand(redisClient *c); -static void subscribeCommand(redisClient *c); -static void unsubscribeCommand(redisClient *c); -static void psubscribeCommand(redisClient *c); -static void punsubscribeCommand(redisClient *c); -static void publishCommand(redisClient *c); -static void watchCommand(redisClient *c); -static void unwatchCommand(redisClient *c); - -/*================================= Globals ================================= */ - -/* Global vars */ -static struct redisServer server; /* server global state */ -static struct redisCommand *commandTable; -static struct redisCommand readonlyCommandTable[] = { - {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, - {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0}, - {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0}, - {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0}, - {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, - {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1}, - {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0}, - {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, - {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1}, - {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1}, - {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1}, - {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, - {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, - {"rpushx",rpushxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, - {"lpushx",lpushxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, - {"linsert",linsertCommand,5,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, - {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, - {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, - {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1}, - {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1}, - {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, - {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1}, - {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, - {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1}, - {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1}, - {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1}, - {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1}, - {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, - {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1}, - {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1}, - {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1}, - {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, - {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, - {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, - {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1}, - {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1}, - {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1}, - {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1}, - {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1}, - {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1}, - {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, - {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, - {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, - {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1}, - {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1}, - {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1}, - {"zunionstore",zunionstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0}, - {"zinterstore",zinterstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0}, - {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1}, - {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1}, - {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1}, - {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1}, - {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, - {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, - {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1}, - {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1}, - {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, - {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, - {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1}, - {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, - {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1}, - {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1}, - {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1}, - {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, - {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, - {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, - {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, - {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1}, - {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1}, - {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1}, - {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, - {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2}, - {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2}, - {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, - {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0}, - {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1}, - {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1}, - {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1}, - {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0}, - {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0}, - {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0}, - {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, - {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0}, - {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, - {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0}, - {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, - {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, - {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, - {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, - {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, - {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, - {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, - {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,execBlockClientOnSwappedKeys,0,0,0}, - {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, - {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, - {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, - {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, - {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1}, - {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, - {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, - {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, - {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0}, - {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0}, - {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0}, - {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0}, - {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0}, - {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0}, - {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0}, - {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0}, - {"watch",watchCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0}, - {"unwatch",unwatchCommand,1,REDIS_CMD_INLINE,NULL,0,0,0} -}; - -/*============================ Utility functions ============================ */ - -/* Glob-style pattern matching. */ -static int stringmatchlen(const char *pattern, int patternLen, - const char *string, int stringLen, int nocase) -{ - while(patternLen) { - switch(pattern[0]) { - case '*': - while (pattern[1] == '*') { - pattern++; - patternLen--; - } - if (patternLen == 1) - return 1; /* match */ - while(stringLen) { - if (stringmatchlen(pattern+1, patternLen-1, - string, stringLen, nocase)) - return 1; /* match */ - string++; - stringLen--; - } - return 0; /* no match */ - break; - case '?': - if (stringLen == 0) - return 0; /* no match */ - string++; - stringLen--; - break; - case '[': - { - int not, match; - - pattern++; - patternLen--; - not = pattern[0] == '^'; - if (not) { - pattern++; - patternLen--; - } - match = 0; - while(1) { - if (pattern[0] == '\\') { - pattern++; - patternLen--; - if (pattern[0] == string[0]) - match = 1; - } else if (pattern[0] == ']') { - break; - } else if (patternLen == 0) { - pattern--; - patternLen++; - break; - } else if (pattern[1] == '-' && patternLen >= 3) { - int start = pattern[0]; - int end = pattern[2]; - int c = string[0]; - if (start > end) { - int t = start; - start = end; - end = t; - } - if (nocase) { - start = tolower(start); - end = tolower(end); - c = tolower(c); - } - pattern += 2; - patternLen -= 2; - if (c >= start && c <= end) - match = 1; - } else { - if (!nocase) { - if (pattern[0] == string[0]) - match = 1; - } else { - if (tolower((int)pattern[0]) == tolower((int)string[0])) - match = 1; - } - } - pattern++; - patternLen--; - } - if (not) - match = !match; - if (!match) - return 0; /* no match */ - string++; - stringLen--; - break; - } - case '\\': - if (patternLen >= 2) { - pattern++; - patternLen--; - } - /* fall through */ - default: - if (!nocase) { - if (pattern[0] != string[0]) - return 0; /* no match */ - } else { - if (tolower((int)pattern[0]) != tolower((int)string[0])) - return 0; /* no match */ - } - string++; - stringLen--; - break; - } - pattern++; - patternLen--; - if (stringLen == 0) { - while(*pattern == '*') { - pattern++; - patternLen--; - } - break; - } - } - if (patternLen == 0 && stringLen == 0) - return 1; - return 0; -} - -static int stringmatch(const char *pattern, const char *string, int nocase) { - return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase); -} - -/* Convert a string representing an amount of memory into the number of - * bytes, so for instance memtoll("1Gi") will return 1073741824 that is - * (1024*1024*1024). - * - * On parsing error, if *err is not NULL, it's set to 1, otherwise it's - * set to 0 */ -static long long memtoll(const char *p, int *err) { - const char *u; - char buf[128]; - long mul; /* unit multiplier */ - long long val; - unsigned int digits; - - if (err) *err = 0; - /* Search the first non digit character. */ - u = p; - if (*u == '-') u++; - while(*u && isdigit(*u)) u++; - if (*u == '\0' || !strcasecmp(u,"b")) { - mul = 1; - } else if (!strcasecmp(u,"k")) { - mul = 1000; - } else if (!strcasecmp(u,"kb")) { - mul = 1024; - } else if (!strcasecmp(u,"m")) { - mul = 1000*1000; - } else if (!strcasecmp(u,"mb")) { - mul = 1024*1024; - } else if (!strcasecmp(u,"g")) { - mul = 1000L*1000*1000; - } else if (!strcasecmp(u,"gb")) { - mul = 1024L*1024*1024; - } else { - if (err) *err = 1; - mul = 1; - } - digits = u-p; - if (digits >= sizeof(buf)) { - if (err) *err = 1; - return LLONG_MAX; - } - memcpy(buf,p,digits); - buf[digits] = '\0'; - val = strtoll(buf,NULL,10); - return val*mul; -} - -/* Convert a long long into a string. Returns the number of - * characters needed to represent the number, that can be shorter if passed - * buffer length is not enough to store the whole number. */ -static int ll2string(char *s, size_t len, long long value) { - char buf[32], *p; - unsigned long long v; - size_t l; - - if (len == 0) return 0; - v = (value < 0) ? -value : value; - p = buf+31; /* point to the last character */ - do { - *p-- = '0'+(v%10); - v /= 10; - } while(v); - if (value < 0) *p-- = '-'; - p++; - l = 32-(p-buf); - if (l+1 > len) l = len-1; /* Make sure it fits, including the nul term */ - memcpy(s,p,l); - s[l] = '\0'; - return l; -} - -static void redisLog(int level, const char *fmt, ...) { - va_list ap; - FILE *fp; - - fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a"); - if (!fp) return; - - va_start(ap, fmt); - if (level >= server.verbosity) { - char *c = ".-*#"; - char buf[64]; - time_t now; - - now = time(NULL); - strftime(buf,64,"%d %b %H:%M:%S",localtime(&now)); - fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]); - vfprintf(fp, fmt, ap); - fprintf(fp,"\n"); - fflush(fp); - } - va_end(ap); - - if (server.logfile) fclose(fp); -} - -/*====================== Hash table type implementation ==================== */ - -/* This is an hash table type that uses the SDS dynamic strings libary as - * keys and radis objects as values (objects can hold SDS strings, - * lists, sets). */ - -static void dictVanillaFree(void *privdata, void *val) -{ - DICT_NOTUSED(privdata); - zfree(val); -} - -static void dictListDestructor(void *privdata, void *val) -{ - DICT_NOTUSED(privdata); - listRelease((list*)val); -} - -static int dictSdsKeyCompare(void *privdata, const void *key1, - const void *key2) -{ - int l1,l2; - DICT_NOTUSED(privdata); - - l1 = sdslen((sds)key1); - l2 = sdslen((sds)key2); - if (l1 != l2) return 0; - return memcmp(key1, key2, l1) == 0; -} - -static void dictRedisObjectDestructor(void *privdata, void *val) -{ - DICT_NOTUSED(privdata); - - if (val == NULL) return; /* Values of swapped out keys as set to NULL */ - decrRefCount(val); -} - -static void dictSdsDestructor(void *privdata, void *val) -{ - DICT_NOTUSED(privdata); - - sdsfree(val); -} - -static int dictObjKeyCompare(void *privdata, const void *key1, - const void *key2) -{ - const robj *o1 = key1, *o2 = key2; - return dictSdsKeyCompare(privdata,o1->ptr,o2->ptr); -} - -static unsigned int dictObjHash(const void *key) { - const robj *o = key; - return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr)); -} - -static unsigned int dictSdsHash(const void *key) { - return dictGenHashFunction((unsigned char*)key, sdslen((char*)key)); -} - -static int dictEncObjKeyCompare(void *privdata, const void *key1, - const void *key2) -{ - robj *o1 = (robj*) key1, *o2 = (robj*) key2; - int cmp; - - if (o1->encoding == REDIS_ENCODING_INT && - o2->encoding == REDIS_ENCODING_INT) - return o1->ptr == o2->ptr; - - o1 = getDecodedObject(o1); - o2 = getDecodedObject(o2); - cmp = dictSdsKeyCompare(privdata,o1->ptr,o2->ptr); - decrRefCount(o1); - decrRefCount(o2); - return cmp; -} - -static unsigned int dictEncObjHash(const void *key) { - robj *o = (robj*) key; - - if (o->encoding == REDIS_ENCODING_RAW) { - return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr)); - } else { - if (o->encoding == REDIS_ENCODING_INT) { - char buf[32]; - int len; - - len = ll2string(buf,32,(long)o->ptr); - return dictGenHashFunction((unsigned char*)buf, len); - } else { - unsigned int hash; - - o = getDecodedObject(o); - hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr)); - decrRefCount(o); - return hash; - } - } -} - -/* Sets type */ -static dictType setDictType = { - dictEncObjHash, /* hash function */ - NULL, /* key dup */ - NULL, /* val dup */ - dictEncObjKeyCompare, /* key compare */ - dictRedisObjectDestructor, /* key destructor */ - NULL /* val destructor */ -}; - -/* Sorted sets hash (note: a skiplist is used in addition to the hash table) */ -static dictType zsetDictType = { - dictEncObjHash, /* hash function */ - NULL, /* key dup */ - NULL, /* val dup */ - dictEncObjKeyCompare, /* key compare */ - dictRedisObjectDestructor, /* key destructor */ - dictVanillaFree /* val destructor of malloc(sizeof(double)) */ -}; - -/* Db->dict, keys are sds strings, vals are Redis objects. */ -static dictType dbDictType = { - dictSdsHash, /* hash function */ - NULL, /* key dup */ - NULL, /* val dup */ - dictSdsKeyCompare, /* key compare */ - dictSdsDestructor, /* key destructor */ - dictRedisObjectDestructor /* val destructor */ -}; - -/* Db->expires */ -static dictType keyptrDictType = { - dictSdsHash, /* hash function */ - NULL, /* key dup */ - NULL, /* val dup */ - dictSdsKeyCompare, /* key compare */ - NULL, /* key destructor */ - NULL /* val destructor */ -}; - -/* Hash type hash table (note that small hashes are represented with zimpaps) */ -static dictType hashDictType = { - dictEncObjHash, /* hash function */ - NULL, /* key dup */ - NULL, /* val dup */ - dictEncObjKeyCompare, /* key compare */ - dictRedisObjectDestructor, /* key destructor */ - dictRedisObjectDestructor /* val destructor */ -}; - -/* Keylist hash table type has unencoded redis objects as keys and - * lists as values. It's used for blocking operations (BLPOP) and to - * map swapped keys to a list of clients waiting for this keys to be loaded. */ -static dictType keylistDictType = { - dictObjHash, /* hash function */ - NULL, /* key dup */ - NULL, /* val dup */ - dictObjKeyCompare, /* key compare */ - dictRedisObjectDestructor, /* key destructor */ - dictListDestructor /* val destructor */ -}; - -static void version(); - -/* ========================= Random utility functions ======================= */ - -/* Redis generally does not try to recover from out of memory conditions - * when allocating objects or strings, it is not clear if it will be possible - * to report this condition to the client since the networking layer itself - * is based on heap allocation for send buffers, so we simply abort. - * At least the code will be simpler to read... */ -static void oom(const char *msg) { - redisLog(REDIS_WARNING, "%s: Out of memory\n",msg); - sleep(1); - abort(); -} - -/* ====================== Redis server networking stuff ===================== */ -static void closeTimedoutClients(void) { - redisClient *c; - listNode *ln; - time_t now = time(NULL); - listIter li; - - listRewind(server.clients,&li); - while ((ln = listNext(&li)) != NULL) { - c = listNodeValue(ln); - if (server.maxidletime && - !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */ - !(c->flags & REDIS_MASTER) && /* no timeout for masters */ - dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */ - listLength(c->pubsub_patterns) == 0 && - (now - c->lastinteraction > server.maxidletime)) - { - redisLog(REDIS_VERBOSE,"Closing idle client"); - freeClient(c); - } else if (c->flags & REDIS_BLOCKED) { - if (c->blockingto != 0 && c->blockingto < now) { - addReply(c,shared.nullmultibulk); - unblockClientWaitingData(c); - } - } - } -} - -static int htNeedsResize(dict *dict) { - long long size, used; - - size = dictSlots(dict); - used = dictSize(dict); - return (size && used && size > DICT_HT_INITIAL_SIZE && - (used*100/size < REDIS_HT_MINFILL)); -} - -/* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL - * we resize the hash table to save memory */ -static void tryResizeHashTables(void) { - int j; - - for (j = 0; j < server.dbnum; j++) { - if (htNeedsResize(server.db[j].dict)) - dictResize(server.db[j].dict); - if (htNeedsResize(server.db[j].expires)) - dictResize(server.db[j].expires); - } -} - -/* Our hash table implementation performs rehashing incrementally while - * we write/read from the hash table. Still if the server is idle, the hash - * table will use two tables for a long time. So we try to use 1 millisecond - * of CPU time at every serverCron() loop in order to rehash some key. */ -static void incrementallyRehash(void) { - int j; - - for (j = 0; j < server.dbnum; j++) { - if (dictIsRehashing(server.db[j].dict)) { - dictRehashMilliseconds(server.db[j].dict,1); - break; /* already used our millisecond for this loop... */ - } - } -} - -/* A background saving child (BGSAVE) terminated its work. Handle this. */ -void backgroundSaveDoneHandler(int statloc) { - int exitcode = WEXITSTATUS(statloc); - int bysignal = WIFSIGNALED(statloc); - - if (!bysignal && exitcode == 0) { - redisLog(REDIS_NOTICE, - "Background saving terminated with success"); - server.dirty = 0; - server.lastsave = time(NULL); - } else if (!bysignal && exitcode != 0) { - redisLog(REDIS_WARNING, "Background saving error"); - } else { - redisLog(REDIS_WARNING, - "Background saving terminated by signal %d", WTERMSIG(statloc)); - rdbRemoveTempFile(server.bgsavechildpid); - } - server.bgsavechildpid = -1; - /* Possibly there are slaves waiting for a BGSAVE in order to be served - * (the first stage of SYNC is a bulk transfer of dump.rdb) */ - updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR); -} - -/* A background append only file rewriting (BGREWRITEAOF) terminated its work. - * Handle this. */ -void backgroundRewriteDoneHandler(int statloc) { - int exitcode = WEXITSTATUS(statloc); - int bysignal = WIFSIGNALED(statloc); - - if (!bysignal && exitcode == 0) { - int fd; - char tmpfile[256]; - - redisLog(REDIS_NOTICE, - "Background append only file rewriting terminated with success"); - /* Now it's time to flush the differences accumulated by the parent */ - snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid); - fd = open(tmpfile,O_WRONLY|O_APPEND); - if (fd == -1) { - redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno)); - goto cleanup; - } - /* Flush our data... */ - if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) != - (signed) sdslen(server.bgrewritebuf)) { - redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno)); - close(fd); - goto cleanup; - } - redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf)); - /* Now our work is to rename the temp file into the stable file. And - * switch the file descriptor used by the server for append only. */ - if (rename(tmpfile,server.appendfilename) == -1) { - redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno)); - close(fd); - goto cleanup; - } - /* Mission completed... almost */ - redisLog(REDIS_NOTICE,"Append only file successfully rewritten."); - if (server.appendfd != -1) { - /* If append only is actually enabled... */ - close(server.appendfd); - server.appendfd = fd; - if (server.appendfsync != APPENDFSYNC_NO) aof_fsync(fd); - server.appendseldb = -1; /* Make sure it will issue SELECT */ - redisLog(REDIS_NOTICE,"The new append only file was selected for future appends."); - } else { - /* If append only is disabled we just generate a dump in this - * format. Why not? */ - close(fd); - } - } else if (!bysignal && exitcode != 0) { - redisLog(REDIS_WARNING, "Background append only file rewriting error"); - } else { - redisLog(REDIS_WARNING, - "Background append only file rewriting terminated by signal %d", - WTERMSIG(statloc)); - } -cleanup: - sdsfree(server.bgrewritebuf); - server.bgrewritebuf = sdsempty(); - aofRemoveTempFile(server.bgrewritechildpid); - server.bgrewritechildpid = -1; -} - -/* This function is called once a background process of some kind terminates, - * as we want to avoid resizing the hash tables when there is a child in order - * to play well with copy-on-write (otherwise when a resize happens lots of - * memory pages are copied). The goal of this function is to update the ability - * for dict.c to resize the hash tables accordingly to the fact we have o not - * running childs. */ -static void updateDictResizePolicy(void) { - if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) - dictEnableResize(); - else - dictDisableResize(); -} - -static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) { - int j, loops = server.cronloops++; - REDIS_NOTUSED(eventLoop); - REDIS_NOTUSED(id); - REDIS_NOTUSED(clientData); - - /* We take a cached value of the unix time in the global state because - * with virtual memory and aging there is to store the current time - * in objects at every object access, and accuracy is not needed. - * To access a global var is faster than calling time(NULL) */ - server.unixtime = time(NULL); - /* We have just 21 bits per object for LRU information. - * So we use an (eventually wrapping) LRU clock with minutes resolution. - * - * When we need to select what object to swap, we compute the minimum - * time distance between the current lruclock and the object last access - * lruclock info. Even if clocks will wrap on overflow, there is - * the interesting property that we are sure that at least - * ABS(A-B) minutes passed between current time and timestamp B. - * - * This is not precise but we don't need at all precision, but just - * something statistically reasonable. - */ - server.lruclock = (time(NULL)/60)&((1<<21)-1); - - /* We received a SIGTERM, shutting down here in a safe way, as it is - * not ok doing so inside the signal handler. */ - if (server.shutdown_asap) { - if (prepareForShutdown() == REDIS_OK) exit(0); - redisLog(REDIS_WARNING,"SIGTERM received but errors trying to shut down the server, check the logs for more information"); - } - - /* Show some info about non-empty databases */ - for (j = 0; j < server.dbnum; j++) { - long long size, used, vkeys; - - size = dictSlots(server.db[j].dict); - used = dictSize(server.db[j].dict); - vkeys = dictSize(server.db[j].expires); - if (!(loops % 50) && (used || vkeys)) { - redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size); - /* dictPrintStats(server.dict); */ - } - } - - /* We don't want to resize the hash tables while a bacground saving - * is in progress: the saving child is created using fork() that is - * implemented with a copy-on-write semantic in most modern systems, so - * if we resize the HT while there is the saving child at work actually - * a lot of memory movements in the parent will cause a lot of pages - * copied. */ - if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) { - if (!(loops % 10)) tryResizeHashTables(); - if (server.activerehashing) incrementallyRehash(); - } - - /* Show information about connected clients */ - if (!(loops % 50)) { - redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use", - listLength(server.clients)-listLength(server.slaves), - listLength(server.slaves), - zmalloc_used_memory()); - } - - /* Close connections of timedout clients */ - if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients) - closeTimedoutClients(); - - /* Check if a background saving or AOF rewrite in progress terminated */ - if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) { - int statloc; - pid_t pid; - - if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) { - if (pid == server.bgsavechildpid) { - backgroundSaveDoneHandler(statloc); - } else { - backgroundRewriteDoneHandler(statloc); - } - updateDictResizePolicy(); - } - } else { - /* If there is not a background saving in progress check if - * we have to save now */ - time_t now = time(NULL); - for (j = 0; j < server.saveparamslen; j++) { - struct saveparam *sp = server.saveparams+j; - - if (server.dirty >= sp->changes && - now-server.lastsave > sp->seconds) { - redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...", - sp->changes, sp->seconds); - rdbSaveBackground(server.dbfilename); - break; - } - } - } - - /* Try to expire a few timed out keys. The algorithm used is adaptive and - * will use few CPU cycles if there are few expiring keys, otherwise - * it will get more aggressive to avoid that too much memory is used by - * keys that can be removed from the keyspace. */ - for (j = 0; j < server.dbnum; j++) { - int expired; - redisDb *db = server.db+j; - - /* Continue to expire if at the end of the cycle more than 25% - * of the keys were expired. */ - do { - long num = dictSize(db->expires); - time_t now = time(NULL); - - expired = 0; - if (num > REDIS_EXPIRELOOKUPS_PER_CRON) - num = REDIS_EXPIRELOOKUPS_PER_CRON; - while (num--) { - dictEntry *de; - time_t t; - - if ((de = dictGetRandomKey(db->expires)) == NULL) break; - t = (time_t) dictGetEntryVal(de); - if (now > t) { - sds key = dictGetEntryKey(de); - robj *keyobj = createStringObject(key,sdslen(key)); - - dbDelete(db,keyobj); - decrRefCount(keyobj); - expired++; - server.stat_expiredkeys++; - } - } - } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4); - } - - /* Swap a few keys on disk if we are over the memory limit and VM - * is enbled. Try to free objects from the free list first. */ - if (vmCanSwapOut()) { - while (server.vm_enabled && zmalloc_used_memory() > - server.vm_max_memory) - { - int retval; - - if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue; - retval = (server.vm_max_threads == 0) ? - vmSwapOneObjectBlocking() : - vmSwapOneObjectThreaded(); - if (retval == REDIS_ERR && !(loops % 300) && - zmalloc_used_memory() > - (server.vm_max_memory+server.vm_max_memory/10)) - { - redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!"); - } - /* Note that when using threade I/O we free just one object, - * because anyway when the I/O thread in charge to swap this - * object out will finish, the handler of completed jobs - * will try to swap more objects if we are still out of memory. */ - if (retval == REDIS_ERR || server.vm_max_threads > 0) break; - } - } - - /* Check if we should connect to a MASTER */ - if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) { - redisLog(REDIS_NOTICE,"Connecting to MASTER..."); - if (syncWithMaster() == REDIS_OK) { - redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded"); - if (server.appendonly) rewriteAppendOnlyFileBackground(); - } - } - return 100; -} - -/* This function gets called every time Redis is entering the - * main loop of the event driven library, that is, before to sleep - * for ready file descriptors. */ -static void beforeSleep(struct aeEventLoop *eventLoop) { - REDIS_NOTUSED(eventLoop); - - /* Awake clients that got all the swapped keys they requested */ - if (server.vm_enabled && listLength(server.io_ready_clients)) { - listIter li; - listNode *ln; - - listRewind(server.io_ready_clients,&li); - while((ln = listNext(&li))) { - redisClient *c = ln->value; - struct redisCommand *cmd; - - /* Resume the client. */ - listDelNode(server.io_ready_clients,ln); - c->flags &= (~REDIS_IO_WAIT); - server.vm_blocked_clients--; - aeCreateFileEvent(server.el, c->fd, AE_READABLE, - readQueryFromClient, c); - cmd = lookupCommand(c->argv[0]->ptr); - assert(cmd != NULL); - call(c,cmd); - resetClient(c); - /* There may be more data to process in the input buffer. */ - if (c->querybuf && sdslen(c->querybuf) > 0) - processInputBuffer(c); - } - } - /* Write the AOF buffer on disk */ - flushAppendOnlyFile(); -} - -static void createSharedObjects(void) { - int j; - - shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n")); - shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n")); - shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n")); - shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n")); - shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n")); - shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n")); - shared.cnegone = createObject(REDIS_STRING,sdsnew(":-1\r\n")); - shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n")); - shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n")); - shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n")); - shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n")); - shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n")); - shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew( - "-ERR Operation against a key holding the wrong kind of value\r\n")); - shared.nokeyerr = createObject(REDIS_STRING,sdsnew( - "-ERR no such key\r\n")); - shared.syntaxerr = createObject(REDIS_STRING,sdsnew( - "-ERR syntax error\r\n")); - shared.sameobjecterr = createObject(REDIS_STRING,sdsnew( - "-ERR source and destination objects are the same\r\n")); - shared.outofrangeerr = createObject(REDIS_STRING,sdsnew( - "-ERR index out of range\r\n")); - shared.space = createObject(REDIS_STRING,sdsnew(" ")); - shared.colon = createObject(REDIS_STRING,sdsnew(":")); - shared.plus = createObject(REDIS_STRING,sdsnew("+")); - shared.select0 = createStringObject("select 0\r\n",10); - shared.select1 = createStringObject("select 1\r\n",10); - shared.select2 = createStringObject("select 2\r\n",10); - shared.select3 = createStringObject("select 3\r\n",10); - shared.select4 = createStringObject("select 4\r\n",10); - shared.select5 = createStringObject("select 5\r\n",10); - shared.select6 = createStringObject("select 6\r\n",10); - shared.select7 = createStringObject("select 7\r\n",10); - shared.select8 = createStringObject("select 8\r\n",10); - shared.select9 = createStringObject("select 9\r\n",10); - shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13); - shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14); - shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15); - shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18); - shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17); - shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19); - shared.mbulk3 = createStringObject("*3\r\n",4); - shared.mbulk4 = createStringObject("*4\r\n",4); - for (j = 0; j < REDIS_SHARED_INTEGERS; j++) { - shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j); - shared.integers[j]->encoding = REDIS_ENCODING_INT; - } -} - -static void appendServerSaveParams(time_t seconds, int changes) { - server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1)); - server.saveparams[server.saveparamslen].seconds = seconds; - server.saveparams[server.saveparamslen].changes = changes; - server.saveparamslen++; -} - -static void resetServerSaveParams() { - zfree(server.saveparams); - server.saveparams = NULL; - server.saveparamslen = 0; -} - -static void initServerConfig() { - server.dbnum = REDIS_DEFAULT_DBNUM; - server.port = REDIS_SERVERPORT; - server.verbosity = REDIS_VERBOSE; - server.maxidletime = REDIS_MAXIDLETIME; - server.saveparams = NULL; - server.logfile = NULL; /* NULL = log on standard output */ - server.bindaddr = NULL; - server.glueoutputbuf = 1; - server.daemonize = 0; - server.appendonly = 0; - server.appendfsync = APPENDFSYNC_EVERYSEC; - server.no_appendfsync_on_rewrite = 0; - server.lastfsync = time(NULL); - server.appendfd = -1; - server.appendseldb = -1; /* Make sure the first time will not match */ - server.pidfile = zstrdup("/var/run/redis.pid"); - server.dbfilename = zstrdup("dump.rdb"); - server.appendfilename = zstrdup("appendonly.aof"); - server.requirepass = NULL; - server.rdbcompression = 1; - server.activerehashing = 1; - server.maxclients = 0; - server.blpop_blocked_clients = 0; - server.maxmemory = 0; - server.vm_enabled = 0; - server.vm_swap_file = zstrdup("/tmp/redis-%p.vm"); - server.vm_page_size = 256; /* 256 bytes per page */ - server.vm_pages = 1024*1024*100; /* 104 millions of pages */ - server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */ - server.vm_max_threads = 4; - server.vm_blocked_clients = 0; - server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES; - server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE; - server.list_max_ziplist_entries = REDIS_LIST_MAX_ZIPLIST_ENTRIES; - server.list_max_ziplist_value = REDIS_LIST_MAX_ZIPLIST_VALUE; - server.shutdown_asap = 0; - - resetServerSaveParams(); - - appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */ - appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */ - appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */ - /* Replication related */ - server.isslave = 0; - server.masterauth = NULL; - server.masterhost = NULL; - server.masterport = 6379; - server.master = NULL; - server.replstate = REDIS_REPL_NONE; - - /* Double constants initialization */ - R_Zero = 0.0; - R_PosInf = 1.0/R_Zero; - R_NegInf = -1.0/R_Zero; - R_Nan = R_Zero/R_Zero; -} - -static void initServer() { - int j; - - signal(SIGHUP, SIG_IGN); - signal(SIGPIPE, SIG_IGN); - setupSigSegvAction(); - - server.devnull = fopen("/dev/null","w"); - if (server.devnull == NULL) { - redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr); - exit(1); - } - server.clients = listCreate(); - server.slaves = listCreate(); - server.monitors = listCreate(); - server.objfreelist = listCreate(); - createSharedObjects(); - server.el = aeCreateEventLoop(); - server.db = zmalloc(sizeof(redisDb)*server.dbnum); - server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr); - if (server.fd == -1) { - redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr); - exit(1); - } - for (j = 0; j < server.dbnum; j++) { - server.db[j].dict = dictCreate(&dbDictType,NULL); - server.db[j].expires = dictCreate(&keyptrDictType,NULL); - server.db[j].blocking_keys = dictCreate(&keylistDictType,NULL); - server.db[j].watched_keys = dictCreate(&keylistDictType,NULL); - if (server.vm_enabled) - server.db[j].io_keys = dictCreate(&keylistDictType,NULL); - server.db[j].id = j; - } - server.pubsub_channels = dictCreate(&keylistDictType,NULL); - server.pubsub_patterns = listCreate(); - listSetFreeMethod(server.pubsub_patterns,freePubsubPattern); - listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern); - server.cronloops = 0; - server.bgsavechildpid = -1; - server.bgrewritechildpid = -1; - server.bgrewritebuf = sdsempty(); - server.aofbuf = sdsempty(); - server.lastsave = time(NULL); - server.dirty = 0; - server.stat_numcommands = 0; - server.stat_numconnections = 0; - server.stat_expiredkeys = 0; - server.stat_starttime = time(NULL); - server.unixtime = time(NULL); - aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL); - if (aeCreateFileEvent(server.el, server.fd, AE_READABLE, - acceptHandler, NULL) == AE_ERR) oom("creating file event"); - - if (server.appendonly) { - server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644); - if (server.appendfd == -1) { - redisLog(REDIS_WARNING, "Can't open the append-only file: %s", - strerror(errno)); - exit(1); - } - } - - if (server.vm_enabled) vmInit(); -} - -/* Empty the whole database */ -static long long emptyDb() { - int j; - long long removed = 0; - - for (j = 0; j < server.dbnum; j++) { - removed += dictSize(server.db[j].dict); - dictEmpty(server.db[j].dict); - dictEmpty(server.db[j].expires); - } - return removed; -} - -static int yesnotoi(char *s) { - if (!strcasecmp(s,"yes")) return 1; - else if (!strcasecmp(s,"no")) return 0; - else return -1; -} - -/* I agree, this is a very rudimental way to load a configuration... - will improve later if the config gets more complex */ -static void loadServerConfig(char *filename) { - FILE *fp; - char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL; - int linenum = 0; - sds line = NULL; - - if (filename[0] == '-' && filename[1] == '\0') - fp = stdin; - else { - if ((fp = fopen(filename,"r")) == NULL) { - redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename); - exit(1); - } - } - - while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) { - sds *argv; - int argc, j; - - linenum++; - line = sdsnew(buf); - line = sdstrim(line," \t\r\n"); - - /* Skip comments and blank lines*/ - if (line[0] == '#' || line[0] == '\0') { - sdsfree(line); - continue; - } - - /* Split into arguments */ - argv = sdssplitlen(line,sdslen(line)," ",1,&argc); - sdstolower(argv[0]); - - /* Execute config directives */ - if (!strcasecmp(argv[0],"timeout") && argc == 2) { - server.maxidletime = atoi(argv[1]); - if (server.maxidletime < 0) { - err = "Invalid timeout value"; goto loaderr; - } - } else if (!strcasecmp(argv[0],"port") && argc == 2) { - server.port = atoi(argv[1]); - if (server.port < 1 || server.port > 65535) { - err = "Invalid port"; goto loaderr; - } - } else if (!strcasecmp(argv[0],"bind") && argc == 2) { - server.bindaddr = zstrdup(argv[1]); - } else if (!strcasecmp(argv[0],"save") && argc == 3) { - int seconds = atoi(argv[1]); - int changes = atoi(argv[2]); - if (seconds < 1 || changes < 0) { - err = "Invalid save parameters"; goto loaderr; - } - appendServerSaveParams(seconds,changes); - } else if (!strcasecmp(argv[0],"dir") && argc == 2) { - if (chdir(argv[1]) == -1) { - redisLog(REDIS_WARNING,"Can't chdir to '%s': %s", - argv[1], strerror(errno)); - exit(1); - } - } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) { - if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG; - else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE; - else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE; - else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING; - else { - err = "Invalid log level. Must be one of debug, notice, warning"; - goto loaderr; - } - } else if (!strcasecmp(argv[0],"logfile") && argc == 2) { - FILE *logfp; - - server.logfile = zstrdup(argv[1]); - if (!strcasecmp(server.logfile,"stdout")) { - zfree(server.logfile); - server.logfile = NULL; - } - if (server.logfile) { - /* Test if we are able to open the file. The server will not - * be able to abort just for this problem later... */ - logfp = fopen(server.logfile,"a"); - if (logfp == NULL) { - err = sdscatprintf(sdsempty(), - "Can't open the log file: %s", strerror(errno)); - goto loaderr; - } - fclose(logfp); - } - } else if (!strcasecmp(argv[0],"databases") && argc == 2) { - server.dbnum = atoi(argv[1]); - if (server.dbnum < 1) { - err = "Invalid number of databases"; goto loaderr; - } - } else if (!strcasecmp(argv[0],"include") && argc == 2) { - loadServerConfig(argv[1]); - } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) { - server.maxclients = atoi(argv[1]); - } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) { - server.maxmemory = memtoll(argv[1],NULL); - } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) { - server.masterhost = sdsnew(argv[1]); - server.masterport = atoi(argv[2]); - server.replstate = REDIS_REPL_CONNECT; - } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) { - server.masterauth = zstrdup(argv[1]); - } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) { - if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } - } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) { - if ((server.rdbcompression = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } - } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) { - if ((server.activerehashing = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } - } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) { - if ((server.daemonize = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } - } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) { - if ((server.appendonly = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } - } else if (!strcasecmp(argv[0],"appendfilename") && argc == 2) { - zfree(server.appendfilename); - server.appendfilename = zstrdup(argv[1]); - } else if (!strcasecmp(argv[0],"no-appendfsync-on-rewrite") - && argc == 2) { - if ((server.no_appendfsync_on_rewrite= yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } - } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) { - if (!strcasecmp(argv[1],"no")) { - server.appendfsync = APPENDFSYNC_NO; - } else if (!strcasecmp(argv[1],"always")) { - server.appendfsync = APPENDFSYNC_ALWAYS; - } else if (!strcasecmp(argv[1],"everysec")) { - server.appendfsync = APPENDFSYNC_EVERYSEC; - } else { - err = "argument must be 'no', 'always' or 'everysec'"; - goto loaderr; - } - } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) { - server.requirepass = zstrdup(argv[1]); - } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) { - zfree(server.pidfile); - server.pidfile = zstrdup(argv[1]); - } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) { - zfree(server.dbfilename); - server.dbfilename = zstrdup(argv[1]); - } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) { - if ((server.vm_enabled = yesnotoi(argv[1])) == -1) { - err = "argument must be 'yes' or 'no'"; goto loaderr; - } - } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) { - zfree(server.vm_swap_file); - server.vm_swap_file = zstrdup(argv[1]); - } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) { - server.vm_max_memory = memtoll(argv[1],NULL); - } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) { - server.vm_page_size = memtoll(argv[1], NULL); - } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) { - server.vm_pages = memtoll(argv[1], NULL); - } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) { - server.vm_max_threads = strtoll(argv[1], NULL, 10); - } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){ - server.hash_max_zipmap_entries = memtoll(argv[1], NULL); - } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){ - server.hash_max_zipmap_value = memtoll(argv[1], NULL); - } else if (!strcasecmp(argv[0],"list-max-ziplist-entries") && argc == 2){ - server.list_max_ziplist_entries = memtoll(argv[1], NULL); - } else if (!strcasecmp(argv[0],"list-max-ziplist-value") && argc == 2){ - server.list_max_ziplist_value = memtoll(argv[1], NULL); - } else { - err = "Bad directive or wrong number of arguments"; goto loaderr; - } - for (j = 0; j < argc; j++) - sdsfree(argv[j]); - zfree(argv); - sdsfree(line); - } - if (fp != stdin) fclose(fp); - return; - -loaderr: - fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n"); - fprintf(stderr, "Reading the configuration file, at line %d\n", linenum); - fprintf(stderr, ">>> '%s'\n", line); - fprintf(stderr, "%s\n", err); - exit(1); -} - -static void freeClientArgv(redisClient *c) { - int j; - - for (j = 0; j < c->argc; j++) - decrRefCount(c->argv[j]); - for (j = 0; j < c->mbargc; j++) - decrRefCount(c->mbargv[j]); - c->argc = 0; - c->mbargc = 0; -} - -static void freeClient(redisClient *c) { - listNode *ln; - - /* Note that if the client we are freeing is blocked into a blocking - * call, we have to set querybuf to NULL *before* to call - * unblockClientWaitingData() to avoid processInputBuffer() will get - * called. Also it is important to remove the file events after - * this, because this call adds the READABLE event. */ - sdsfree(c->querybuf); - c->querybuf = NULL; - if (c->flags & REDIS_BLOCKED) - unblockClientWaitingData(c); - - /* UNWATCH all the keys */ - unwatchAllKeys(c); - listRelease(c->watched_keys); - /* Unsubscribe from all the pubsub channels */ - pubsubUnsubscribeAllChannels(c,0); - pubsubUnsubscribeAllPatterns(c,0); - dictRelease(c->pubsub_channels); - listRelease(c->pubsub_patterns); - /* Obvious cleanup */ - aeDeleteFileEvent(server.el,c->fd,AE_READABLE); - aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE); - listRelease(c->reply); - freeClientArgv(c); - close(c->fd); - /* Remove from the list of clients */ - ln = listSearchKey(server.clients,c); - redisAssert(ln != NULL); - listDelNode(server.clients,ln); - /* Remove from the list of clients that are now ready to be restarted - * after waiting for swapped keys */ - if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) { - ln = listSearchKey(server.io_ready_clients,c); - if (ln) { - listDelNode(server.io_ready_clients,ln); - server.vm_blocked_clients--; - } - } - /* Remove from the list of clients waiting for swapped keys */ - while (server.vm_enabled && listLength(c->io_keys)) { - ln = listFirst(c->io_keys); - dontWaitForSwappedKey(c,ln->value); - } - listRelease(c->io_keys); - /* Master/slave cleanup */ - if (c->flags & REDIS_SLAVE) { - if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1) - close(c->repldbfd); - list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves; - ln = listSearchKey(l,c); - redisAssert(ln != NULL); - listDelNode(l,ln); - } - if (c->flags & REDIS_MASTER) { - server.master = NULL; - server.replstate = REDIS_REPL_CONNECT; - } - /* Release memory */ - zfree(c->argv); - zfree(c->mbargv); - freeClientMultiState(c); - zfree(c); -} - -#define GLUEREPLY_UP_TO (1024) -static void glueReplyBuffersIfNeeded(redisClient *c) { - int copylen = 0; - char buf[GLUEREPLY_UP_TO]; - listNode *ln; - listIter li; - robj *o; - - listRewind(c->reply,&li); - while((ln = listNext(&li))) { - int objlen; - - o = ln->value; - objlen = sdslen(o->ptr); - if (copylen + objlen <= GLUEREPLY_UP_TO) { - memcpy(buf+copylen,o->ptr,objlen); - copylen += objlen; - listDelNode(c->reply,ln); - } else { - if (copylen == 0) return; - break; - } - } - /* Now the output buffer is empty, add the new single element */ - o = createObject(REDIS_STRING,sdsnewlen(buf,copylen)); - listAddNodeHead(c->reply,o); -} - -static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) { - redisClient *c = privdata; - int nwritten = 0, totwritten = 0, objlen; - robj *o; - REDIS_NOTUSED(el); - REDIS_NOTUSED(mask); - - /* Use writev() if we have enough buffers to send */ - if (!server.glueoutputbuf && - listLength(c->reply) > REDIS_WRITEV_THRESHOLD && - !(c->flags & REDIS_MASTER)) - { - sendReplyToClientWritev(el, fd, privdata, mask); - return; - } - - while(listLength(c->reply)) { - if (server.glueoutputbuf && listLength(c->reply) > 1) - glueReplyBuffersIfNeeded(c); - - o = listNodeValue(listFirst(c->reply)); - objlen = sdslen(o->ptr); - - if (objlen == 0) { - listDelNode(c->reply,listFirst(c->reply)); - continue; - } - - if (c->flags & REDIS_MASTER) { - /* Don't reply to a master */ - nwritten = objlen - c->sentlen; - } else { - nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen); - if (nwritten <= 0) break; - } - c->sentlen += nwritten; - totwritten += nwritten; - /* If we fully sent the object on head go to the next one */ - if (c->sentlen == objlen) { - listDelNode(c->reply,listFirst(c->reply)); - c->sentlen = 0; - } - /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT - * bytes, in a single threaded server it's a good idea to serve - * other clients as well, even if a very large request comes from - * super fast link that is always able to accept data (in real world - * scenario think about 'KEYS *' against the loopback interfae) */ - if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break; - } - if (nwritten == -1) { - if (errno == EAGAIN) { - nwritten = 0; - } else { - redisLog(REDIS_VERBOSE, - "Error writing to client: %s", strerror(errno)); - freeClient(c); - return; - } - } - if (totwritten > 0) c->lastinteraction = time(NULL); - if (listLength(c->reply) == 0) { - c->sentlen = 0; - aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE); - } -} - -static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask) -{ - redisClient *c = privdata; - int nwritten = 0, totwritten = 0, objlen, willwrite; - robj *o; - struct iovec iov[REDIS_WRITEV_IOVEC_COUNT]; - int offset, ion = 0; - REDIS_NOTUSED(el); - REDIS_NOTUSED(mask); - - listNode *node; - while (listLength(c->reply)) { - offset = c->sentlen; - ion = 0; - willwrite = 0; - - /* fill-in the iov[] array */ - for(node = listFirst(c->reply); node; node = listNextNode(node)) { - o = listNodeValue(node); - objlen = sdslen(o->ptr); - - if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT) - break; - - if(ion == REDIS_WRITEV_IOVEC_COUNT) - break; /* no more iovecs */ - - iov[ion].iov_base = ((char*)o->ptr) + offset; - iov[ion].iov_len = objlen - offset; - willwrite += objlen - offset; - offset = 0; /* just for the first item */ - ion++; - } - - if(willwrite == 0) - break; - - /* write all collected blocks at once */ - if((nwritten = writev(fd, iov, ion)) < 0) { - if (errno != EAGAIN) { - redisLog(REDIS_VERBOSE, - "Error writing to client: %s", strerror(errno)); - freeClient(c); - return; - } - break; - } - - totwritten += nwritten; - offset = c->sentlen; - - /* remove written robjs from c->reply */ - while (nwritten && listLength(c->reply)) { - o = listNodeValue(listFirst(c->reply)); - objlen = sdslen(o->ptr); - - if(nwritten >= objlen - offset) { - listDelNode(c->reply, listFirst(c->reply)); - nwritten -= objlen - offset; - c->sentlen = 0; - } else { - /* partial write */ - c->sentlen += nwritten; - break; - } - offset = 0; - } - } - - if (totwritten > 0) - c->lastinteraction = time(NULL); - - if (listLength(c->reply) == 0) { - c->sentlen = 0; - aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE); - } -} - -static int qsortRedisCommands(const void *r1, const void *r2) { - return strcasecmp( - ((struct redisCommand*)r1)->name, - ((struct redisCommand*)r2)->name); -} - -static void sortCommandTable() { - /* Copy and sort the read-only version of the command table */ - commandTable = (struct redisCommand*)malloc(sizeof(readonlyCommandTable)); - memcpy(commandTable,readonlyCommandTable,sizeof(readonlyCommandTable)); - qsort(commandTable, - sizeof(readonlyCommandTable)/sizeof(struct redisCommand), - sizeof(struct redisCommand),qsortRedisCommands); -} - -static struct redisCommand *lookupCommand(char *name) { - struct redisCommand tmp = {name,NULL,0,0,NULL,0,0,0}; - return bsearch( - &tmp, - commandTable, - sizeof(readonlyCommandTable)/sizeof(struct redisCommand), - sizeof(struct redisCommand), - qsortRedisCommands); -} - -/* resetClient prepare the client to process the next command */ -static void resetClient(redisClient *c) { - freeClientArgv(c); - c->bulklen = -1; - c->multibulk = 0; -} - -/* Call() is the core of Redis execution of a command */ -static void call(redisClient *c, struct redisCommand *cmd) { - long long dirty; - - dirty = server.dirty; - cmd->proc(c); - dirty = server.dirty-dirty; - - if (server.appendonly && dirty) - feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc); - if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) && - listLength(server.slaves)) - replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc); - if (listLength(server.monitors)) - replicationFeedMonitors(server.monitors,c->db->id,c->argv,c->argc); - server.stat_numcommands++; -} - -/* If this function gets called we already read a whole - * command, argments are in the client argv/argc fields. - * processCommand() execute the command or prepare the - * server for a bulk read from the client. - * - * If 1 is returned the client is still alive and valid and - * and other operations can be performed by the caller. Otherwise - * if 0 is returned the client was destroied (i.e. after QUIT). */ -static int processCommand(redisClient *c) { - struct redisCommand *cmd; - - /* Free some memory if needed (maxmemory setting) */ - if (server.maxmemory) freeMemoryIfNeeded(); - - /* Handle the multi bulk command type. This is an alternative protocol - * supported by Redis in order to receive commands that are composed of - * multiple binary-safe "bulk" arguments. The latency of processing is - * a bit higher but this allows things like multi-sets, so if this - * protocol is used only for MSET and similar commands this is a big win. */ - if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') { - c->multibulk = atoi(((char*)c->argv[0]->ptr)+1); - if (c->multibulk <= 0) { - resetClient(c); - return 1; - } else { - decrRefCount(c->argv[c->argc-1]); - c->argc--; - return 1; - } - } else if (c->multibulk) { - if (c->bulklen == -1) { - if (((char*)c->argv[0]->ptr)[0] != '$') { - addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n")); - resetClient(c); - return 1; - } else { - int bulklen = atoi(((char*)c->argv[0]->ptr)+1); - decrRefCount(c->argv[0]); - if (bulklen < 0 || bulklen > 1024*1024*1024) { - c->argc--; - addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n")); - resetClient(c); - return 1; - } - c->argc--; - c->bulklen = bulklen+2; /* add two bytes for CR+LF */ - return 1; - } - } else { - c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1)); - c->mbargv[c->mbargc] = c->argv[0]; - c->mbargc++; - c->argc--; - c->multibulk--; - if (c->multibulk == 0) { - robj **auxargv; - int auxargc; - - /* Here we need to swap the multi-bulk argc/argv with the - * normal argc/argv of the client structure. */ - auxargv = c->argv; - c->argv = c->mbargv; - c->mbargv = auxargv; - - auxargc = c->argc; - c->argc = c->mbargc; - c->mbargc = auxargc; - - /* We need to set bulklen to something different than -1 - * in order for the code below to process the command without - * to try to read the last argument of a bulk command as - * a special argument. */ - c->bulklen = 0; - /* continue below and process the command */ - } else { - c->bulklen = -1; - return 1; - } - } - } - /* -- end of multi bulk commands processing -- */ - - /* The QUIT command is handled as a special case. Normal command - * procs are unable to close the client connection safely */ - if (!strcasecmp(c->argv[0]->ptr,"quit")) { - freeClient(c); - return 0; - } - - /* Now lookup the command and check ASAP about trivial error conditions - * such wrong arity, bad command name and so forth. */ - cmd = lookupCommand(c->argv[0]->ptr); - if (!cmd) { - addReplySds(c, - sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n", - (char*)c->argv[0]->ptr)); - resetClient(c); - return 1; - } else if ((cmd->arity > 0 && cmd->arity != c->argc) || - (c->argc < -cmd->arity)) { - addReplySds(c, - sdscatprintf(sdsempty(), - "-ERR wrong number of arguments for '%s' command\r\n", - cmd->name)); - resetClient(c); - return 1; - } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) { - /* This is a bulk command, we have to read the last argument yet. */ - int bulklen = atoi(c->argv[c->argc-1]->ptr); - - decrRefCount(c->argv[c->argc-1]); - if (bulklen < 0 || bulklen > 1024*1024*1024) { - c->argc--; - addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n")); - resetClient(c); - return 1; - } - c->argc--; - c->bulklen = bulklen+2; /* add two bytes for CR+LF */ - /* It is possible that the bulk read is already in the - * buffer. Check this condition and handle it accordingly. - * This is just a fast path, alternative to call processInputBuffer(). - * It's a good idea since the code is small and this condition - * happens most of the times. */ - if ((signed)sdslen(c->querybuf) >= c->bulklen) { - c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2); - c->argc++; - c->querybuf = sdsrange(c->querybuf,c->bulklen,-1); - } else { - /* Otherwise return... there is to read the last argument - * from the socket. */ - return 1; - } - } - /* Let's try to encode the bulk object to save space. */ - if (cmd->flags & REDIS_CMD_BULK) - c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]); - - /* Check if the user is authenticated */ - if (server.requirepass && !c->authenticated && cmd->proc != authCommand) { - addReplySds(c,sdsnew("-ERR operation not permitted\r\n")); - resetClient(c); - return 1; - } - - /* Handle the maxmemory directive */ - if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) && - zmalloc_used_memory() > server.maxmemory) - { - addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n")); - resetClient(c); - return 1; - } - - /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */ - if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0) - && - cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand && - cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) { - addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n")); - resetClient(c); - return 1; - } - - /* Exec the command */ - if (c->flags & REDIS_MULTI && - cmd->proc != execCommand && cmd->proc != discardCommand && - cmd->proc != multiCommand && cmd->proc != watchCommand) - { - queueMultiCommand(c,cmd); - addReply(c,shared.queued); - } else { - if (server.vm_enabled && server.vm_max_threads > 0 && - blockClientOnSwappedKeys(c,cmd)) return 1; - call(c,cmd); - } - - /* Prepare the client for the next command */ - resetClient(c); - return 1; -} - -static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) { - listNode *ln; - listIter li; - int outc = 0, j; - robj **outv; - /* We need 1+(ARGS*3) objects since commands are using the new protocol - * and we one 1 object for the first "*\r\n" multibulk count, then - * for every additional object we have "$\r\n" + object + "\r\n". */ - robj *static_outv[REDIS_STATIC_ARGS*3+1]; - robj *lenobj; - - if (argc <= REDIS_STATIC_ARGS) { - outv = static_outv; - } else { - outv = zmalloc(sizeof(robj*)*(argc*3+1)); - } - - lenobj = createObject(REDIS_STRING, - sdscatprintf(sdsempty(), "*%d\r\n", argc)); - lenobj->refcount = 0; - outv[outc++] = lenobj; - for (j = 0; j < argc; j++) { - lenobj = createObject(REDIS_STRING, - sdscatprintf(sdsempty(),"$%lu\r\n", - (unsigned long) stringObjectLen(argv[j]))); - lenobj->refcount = 0; - outv[outc++] = lenobj; - outv[outc++] = argv[j]; - outv[outc++] = shared.crlf; - } - - /* Increment all the refcounts at start and decrement at end in order to - * be sure to free objects if there is no slave in a replication state - * able to be feed with commands */ - for (j = 0; j < outc; j++) incrRefCount(outv[j]); - listRewind(slaves,&li); - while((ln = listNext(&li))) { - redisClient *slave = ln->value; - - /* Don't feed slaves that are still waiting for BGSAVE to start */ - if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue; - - /* Feed all the other slaves, MONITORs and so on */ - if (slave->slaveseldb != dictid) { - robj *selectcmd; - - switch(dictid) { - case 0: selectcmd = shared.select0; break; - case 1: selectcmd = shared.select1; break; - case 2: selectcmd = shared.select2; break; - case 3: selectcmd = shared.select3; break; - case 4: selectcmd = shared.select4; break; - case 5: selectcmd = shared.select5; break; - case 6: selectcmd = shared.select6; break; - case 7: selectcmd = shared.select7; break; - case 8: selectcmd = shared.select8; break; - case 9: selectcmd = shared.select9; break; - default: - selectcmd = createObject(REDIS_STRING, - sdscatprintf(sdsempty(),"select %d\r\n",dictid)); - selectcmd->refcount = 0; - break; - } - addReply(slave,selectcmd); - slave->slaveseldb = dictid; - } - for (j = 0; j < outc; j++) addReply(slave,outv[j]); - } - for (j = 0; j < outc; j++) decrRefCount(outv[j]); - if (outv != static_outv) zfree(outv); -} - -static sds sdscatrepr(sds s, char *p, size_t len) { - s = sdscatlen(s,"\"",1); - while(len--) { - switch(*p) { - case '\\': - case '"': - s = sdscatprintf(s,"\\%c",*p); - break; - case '\n': s = sdscatlen(s,"\\n",1); break; - case '\r': s = sdscatlen(s,"\\r",1); break; - case '\t': s = sdscatlen(s,"\\t",1); break; - case '\a': s = sdscatlen(s,"\\a",1); break; - case '\b': s = sdscatlen(s,"\\b",1); break; - default: - if (isprint(*p)) - s = sdscatprintf(s,"%c",*p); - else - s = sdscatprintf(s,"\\x%02x",(unsigned char)*p); - break; - } - p++; - } - return sdscatlen(s,"\"",1); -} - -static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc) { - listNode *ln; - listIter li; - int j; - sds cmdrepr = sdsnew("+"); - robj *cmdobj; - struct timeval tv; - - gettimeofday(&tv,NULL); - cmdrepr = sdscatprintf(cmdrepr,"%ld.%ld ",(long)tv.tv_sec,(long)tv.tv_usec); - if (dictid != 0) cmdrepr = sdscatprintf(cmdrepr,"(db %d) ", dictid); - - for (j = 0; j < argc; j++) { - if (argv[j]->encoding == REDIS_ENCODING_INT) { - cmdrepr = sdscatprintf(cmdrepr, "%ld", (long)argv[j]->ptr); - } else { - cmdrepr = sdscatrepr(cmdrepr,(char*)argv[j]->ptr, - sdslen(argv[j]->ptr)); - } - if (j != argc-1) - cmdrepr = sdscatlen(cmdrepr," ",1); - } - cmdrepr = sdscatlen(cmdrepr,"\r\n",2); - cmdobj = createObject(REDIS_STRING,cmdrepr); - - listRewind(monitors,&li); - while((ln = listNext(&li))) { - redisClient *monitor = ln->value; - addReply(monitor,cmdobj); - } - decrRefCount(cmdobj); -} - -static void processInputBuffer(redisClient *c) { -again: - /* Before to process the input buffer, make sure the client is not - * waitig for a blocking operation such as BLPOP. Note that the first - * iteration the client is never blocked, otherwise the processInputBuffer - * would not be called at all, but after the execution of the first commands - * in the input buffer the client may be blocked, and the "goto again" - * will try to reiterate. The following line will make it return asap. */ - if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return; - if (c->bulklen == -1) { - /* Read the first line of the query */ - char *p = strchr(c->querybuf,'\n'); - size_t querylen; - - if (p) { - sds query, *argv; - int argc, j; - - query = c->querybuf; - c->querybuf = sdsempty(); - querylen = 1+(p-(query)); - if (sdslen(query) > querylen) { - /* leave data after the first line of the query in the buffer */ - c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen); - } - *p = '\0'; /* remove "\n" */ - if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */ - sdsupdatelen(query); - - /* Now we can split the query in arguments */ - argv = sdssplitlen(query,sdslen(query)," ",1,&argc); - sdsfree(query); - - if (c->argv) zfree(c->argv); - c->argv = zmalloc(sizeof(robj*)*argc); - - for (j = 0; j < argc; j++) { - if (sdslen(argv[j])) { - c->argv[c->argc] = createObject(REDIS_STRING,argv[j]); - c->argc++; - } else { - sdsfree(argv[j]); - } - } - zfree(argv); - if (c->argc) { - /* Execute the command. If the client is still valid - * after processCommand() return and there is something - * on the query buffer try to process the next command. */ - if (processCommand(c) && sdslen(c->querybuf)) goto again; - } else { - /* Nothing to process, argc == 0. Just process the query - * buffer if it's not empty or return to the caller */ - if (sdslen(c->querybuf)) goto again; - } - return; - } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) { - redisLog(REDIS_VERBOSE, "Client protocol error"); - freeClient(c); - return; - } - } else { - /* Bulk read handling. Note that if we are at this point - the client already sent a command terminated with a newline, - we are reading the bulk data that is actually the last - argument of the command. */ - int qbl = sdslen(c->querybuf); - - if (c->bulklen <= qbl) { - /* Copy everything but the final CRLF as final argument */ - c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2); - c->argc++; - c->querybuf = sdsrange(c->querybuf,c->bulklen,-1); - /* Process the command. If the client is still valid after - * the processing and there is more data in the buffer - * try to parse it. */ - if (processCommand(c) && sdslen(c->querybuf)) goto again; - return; - } - } -} - -static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) { - redisClient *c = (redisClient*) privdata; - char buf[REDIS_IOBUF_LEN]; - int nread; - REDIS_NOTUSED(el); - REDIS_NOTUSED(mask); - - nread = read(fd, buf, REDIS_IOBUF_LEN); - if (nread == -1) { - if (errno == EAGAIN) { - nread = 0; - } else { - redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno)); - freeClient(c); - return; - } - } else if (nread == 0) { - redisLog(REDIS_VERBOSE, "Client closed connection"); - freeClient(c); - return; - } - if (nread) { - c->querybuf = sdscatlen(c->querybuf, buf, nread); - c->lastinteraction = time(NULL); - } else { - return; - } - processInputBuffer(c); -} - -static int selectDb(redisClient *c, int id) { - if (id < 0 || id >= server.dbnum) - return REDIS_ERR; - c->db = &server.db[id]; - return REDIS_OK; -} - -static void *dupClientReplyValue(void *o) { - incrRefCount((robj*)o); - return o; -} - -static int listMatchObjects(void *a, void *b) { - return equalStringObjects(a,b); -} - -static redisClient *createClient(int fd) { - redisClient *c = zmalloc(sizeof(*c)); - - anetNonBlock(NULL,fd); - anetTcpNoDelay(NULL,fd); - if (!c) return NULL; - selectDb(c,0); - c->fd = fd; - c->querybuf = sdsempty(); - c->argc = 0; - c->argv = NULL; - c->bulklen = -1; - c->multibulk = 0; - c->mbargc = 0; - c->mbargv = NULL; - c->sentlen = 0; - c->flags = 0; - c->lastinteraction = time(NULL); - c->authenticated = 0; - c->replstate = REDIS_REPL_NONE; - c->reply = listCreate(); - listSetFreeMethod(c->reply,decrRefCount); - listSetDupMethod(c->reply,dupClientReplyValue); - c->blocking_keys = NULL; - c->blocking_keys_num = 0; - c->io_keys = listCreate(); - c->watched_keys = listCreate(); - listSetFreeMethod(c->io_keys,decrRefCount); - c->pubsub_channels = dictCreate(&setDictType,NULL); - c->pubsub_patterns = listCreate(); - listSetFreeMethod(c->pubsub_patterns,decrRefCount); - listSetMatchMethod(c->pubsub_patterns,listMatchObjects); - if (aeCreateFileEvent(server.el, c->fd, AE_READABLE, - readQueryFromClient, c) == AE_ERR) { - freeClient(c); - return NULL; - } - listAddNodeTail(server.clients,c); - initClientMultiState(c); - return c; -} - -static void addReply(redisClient *c, robj *obj) { - if (listLength(c->reply) == 0 && - (c->replstate == REDIS_REPL_NONE || - c->replstate == REDIS_REPL_ONLINE) && - aeCreateFileEvent(server.el, c->fd, AE_WRITABLE, - sendReplyToClient, c) == AE_ERR) return; - - if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) { - obj = dupStringObject(obj); - obj->refcount = 0; /* getDecodedObject() will increment the refcount */ - } - listAddNodeTail(c->reply,getDecodedObject(obj)); -} - -static void addReplySds(redisClient *c, sds s) { - robj *o = createObject(REDIS_STRING,s); - addReply(c,o); - decrRefCount(o); -} - -static void addReplyDouble(redisClient *c, double d) { - char buf[128]; - - snprintf(buf,sizeof(buf),"%.17g",d); - addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n", - (unsigned long) strlen(buf),buf)); -} - -static void addReplyLongLong(redisClient *c, long long ll) { - char buf[128]; - size_t len; - - if (ll == 0) { - addReply(c,shared.czero); - return; - } else if (ll == 1) { - addReply(c,shared.cone); - return; - } - buf[0] = ':'; - len = ll2string(buf+1,sizeof(buf)-1,ll); - buf[len+1] = '\r'; - buf[len+2] = '\n'; - addReplySds(c,sdsnewlen(buf,len+3)); -} - -static void addReplyUlong(redisClient *c, unsigned long ul) { - char buf[128]; - size_t len; - - if (ul == 0) { - addReply(c,shared.czero); - return; - } else if (ul == 1) { - addReply(c,shared.cone); - return; - } - len = snprintf(buf,sizeof(buf),":%lu\r\n",ul); - addReplySds(c,sdsnewlen(buf,len)); -} - -static void addReplyBulkLen(redisClient *c, robj *obj) { - size_t len, intlen; - char buf[128]; - - if (obj->encoding == REDIS_ENCODING_RAW) { - len = sdslen(obj->ptr); - } else { - long n = (long)obj->ptr; - - /* Compute how many bytes will take this integer as a radix 10 string */ - len = 1; - if (n < 0) { - len++; - n = -n; - } - while((n = n/10) != 0) { - len++; - } - } - buf[0] = '$'; - intlen = ll2string(buf+1,sizeof(buf)-1,(long long)len); - buf[intlen+1] = '\r'; - buf[intlen+2] = '\n'; - addReplySds(c,sdsnewlen(buf,intlen+3)); -} - -static void addReplyBulk(redisClient *c, robj *obj) { - addReplyBulkLen(c,obj); - addReply(c,obj); - addReply(c,shared.crlf); -} - -static void addReplyBulkSds(redisClient *c, sds s) { - robj *o = createStringObject(s, sdslen(s)); - addReplyBulk(c,o); - decrRefCount(o); -} - -/* In the CONFIG command we need to add vanilla C string as bulk replies */ -static void addReplyBulkCString(redisClient *c, char *s) { - if (s == NULL) { - addReply(c,shared.nullbulk); - } else { - robj *o = createStringObject(s,strlen(s)); - addReplyBulk(c,o); - decrRefCount(o); - } -} - -static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) { - int cport, cfd; - char cip[128]; - redisClient *c; - REDIS_NOTUSED(el); - REDIS_NOTUSED(mask); - REDIS_NOTUSED(privdata); - - cfd = anetAccept(server.neterr, fd, cip, &cport); - if (cfd == AE_ERR) { - redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr); - return; - } - redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport); - if ((c = createClient(cfd)) == NULL) { - redisLog(REDIS_WARNING,"Error allocating resoures for the client"); - close(cfd); /* May be already closed, just ingore errors */ - return; - } - /* If maxclient directive is set and this is one client more... close the - * connection. Note that we create the client instead to check before - * for this condition, since now the socket is already set in nonblocking - * mode and we can send an error for free using the Kernel I/O */ - if (server.maxclients && listLength(server.clients) > server.maxclients) { - char *err = "-ERR max number of clients reached\r\n"; - - /* That's a best effort error message, don't check write errors */ - if (write(c->fd,err,strlen(err)) == -1) { - /* Nothing to do, Just to avoid the warning... */ - } - freeClient(c); - return; - } - server.stat_numconnections++; -} - -/* ======================= Redis objects implementation ===================== */ - -static robj *createObject(int type, void *ptr) { - robj *o; - - if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex); - if (listLength(server.objfreelist)) { - listNode *head = listFirst(server.objfreelist); - o = listNodeValue(head); - listDelNode(server.objfreelist,head); - if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex); - } else { - if (server.vm_enabled) - pthread_mutex_unlock(&server.obj_freelist_mutex); - o = zmalloc(sizeof(*o)); - } - o->type = type; - o->encoding = REDIS_ENCODING_RAW; - o->ptr = ptr; - o->refcount = 1; - if (server.vm_enabled) { - /* Note that this code may run in the context of an I/O thread - * and accessing server.lruclock in theory is an error - * (no locks). But in practice this is safe, and even if we read - * garbage Redis will not fail. */ - o->lru = server.lruclock; - o->storage = REDIS_VM_MEMORY; - } - return o; -} - -static robj *createStringObject(char *ptr, size_t len) { - return createObject(REDIS_STRING,sdsnewlen(ptr,len)); -} - -static robj *createStringObjectFromLongLong(long long value) { - robj *o; - if (value >= 0 && value < REDIS_SHARED_INTEGERS) { - incrRefCount(shared.integers[value]); - o = shared.integers[value]; - } else { - if (value >= LONG_MIN && value <= LONG_MAX) { - o = createObject(REDIS_STRING, NULL); - o->encoding = REDIS_ENCODING_INT; - o->ptr = (void*)((long)value); - } else { - o = createObject(REDIS_STRING,sdsfromlonglong(value)); - } - } - return o; -} - -static robj *dupStringObject(robj *o) { - assert(o->encoding == REDIS_ENCODING_RAW); - return createStringObject(o->ptr,sdslen(o->ptr)); -} - -static robj *createListObject(void) { - list *l = listCreate(); - robj *o = createObject(REDIS_LIST,l); - listSetFreeMethod(l,decrRefCount); - o->encoding = REDIS_ENCODING_LINKEDLIST; - return o; -} - -static robj *createZiplistObject(void) { - unsigned char *zl = ziplistNew(); - robj *o = createObject(REDIS_LIST,zl); - o->encoding = REDIS_ENCODING_ZIPLIST; - return o; -} - -static robj *createSetObject(void) { - dict *d = dictCreate(&setDictType,NULL); - return createObject(REDIS_SET,d); -} - -static robj *createHashObject(void) { - /* All the Hashes start as zipmaps. Will be automatically converted - * into hash tables if there are enough elements or big elements - * inside. */ - unsigned char *zm = zipmapNew(); - robj *o = createObject(REDIS_HASH,zm); - o->encoding = REDIS_ENCODING_ZIPMAP; - return o; -} - -static robj *createZsetObject(void) { - zset *zs = zmalloc(sizeof(*zs)); - - zs->dict = dictCreate(&zsetDictType,NULL); - zs->zsl = zslCreate(); - return createObject(REDIS_ZSET,zs); -} - -static void freeStringObject(robj *o) { - if (o->encoding == REDIS_ENCODING_RAW) { - sdsfree(o->ptr); - } -} - -static void freeListObject(robj *o) { - switch (o->encoding) { - case REDIS_ENCODING_LINKEDLIST: - listRelease((list*) o->ptr); - break; - case REDIS_ENCODING_ZIPLIST: - zfree(o->ptr); - break; - default: - redisPanic("Unknown list encoding type"); - } -} - -static void freeSetObject(robj *o) { - dictRelease((dict*) o->ptr); -} - -static void freeZsetObject(robj *o) { - zset *zs = o->ptr; - - dictRelease(zs->dict); - zslFree(zs->zsl); - zfree(zs); -} - -static void freeHashObject(robj *o) { - switch (o->encoding) { - case REDIS_ENCODING_HT: - dictRelease((dict*) o->ptr); - break; - case REDIS_ENCODING_ZIPMAP: - zfree(o->ptr); - break; - default: - redisPanic("Unknown hash encoding type"); - break; - } -} - -static void incrRefCount(robj *o) { - o->refcount++; -} - -static void decrRefCount(void *obj) { - robj *o = obj; - - /* Object is a swapped out value, or in the process of being loaded. */ - if (server.vm_enabled && - (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING)) - { - vmpointer *vp = obj; - if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(o); - vmMarkPagesFree(vp->page,vp->usedpages); - server.vm_stats_swapped_objects--; - zfree(vp); - return; - } - - if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0"); - /* Object is in memory, or in the process of being swapped out. - * - * If the object is being swapped out, abort the operation on - * decrRefCount even if the refcount does not drop to 0: the object - * is referenced at least two times, as value of the key AND as - * job->val in the iojob. So if we don't invalidate the iojob, when it is - * done but the relevant key was removed in the meantime, the - * complete jobs handler will not find the key about the job and the - * assert will fail. */ - if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING) - vmCancelThreadedIOJob(o); - if (--(o->refcount) == 0) { - switch(o->type) { - case REDIS_STRING: freeStringObject(o); break; - case REDIS_LIST: freeListObject(o); break; - case REDIS_SET: freeSetObject(o); break; - case REDIS_ZSET: freeZsetObject(o); break; - case REDIS_HASH: freeHashObject(o); break; - default: redisPanic("Unknown object type"); break; - } - if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex); - if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX || - !listAddNodeHead(server.objfreelist,o)) - zfree(o); - if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex); - } -} - -static int checkType(redisClient *c, robj *o, int type) { - if (o->type != type) { - addReply(c,shared.wrongtypeerr); - return 1; - } - return 0; -} - -/* Check if the nul-terminated string 's' can be represented by a long - * (that is, is a number that fits into long without any other space or - * character before or after the digits). - * - * If so, the function returns REDIS_OK and *longval is set to the value - * of the number. Otherwise REDIS_ERR is returned */ -static int isStringRepresentableAsLong(sds s, long *longval) { - char buf[32], *endptr; - long value; - int slen; - - value = strtol(s, &endptr, 10); - if (endptr[0] != '\0') return REDIS_ERR; - slen = ll2string(buf,32,value); - - /* If the number converted back into a string is not identical - * then it's not possible to encode the string as integer */ - if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR; - if (longval) *longval = value; - return REDIS_OK; -} - -/* Try to encode a string object in order to save space */ -static robj *tryObjectEncoding(robj *o) { - long value; - sds s = o->ptr; - - if (o->encoding != REDIS_ENCODING_RAW) - return o; /* Already encoded */ - - /* It's not safe to encode shared objects: shared objects can be shared - * everywhere in the "object space" of Redis. Encoded objects can only - * appear as "values" (and not, for instance, as keys) */ - if (o->refcount > 1) return o; - - /* Currently we try to encode only strings */ - redisAssert(o->type == REDIS_STRING); - - /* Check if we can represent this string as a long integer */ - if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o; - - /* Ok, this object can be encoded */ - if (value >= 0 && value < REDIS_SHARED_INTEGERS) { - decrRefCount(o); - incrRefCount(shared.integers[value]); - return shared.integers[value]; - } else { - o->encoding = REDIS_ENCODING_INT; - sdsfree(o->ptr); - o->ptr = (void*) value; - return o; - } -} - -/* Get a decoded version of an encoded object (returned as a new object). - * If the object is already raw-encoded just increment the ref count. */ -static robj *getDecodedObject(robj *o) { - robj *dec; - - if (o->encoding == REDIS_ENCODING_RAW) { - incrRefCount(o); - return o; - } - if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) { - char buf[32]; - - ll2string(buf,32,(long)o->ptr); - dec = createStringObject(buf,strlen(buf)); - return dec; - } else { - redisPanic("Unknown encoding type"); - } -} - -/* Compare two string objects via strcmp() or alike. - * Note that the objects may be integer-encoded. In such a case we - * use ll2string() to get a string representation of the numbers on the stack - * and compare the strings, it's much faster than calling getDecodedObject(). - * - * Important note: if objects are not integer encoded, but binary-safe strings, - * sdscmp() from sds.c will apply memcmp() so this function ca be considered - * binary safe. */ -static int compareStringObjects(robj *a, robj *b) { - redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING); - char bufa[128], bufb[128], *astr, *bstr; - int bothsds = 1; - - if (a == b) return 0; - if (a->encoding != REDIS_ENCODING_RAW) { - ll2string(bufa,sizeof(bufa),(long) a->ptr); - astr = bufa; - bothsds = 0; - } else { - astr = a->ptr; - } - if (b->encoding != REDIS_ENCODING_RAW) { - ll2string(bufb,sizeof(bufb),(long) b->ptr); - bstr = bufb; - bothsds = 0; - } else { - bstr = b->ptr; - } - return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr); -} - -/* Equal string objects return 1 if the two objects are the same from the - * point of view of a string comparison, otherwise 0 is returned. Note that - * this function is faster then checking for (compareStringObject(a,b) == 0) - * because it can perform some more optimization. */ -static int equalStringObjects(robj *a, robj *b) { - if (a->encoding != REDIS_ENCODING_RAW && b->encoding != REDIS_ENCODING_RAW){ - return a->ptr == b->ptr; - } else { - return compareStringObjects(a,b) == 0; - } -} - -static size_t stringObjectLen(robj *o) { - redisAssert(o->type == REDIS_STRING); - if (o->encoding == REDIS_ENCODING_RAW) { - return sdslen(o->ptr); - } else { - char buf[32]; - - return ll2string(buf,32,(long)o->ptr); - } -} - -static int getDoubleFromObject(robj *o, double *target) { - double value; - char *eptr; - - if (o == NULL) { - value = 0; - } else { - redisAssert(o->type == REDIS_STRING); - if (o->encoding == REDIS_ENCODING_RAW) { - value = strtod(o->ptr, &eptr); - if (eptr[0] != '\0') return REDIS_ERR; - } else if (o->encoding == REDIS_ENCODING_INT) { - value = (long)o->ptr; - } else { - redisPanic("Unknown string encoding"); - } - } - - *target = value; - return REDIS_OK; -} - -static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) { - double value; - if (getDoubleFromObject(o, &value) != REDIS_OK) { - if (msg != NULL) { - addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg)); - } else { - addReplySds(c, sdsnew("-ERR value is not a double\r\n")); - } - return REDIS_ERR; - } - - *target = value; - return REDIS_OK; -} - -static int getLongLongFromObject(robj *o, long long *target) { - long long value; - char *eptr; - - if (o == NULL) { - value = 0; - } else { - redisAssert(o->type == REDIS_STRING); - if (o->encoding == REDIS_ENCODING_RAW) { - value = strtoll(o->ptr, &eptr, 10); - if (eptr[0] != '\0') return REDIS_ERR; - } else if (o->encoding == REDIS_ENCODING_INT) { - value = (long)o->ptr; - } else { - redisPanic("Unknown string encoding"); - } - } - - *target = value; - return REDIS_OK; -} - -static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) { - long long value; - if (getLongLongFromObject(o, &value) != REDIS_OK) { - if (msg != NULL) { - addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg)); - } else { - addReplySds(c, sdsnew("-ERR value is not an integer\r\n")); - } - return REDIS_ERR; - } - - *target = value; - return REDIS_OK; -} - -static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) { - long long value; - - if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR; - if (value < LONG_MIN || value > LONG_MAX) { - if (msg != NULL) { - addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg)); - } else { - addReplySds(c, sdsnew("-ERR value is out of range\r\n")); - } - return REDIS_ERR; - } - - *target = value; - return REDIS_OK; -} - -/* =========================== Keyspace access API ========================== */ - -static robj *lookupKey(redisDb *db, robj *key) { - dictEntry *de = dictFind(db->dict,key->ptr); - if (de) { - robj *val = dictGetEntryVal(de); - - if (server.vm_enabled) { - if (val->storage == REDIS_VM_MEMORY || - val->storage == REDIS_VM_SWAPPING) - { - /* If we were swapping the object out, cancel the operation */ - if (val->storage == REDIS_VM_SWAPPING) - vmCancelThreadedIOJob(val); - /* Update the access time for the aging algorithm. */ - val->lru = server.lruclock; - } else { - int notify = (val->storage == REDIS_VM_LOADING); - - /* Our value was swapped on disk. Bring it at home. */ - redisAssert(val->type == REDIS_VMPOINTER); - val = vmLoadObject(val); - dictGetEntryVal(de) = val; - - /* Clients blocked by the VM subsystem may be waiting for - * this key... */ - if (notify) handleClientsBlockedOnSwappedKey(db,key); - } - } - return val; - } else { - return NULL; - } -} - -static robj *lookupKeyRead(redisDb *db, robj *key) { - expireIfNeeded(db,key); - return lookupKey(db,key); -} - -static robj *lookupKeyWrite(redisDb *db, robj *key) { - deleteIfVolatile(db,key); - touchWatchedKey(db,key); - return lookupKey(db,key); -} - -static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) { - robj *o = lookupKeyRead(c->db, key); - if (!o) addReply(c,reply); - return o; -} - -static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) { - robj *o = lookupKeyWrite(c->db, key); - if (!o) addReply(c,reply); - return o; -} - -/* Add the key to the DB. If the key already exists REDIS_ERR is returned, - * otherwise REDIS_OK is returned, and the caller should increment the - * refcount of 'val'. */ -static int dbAdd(redisDb *db, robj *key, robj *val) { - /* Perform a lookup before adding the key, as we need to copy the - * key value. */ - if (dictFind(db->dict, key->ptr) != NULL) { - return REDIS_ERR; - } else { - sds copy = sdsdup(key->ptr); - dictAdd(db->dict, copy, val); - return REDIS_OK; - } -} - -/* If the key does not exist, this is just like dbAdd(). Otherwise - * the value associated to the key is replaced with the new one. - * - * On update (key already existed) 0 is returned. Otherwise 1. */ -static int dbReplace(redisDb *db, robj *key, robj *val) { - if (dictFind(db->dict,key->ptr) == NULL) { - sds copy = sdsdup(key->ptr); - dictAdd(db->dict, copy, val); - return 1; - } else { - dictReplace(db->dict, key->ptr, val); - return 0; - } -} - -static int dbExists(redisDb *db, robj *key) { - return dictFind(db->dict,key->ptr) != NULL; -} - -/* Return a random key, in form of a Redis object. - * If there are no keys, NULL is returned. - * - * The function makes sure to return keys not already expired. */ -static robj *dbRandomKey(redisDb *db) { - struct dictEntry *de; - - while(1) { - sds key; - robj *keyobj; - - de = dictGetRandomKey(db->dict); - if (de == NULL) return NULL; - - key = dictGetEntryKey(de); - keyobj = createStringObject(key,sdslen(key)); - if (dictFind(db->expires,key)) { - if (expireIfNeeded(db,keyobj)) { - decrRefCount(keyobj); - continue; /* search for another key. This expired. */ - } - } - return keyobj; - } -} - -/* Delete a key, value, and associated expiration entry if any, from the DB */ -static int dbDelete(redisDb *db, robj *key) { - /* Deleting an entry from the expires dict will not free the sds of - * the key, because it is shared with the main dictionary. */ - if (dictSize(db->expires) > 0) dictDelete(db->expires,key->ptr); - return dictDelete(db->dict,key->ptr) == DICT_OK; -} - -/*============================ RDB saving/loading =========================== */ - -static int rdbSaveType(FILE *fp, unsigned char type) { - if (fwrite(&type,1,1,fp) == 0) return -1; - return 0; -} - -static int rdbSaveTime(FILE *fp, time_t t) { - int32_t t32 = (int32_t) t; - if (fwrite(&t32,4,1,fp) == 0) return -1; - return 0; -} - -/* check rdbLoadLen() comments for more info */ -static int rdbSaveLen(FILE *fp, uint32_t len) { - unsigned char buf[2]; - - if (len < (1<<6)) { - /* Save a 6 bit len */ - buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6); - if (fwrite(buf,1,1,fp) == 0) return -1; - } else if (len < (1<<14)) { - /* Save a 14 bit len */ - buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6); - buf[1] = len&0xFF; - if (fwrite(buf,2,1,fp) == 0) return -1; - } else { - /* Save a 32 bit len */ - buf[0] = (REDIS_RDB_32BITLEN<<6); - if (fwrite(buf,1,1,fp) == 0) return -1; - len = htonl(len); - if (fwrite(&len,4,1,fp) == 0) return -1; - } - return 0; -} - -/* Encode 'value' as an integer if possible (if integer will fit the - * supported range). If the function sucessful encoded the integer - * then the (up to 5 bytes) encoded representation is written in the - * string pointed by 'enc' and the length is returned. Otherwise - * 0 is returned. */ -static int rdbEncodeInteger(long long value, unsigned char *enc) { - /* Finally check if it fits in our ranges */ - if (value >= -(1<<7) && value <= (1<<7)-1) { - enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8; - enc[1] = value&0xFF; - return 2; - } else if (value >= -(1<<15) && value <= (1<<15)-1) { - enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16; - enc[1] = value&0xFF; - enc[2] = (value>>8)&0xFF; - return 3; - } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) { - enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32; - enc[1] = value&0xFF; - enc[2] = (value>>8)&0xFF; - enc[3] = (value>>16)&0xFF; - enc[4] = (value>>24)&0xFF; - return 5; - } else { - return 0; - } -} - -/* String objects in the form "2391" "-100" without any space and with a - * range of values that can fit in an 8, 16 or 32 bit signed value can be - * encoded as integers to save space */ -static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) { - long long value; - char *endptr, buf[32]; - - /* Check if it's possible to encode this value as a number */ - value = strtoll(s, &endptr, 10); - if (endptr[0] != '\0') return 0; - ll2string(buf,32,value); - - /* If the number converted back into a string is not identical - * then it's not possible to encode the string as integer */ - if (strlen(buf) != len || memcmp(buf,s,len)) return 0; - - return rdbEncodeInteger(value,enc); -} - -static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) { - size_t comprlen, outlen; - unsigned char byte; - void *out; - - /* We require at least four bytes compression for this to be worth it */ - if (len <= 4) return 0; - outlen = len-4; - if ((out = zmalloc(outlen+1)) == NULL) return 0; - comprlen = lzf_compress(s, len, out, outlen); - if (comprlen == 0) { - zfree(out); - return 0; - } - /* Data compressed! Let's save it on disk */ - byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF; - if (fwrite(&byte,1,1,fp) == 0) goto writeerr; - if (rdbSaveLen(fp,comprlen) == -1) goto writeerr; - if (rdbSaveLen(fp,len) == -1) goto writeerr; - if (fwrite(out,comprlen,1,fp) == 0) goto writeerr; - zfree(out); - return comprlen; - -writeerr: - zfree(out); - return -1; -} - -/* Save a string objet as [len][data] on disk. If the object is a string - * representation of an integer value we try to safe it in a special form */ -static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) { - int enclen; - - /* Try integer encoding */ - if (len <= 11) { - unsigned char buf[5]; - if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) { - if (fwrite(buf,enclen,1,fp) == 0) return -1; - return 0; - } - } - - /* Try LZF compression - under 20 bytes it's unable to compress even - * aaaaaaaaaaaaaaaaaa so skip it */ - if (server.rdbcompression && len > 20) { - int retval; - - retval = rdbSaveLzfStringObject(fp,s,len); - if (retval == -1) return -1; - if (retval > 0) return 0; - /* retval == 0 means data can't be compressed, save the old way */ - } - - /* Store verbatim */ - if (rdbSaveLen(fp,len) == -1) return -1; - if (len && fwrite(s,len,1,fp) == 0) return -1; - return 0; -} - -/* Save a long long value as either an encoded string or a string. */ -static int rdbSaveLongLongAsStringObject(FILE *fp, long long value) { - unsigned char buf[32]; - int enclen = rdbEncodeInteger(value,buf); - if (enclen > 0) { - if (fwrite(buf,enclen,1,fp) == 0) return -1; - } else { - /* Encode as string */ - enclen = ll2string((char*)buf,32,value); - redisAssert(enclen < 32); - if (rdbSaveLen(fp,enclen) == -1) return -1; - if (fwrite(buf,enclen,1,fp) == 0) return -1; - } - return 0; -} - -/* Like rdbSaveStringObjectRaw() but handle encoded objects */ -static int rdbSaveStringObject(FILE *fp, robj *obj) { - /* Avoid to decode the object, then encode it again, if the - * object is alrady integer encoded. */ - if (obj->encoding == REDIS_ENCODING_INT) { - return rdbSaveLongLongAsStringObject(fp,(long)obj->ptr); - } else { - redisAssert(obj->encoding == REDIS_ENCODING_RAW); - return rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr)); - } -} - -/* Save a double value. Doubles are saved as strings prefixed by an unsigned - * 8 bit integer specifing the length of the representation. - * This 8 bit integer has special values in order to specify the following - * conditions: - * 253: not a number - * 254: + inf - * 255: - inf - */ -static int rdbSaveDoubleValue(FILE *fp, double val) { - unsigned char buf[128]; - int len; - - if (isnan(val)) { - buf[0] = 253; - len = 1; - } else if (!isfinite(val)) { - len = 1; - buf[0] = (val < 0) ? 255 : 254; - } else { -#if (DBL_MANT_DIG >= 52) && (LLONG_MAX == 0x7fffffffffffffffLL) - /* Check if the float is in a safe range to be casted into a - * long long. We are assuming that long long is 64 bit here. - * Also we are assuming that there are no implementations around where - * double has precision < 52 bit. - * - * Under this assumptions we test if a double is inside an interval - * where casting to long long is safe. Then using two castings we - * make sure the decimal part is zero. If all this is true we use - * integer printing function that is much faster. */ - double min = -4503599627370495; /* (2^52)-1 */ - double max = 4503599627370496; /* -(2^52) */ - if (val > min && val < max && val == ((double)((long long)val))) - ll2string((char*)buf+1,sizeof(buf),(long long)val); - else -#endif - snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val); - buf[0] = strlen((char*)buf+1); - len = buf[0]+1; - } - if (fwrite(buf,len,1,fp) == 0) return -1; - return 0; -} - -/* Save a Redis object. */ -static int rdbSaveObject(FILE *fp, robj *o) { - if (o->type == REDIS_STRING) { - /* Save a string value */ - if (rdbSaveStringObject(fp,o) == -1) return -1; - } else if (o->type == REDIS_LIST) { - /* Save a list value */ - if (o->encoding == REDIS_ENCODING_ZIPLIST) { - unsigned char *p; - unsigned char *vstr; - unsigned int vlen; - long long vlong; - - if (rdbSaveLen(fp,ziplistLen(o->ptr)) == -1) return -1; - p = ziplistIndex(o->ptr,0); - while(ziplistGet(p,&vstr,&vlen,&vlong)) { - if (vstr) { - if (rdbSaveRawString(fp,vstr,vlen) == -1) - return -1; - } else { - if (rdbSaveLongLongAsStringObject(fp,vlong) == -1) - return -1; - } - p = ziplistNext(o->ptr,p); - } - } else if (o->encoding == REDIS_ENCODING_LINKEDLIST) { - list *list = o->ptr; - listIter li; - listNode *ln; - - if (rdbSaveLen(fp,listLength(list)) == -1) return -1; - listRewind(list,&li); - while((ln = listNext(&li))) { - robj *eleobj = listNodeValue(ln); - if (rdbSaveStringObject(fp,eleobj) == -1) return -1; - } - } else { - redisPanic("Unknown list encoding"); - } - } else if (o->type == REDIS_SET) { - /* Save a set value */ - dict *set = o->ptr; - dictIterator *di = dictGetIterator(set); - dictEntry *de; - - if (rdbSaveLen(fp,dictSize(set)) == -1) return -1; - while((de = dictNext(di)) != NULL) { - robj *eleobj = dictGetEntryKey(de); - - if (rdbSaveStringObject(fp,eleobj) == -1) return -1; - } - dictReleaseIterator(di); - } else if (o->type == REDIS_ZSET) { - /* Save a set value */ - zset *zs = o->ptr; - dictIterator *di = dictGetIterator(zs->dict); - dictEntry *de; - - if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1; - while((de = dictNext(di)) != NULL) { - robj *eleobj = dictGetEntryKey(de); - double *score = dictGetEntryVal(de); - - if (rdbSaveStringObject(fp,eleobj) == -1) return -1; - if (rdbSaveDoubleValue(fp,*score) == -1) return -1; - } - dictReleaseIterator(di); - } else if (o->type == REDIS_HASH) { - /* Save a hash value */ - if (o->encoding == REDIS_ENCODING_ZIPMAP) { - unsigned char *p = zipmapRewind(o->ptr); - unsigned int count = zipmapLen(o->ptr); - unsigned char *key, *val; - unsigned int klen, vlen; - - if (rdbSaveLen(fp,count) == -1) return -1; - while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) { - if (rdbSaveRawString(fp,key,klen) == -1) return -1; - if (rdbSaveRawString(fp,val,vlen) == -1) return -1; - } - } else { - dictIterator *di = dictGetIterator(o->ptr); - dictEntry *de; - - if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1; - while((de = dictNext(di)) != NULL) { - robj *key = dictGetEntryKey(de); - robj *val = dictGetEntryVal(de); - - if (rdbSaveStringObject(fp,key) == -1) return -1; - if (rdbSaveStringObject(fp,val) == -1) return -1; - } - dictReleaseIterator(di); - } - } else { - redisPanic("Unknown object type"); - } - return 0; -} - -/* Return the length the object will have on disk if saved with - * the rdbSaveObject() function. Currently we use a trick to get - * this length with very little changes to the code. In the future - * we could switch to a faster solution. */ -static off_t rdbSavedObjectLen(robj *o, FILE *fp) { - if (fp == NULL) fp = server.devnull; - rewind(fp); - assert(rdbSaveObject(fp,o) != 1); - return ftello(fp); -} - -/* Return the number of pages required to save this object in the swap file */ -static off_t rdbSavedObjectPages(robj *o, FILE *fp) { - off_t bytes = rdbSavedObjectLen(o,fp); - - return (bytes+(server.vm_page_size-1))/server.vm_page_size; -} - -/* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */ -static int rdbSave(char *filename) { - dictIterator *di = NULL; - dictEntry *de; - FILE *fp; - char tmpfile[256]; - int j; - time_t now = time(NULL); - - /* Wait for I/O therads to terminate, just in case this is a - * foreground-saving, to avoid seeking the swap file descriptor at the - * same time. */ - if (server.vm_enabled) - waitEmptyIOJobsQueue(); - - snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid()); - fp = fopen(tmpfile,"w"); - if (!fp) { - redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno)); - return REDIS_ERR; - } - if (fwrite("REDIS0001",9,1,fp) == 0) goto werr; - for (j = 0; j < server.dbnum; j++) { - redisDb *db = server.db+j; - dict *d = db->dict; - if (dictSize(d) == 0) continue; - di = dictGetIterator(d); - if (!di) { - fclose(fp); - return REDIS_ERR; - } - - /* Write the SELECT DB opcode */ - if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr; - if (rdbSaveLen(fp,j) == -1) goto werr; - - /* Iterate this DB writing every entry */ - while((de = dictNext(di)) != NULL) { - sds keystr = dictGetEntryKey(de); - robj key, *o = dictGetEntryVal(de); - time_t expiretime; - - initStaticStringObject(key,keystr); - expiretime = getExpire(db,&key); - - /* Save the expire time */ - if (expiretime != -1) { - /* If this key is already expired skip it */ - if (expiretime < now) continue; - if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr; - if (rdbSaveTime(fp,expiretime) == -1) goto werr; - } - /* Save the key and associated value. This requires special - * handling if the value is swapped out. */ - if (!server.vm_enabled || o->storage == REDIS_VM_MEMORY || - o->storage == REDIS_VM_SWAPPING) { - /* Save type, key, value */ - if (rdbSaveType(fp,o->type) == -1) goto werr; - if (rdbSaveStringObject(fp,&key) == -1) goto werr; - if (rdbSaveObject(fp,o) == -1) goto werr; - } else { - /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */ - robj *po; - /* Get a preview of the object in memory */ - po = vmPreviewObject(o); - /* Save type, key, value */ - if (rdbSaveType(fp,po->type) == -1) goto werr; - if (rdbSaveStringObject(fp,&key) == -1) goto werr; - if (rdbSaveObject(fp,po) == -1) goto werr; - /* Remove the loaded object from memory */ - decrRefCount(po); - } - } - dictReleaseIterator(di); - } - /* EOF opcode */ - if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr; - - /* Make sure data will not remain on the OS's output buffers */ - fflush(fp); - fsync(fileno(fp)); - fclose(fp); - - /* Use RENAME to make sure the DB file is changed atomically only - * if the generate DB file is ok. */ - if (rename(tmpfile,filename) == -1) { - redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno)); - unlink(tmpfile); - return REDIS_ERR; - } - redisLog(REDIS_NOTICE,"DB saved on disk"); - server.dirty = 0; - server.lastsave = time(NULL); - return REDIS_OK; - -werr: - fclose(fp); - unlink(tmpfile); - redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno)); - if (di) dictReleaseIterator(di); - return REDIS_ERR; -} - -static int rdbSaveBackground(char *filename) { - pid_t childpid; - - if (server.bgsavechildpid != -1) return REDIS_ERR; - if (server.vm_enabled) waitEmptyIOJobsQueue(); - if ((childpid = fork()) == 0) { - /* Child */ - if (server.vm_enabled) vmReopenSwapFile(); - close(server.fd); - if (rdbSave(filename) == REDIS_OK) { - _exit(0); - } else { - _exit(1); - } - } else { - /* Parent */ - if (childpid == -1) { - redisLog(REDIS_WARNING,"Can't save in background: fork: %s", - strerror(errno)); - return REDIS_ERR; - } - redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid); - server.bgsavechildpid = childpid; - updateDictResizePolicy(); - return REDIS_OK; - } - return REDIS_OK; /* unreached */ -} - -static void rdbRemoveTempFile(pid_t childpid) { - char tmpfile[256]; - - snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid); - unlink(tmpfile); -} - -static int rdbLoadType(FILE *fp) { - unsigned char type; - if (fread(&type,1,1,fp) == 0) return -1; - return type; -} - -static time_t rdbLoadTime(FILE *fp) { - int32_t t32; - if (fread(&t32,4,1,fp) == 0) return -1; - return (time_t) t32; -} - -/* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top - * of this file for a description of how this are stored on disk. - * - * isencoded is set to 1 if the readed length is not actually a length but - * an "encoding type", check the above comments for more info */ -static uint32_t rdbLoadLen(FILE *fp, int *isencoded) { - unsigned char buf[2]; - uint32_t len; - int type; - - if (isencoded) *isencoded = 0; - if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR; - type = (buf[0]&0xC0)>>6; - if (type == REDIS_RDB_6BITLEN) { - /* Read a 6 bit len */ - return buf[0]&0x3F; - } else if (type == REDIS_RDB_ENCVAL) { - /* Read a 6 bit len encoding type */ - if (isencoded) *isencoded = 1; - return buf[0]&0x3F; - } else if (type == REDIS_RDB_14BITLEN) { - /* Read a 14 bit len */ - if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR; - return ((buf[0]&0x3F)<<8)|buf[1]; - } else { - /* Read a 32 bit len */ - if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR; - return ntohl(len); - } -} - -/* Load an integer-encoded object from file 'fp', with the specified - * encoding type 'enctype'. If encode is true the function may return - * an integer-encoded object as reply, otherwise the returned object - * will always be encoded as a raw string. */ -static robj *rdbLoadIntegerObject(FILE *fp, int enctype, int encode) { - unsigned char enc[4]; - long long val; - - if (enctype == REDIS_RDB_ENC_INT8) { - if (fread(enc,1,1,fp) == 0) return NULL; - val = (signed char)enc[0]; - } else if (enctype == REDIS_RDB_ENC_INT16) { - uint16_t v; - if (fread(enc,2,1,fp) == 0) return NULL; - v = enc[0]|(enc[1]<<8); - val = (int16_t)v; - } else if (enctype == REDIS_RDB_ENC_INT32) { - uint32_t v; - if (fread(enc,4,1,fp) == 0) return NULL; - v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24); - val = (int32_t)v; - } else { - val = 0; /* anti-warning */ - redisPanic("Unknown RDB integer encoding type"); - } - if (encode) - return createStringObjectFromLongLong(val); - else - return createObject(REDIS_STRING,sdsfromlonglong(val)); -} - -static robj *rdbLoadLzfStringObject(FILE*fp) { - unsigned int len, clen; - unsigned char *c = NULL; - sds val = NULL; - - if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL; - if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL; - if ((c = zmalloc(clen)) == NULL) goto err; - if ((val = sdsnewlen(NULL,len)) == NULL) goto err; - if (fread(c,clen,1,fp) == 0) goto err; - if (lzf_decompress(c,clen,val,len) == 0) goto err; - zfree(c); - return createObject(REDIS_STRING,val); -err: - zfree(c); - sdsfree(val); - return NULL; -} - -static robj *rdbGenericLoadStringObject(FILE*fp, int encode) { - int isencoded; - uint32_t len; - sds val; - - len = rdbLoadLen(fp,&isencoded); - if (isencoded) { - switch(len) { - case REDIS_RDB_ENC_INT8: - case REDIS_RDB_ENC_INT16: - case REDIS_RDB_ENC_INT32: - return rdbLoadIntegerObject(fp,len,encode); - case REDIS_RDB_ENC_LZF: - return rdbLoadLzfStringObject(fp); - default: - redisPanic("Unknown RDB encoding type"); - } - } - - if (len == REDIS_RDB_LENERR) return NULL; - val = sdsnewlen(NULL,len); - if (len && fread(val,len,1,fp) == 0) { - sdsfree(val); - return NULL; - } - return createObject(REDIS_STRING,val); -} - -static robj *rdbLoadStringObject(FILE *fp) { - return rdbGenericLoadStringObject(fp,0); -} - -static robj *rdbLoadEncodedStringObject(FILE *fp) { - return rdbGenericLoadStringObject(fp,1); -} - -/* For information about double serialization check rdbSaveDoubleValue() */ -static int rdbLoadDoubleValue(FILE *fp, double *val) { - char buf[128]; - unsigned char len; - - if (fread(&len,1,1,fp) == 0) return -1; - switch(len) { - case 255: *val = R_NegInf; return 0; - case 254: *val = R_PosInf; return 0; - case 253: *val = R_Nan; return 0; - default: - if (fread(buf,len,1,fp) == 0) return -1; - buf[len] = '\0'; - sscanf(buf, "%lg", val); - return 0; - } -} - -/* Load a Redis object of the specified type from the specified file. - * On success a newly allocated object is returned, otherwise NULL. */ -static robj *rdbLoadObject(int type, FILE *fp) { - robj *o, *ele, *dec; - size_t len; - - redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp)); - if (type == REDIS_STRING) { - /* Read string value */ - if ((o = rdbLoadEncodedStringObject(fp)) == NULL) return NULL; - o = tryObjectEncoding(o); - } else if (type == REDIS_LIST) { - /* Read list value */ - if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL; - - /* Use a real list when there are too many entries */ - if (len > server.list_max_ziplist_entries) { - o = createListObject(); - } else { - o = createZiplistObject(); - } - - /* Load every single element of the list */ - while(len--) { - if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL; - - /* If we are using a ziplist and the value is too big, convert - * the object to a real list. */ - if (o->encoding == REDIS_ENCODING_ZIPLIST && - ele->encoding == REDIS_ENCODING_RAW && - sdslen(ele->ptr) > server.list_max_ziplist_value) - listTypeConvert(o,REDIS_ENCODING_LINKEDLIST); - - if (o->encoding == REDIS_ENCODING_ZIPLIST) { - dec = getDecodedObject(ele); - o->ptr = ziplistPush(o->ptr,dec->ptr,sdslen(dec->ptr),REDIS_TAIL); - decrRefCount(dec); - decrRefCount(ele); - } else { - ele = tryObjectEncoding(ele); - listAddNodeTail(o->ptr,ele); - } - } - } else if (type == REDIS_SET) { - /* Read list/set value */ - if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL; - o = createSetObject(); - /* It's faster to expand the dict to the right size asap in order - * to avoid rehashing */ - if (len > DICT_HT_INITIAL_SIZE) - dictExpand(o->ptr,len); - /* Load every single element of the list/set */ - while(len--) { - if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL; - ele = tryObjectEncoding(ele); - dictAdd((dict*)o->ptr,ele,NULL); - } - } else if (type == REDIS_ZSET) { - /* Read list/set value */ - size_t zsetlen; - zset *zs; - - if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL; - o = createZsetObject(); - zs = o->ptr; - /* Load every single element of the list/set */ - while(zsetlen--) { - robj *ele; - double *score = zmalloc(sizeof(double)); - - if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL; - ele = tryObjectEncoding(ele); - if (rdbLoadDoubleValue(fp,score) == -1) return NULL; - dictAdd(zs->dict,ele,score); - zslInsert(zs->zsl,*score,ele); - incrRefCount(ele); /* added to skiplist */ - } - } else if (type == REDIS_HASH) { - size_t hashlen; - - if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL; - o = createHashObject(); - /* Too many entries? Use an hash table. */ - if (hashlen > server.hash_max_zipmap_entries) - convertToRealHash(o); - /* Load every key/value, then set it into the zipmap or hash - * table, as needed. */ - while(hashlen--) { - robj *key, *val; - - if ((key = rdbLoadEncodedStringObject(fp)) == NULL) return NULL; - if ((val = rdbLoadEncodedStringObject(fp)) == NULL) return NULL; - /* If we are using a zipmap and there are too big values - * the object is converted to real hash table encoding. */ - if (o->encoding != REDIS_ENCODING_HT && - ((key->encoding == REDIS_ENCODING_RAW && - sdslen(key->ptr) > server.hash_max_zipmap_value) || - (val->encoding == REDIS_ENCODING_RAW && - sdslen(val->ptr) > server.hash_max_zipmap_value))) - { - convertToRealHash(o); - } - - if (o->encoding == REDIS_ENCODING_ZIPMAP) { - unsigned char *zm = o->ptr; - robj *deckey, *decval; - - /* We need raw string objects to add them to the zipmap */ - deckey = getDecodedObject(key); - decval = getDecodedObject(val); - zm = zipmapSet(zm,deckey->ptr,sdslen(deckey->ptr), - decval->ptr,sdslen(decval->ptr),NULL); - o->ptr = zm; - decrRefCount(deckey); - decrRefCount(decval); - decrRefCount(key); - decrRefCount(val); - } else { - key = tryObjectEncoding(key); - val = tryObjectEncoding(val); - dictAdd((dict*)o->ptr,key,val); - } - } - } else { - redisPanic("Unknown object type"); - } - return o; -} - -static int rdbLoad(char *filename) { - FILE *fp; - uint32_t dbid; - int type, retval, rdbver; - int swap_all_values = 0; - redisDb *db = server.db+0; - char buf[1024]; - time_t expiretime, now = time(NULL); - - fp = fopen(filename,"r"); - if (!fp) return REDIS_ERR; - if (fread(buf,9,1,fp) == 0) goto eoferr; - buf[9] = '\0'; - if (memcmp(buf,"REDIS",5) != 0) { - fclose(fp); - redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file"); - return REDIS_ERR; - } - rdbver = atoi(buf+5); - if (rdbver != 1) { - fclose(fp); - redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver); - return REDIS_ERR; - } - while(1) { - robj *key, *val; - int force_swapout; - - expiretime = -1; - /* Read type. */ - if ((type = rdbLoadType(fp)) == -1) goto eoferr; - if (type == REDIS_EXPIRETIME) { - if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr; - /* We read the time so we need to read the object type again */ - if ((type = rdbLoadType(fp)) == -1) goto eoferr; - } - if (type == REDIS_EOF) break; - /* Handle SELECT DB opcode as a special case */ - if (type == REDIS_SELECTDB) { - if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) - goto eoferr; - if (dbid >= (unsigned)server.dbnum) { - redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum); - exit(1); - } - db = server.db+dbid; - continue; - } - /* Read key */ - if ((key = rdbLoadStringObject(fp)) == NULL) goto eoferr; - /* Read value */ - if ((val = rdbLoadObject(type,fp)) == NULL) goto eoferr; - /* Check if the key already expired */ - if (expiretime != -1 && expiretime < now) { - decrRefCount(key); - decrRefCount(val); - continue; - } - /* Add the new object in the hash table */ - retval = dbAdd(db,key,val); - if (retval == REDIS_ERR) { - redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", key->ptr); - exit(1); - } - /* Set the expire time if needed */ - if (expiretime != -1) setExpire(db,key,expiretime); - - /* Handle swapping while loading big datasets when VM is on */ - - /* If we detecter we are hopeless about fitting something in memory - * we just swap every new key on disk. Directly... - * Note that's important to check for this condition before resorting - * to random sampling, otherwise we may try to swap already - * swapped keys. */ - if (swap_all_values) { - dictEntry *de = dictFind(db->dict,key->ptr); - - /* de may be NULL since the key already expired */ - if (de) { - vmpointer *vp; - val = dictGetEntryVal(de); - - if (val->refcount == 1 && - (vp = vmSwapObjectBlocking(val)) != NULL) - dictGetEntryVal(de) = vp; - } - decrRefCount(key); - continue; - } - decrRefCount(key); - - /* Flush data on disk once 32 MB of additional RAM are used... */ - force_swapout = 0; - if ((zmalloc_used_memory() - server.vm_max_memory) > 1024*1024*32) - force_swapout = 1; - - /* If we have still some hope of having some value fitting memory - * then we try random sampling. */ - if (!swap_all_values && server.vm_enabled && force_swapout) { - while (zmalloc_used_memory() > server.vm_max_memory) { - if (vmSwapOneObjectBlocking() == REDIS_ERR) break; - } - if (zmalloc_used_memory() > server.vm_max_memory) - swap_all_values = 1; /* We are already using too much mem */ - } - } - fclose(fp); - return REDIS_OK; - -eoferr: /* unexpected end of file is handled here with a fatal exit */ - redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now."); - exit(1); - return REDIS_ERR; /* Just to avoid warning */ -} - -/*================================== Shutdown =============================== */ -static int prepareForShutdown() { - redisLog(REDIS_WARNING,"User requested shutdown, saving DB..."); - /* Kill the saving child if there is a background saving in progress. - We want to avoid race conditions, for instance our saving child may - overwrite the synchronous saving did by SHUTDOWN. */ - if (server.bgsavechildpid != -1) { - redisLog(REDIS_WARNING,"There is a live saving child. Killing it!"); - kill(server.bgsavechildpid,SIGKILL); - rdbRemoveTempFile(server.bgsavechildpid); - } - if (server.appendonly) { - /* Append only file: fsync() the AOF and exit */ - aof_fsync(server.appendfd); - if (server.vm_enabled) unlink(server.vm_swap_file); - } else { - /* Snapshotting. Perform a SYNC SAVE and exit */ - if (rdbSave(server.dbfilename) == REDIS_OK) { - if (server.daemonize) - unlink(server.pidfile); - redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory()); - } else { - /* Ooops.. error saving! The best we can do is to continue - * operating. Note that if there was a background saving process, - * in the next cron() Redis will be notified that the background - * saving aborted, handling special stuff like slaves pending for - * synchronization... */ - redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit"); - return REDIS_ERR; - } - } - redisLog(REDIS_WARNING,"Server exit now, bye bye..."); - return REDIS_OK; -} - -/*================================== Commands =============================== */ - -static void authCommand(redisClient *c) { - if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) { - c->authenticated = 1; - addReply(c,shared.ok); - } else { - c->authenticated = 0; - addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n")); - } -} - -static void pingCommand(redisClient *c) { - addReply(c,shared.pong); -} - -static void echoCommand(redisClient *c) { - addReplyBulk(c,c->argv[1]); -} - -/*=================================== Strings =============================== */ - -static void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) { - int retval; - long seconds = 0; /* initialized to avoid an harmness warning */ - - if (expire) { - if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK) - return; - if (seconds <= 0) { - addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n")); - return; - } - } - - touchWatchedKey(c->db,key); - if (nx) deleteIfVolatile(c->db,key); - retval = dbAdd(c->db,key,val); - if (retval == REDIS_ERR) { - if (!nx) { - dbReplace(c->db,key,val); - incrRefCount(val); - } else { - addReply(c,shared.czero); - return; - } - } else { - incrRefCount(val); - } - server.dirty++; - removeExpire(c->db,key); - if (expire) setExpire(c->db,key,time(NULL)+seconds); - addReply(c, nx ? shared.cone : shared.ok); -} - -static void setCommand(redisClient *c) { - setGenericCommand(c,0,c->argv[1],c->argv[2],NULL); -} - -static void setnxCommand(redisClient *c) { - setGenericCommand(c,1,c->argv[1],c->argv[2],NULL); -} - -static void setexCommand(redisClient *c) { - setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]); -} - -static int getGenericCommand(redisClient *c) { - robj *o; - - if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL) - return REDIS_OK; - - if (o->type != REDIS_STRING) { - addReply(c,shared.wrongtypeerr); - return REDIS_ERR; - } else { - addReplyBulk(c,o); - return REDIS_OK; - } -} - -static void getCommand(redisClient *c) { - getGenericCommand(c); -} - -static void getsetCommand(redisClient *c) { - if (getGenericCommand(c) == REDIS_ERR) return; - dbReplace(c->db,c->argv[1],c->argv[2]); - incrRefCount(c->argv[2]); - server.dirty++; - removeExpire(c->db,c->argv[1]); -} - -static void mgetCommand(redisClient *c) { - int j; - - addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1)); - for (j = 1; j < c->argc; j++) { - robj *o = lookupKeyRead(c->db,c->argv[j]); - if (o == NULL) { - addReply(c,shared.nullbulk); - } else { - if (o->type != REDIS_STRING) { - addReply(c,shared.nullbulk); - } else { - addReplyBulk(c,o); - } - } - } -} - -static void msetGenericCommand(redisClient *c, int nx) { - int j, busykeys = 0; - - if ((c->argc % 2) == 0) { - addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n")); - return; - } - /* Handle the NX flag. The MSETNX semantic is to return zero and don't - * set nothing at all if at least one already key exists. */ - if (nx) { - for (j = 1; j < c->argc; j += 2) { - if (lookupKeyWrite(c->db,c->argv[j]) != NULL) { - busykeys++; - } - } - } - if (busykeys) { - addReply(c, shared.czero); - return; - } - - for (j = 1; j < c->argc; j += 2) { - c->argv[j+1] = tryObjectEncoding(c->argv[j+1]); - dbReplace(c->db,c->argv[j],c->argv[j+1]); - incrRefCount(c->argv[j+1]); - removeExpire(c->db,c->argv[j]); - } - server.dirty += (c->argc-1)/2; - addReply(c, nx ? shared.cone : shared.ok); -} - -static void msetCommand(redisClient *c) { - msetGenericCommand(c,0); -} - -static void msetnxCommand(redisClient *c) { - msetGenericCommand(c,1); -} - -static void incrDecrCommand(redisClient *c, long long incr) { - long long value; - robj *o; - - o = lookupKeyWrite(c->db,c->argv[1]); - if (o != NULL && checkType(c,o,REDIS_STRING)) return; - if (getLongLongFromObjectOrReply(c,o,&value,NULL) != REDIS_OK) return; - - value += incr; - o = createStringObjectFromLongLong(value); - dbReplace(c->db,c->argv[1],o); - server.dirty++; - addReply(c,shared.colon); - addReply(c,o); - addReply(c,shared.crlf); -} - -static void incrCommand(redisClient *c) { - incrDecrCommand(c,1); -} - -static void decrCommand(redisClient *c) { - incrDecrCommand(c,-1); -} - -static void incrbyCommand(redisClient *c) { - long long incr; - - if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return; - incrDecrCommand(c,incr); -} - -static void decrbyCommand(redisClient *c) { - long long incr; - - if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return; - incrDecrCommand(c,-incr); -} - -static void appendCommand(redisClient *c) { - int retval; - size_t totlen; - robj *o; - - o = lookupKeyWrite(c->db,c->argv[1]); - if (o == NULL) { - /* Create the key */ - retval = dbAdd(c->db,c->argv[1],c->argv[2]); - incrRefCount(c->argv[2]); - totlen = stringObjectLen(c->argv[2]); - } else { - if (o->type != REDIS_STRING) { - addReply(c,shared.wrongtypeerr); - return; - } - /* If the object is specially encoded or shared we have to make - * a copy */ - if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) { - robj *decoded = getDecodedObject(o); - - o = createStringObject(decoded->ptr, sdslen(decoded->ptr)); - decrRefCount(decoded); - dbReplace(c->db,c->argv[1],o); - } - /* APPEND! */ - if (c->argv[2]->encoding == REDIS_ENCODING_RAW) { - o->ptr = sdscatlen(o->ptr, - c->argv[2]->ptr, sdslen(c->argv[2]->ptr)); - } else { - o->ptr = sdscatprintf(o->ptr, "%ld", - (unsigned long) c->argv[2]->ptr); - } - totlen = sdslen(o->ptr); - } - server.dirty++; - addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen)); -} - -static void substrCommand(redisClient *c) { - robj *o; - long start = atoi(c->argv[2]->ptr); - long end = atoi(c->argv[3]->ptr); - size_t rangelen, strlen; - sds range; - - if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL || - checkType(c,o,REDIS_STRING)) return; - - o = getDecodedObject(o); - strlen = sdslen(o->ptr); - - /* convert negative indexes */ - if (start < 0) start = strlen+start; - if (end < 0) end = strlen+end; - if (start < 0) start = 0; - if (end < 0) end = 0; - - /* indexes sanity checks */ - if (start > end || (size_t)start >= strlen) { - /* Out of range start or start > end result in null reply */ - addReply(c,shared.nullbulk); - decrRefCount(o); - return; - } - if ((size_t)end >= strlen) end = strlen-1; - rangelen = (end-start)+1; - - /* Return the result */ - addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen)); - range = sdsnewlen((char*)o->ptr+start,rangelen); - addReplySds(c,range); - addReply(c,shared.crlf); - decrRefCount(o); -} - -/* ========================= Type agnostic commands ========================= */ - -static void delCommand(redisClient *c) { - int deleted = 0, j; - - for (j = 1; j < c->argc; j++) { - if (dbDelete(c->db,c->argv[j])) { - touchWatchedKey(c->db,c->argv[j]); - server.dirty++; - deleted++; - } - } - addReplyLongLong(c,deleted); -} - -static void existsCommand(redisClient *c) { - expireIfNeeded(c->db,c->argv[1]); - if (dbExists(c->db,c->argv[1])) { - addReply(c, shared.cone); - } else { - addReply(c, shared.czero); - } -} - -static void selectCommand(redisClient *c) { - int id = atoi(c->argv[1]->ptr); - - if (selectDb(c,id) == REDIS_ERR) { - addReplySds(c,sdsnew("-ERR invalid DB index\r\n")); - } else { - addReply(c,shared.ok); - } -} - -static void randomkeyCommand(redisClient *c) { - robj *key; - - if ((key = dbRandomKey(c->db)) == NULL) { - addReply(c,shared.nullbulk); - return; - } - - addReplyBulk(c,key); - decrRefCount(key); -} - -static void keysCommand(redisClient *c) { - dictIterator *di; - dictEntry *de; - sds pattern = c->argv[1]->ptr; - int plen = sdslen(pattern); - unsigned long numkeys = 0; - robj *lenobj = createObject(REDIS_STRING,NULL); - - di = dictGetIterator(c->db->dict); - addReply(c,lenobj); - decrRefCount(lenobj); - while((de = dictNext(di)) != NULL) { - sds key = dictGetEntryKey(de); - robj *keyobj; - - if ((pattern[0] == '*' && pattern[1] == '\0') || - stringmatchlen(pattern,plen,key,sdslen(key),0)) { - keyobj = createStringObject(key,sdslen(key)); - if (expireIfNeeded(c->db,keyobj) == 0) { - addReplyBulk(c,keyobj); - numkeys++; - } - decrRefCount(keyobj); - } - } - dictReleaseIterator(di); - lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys); -} - -static void dbsizeCommand(redisClient *c) { - addReplySds(c, - sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict))); -} - -static void lastsaveCommand(redisClient *c) { - addReplySds(c, - sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave)); -} - -static void typeCommand(redisClient *c) { - robj *o; - char *type; - - o = lookupKeyRead(c->db,c->argv[1]); - if (o == NULL) { - type = "+none"; - } else { - switch(o->type) { - case REDIS_STRING: type = "+string"; break; - case REDIS_LIST: type = "+list"; break; - case REDIS_SET: type = "+set"; break; - case REDIS_ZSET: type = "+zset"; break; - case REDIS_HASH: type = "+hash"; break; - default: type = "+unknown"; break; - } - } - addReplySds(c,sdsnew(type)); - addReply(c,shared.crlf); -} - -static void saveCommand(redisClient *c) { - if (server.bgsavechildpid != -1) { - addReplySds(c,sdsnew("-ERR background save in progress\r\n")); - return; - } - if (rdbSave(server.dbfilename) == REDIS_OK) { - addReply(c,shared.ok); - } else { - addReply(c,shared.err); - } -} - -static void bgsaveCommand(redisClient *c) { - if (server.bgsavechildpid != -1) { - addReplySds(c,sdsnew("-ERR background save already in progress\r\n")); - return; - } - if (rdbSaveBackground(server.dbfilename) == REDIS_OK) { - char *status = "+Background saving started\r\n"; - addReplySds(c,sdsnew(status)); - } else { - addReply(c,shared.err); - } -} - -static void shutdownCommand(redisClient *c) { - if (prepareForShutdown() == REDIS_OK) - exit(0); - addReplySds(c, sdsnew("-ERR Errors trying to SHUTDOWN. Check logs.\r\n")); -} - -static void renameGenericCommand(redisClient *c, int nx) { - robj *o; - - /* To use the same key as src and dst is probably an error */ - if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) { - addReply(c,shared.sameobjecterr); - return; - } - - if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL) - return; - - incrRefCount(o); - deleteIfVolatile(c->db,c->argv[2]); - if (dbAdd(c->db,c->argv[2],o) == REDIS_ERR) { - if (nx) { - decrRefCount(o); - addReply(c,shared.czero); - return; - } - dbReplace(c->db,c->argv[2],o); - } - dbDelete(c->db,c->argv[1]); - touchWatchedKey(c->db,c->argv[2]); - server.dirty++; - addReply(c,nx ? shared.cone : shared.ok); -} - -static void renameCommand(redisClient *c) { - renameGenericCommand(c,0); -} - -static void renamenxCommand(redisClient *c) { - renameGenericCommand(c,1); -} - -static void moveCommand(redisClient *c) { - robj *o; - redisDb *src, *dst; - int srcid; - - /* Obtain source and target DB pointers */ - src = c->db; - srcid = c->db->id; - if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) { - addReply(c,shared.outofrangeerr); - return; - } - dst = c->db; - selectDb(c,srcid); /* Back to the source DB */ - - /* If the user is moving using as target the same - * DB as the source DB it is probably an error. */ - if (src == dst) { - addReply(c,shared.sameobjecterr); - return; - } - - /* Check if the element exists and get a reference */ - o = lookupKeyWrite(c->db,c->argv[1]); - if (!o) { - addReply(c,shared.czero); - return; - } - - /* Try to add the element to the target DB */ - deleteIfVolatile(dst,c->argv[1]); - if (dbAdd(dst,c->argv[1],o) == REDIS_ERR) { - addReply(c,shared.czero); - return; - } - incrRefCount(o); - - /* OK! key moved, free the entry in the source DB */ - dbDelete(src,c->argv[1]); - server.dirty++; - addReply(c,shared.cone); -} - -/* =================================== Lists ================================ */ - - -/* Check the argument length to see if it requires us to convert the ziplist - * to a real list. Only check raw-encoded objects because integer encoded - * objects are never too long. */ -static void listTypeTryConversion(robj *subject, robj *value) { - if (subject->encoding != REDIS_ENCODING_ZIPLIST) return; - if (value->encoding == REDIS_ENCODING_RAW && - sdslen(value->ptr) > server.list_max_ziplist_value) - listTypeConvert(subject,REDIS_ENCODING_LINKEDLIST); -} - -static void listTypePush(robj *subject, robj *value, int where) { - /* Check if we need to convert the ziplist */ - listTypeTryConversion(subject,value); - if (subject->encoding == REDIS_ENCODING_ZIPLIST && - ziplistLen(subject->ptr) >= server.list_max_ziplist_entries) - listTypeConvert(subject,REDIS_ENCODING_LINKEDLIST); - - if (subject->encoding == REDIS_ENCODING_ZIPLIST) { - int pos = (where == REDIS_HEAD) ? ZIPLIST_HEAD : ZIPLIST_TAIL; - value = getDecodedObject(value); - subject->ptr = ziplistPush(subject->ptr,value->ptr,sdslen(value->ptr),pos); - decrRefCount(value); - } else if (subject->encoding == REDIS_ENCODING_LINKEDLIST) { - if (where == REDIS_HEAD) { - listAddNodeHead(subject->ptr,value); - } else { - listAddNodeTail(subject->ptr,value); - } - incrRefCount(value); - } else { - redisPanic("Unknown list encoding"); - } -} - -static robj *listTypePop(robj *subject, int where) { - robj *value = NULL; - if (subject->encoding == REDIS_ENCODING_ZIPLIST) { - unsigned char *p; - unsigned char *vstr; - unsigned int vlen; - long long vlong; - int pos = (where == REDIS_HEAD) ? 0 : -1; - p = ziplistIndex(subject->ptr,pos); - if (ziplistGet(p,&vstr,&vlen,&vlong)) { - if (vstr) { - value = createStringObject((char*)vstr,vlen); - } else { - value = createStringObjectFromLongLong(vlong); - } - /* We only need to delete an element when it exists */ - subject->ptr = ziplistDelete(subject->ptr,&p); - } - } else if (subject->encoding == REDIS_ENCODING_LINKEDLIST) { - list *list = subject->ptr; - listNode *ln; - if (where == REDIS_HEAD) { - ln = listFirst(list); - } else { - ln = listLast(list); - } - if (ln != NULL) { - value = listNodeValue(ln); - incrRefCount(value); - listDelNode(list,ln); - } - } else { - redisPanic("Unknown list encoding"); - } - return value; -} - -static unsigned long listTypeLength(robj *subject) { - if (subject->encoding == REDIS_ENCODING_ZIPLIST) { - return ziplistLen(subject->ptr); - } else if (subject->encoding == REDIS_ENCODING_LINKEDLIST) { - return listLength((list*)subject->ptr); - } else { - redisPanic("Unknown list encoding"); - } -} - -/* Structure to hold set iteration abstraction. */ -typedef struct { - robj *subject; - unsigned char encoding; - unsigned char direction; /* Iteration direction */ - unsigned char *zi; - listNode *ln; -} listTypeIterator; - -/* Structure for an entry while iterating over a list. */ -typedef struct { - listTypeIterator *li; - unsigned char *zi; /* Entry in ziplist */ - listNode *ln; /* Entry in linked list */ -} listTypeEntry; - -/* Initialize an iterator at the specified index. */ -static listTypeIterator *listTypeInitIterator(robj *subject, int index, unsigned char direction) { - listTypeIterator *li = zmalloc(sizeof(listTypeIterator)); - li->subject = subject; - li->encoding = subject->encoding; - li->direction = direction; - if (li->encoding == REDIS_ENCODING_ZIPLIST) { - li->zi = ziplistIndex(subject->ptr,index); - } else if (li->encoding == REDIS_ENCODING_LINKEDLIST) { - li->ln = listIndex(subject->ptr,index); - } else { - redisPanic("Unknown list encoding"); - } - return li; -} - -/* Clean up the iterator. */ -static void listTypeReleaseIterator(listTypeIterator *li) { - zfree(li); -} - -/* Stores pointer to current the entry in the provided entry structure - * and advances the position of the iterator. Returns 1 when the current - * entry is in fact an entry, 0 otherwise. */ -static int listTypeNext(listTypeIterator *li, listTypeEntry *entry) { - /* Protect from converting when iterating */ - redisAssert(li->subject->encoding == li->encoding); - - entry->li = li; - if (li->encoding == REDIS_ENCODING_ZIPLIST) { - entry->zi = li->zi; - if (entry->zi != NULL) { - if (li->direction == REDIS_TAIL) - li->zi = ziplistNext(li->subject->ptr,li->zi); - else - li->zi = ziplistPrev(li->subject->ptr,li->zi); - return 1; - } - } else if (li->encoding == REDIS_ENCODING_LINKEDLIST) { - entry->ln = li->ln; - if (entry->ln != NULL) { - if (li->direction == REDIS_TAIL) - li->ln = li->ln->next; - else - li->ln = li->ln->prev; - return 1; - } - } else { - redisPanic("Unknown list encoding"); - } - return 0; -} - -/* Return entry or NULL at the current position of the iterator. */ -static robj *listTypeGet(listTypeEntry *entry) { - listTypeIterator *li = entry->li; - robj *value = NULL; - if (li->encoding == REDIS_ENCODING_ZIPLIST) { - unsigned char *vstr; - unsigned int vlen; - long long vlong; - redisAssert(entry->zi != NULL); - if (ziplistGet(entry->zi,&vstr,&vlen,&vlong)) { - if (vstr) { - value = createStringObject((char*)vstr,vlen); - } else { - value = createStringObjectFromLongLong(vlong); - } - } - } else if (li->encoding == REDIS_ENCODING_LINKEDLIST) { - redisAssert(entry->ln != NULL); - value = listNodeValue(entry->ln); - incrRefCount(value); - } else { - redisPanic("Unknown list encoding"); - } - return value; -} - -static void listTypeInsert(listTypeEntry *entry, robj *value, int where) { - robj *subject = entry->li->subject; - if (entry->li->encoding == REDIS_ENCODING_ZIPLIST) { - value = getDecodedObject(value); - if (where == REDIS_TAIL) { - unsigned char *next = ziplistNext(subject->ptr,entry->zi); - - /* When we insert after the current element, but the current element - * is the tail of the list, we need to do a push. */ - if (next == NULL) { - subject->ptr = ziplistPush(subject->ptr,value->ptr,sdslen(value->ptr),REDIS_TAIL); - } else { - subject->ptr = ziplistInsert(subject->ptr,next,value->ptr,sdslen(value->ptr)); - } - } else { - subject->ptr = ziplistInsert(subject->ptr,entry->zi,value->ptr,sdslen(value->ptr)); - } - decrRefCount(value); - } else if (entry->li->encoding == REDIS_ENCODING_LINKEDLIST) { - if (where == REDIS_TAIL) { - listInsertNode(subject->ptr,entry->ln,value,AL_START_TAIL); - } else { - listInsertNode(subject->ptr,entry->ln,value,AL_START_HEAD); - } - incrRefCount(value); - } else { - redisPanic("Unknown list encoding"); - } -} - -/* Compare the given object with the entry at the current position. */ -static int listTypeEqual(listTypeEntry *entry, robj *o) { - listTypeIterator *li = entry->li; - if (li->encoding == REDIS_ENCODING_ZIPLIST) { - redisAssert(o->encoding == REDIS_ENCODING_RAW); - return ziplistCompare(entry->zi,o->ptr,sdslen(o->ptr)); - } else if (li->encoding == REDIS_ENCODING_LINKEDLIST) { - return equalStringObjects(o,listNodeValue(entry->ln)); - } else { - redisPanic("Unknown list encoding"); - } -} - -/* Delete the element pointed to. */ -static void listTypeDelete(listTypeEntry *entry) { - listTypeIterator *li = entry->li; - if (li->encoding == REDIS_ENCODING_ZIPLIST) { - unsigned char *p = entry->zi; - li->subject->ptr = ziplistDelete(li->subject->ptr,&p); - - /* Update position of the iterator depending on the direction */ - if (li->direction == REDIS_TAIL) - li->zi = p; - else - li->zi = ziplistPrev(li->subject->ptr,p); - } else if (entry->li->encoding == REDIS_ENCODING_LINKEDLIST) { - listNode *next; - if (li->direction == REDIS_TAIL) - next = entry->ln->next; - else - next = entry->ln->prev; - listDelNode(li->subject->ptr,entry->ln); - li->ln = next; - } else { - redisPanic("Unknown list encoding"); - } -} - -static void listTypeConvert(robj *subject, int enc) { - listTypeIterator *li; - listTypeEntry entry; - redisAssert(subject->type == REDIS_LIST); - - if (enc == REDIS_ENCODING_LINKEDLIST) { - list *l = listCreate(); - listSetFreeMethod(l,decrRefCount); - - /* listTypeGet returns a robj with incremented refcount */ - li = listTypeInitIterator(subject,0,REDIS_TAIL); - while (listTypeNext(li,&entry)) listAddNodeTail(l,listTypeGet(&entry)); - listTypeReleaseIterator(li); - - subject->encoding = REDIS_ENCODING_LINKEDLIST; - zfree(subject->ptr); - subject->ptr = l; - } else { - redisPanic("Unsupported list conversion"); - } -} - -static void pushGenericCommand(redisClient *c, int where) { - robj *lobj = lookupKeyWrite(c->db,c->argv[1]); - if (lobj == NULL) { - if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) { - addReply(c,shared.cone); - return; - } - lobj = createZiplistObject(); - dbAdd(c->db,c->argv[1],lobj); - } else { - if (lobj->type != REDIS_LIST) { - addReply(c,shared.wrongtypeerr); - return; - } - if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) { - addReply(c,shared.cone); - return; - } - } - listTypePush(lobj,c->argv[2],where); - addReplyLongLong(c,listTypeLength(lobj)); - server.dirty++; -} - -static void lpushCommand(redisClient *c) { - pushGenericCommand(c,REDIS_HEAD); -} - -static void rpushCommand(redisClient *c) { - pushGenericCommand(c,REDIS_TAIL); -} - -static void pushxGenericCommand(redisClient *c, robj *refval, robj *val, int where) { - robj *subject; - listTypeIterator *iter; - listTypeEntry entry; - int inserted = 0; - - if ((subject = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL || - checkType(c,subject,REDIS_LIST)) return; - - if (refval != NULL) { - /* Note: we expect refval to be string-encoded because it is *not* the - * last argument of the multi-bulk LINSERT. */ - redisAssert(refval->encoding == REDIS_ENCODING_RAW); - - /* We're not sure if this value can be inserted yet, but we cannot - * convert the list inside the iterator. We don't want to loop over - * the list twice (once to see if the value can be inserted and once - * to do the actual insert), so we assume this value can be inserted - * and convert the ziplist to a regular list if necessary. */ - listTypeTryConversion(subject,val); - - /* Seek refval from head to tail */ - iter = listTypeInitIterator(subject,0,REDIS_TAIL); - while (listTypeNext(iter,&entry)) { - if (listTypeEqual(&entry,refval)) { - listTypeInsert(&entry,val,where); - inserted = 1; - break; - } - } - listTypeReleaseIterator(iter); - - if (inserted) { - /* Check if the length exceeds the ziplist length threshold. */ - if (subject->encoding == REDIS_ENCODING_ZIPLIST && - ziplistLen(subject->ptr) > server.list_max_ziplist_entries) - listTypeConvert(subject,REDIS_ENCODING_LINKEDLIST); - server.dirty++; - } else { - /* Notify client of a failed insert */ - addReply(c,shared.cnegone); - return; - } - } else { - listTypePush(subject,val,where); - server.dirty++; - } - - addReplyUlong(c,listTypeLength(subject)); -} - -static void lpushxCommand(redisClient *c) { - pushxGenericCommand(c,NULL,c->argv[2],REDIS_HEAD); -} - -static void rpushxCommand(redisClient *c) { - pushxGenericCommand(c,NULL,c->argv[2],REDIS_TAIL); -} - -static void linsertCommand(redisClient *c) { - if (strcasecmp(c->argv[2]->ptr,"after") == 0) { - pushxGenericCommand(c,c->argv[3],c->argv[4],REDIS_TAIL); - } else if (strcasecmp(c->argv[2]->ptr,"before") == 0) { - pushxGenericCommand(c,c->argv[3],c->argv[4],REDIS_HEAD); - } else { - addReply(c,shared.syntaxerr); - } -} - -static void llenCommand(redisClient *c) { - robj *o = lookupKeyReadOrReply(c,c->argv[1],shared.czero); - if (o == NULL || checkType(c,o,REDIS_LIST)) return; - addReplyUlong(c,listTypeLength(o)); -} - -static void lindexCommand(redisClient *c) { - robj *o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk); - if (o == NULL || checkType(c,o,REDIS_LIST)) return; - int index = atoi(c->argv[2]->ptr); - robj *value = NULL; - - if (o->encoding == REDIS_ENCODING_ZIPLIST) { - unsigned char *p; - unsigned char *vstr; - unsigned int vlen; - long long vlong; - p = ziplistIndex(o->ptr,index); - if (ziplistGet(p,&vstr,&vlen,&vlong)) { - if (vstr) { - value = createStringObject((char*)vstr,vlen); - } else { - value = createStringObjectFromLongLong(vlong); - } - addReplyBulk(c,value); - decrRefCount(value); - } else { - addReply(c,shared.nullbulk); - } - } else if (o->encoding == REDIS_ENCODING_LINKEDLIST) { - listNode *ln = listIndex(o->ptr,index); - if (ln != NULL) { - value = listNodeValue(ln); - addReplyBulk(c,value); - } else { - addReply(c,shared.nullbulk); - } - } else { - redisPanic("Unknown list encoding"); - } -} - -static void lsetCommand(redisClient *c) { - robj *o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr); - if (o == NULL || checkType(c,o,REDIS_LIST)) return; - int index = atoi(c->argv[2]->ptr); - robj *value = c->argv[3]; - - listTypeTryConversion(o,value); - if (o->encoding == REDIS_ENCODING_ZIPLIST) { - unsigned char *p, *zl = o->ptr; - p = ziplistIndex(zl,index); - if (p == NULL) { - addReply(c,shared.outofrangeerr); - } else { - o->ptr = ziplistDelete(o->ptr,&p); - value = getDecodedObject(value); - o->ptr = ziplistInsert(o->ptr,p,value->ptr,sdslen(value->ptr)); - decrRefCount(value); - addReply(c,shared.ok); - server.dirty++; - } - } else if (o->encoding == REDIS_ENCODING_LINKEDLIST) { - listNode *ln = listIndex(o->ptr,index); - if (ln == NULL) { - addReply(c,shared.outofrangeerr); - } else { - decrRefCount((robj*)listNodeValue(ln)); - listNodeValue(ln) = value; - incrRefCount(value); - addReply(c,shared.ok); - server.dirty++; - } - } else { - redisPanic("Unknown list encoding"); - } -} - -static void popGenericCommand(redisClient *c, int where) { - robj *o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk); - if (o == NULL || checkType(c,o,REDIS_LIST)) return; - - robj *value = listTypePop(o,where); - if (value == NULL) { - addReply(c,shared.nullbulk); - } else { - addReplyBulk(c,value); - decrRefCount(value); - if (listTypeLength(o) == 0) dbDelete(c->db,c->argv[1]); - server.dirty++; - } -} - -static void lpopCommand(redisClient *c) { - popGenericCommand(c,REDIS_HEAD); -} - -static void rpopCommand(redisClient *c) { - popGenericCommand(c,REDIS_TAIL); -} - -static void lrangeCommand(redisClient *c) { - robj *o, *value; - int start = atoi(c->argv[2]->ptr); - int end = atoi(c->argv[3]->ptr); - int llen; - int rangelen, j; - listTypeEntry entry; - - if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL - || checkType(c,o,REDIS_LIST)) return; - llen = listTypeLength(o); - - /* convert negative indexes */ - if (start < 0) start = llen+start; - if (end < 0) end = llen+end; - if (start < 0) start = 0; - if (end < 0) end = 0; - - /* indexes sanity checks */ - if (start > end || start >= llen) { - /* Out of range start or start > end result in empty list */ - addReply(c,shared.emptymultibulk); - return; - } - if (end >= llen) end = llen-1; - rangelen = (end-start)+1; - - /* Return the result in form of a multi-bulk reply */ - addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen)); - listTypeIterator *li = listTypeInitIterator(o,start,REDIS_TAIL); - for (j = 0; j < rangelen; j++) { - redisAssert(listTypeNext(li,&entry)); - value = listTypeGet(&entry); - addReplyBulk(c,value); - decrRefCount(value); - } - listTypeReleaseIterator(li); -} - -static void ltrimCommand(redisClient *c) { - robj *o; - int start = atoi(c->argv[2]->ptr); - int end = atoi(c->argv[3]->ptr); - int llen; - int j, ltrim, rtrim; - list *list; - listNode *ln; - - if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL || - checkType(c,o,REDIS_LIST)) return; - llen = listTypeLength(o); - - /* convert negative indexes */ - if (start < 0) start = llen+start; - if (end < 0) end = llen+end; - if (start < 0) start = 0; - if (end < 0) end = 0; - - /* indexes sanity checks */ - if (start > end || start >= llen) { - /* Out of range start or start > end result in empty list */ - ltrim = llen; - rtrim = 0; - } else { - if (end >= llen) end = llen-1; - ltrim = start; - rtrim = llen-end-1; - } - - /* Remove list elements to perform the trim */ - if (o->encoding == REDIS_ENCODING_ZIPLIST) { - o->ptr = ziplistDeleteRange(o->ptr,0,ltrim); - o->ptr = ziplistDeleteRange(o->ptr,-rtrim,rtrim); - } else if (o->encoding == REDIS_ENCODING_LINKEDLIST) { - list = o->ptr; - for (j = 0; j < ltrim; j++) { - ln = listFirst(list); - listDelNode(list,ln); - } - for (j = 0; j < rtrim; j++) { - ln = listLast(list); - listDelNode(list,ln); - } - } else { - redisPanic("Unknown list encoding"); - } - if (listTypeLength(o) == 0) dbDelete(c->db,c->argv[1]); - server.dirty++; - addReply(c,shared.ok); -} - -static void lremCommand(redisClient *c) { - robj *subject, *obj = c->argv[3]; - int toremove = atoi(c->argv[2]->ptr); - int removed = 0; - listTypeEntry entry; - - subject = lookupKeyWriteOrReply(c,c->argv[1],shared.czero); - if (subject == NULL || checkType(c,subject,REDIS_LIST)) return; - - /* Make sure obj is raw when we're dealing with a ziplist */ - if (subject->encoding == REDIS_ENCODING_ZIPLIST) - obj = getDecodedObject(obj); - - listTypeIterator *li; - if (toremove < 0) { - toremove = -toremove; - li = listTypeInitIterator(subject,-1,REDIS_HEAD); - } else { - li = listTypeInitIterator(subject,0,REDIS_TAIL); - } - - while (listTypeNext(li,&entry)) { - if (listTypeEqual(&entry,obj)) { - listTypeDelete(&entry); - server.dirty++; - removed++; - if (toremove && removed == toremove) break; - } - } - listTypeReleaseIterator(li); - - /* Clean up raw encoded object */ - if (subject->encoding == REDIS_ENCODING_ZIPLIST) - decrRefCount(obj); - - if (listTypeLength(subject) == 0) dbDelete(c->db,c->argv[1]); - addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed)); -} - -/* This is the semantic of this command: - * RPOPLPUSH srclist dstlist: - * IF LLEN(srclist) > 0 - * element = RPOP srclist - * LPUSH dstlist element - * RETURN element - * ELSE - * RETURN nil - * END - * END - * - * The idea is to be able to get an element from a list in a reliable way - * since the element is not just returned but pushed against another list - * as well. This command was originally proposed by Ezra Zygmuntowicz. - */ -static void rpoplpushcommand(redisClient *c) { - robj *sobj, *value; - if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL || - checkType(c,sobj,REDIS_LIST)) return; - - if (listTypeLength(sobj) == 0) { - addReply(c,shared.nullbulk); - } else { - robj *dobj = lookupKeyWrite(c->db,c->argv[2]); - if (dobj && checkType(c,dobj,REDIS_LIST)) return; - value = listTypePop(sobj,REDIS_TAIL); - - /* Add the element to the target list (unless it's directly - * passed to some BLPOP-ing client */ - if (!handleClientsWaitingListPush(c,c->argv[2],value)) { - /* Create the list if the key does not exist */ - if (!dobj) { - dobj = createZiplistObject(); - dbAdd(c->db,c->argv[2],dobj); - } - listTypePush(dobj,value,REDIS_HEAD); - } - - /* Send the element to the client as reply as well */ - addReplyBulk(c,value); - - /* listTypePop returns an object with its refcount incremented */ - decrRefCount(value); - - /* Delete the source list when it is empty */ - if (listTypeLength(sobj) == 0) dbDelete(c->db,c->argv[1]); - server.dirty++; - } -} - -/* ==================================== Sets ================================ */ - -static void saddCommand(redisClient *c) { - robj *set; - - set = lookupKeyWrite(c->db,c->argv[1]); - if (set == NULL) { - set = createSetObject(); - dbAdd(c->db,c->argv[1],set); - } else { - if (set->type != REDIS_SET) { - addReply(c,shared.wrongtypeerr); - return; - } - } - if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) { - incrRefCount(c->argv[2]); - server.dirty++; - addReply(c,shared.cone); - } else { - addReply(c,shared.czero); - } -} - -static void sremCommand(redisClient *c) { - robj *set; - - if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL || - checkType(c,set,REDIS_SET)) return; - - if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) { - server.dirty++; - if (htNeedsResize(set->ptr)) dictResize(set->ptr); - if (dictSize((dict*)set->ptr) == 0) dbDelete(c->db,c->argv[1]); - addReply(c,shared.cone); - } else { - addReply(c,shared.czero); - } -} - -static void smoveCommand(redisClient *c) { - robj *srcset, *dstset; - - srcset = lookupKeyWrite(c->db,c->argv[1]); - dstset = lookupKeyWrite(c->db,c->argv[2]); - - /* If the source key does not exist return 0, if it's of the wrong type - * raise an error */ - if (srcset == NULL || srcset->type != REDIS_SET) { - addReply(c, srcset ? shared.wrongtypeerr : shared.czero); - return; - } - /* Error if the destination key is not a set as well */ - if (dstset && dstset->type != REDIS_SET) { - addReply(c,shared.wrongtypeerr); - return; - } - /* Remove the element from the source set */ - if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) { - /* Key not found in the src set! return zero */ - addReply(c,shared.czero); - return; - } - if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset) - dbDelete(c->db,c->argv[1]); - server.dirty++; - /* Add the element to the destination set */ - if (!dstset) { - dstset = createSetObject(); - dbAdd(c->db,c->argv[2],dstset); - } - if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK) - incrRefCount(c->argv[3]); - addReply(c,shared.cone); -} - -static void sismemberCommand(redisClient *c) { - robj *set; - - if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL || - checkType(c,set,REDIS_SET)) return; - - if (dictFind(set->ptr,c->argv[2])) - addReply(c,shared.cone); - else - addReply(c,shared.czero); -} - -static void scardCommand(redisClient *c) { - robj *o; - dict *s; - - if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL || - checkType(c,o,REDIS_SET)) return; - - s = o->ptr; - addReplyUlong(c,dictSize(s)); -} - -static void spopCommand(redisClient *c) { - robj *set; - dictEntry *de; - - if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL || - checkType(c,set,REDIS_SET)) return; - - de = dictGetRandomKey(set->ptr); - if (de == NULL) { - addReply(c,shared.nullbulk); - } else { - robj *ele = dictGetEntryKey(de); - - addReplyBulk(c,ele); - dictDelete(set->ptr,ele); - if (htNeedsResize(set->ptr)) dictResize(set->ptr); - if (dictSize((dict*)set->ptr) == 0) dbDelete(c->db,c->argv[1]); - server.dirty++; - } -} - -static void srandmemberCommand(redisClient *c) { - robj *set; - dictEntry *de; - - if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL || - checkType(c,set,REDIS_SET)) return; - - de = dictGetRandomKey(set->ptr); - if (de == NULL) { - addReply(c,shared.nullbulk); - } else { - robj *ele = dictGetEntryKey(de); - - addReplyBulk(c,ele); - } -} - -static int qsortCompareSetsByCardinality(const void *s1, const void *s2) { - dict **d1 = (void*) s1, **d2 = (void*) s2; - - return dictSize(*d1)-dictSize(*d2); -} - -static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) { - dict **dv = zmalloc(sizeof(dict*)*setsnum); - dictIterator *di; - dictEntry *de; - robj *lenobj = NULL, *dstset = NULL; - unsigned long j, cardinality = 0; - - for (j = 0; j < setsnum; j++) { - robj *setobj; - - setobj = dstkey ? - lookupKeyWrite(c->db,setskeys[j]) : - lookupKeyRead(c->db,setskeys[j]); - if (!setobj) { - zfree(dv); - if (dstkey) { - if (dbDelete(c->db,dstkey)) - server.dirty++; - addReply(c,shared.czero); - } else { - addReply(c,shared.emptymultibulk); - } - return; - } - if (setobj->type != REDIS_SET) { - zfree(dv); - addReply(c,shared.wrongtypeerr); - return; - } - dv[j] = setobj->ptr; - } - /* Sort sets from the smallest to largest, this will improve our - * algorithm's performace */ - qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality); - - /* The first thing we should output is the total number of elements... - * since this is a multi-bulk write, but at this stage we don't know - * the intersection set size, so we use a trick, append an empty object - * to the output list and save the pointer to later modify it with the - * right length */ - if (!dstkey) { - lenobj = createObject(REDIS_STRING,NULL); - addReply(c,lenobj); - decrRefCount(lenobj); - } else { - /* If we have a target key where to store the resulting set - * create this key with an empty set inside */ - dstset = createSetObject(); - } - - /* Iterate all the elements of the first (smallest) set, and test - * the element against all the other sets, if at least one set does - * not include the element it is discarded */ - di = dictGetIterator(dv[0]); - - while((de = dictNext(di)) != NULL) { - robj *ele; - - for (j = 1; j < setsnum; j++) - if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break; - if (j != setsnum) - continue; /* at least one set does not contain the member */ - ele = dictGetEntryKey(de); - if (!dstkey) { - addReplyBulk(c,ele); - cardinality++; - } else { - dictAdd(dstset->ptr,ele,NULL); - incrRefCount(ele); - } - } - dictReleaseIterator(di); - - if (dstkey) { - /* Store the resulting set into the target, if the intersection - * is not an empty set. */ - dbDelete(c->db,dstkey); - if (dictSize((dict*)dstset->ptr) > 0) { - dbAdd(c->db,dstkey,dstset); - addReplyLongLong(c,dictSize((dict*)dstset->ptr)); - } else { - decrRefCount(dstset); - addReply(c,shared.czero); - } - server.dirty++; - } else { - lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality); - } - zfree(dv); -} - -static void sinterCommand(redisClient *c) { - sinterGenericCommand(c,c->argv+1,c->argc-1,NULL); -} - -static void sinterstoreCommand(redisClient *c) { - sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]); -} - -#define REDIS_OP_UNION 0 -#define REDIS_OP_DIFF 1 -#define REDIS_OP_INTER 2 - -static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) { - dict **dv = zmalloc(sizeof(dict*)*setsnum); - dictIterator *di; - dictEntry *de; - robj *dstset = NULL; - int j, cardinality = 0; - - for (j = 0; j < setsnum; j++) { - robj *setobj; - - setobj = dstkey ? - lookupKeyWrite(c->db,setskeys[j]) : - lookupKeyRead(c->db,setskeys[j]); - if (!setobj) { - dv[j] = NULL; - continue; - } - if (setobj->type != REDIS_SET) { - zfree(dv); - addReply(c,shared.wrongtypeerr); - return; - } - dv[j] = setobj->ptr; - } - - /* We need a temp set object to store our union. If the dstkey - * is not NULL (that is, we are inside an SUNIONSTORE operation) then - * this set object will be the resulting object to set into the target key*/ - dstset = createSetObject(); - - /* Iterate all the elements of all the sets, add every element a single - * time to the result set */ - for (j = 0; j < setsnum; j++) { - if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */ - if (!dv[j]) continue; /* non existing keys are like empty sets */ - - di = dictGetIterator(dv[j]); - - while((de = dictNext(di)) != NULL) { - robj *ele; - - /* dictAdd will not add the same element multiple times */ - ele = dictGetEntryKey(de); - if (op == REDIS_OP_UNION || j == 0) { - if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) { - incrRefCount(ele); - cardinality++; - } - } else if (op == REDIS_OP_DIFF) { - if (dictDelete(dstset->ptr,ele) == DICT_OK) { - cardinality--; - } - } - } - dictReleaseIterator(di); - - /* result set is empty? Exit asap. */ - if (op == REDIS_OP_DIFF && cardinality == 0) break; - } - - /* Output the content of the resulting set, if not in STORE mode */ - if (!dstkey) { - addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality)); - di = dictGetIterator(dstset->ptr); - while((de = dictNext(di)) != NULL) { - robj *ele; - - ele = dictGetEntryKey(de); - addReplyBulk(c,ele); - } - dictReleaseIterator(di); - decrRefCount(dstset); - } else { - /* If we have a target key where to store the resulting set - * create this key with the result set inside */ - dbDelete(c->db,dstkey); - if (dictSize((dict*)dstset->ptr) > 0) { - dbAdd(c->db,dstkey,dstset); - addReplyLongLong(c,dictSize((dict*)dstset->ptr)); - } else { - decrRefCount(dstset); - addReply(c,shared.czero); - } - server.dirty++; - } - zfree(dv); -} - -static void sunionCommand(redisClient *c) { - sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION); -} - -static void sunionstoreCommand(redisClient *c) { - sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION); -} - -static void sdiffCommand(redisClient *c) { - sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF); -} - -static void sdiffstoreCommand(redisClient *c) { - sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF); -} - -/* ==================================== ZSets =============================== */ - -/* ZSETs are ordered sets using two data structures to hold the same elements - * in order to get O(log(N)) INSERT and REMOVE operations into a sorted - * data structure. - * - * The elements are added to an hash table mapping Redis objects to scores. - * At the same time the elements are added to a skip list mapping scores - * to Redis objects (so objects are sorted by scores in this "view"). */ - -/* This skiplist implementation is almost a C translation of the original - * algorithm described by William Pugh in "Skip Lists: A Probabilistic - * Alternative to Balanced Trees", modified in three ways: - * a) this implementation allows for repeated values. - * b) the comparison is not just by key (our 'score') but by satellite data. - * c) there is a back pointer, so it's a doubly linked list with the back - * pointers being only at "level 1". This allows to traverse the list - * from tail to head, useful for ZREVRANGE. */ - -static zskiplistNode *zslCreateNode(int level, double score, robj *obj) { - zskiplistNode *zn = zmalloc(sizeof(*zn)); - - zn->forward = zmalloc(sizeof(zskiplistNode*) * level); - if (level > 1) - zn->span = zmalloc(sizeof(unsigned int) * (level - 1)); - else - zn->span = NULL; - zn->score = score; - zn->obj = obj; - return zn; -} - -static zskiplist *zslCreate(void) { - int j; - zskiplist *zsl; - - zsl = zmalloc(sizeof(*zsl)); - zsl->level = 1; - zsl->length = 0; - zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL); - for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) { - zsl->header->forward[j] = NULL; - - /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */ - if (j < ZSKIPLIST_MAXLEVEL-1) - zsl->header->span[j] = 0; - } - zsl->header->backward = NULL; - zsl->tail = NULL; - return zsl; -} - -static void zslFreeNode(zskiplistNode *node) { - decrRefCount(node->obj); - zfree(node->forward); - zfree(node->span); - zfree(node); -} - -static void zslFree(zskiplist *zsl) { - zskiplistNode *node = zsl->header->forward[0], *next; - - zfree(zsl->header->forward); - zfree(zsl->header->span); - zfree(zsl->header); - while(node) { - next = node->forward[0]; - zslFreeNode(node); - node = next; - } - zfree(zsl); -} - -static int zslRandomLevel(void) { - int level = 1; - while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF)) - level += 1; - return (levelheader; - for (i = zsl->level-1; i >= 0; i--) { - /* store rank that is crossed to reach the insert position */ - rank[i] = i == (zsl->level-1) ? 0 : rank[i+1]; - - while (x->forward[i] && - (x->forward[i]->score < score || - (x->forward[i]->score == score && - compareStringObjects(x->forward[i]->obj,obj) < 0))) { - rank[i] += i > 0 ? x->span[i-1] : 1; - x = x->forward[i]; - } - update[i] = x; - } - /* we assume the key is not already inside, since we allow duplicated - * scores, and the re-insertion of score and redis object should never - * happpen since the caller of zslInsert() should test in the hash table - * if the element is already inside or not. */ - level = zslRandomLevel(); - if (level > zsl->level) { - for (i = zsl->level; i < level; i++) { - rank[i] = 0; - update[i] = zsl->header; - update[i]->span[i-1] = zsl->length; - } - zsl->level = level; - } - x = zslCreateNode(level,score,obj); - for (i = 0; i < level; i++) { - x->forward[i] = update[i]->forward[i]; - update[i]->forward[i] = x; - - /* update span covered by update[i] as x is inserted here */ - if (i > 0) { - x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]); - update[i]->span[i-1] = (rank[0] - rank[i]) + 1; - } - } - - /* increment span for untouched levels */ - for (i = level; i < zsl->level; i++) { - update[i]->span[i-1]++; - } - - x->backward = (update[0] == zsl->header) ? NULL : update[0]; - if (x->forward[0]) - x->forward[0]->backward = x; - else - zsl->tail = x; - zsl->length++; -} - -/* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */ -void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) { - int i; - for (i = 0; i < zsl->level; i++) { - if (update[i]->forward[i] == x) { - if (i > 0) { - update[i]->span[i-1] += x->span[i-1] - 1; - } - update[i]->forward[i] = x->forward[i]; - } else { - /* invariant: i > 0, because update[0]->forward[0] - * is always equal to x */ - update[i]->span[i-1] -= 1; - } - } - if (x->forward[0]) { - x->forward[0]->backward = x->backward; - } else { - zsl->tail = x->backward; - } - while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL) - zsl->level--; - zsl->length--; -} - -/* Delete an element with matching score/object from the skiplist. */ -static int zslDelete(zskiplist *zsl, double score, robj *obj) { - zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x; - int i; - - x = zsl->header; - for (i = zsl->level-1; i >= 0; i--) { - while (x->forward[i] && - (x->forward[i]->score < score || - (x->forward[i]->score == score && - compareStringObjects(x->forward[i]->obj,obj) < 0))) - x = x->forward[i]; - update[i] = x; - } - /* We may have multiple elements with the same score, what we need - * is to find the element with both the right score and object. */ - x = x->forward[0]; - if (x && score == x->score && equalStringObjects(x->obj,obj)) { - zslDeleteNode(zsl, x, update); - zslFreeNode(x); - return 1; - } else { - return 0; /* not found */ - } - return 0; /* not found */ -} - -/* Delete all the elements with score between min and max from the skiplist. - * Min and mx are inclusive, so a score >= min || score <= max is deleted. - * Note that this function takes the reference to the hash table view of the - * sorted set, in order to remove the elements from the hash table too. */ -static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) { - zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x; - unsigned long removed = 0; - int i; - - x = zsl->header; - for (i = zsl->level-1; i >= 0; i--) { - while (x->forward[i] && x->forward[i]->score < min) - x = x->forward[i]; - update[i] = x; - } - /* We may have multiple elements with the same score, what we need - * is to find the element with both the right score and object. */ - x = x->forward[0]; - while (x && x->score <= max) { - zskiplistNode *next = x->forward[0]; - zslDeleteNode(zsl, x, update); - dictDelete(dict,x->obj); - zslFreeNode(x); - removed++; - x = next; - } - return removed; /* not found */ -} - -/* Delete all the elements with rank between start and end from the skiplist. - * Start and end are inclusive. Note that start and end need to be 1-based */ -static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) { - zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x; - unsigned long traversed = 0, removed = 0; - int i; - - x = zsl->header; - for (i = zsl->level-1; i >= 0; i--) { - while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) { - traversed += i > 0 ? x->span[i-1] : 1; - x = x->forward[i]; - } - update[i] = x; - } - - traversed++; - x = x->forward[0]; - while (x && traversed <= end) { - zskiplistNode *next = x->forward[0]; - zslDeleteNode(zsl, x, update); - dictDelete(dict,x->obj); - zslFreeNode(x); - removed++; - traversed++; - x = next; - } - return removed; -} - -/* Find the first node having a score equal or greater than the specified one. - * Returns NULL if there is no match. */ -static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) { - zskiplistNode *x; - int i; - - x = zsl->header; - for (i = zsl->level-1; i >= 0; i--) { - while (x->forward[i] && x->forward[i]->score < score) - x = x->forward[i]; - } - /* We may have multiple elements with the same score, what we need - * is to find the element with both the right score and object. */ - return x->forward[0]; -} - -/* Find the rank for an element by both score and key. - * Returns 0 when the element cannot be found, rank otherwise. - * Note that the rank is 1-based due to the span of zsl->header to the - * first element. */ -static unsigned long zslistTypeGetRank(zskiplist *zsl, double score, robj *o) { - zskiplistNode *x; - unsigned long rank = 0; - int i; - - x = zsl->header; - for (i = zsl->level-1; i >= 0; i--) { - while (x->forward[i] && - (x->forward[i]->score < score || - (x->forward[i]->score == score && - compareStringObjects(x->forward[i]->obj,o) <= 0))) { - rank += i > 0 ? x->span[i-1] : 1; - x = x->forward[i]; - } - - /* x might be equal to zsl->header, so test if obj is non-NULL */ - if (x->obj && equalStringObjects(x->obj,o)) { - return rank; - } - } - return 0; -} - -/* Finds an element by its rank. The rank argument needs to be 1-based. */ -zskiplistNode* zslistTypeGetElementByRank(zskiplist *zsl, unsigned long rank) { - zskiplistNode *x; - unsigned long traversed = 0; - int i; - - x = zsl->header; - for (i = zsl->level-1; i >= 0; i--) { - while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank) - { - traversed += i > 0 ? x->span[i-1] : 1; - x = x->forward[i]; - } - if (traversed == rank) { - return x; - } - } - return NULL; -} - -/* The actual Z-commands implementations */ - -/* This generic command implements both ZADD and ZINCRBY. - * scoreval is the score if the operation is a ZADD (doincrement == 0) or - * the increment if the operation is a ZINCRBY (doincrement == 1). */ -static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) { - robj *zsetobj; - zset *zs; - double *score; - - if (isnan(scoreval)) { - addReplySds(c,sdsnew("-ERR provide score is Not A Number (nan)\r\n")); - return; - } - - zsetobj = lookupKeyWrite(c->db,key); - if (zsetobj == NULL) { - zsetobj = createZsetObject(); - dbAdd(c->db,key,zsetobj); - } else { - if (zsetobj->type != REDIS_ZSET) { - addReply(c,shared.wrongtypeerr); - return; - } - } - zs = zsetobj->ptr; - - /* Ok now since we implement both ZADD and ZINCRBY here the code - * needs to handle the two different conditions. It's all about setting - * '*score', that is, the new score to set, to the right value. */ - score = zmalloc(sizeof(double)); - if (doincrement) { - dictEntry *de; - - /* Read the old score. If the element was not present starts from 0 */ - de = dictFind(zs->dict,ele); - if (de) { - double *oldscore = dictGetEntryVal(de); - *score = *oldscore + scoreval; - } else { - *score = scoreval; - } - if (isnan(*score)) { - addReplySds(c, - sdsnew("-ERR resulting score is Not A Number (nan)\r\n")); - zfree(score); - /* Note that we don't need to check if the zset may be empty and - * should be removed here, as we can only obtain Nan as score if - * there was already an element in the sorted set. */ - return; - } - } else { - *score = scoreval; - } - - /* What follows is a simple remove and re-insert operation that is common - * to both ZADD and ZINCRBY... */ - if (dictAdd(zs->dict,ele,score) == DICT_OK) { - /* case 1: New element */ - incrRefCount(ele); /* added to hash */ - zslInsert(zs->zsl,*score,ele); - incrRefCount(ele); /* added to skiplist */ - server.dirty++; - if (doincrement) - addReplyDouble(c,*score); - else - addReply(c,shared.cone); - } else { - dictEntry *de; - double *oldscore; - - /* case 2: Score update operation */ - de = dictFind(zs->dict,ele); - redisAssert(de != NULL); - oldscore = dictGetEntryVal(de); - if (*score != *oldscore) { - int deleted; - - /* Remove and insert the element in the skip list with new score */ - deleted = zslDelete(zs->zsl,*oldscore,ele); - redisAssert(deleted != 0); - zslInsert(zs->zsl,*score,ele); - incrRefCount(ele); - /* Update the score in the hash table */ - dictReplace(zs->dict,ele,score); - server.dirty++; - } else { - zfree(score); - } - if (doincrement) - addReplyDouble(c,*score); - else - addReply(c,shared.czero); - } -} - -static void zaddCommand(redisClient *c) { - double scoreval; - - if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return; - zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0); -} - -static void zincrbyCommand(redisClient *c) { - double scoreval; - - if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return; - zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1); -} - -static void zremCommand(redisClient *c) { - robj *zsetobj; - zset *zs; - dictEntry *de; - double *oldscore; - int deleted; - - if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL || - checkType(c,zsetobj,REDIS_ZSET)) return; - - zs = zsetobj->ptr; - de = dictFind(zs->dict,c->argv[2]); - if (de == NULL) { - addReply(c,shared.czero); - return; - } - /* Delete from the skiplist */ - oldscore = dictGetEntryVal(de); - deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]); - redisAssert(deleted != 0); - - /* Delete from the hash table */ - dictDelete(zs->dict,c->argv[2]); - if (htNeedsResize(zs->dict)) dictResize(zs->dict); - if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]); - server.dirty++; - addReply(c,shared.cone); -} - -static void zremrangebyscoreCommand(redisClient *c) { - double min; - double max; - long deleted; - robj *zsetobj; - zset *zs; - - if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) || - (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return; - - if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL || - checkType(c,zsetobj,REDIS_ZSET)) return; - - zs = zsetobj->ptr; - deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict); - if (htNeedsResize(zs->dict)) dictResize(zs->dict); - if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]); - server.dirty += deleted; - addReplyLongLong(c,deleted); -} - -static void zremrangebyrankCommand(redisClient *c) { - long start; - long end; - int llen; - long deleted; - robj *zsetobj; - zset *zs; - - if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) || - (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return; - - if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL || - checkType(c,zsetobj,REDIS_ZSET)) return; - zs = zsetobj->ptr; - llen = zs->zsl->length; - - /* convert negative indexes */ - if (start < 0) start = llen+start; - if (end < 0) end = llen+end; - if (start < 0) start = 0; - if (end < 0) end = 0; - - /* indexes sanity checks */ - if (start > end || start >= llen) { - addReply(c,shared.czero); - return; - } - if (end >= llen) end = llen-1; - - /* increment start and end because zsl*Rank functions - * use 1-based rank */ - deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict); - if (htNeedsResize(zs->dict)) dictResize(zs->dict); - if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]); - server.dirty += deleted; - addReplyLongLong(c, deleted); -} - -typedef struct { - dict *dict; - double weight; -} zsetopsrc; - -static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) { - zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2; - unsigned long size1, size2; - size1 = d1->dict ? dictSize(d1->dict) : 0; - size2 = d2->dict ? dictSize(d2->dict) : 0; - return size1 - size2; -} - -#define REDIS_AGGR_SUM 1 -#define REDIS_AGGR_MIN 2 -#define REDIS_AGGR_MAX 3 -#define zunionInterDictValue(_e) (dictGetEntryVal(_e) == NULL ? 1.0 : *(double*)dictGetEntryVal(_e)) - -inline static void zunionInterAggregate(double *target, double val, int aggregate) { - if (aggregate == REDIS_AGGR_SUM) { - *target = *target + val; - } else if (aggregate == REDIS_AGGR_MIN) { - *target = val < *target ? val : *target; - } else if (aggregate == REDIS_AGGR_MAX) { - *target = val > *target ? val : *target; - } else { - /* safety net */ - redisPanic("Unknown ZUNION/INTER aggregate type"); - } -} - -static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) { - int i, j, setnum; - int aggregate = REDIS_AGGR_SUM; - zsetopsrc *src; - robj *dstobj; - zset *dstzset; - dictIterator *di; - dictEntry *de; - - /* expect setnum input keys to be given */ - setnum = atoi(c->argv[2]->ptr); - if (setnum < 1) { - addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNIONSTORE/ZINTERSTORE\r\n")); - return; - } - - /* test if the expected number of keys would overflow */ - if (3+setnum > c->argc) { - addReply(c,shared.syntaxerr); - return; - } - - /* read keys to be used for input */ - src = zmalloc(sizeof(zsetopsrc) * setnum); - for (i = 0, j = 3; i < setnum; i++, j++) { - robj *obj = lookupKeyWrite(c->db,c->argv[j]); - if (!obj) { - src[i].dict = NULL; - } else { - if (obj->type == REDIS_ZSET) { - src[i].dict = ((zset*)obj->ptr)->dict; - } else if (obj->type == REDIS_SET) { - src[i].dict = (obj->ptr); - } else { - zfree(src); - addReply(c,shared.wrongtypeerr); - return; - } - } - - /* default all weights to 1 */ - src[i].weight = 1.0; - } - - /* parse optional extra arguments */ - if (j < c->argc) { - int remaining = c->argc - j; - - while (remaining) { - if (remaining >= (setnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) { - j++; remaining--; - for (i = 0; i < setnum; i++, j++, remaining--) { - if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK) - return; - } - } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) { - j++; remaining--; - if (!strcasecmp(c->argv[j]->ptr,"sum")) { - aggregate = REDIS_AGGR_SUM; - } else if (!strcasecmp(c->argv[j]->ptr,"min")) { - aggregate = REDIS_AGGR_MIN; - } else if (!strcasecmp(c->argv[j]->ptr,"max")) { - aggregate = REDIS_AGGR_MAX; - } else { - zfree(src); - addReply(c,shared.syntaxerr); - return; - } - j++; remaining--; - } else { - zfree(src); - addReply(c,shared.syntaxerr); - return; - } - } - } - - /* sort sets from the smallest to largest, this will improve our - * algorithm's performance */ - qsort(src,setnum,sizeof(zsetopsrc),qsortCompareZsetopsrcByCardinality); - - dstobj = createZsetObject(); - dstzset = dstobj->ptr; - - if (op == REDIS_OP_INTER) { - /* skip going over all entries if the smallest zset is NULL or empty */ - if (src[0].dict && dictSize(src[0].dict) > 0) { - /* precondition: as src[0].dict is non-empty and the zsets are ordered - * from small to large, all src[i > 0].dict are non-empty too */ - di = dictGetIterator(src[0].dict); - while((de = dictNext(di)) != NULL) { - double *score = zmalloc(sizeof(double)), value; - *score = src[0].weight * zunionInterDictValue(de); - - for (j = 1; j < setnum; j++) { - dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de)); - if (other) { - value = src[j].weight * zunionInterDictValue(other); - zunionInterAggregate(score, value, aggregate); - } else { - break; - } - } - - /* skip entry when not present in every source dict */ - if (j != setnum) { - zfree(score); - } else { - robj *o = dictGetEntryKey(de); - dictAdd(dstzset->dict,o,score); - incrRefCount(o); /* added to dictionary */ - zslInsert(dstzset->zsl,*score,o); - incrRefCount(o); /* added to skiplist */ - } - } - dictReleaseIterator(di); - } - } else if (op == REDIS_OP_UNION) { - for (i = 0; i < setnum; i++) { - if (!src[i].dict) continue; - - di = dictGetIterator(src[i].dict); - while((de = dictNext(di)) != NULL) { - /* skip key when already processed */ - if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue; - - double *score = zmalloc(sizeof(double)), value; - *score = src[i].weight * zunionInterDictValue(de); - - /* because the zsets are sorted by size, its only possible - * for sets at larger indices to hold this entry */ - for (j = (i+1); j < setnum; j++) { - dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de)); - if (other) { - value = src[j].weight * zunionInterDictValue(other); - zunionInterAggregate(score, value, aggregate); - } - } - - robj *o = dictGetEntryKey(de); - dictAdd(dstzset->dict,o,score); - incrRefCount(o); /* added to dictionary */ - zslInsert(dstzset->zsl,*score,o); - incrRefCount(o); /* added to skiplist */ - } - dictReleaseIterator(di); - } - } else { - /* unknown operator */ - redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION); - } - - dbDelete(c->db,dstkey); - if (dstzset->zsl->length) { - dbAdd(c->db,dstkey,dstobj); - addReplyLongLong(c, dstzset->zsl->length); - server.dirty++; - } else { - decrRefCount(dstobj); - addReply(c, shared.czero); - } - zfree(src); -} - -static void zunionstoreCommand(redisClient *c) { - zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION); -} - -static void zinterstoreCommand(redisClient *c) { - zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER); -} - -static void zrangeGenericCommand(redisClient *c, int reverse) { - robj *o; - long start; - long end; - int withscores = 0; - int llen; - int rangelen, j; - zset *zsetobj; - zskiplist *zsl; - zskiplistNode *ln; - robj *ele; - - if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) || - (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return; - - if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) { - withscores = 1; - } else if (c->argc >= 5) { - addReply(c,shared.syntaxerr); - return; - } - - if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL - || checkType(c,o,REDIS_ZSET)) return; - zsetobj = o->ptr; - zsl = zsetobj->zsl; - llen = zsl->length; - - /* convert negative indexes */ - if (start < 0) start = llen+start; - if (end < 0) end = llen+end; - if (start < 0) start = 0; - if (end < 0) end = 0; - - /* indexes sanity checks */ - if (start > end || start >= llen) { - /* Out of range start or start > end result in empty list */ - addReply(c,shared.emptymultibulk); - return; - } - if (end >= llen) end = llen-1; - rangelen = (end-start)+1; - - /* check if starting point is trivial, before searching - * the element in log(N) time */ - if (reverse) { - ln = start == 0 ? zsl->tail : zslistTypeGetElementByRank(zsl, llen-start); - } else { - ln = start == 0 ? - zsl->header->forward[0] : zslistTypeGetElementByRank(zsl, start+1); - } - - /* Return the result in form of a multi-bulk reply */ - addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n", - withscores ? (rangelen*2) : rangelen)); - for (j = 0; j < rangelen; j++) { - ele = ln->obj; - addReplyBulk(c,ele); - if (withscores) - addReplyDouble(c,ln->score); - ln = reverse ? ln->backward : ln->forward[0]; - } -} - -static void zrangeCommand(redisClient *c) { - zrangeGenericCommand(c,0); -} - -static void zrevrangeCommand(redisClient *c) { - zrangeGenericCommand(c,1); -} - -/* This command implements both ZRANGEBYSCORE and ZCOUNT. - * If justcount is non-zero, just the count is returned. */ -static void genericZrangebyscoreCommand(redisClient *c, int justcount) { - robj *o; - double min, max; - int minex = 0, maxex = 0; /* are min or max exclusive? */ - int offset = 0, limit = -1; - int withscores = 0; - int badsyntax = 0; - - /* Parse the min-max interval. If one of the values is prefixed - * by the "(" character, it's considered "open". For instance - * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max - * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */ - if (((char*)c->argv[2]->ptr)[0] == '(') { - min = strtod((char*)c->argv[2]->ptr+1,NULL); - minex = 1; - } else { - min = strtod(c->argv[2]->ptr,NULL); - } - if (((char*)c->argv[3]->ptr)[0] == '(') { - max = strtod((char*)c->argv[3]->ptr+1,NULL); - maxex = 1; - } else { - max = strtod(c->argv[3]->ptr,NULL); - } - - /* Parse "WITHSCORES": note that if the command was called with - * the name ZCOUNT then we are sure that c->argc == 4, so we'll never - * enter the following paths to parse WITHSCORES and LIMIT. */ - if (c->argc == 5 || c->argc == 8) { - if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0) - withscores = 1; - else - badsyntax = 1; - } - if (c->argc != (4 + withscores) && c->argc != (7 + withscores)) - badsyntax = 1; - if (badsyntax) { - addReplySds(c, - sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n")); - return; - } - - /* Parse "LIMIT" */ - if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) { - addReply(c,shared.syntaxerr); - return; - } else if (c->argc == (7 + withscores)) { - offset = atoi(c->argv[5]->ptr); - limit = atoi(c->argv[6]->ptr); - if (offset < 0) offset = 0; - } - - /* Ok, lookup the key and get the range */ - o = lookupKeyRead(c->db,c->argv[1]); - if (o == NULL) { - addReply(c,justcount ? shared.czero : shared.emptymultibulk); - } else { - if (o->type != REDIS_ZSET) { - addReply(c,shared.wrongtypeerr); - } else { - zset *zsetobj = o->ptr; - zskiplist *zsl = zsetobj->zsl; - zskiplistNode *ln; - robj *ele, *lenobj = NULL; - unsigned long rangelen = 0; - - /* Get the first node with the score >= min, or with - * score > min if 'minex' is true. */ - ln = zslFirstWithScore(zsl,min); - while (minex && ln && ln->score == min) ln = ln->forward[0]; - - if (ln == NULL) { - /* No element matching the speciifed interval */ - addReply(c,justcount ? shared.czero : shared.emptymultibulk); - return; - } - - /* We don't know in advance how many matching elements there - * are in the list, so we push this object that will represent - * the multi-bulk length in the output buffer, and will "fix" - * it later */ - if (!justcount) { - lenobj = createObject(REDIS_STRING,NULL); - addReply(c,lenobj); - decrRefCount(lenobj); - } - - while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) { - if (offset) { - offset--; - ln = ln->forward[0]; - continue; - } - if (limit == 0) break; - if (!justcount) { - ele = ln->obj; - addReplyBulk(c,ele); - if (withscores) - addReplyDouble(c,ln->score); - } - ln = ln->forward[0]; - rangelen++; - if (limit > 0) limit--; - } - if (justcount) { - addReplyLongLong(c,(long)rangelen); - } else { - lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n", - withscores ? (rangelen*2) : rangelen); - } - } - } -} - -static void zrangebyscoreCommand(redisClient *c) { - genericZrangebyscoreCommand(c,0); -} - -static void zcountCommand(redisClient *c) { - genericZrangebyscoreCommand(c,1); -} - -static void zcardCommand(redisClient *c) { - robj *o; - zset *zs; - - if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL || - checkType(c,o,REDIS_ZSET)) return; - - zs = o->ptr; - addReplyUlong(c,zs->zsl->length); -} - -static void zscoreCommand(redisClient *c) { - robj *o; - zset *zs; - dictEntry *de; - - if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL || - checkType(c,o,REDIS_ZSET)) return; - - zs = o->ptr; - de = dictFind(zs->dict,c->argv[2]); - if (!de) { - addReply(c,shared.nullbulk); - } else { - double *score = dictGetEntryVal(de); - - addReplyDouble(c,*score); - } -} - -static void zrankGenericCommand(redisClient *c, int reverse) { - robj *o; - zset *zs; - zskiplist *zsl; - dictEntry *de; - unsigned long rank; - double *score; - - if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL || - checkType(c,o,REDIS_ZSET)) return; - - zs = o->ptr; - zsl = zs->zsl; - de = dictFind(zs->dict,c->argv[2]); - if (!de) { - addReply(c,shared.nullbulk); - return; - } - - score = dictGetEntryVal(de); - rank = zslistTypeGetRank(zsl, *score, c->argv[2]); - if (rank) { - if (reverse) { - addReplyLongLong(c, zsl->length - rank); - } else { - addReplyLongLong(c, rank-1); - } - } else { - addReply(c,shared.nullbulk); - } -} - -static void zrankCommand(redisClient *c) { - zrankGenericCommand(c, 0); -} - -static void zrevrankCommand(redisClient *c) { - zrankGenericCommand(c, 1); -} - -/* ========================= Hashes utility functions ======================= */ -#define REDIS_HASH_KEY 1 -#define REDIS_HASH_VALUE 2 - -/* Check the length of a number of objects to see if we need to convert a - * zipmap to a real hash. Note that we only check string encoded objects - * as their string length can be queried in constant time. */ -static void hashTypeTryConversion(robj *subject, robj **argv, int start, int end) { - int i; - if (subject->encoding != REDIS_ENCODING_ZIPMAP) return; - - for (i = start; i <= end; i++) { - if (argv[i]->encoding == REDIS_ENCODING_RAW && - sdslen(argv[i]->ptr) > server.hash_max_zipmap_value) - { - convertToRealHash(subject); - return; - } - } -} - -/* Encode given objects in-place when the hash uses a dict. */ -static void hashTypeTryObjectEncoding(robj *subject, robj **o1, robj **o2) { - if (subject->encoding == REDIS_ENCODING_HT) { - if (o1) *o1 = tryObjectEncoding(*o1); - if (o2) *o2 = tryObjectEncoding(*o2); - } -} - -/* Get the value from a hash identified by key. Returns either a string - * object or NULL if the value cannot be found. The refcount of the object - * is always increased by 1 when the value was found. */ -static robj *hashTypeGet(robj *o, robj *key) { - robj *value = NULL; - if (o->encoding == REDIS_ENCODING_ZIPMAP) { - unsigned char *v; - unsigned int vlen; - key = getDecodedObject(key); - if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) { - value = createStringObject((char*)v,vlen); - } - decrRefCount(key); - } else { - dictEntry *de = dictFind(o->ptr,key); - if (de != NULL) { - value = dictGetEntryVal(de); - incrRefCount(value); - } - } - return value; -} - -/* Test if the key exists in the given hash. Returns 1 if the key - * exists and 0 when it doesn't. */ -static int hashTypeExists(robj *o, robj *key) { - if (o->encoding == REDIS_ENCODING_ZIPMAP) { - key = getDecodedObject(key); - if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) { - decrRefCount(key); - return 1; - } - decrRefCount(key); - } else { - if (dictFind(o->ptr,key) != NULL) { - return 1; - } - } - return 0; -} - -/* Add an element, discard the old if the key already exists. - * Return 0 on insert and 1 on update. */ -static int hashTypeSet(robj *o, robj *key, robj *value) { - int update = 0; - if (o->encoding == REDIS_ENCODING_ZIPMAP) { - key = getDecodedObject(key); - value = getDecodedObject(value); - o->ptr = zipmapSet(o->ptr, - key->ptr,sdslen(key->ptr), - value->ptr,sdslen(value->ptr), &update); - decrRefCount(key); - decrRefCount(value); - - /* Check if the zipmap needs to be upgraded to a real hash table */ - if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries) - convertToRealHash(o); - } else { - if (dictReplace(o->ptr,key,value)) { - /* Insert */ - incrRefCount(key); - } else { - /* Update */ - update = 1; - } - incrRefCount(value); - } - return update; -} - -/* Delete an element from a hash. - * Return 1 on deleted and 0 on not found. */ -static int hashTypeDelete(robj *o, robj *key) { - int deleted = 0; - if (o->encoding == REDIS_ENCODING_ZIPMAP) { - key = getDecodedObject(key); - o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted); - decrRefCount(key); - } else { - deleted = dictDelete((dict*)o->ptr,key) == DICT_OK; - /* Always check if the dictionary needs a resize after a delete. */ - if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr); - } - return deleted; -} - -/* Return the number of elements in a hash. */ -static unsigned long hashTypeLength(robj *o) { - return (o->encoding == REDIS_ENCODING_ZIPMAP) ? - zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr); -} - -/* Structure to hold hash iteration abstration. Note that iteration over - * hashes involves both fields and values. Because it is possible that - * not both are required, store pointers in the iterator to avoid - * unnecessary memory allocation for fields/values. */ -typedef struct { - int encoding; - unsigned char *zi; - unsigned char *zk, *zv; - unsigned int zklen, zvlen; - - dictIterator *di; - dictEntry *de; -} hashTypeIterator; - -static hashTypeIterator *hashTypeInitIterator(robj *subject) { - hashTypeIterator *hi = zmalloc(sizeof(hashTypeIterator)); - hi->encoding = subject->encoding; - if (hi->encoding == REDIS_ENCODING_ZIPMAP) { - hi->zi = zipmapRewind(subject->ptr); - } else if (hi->encoding == REDIS_ENCODING_HT) { - hi->di = dictGetIterator(subject->ptr); - } else { - redisAssert(NULL); - } - return hi; -} - -static void hashTypeReleaseIterator(hashTypeIterator *hi) { - if (hi->encoding == REDIS_ENCODING_HT) { - dictReleaseIterator(hi->di); - } - zfree(hi); -} - -/* Move to the next entry in the hash. Return REDIS_OK when the next entry - * could be found and REDIS_ERR when the iterator reaches the end. */ -static int hashTypeNext(hashTypeIterator *hi) { - if (hi->encoding == REDIS_ENCODING_ZIPMAP) { - if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen, - &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR; - } else { - if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR; - } - return REDIS_OK; -} - -/* Get key or value object at current iteration position. - * This increases the refcount of the field object by 1. */ -static robj *hashTypeCurrent(hashTypeIterator *hi, int what) { - robj *o; - if (hi->encoding == REDIS_ENCODING_ZIPMAP) { - if (what & REDIS_HASH_KEY) { - o = createStringObject((char*)hi->zk,hi->zklen); - } else { - o = createStringObject((char*)hi->zv,hi->zvlen); - } - } else { - if (what & REDIS_HASH_KEY) { - o = dictGetEntryKey(hi->de); - } else { - o = dictGetEntryVal(hi->de); - } - incrRefCount(o); - } - return o; -} - -static robj *hashTypeLookupWriteOrCreate(redisClient *c, robj *key) { - robj *o = lookupKeyWrite(c->db,key); - if (o == NULL) { - o = createHashObject(); - dbAdd(c->db,key,o); - } else { - if (o->type != REDIS_HASH) { - addReply(c,shared.wrongtypeerr); - return NULL; - } - } - return o; -} - -/* ============================= Hash commands ============================== */ -static void hsetCommand(redisClient *c) { - int update; - robj *o; - - if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return; - hashTypeTryConversion(o,c->argv,2,3); - hashTypeTryObjectEncoding(o,&c->argv[2], &c->argv[3]); - update = hashTypeSet(o,c->argv[2],c->argv[3]); - addReply(c, update ? shared.czero : shared.cone); - server.dirty++; -} - -static void hsetnxCommand(redisClient *c) { - robj *o; - if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return; - hashTypeTryConversion(o,c->argv,2,3); - - if (hashTypeExists(o, c->argv[2])) { - addReply(c, shared.czero); - } else { - hashTypeTryObjectEncoding(o,&c->argv[2], &c->argv[3]); - hashTypeSet(o,c->argv[2],c->argv[3]); - addReply(c, shared.cone); - server.dirty++; - } -} - -static void hmsetCommand(redisClient *c) { - int i; - robj *o; - - if ((c->argc % 2) == 1) { - addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n")); - return; - } - - if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return; - hashTypeTryConversion(o,c->argv,2,c->argc-1); - for (i = 2; i < c->argc; i += 2) { - hashTypeTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]); - hashTypeSet(o,c->argv[i],c->argv[i+1]); - } - addReply(c, shared.ok); - server.dirty++; -} - -static void hincrbyCommand(redisClient *c) { - long long value, incr; - robj *o, *current, *new; - - if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return; - if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return; - if ((current = hashTypeGet(o,c->argv[2])) != NULL) { - if (getLongLongFromObjectOrReply(c,current,&value, - "hash value is not an integer") != REDIS_OK) { - decrRefCount(current); - return; - } - decrRefCount(current); - } else { - value = 0; - } - - value += incr; - new = createStringObjectFromLongLong(value); - hashTypeTryObjectEncoding(o,&c->argv[2],NULL); - hashTypeSet(o,c->argv[2],new); - decrRefCount(new); - addReplyLongLong(c,value); - server.dirty++; -} - -static void hgetCommand(redisClient *c) { - robj *o, *value; - if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL || - checkType(c,o,REDIS_HASH)) return; - - if ((value = hashTypeGet(o,c->argv[2])) != NULL) { - addReplyBulk(c,value); - decrRefCount(value); - } else { - addReply(c,shared.nullbulk); - } -} - -static void hmgetCommand(redisClient *c) { - int i; - robj *o, *value; - o = lookupKeyRead(c->db,c->argv[1]); - if (o != NULL && o->type != REDIS_HASH) { - addReply(c,shared.wrongtypeerr); - } - - /* Note the check for o != NULL happens inside the loop. This is - * done because objects that cannot be found are considered to be - * an empty hash. The reply should then be a series of NULLs. */ - addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2)); - for (i = 2; i < c->argc; i++) { - if (o != NULL && (value = hashTypeGet(o,c->argv[i])) != NULL) { - addReplyBulk(c,value); - decrRefCount(value); - } else { - addReply(c,shared.nullbulk); - } - } -} - -static void hdelCommand(redisClient *c) { - robj *o; - if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL || - checkType(c,o,REDIS_HASH)) return; - - if (hashTypeDelete(o,c->argv[2])) { - if (hashTypeLength(o) == 0) dbDelete(c->db,c->argv[1]); - addReply(c,shared.cone); - server.dirty++; - } else { - addReply(c,shared.czero); - } -} - -static void hlenCommand(redisClient *c) { - robj *o; - if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL || - checkType(c,o,REDIS_HASH)) return; - - addReplyUlong(c,hashTypeLength(o)); -} - -static void genericHgetallCommand(redisClient *c, int flags) { - robj *o, *lenobj, *obj; - unsigned long count = 0; - hashTypeIterator *hi; - - if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL - || checkType(c,o,REDIS_HASH)) return; - - lenobj = createObject(REDIS_STRING,NULL); - addReply(c,lenobj); - decrRefCount(lenobj); - - hi = hashTypeInitIterator(o); - while (hashTypeNext(hi) != REDIS_ERR) { - if (flags & REDIS_HASH_KEY) { - obj = hashTypeCurrent(hi,REDIS_HASH_KEY); - addReplyBulk(c,obj); - decrRefCount(obj); - count++; - } - if (flags & REDIS_HASH_VALUE) { - obj = hashTypeCurrent(hi,REDIS_HASH_VALUE); - addReplyBulk(c,obj); - decrRefCount(obj); - count++; - } - } - hashTypeReleaseIterator(hi); - - lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count); -} - -static void hkeysCommand(redisClient *c) { - genericHgetallCommand(c,REDIS_HASH_KEY); -} - -static void hvalsCommand(redisClient *c) { - genericHgetallCommand(c,REDIS_HASH_VALUE); -} - -static void hgetallCommand(redisClient *c) { - genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE); -} - -static void hexistsCommand(redisClient *c) { - robj *o; - if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL || - checkType(c,o,REDIS_HASH)) return; - - addReply(c, hashTypeExists(o,c->argv[2]) ? shared.cone : shared.czero); -} - -static void convertToRealHash(robj *o) { - unsigned char *key, *val, *p, *zm = o->ptr; - unsigned int klen, vlen; - dict *dict = dictCreate(&hashDictType,NULL); - - assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT); - p = zipmapRewind(zm); - while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) { - robj *keyobj, *valobj; - - keyobj = createStringObject((char*)key,klen); - valobj = createStringObject((char*)val,vlen); - keyobj = tryObjectEncoding(keyobj); - valobj = tryObjectEncoding(valobj); - dictAdd(dict,keyobj,valobj); - } - o->encoding = REDIS_ENCODING_HT; - o->ptr = dict; - zfree(zm); -} - -/* ========================= Non type-specific commands ==================== */ - -static void flushdbCommand(redisClient *c) { - server.dirty += dictSize(c->db->dict); - touchWatchedKeysOnFlush(c->db->id); - dictEmpty(c->db->dict); - dictEmpty(c->db->expires); - addReply(c,shared.ok); -} - -static void flushallCommand(redisClient *c) { - touchWatchedKeysOnFlush(-1); - server.dirty += emptyDb(); - addReply(c,shared.ok); - if (server.bgsavechildpid != -1) { - kill(server.bgsavechildpid,SIGKILL); - rdbRemoveTempFile(server.bgsavechildpid); - } - rdbSave(server.dbfilename); - server.dirty++; -} - -static redisSortOperation *createSortOperation(int type, robj *pattern) { - redisSortOperation *so = zmalloc(sizeof(*so)); - so->type = type; - so->pattern = pattern; - return so; -} - -/* Return the value associated to the key with a name obtained - * substituting the first occurence of '*' in 'pattern' with 'subst'. - * The returned object will always have its refcount increased by 1 - * when it is non-NULL. */ -static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) { - char *p, *f; - sds spat, ssub; - robj keyobj, fieldobj, *o; - int prefixlen, sublen, postfixlen, fieldlen; - /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */ - struct { - long len; - long free; - char buf[REDIS_SORTKEY_MAX+1]; - } keyname, fieldname; - - /* If the pattern is "#" return the substitution object itself in order - * to implement the "SORT ... GET #" feature. */ - spat = pattern->ptr; - if (spat[0] == '#' && spat[1] == '\0') { - incrRefCount(subst); - return subst; - } - - /* The substitution object may be specially encoded. If so we create - * a decoded object on the fly. Otherwise getDecodedObject will just - * increment the ref count, that we'll decrement later. */ - subst = getDecodedObject(subst); - - ssub = subst->ptr; - if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL; - p = strchr(spat,'*'); - if (!p) { - decrRefCount(subst); - return NULL; - } - - /* Find out if we're dealing with a hash dereference. */ - if ((f = strstr(p+1, "->")) != NULL) { - fieldlen = sdslen(spat)-(f-spat); - /* this also copies \0 character */ - memcpy(fieldname.buf,f+2,fieldlen-1); - fieldname.len = fieldlen-2; - } else { - fieldlen = 0; - } - - prefixlen = p-spat; - sublen = sdslen(ssub); - postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen; - memcpy(keyname.buf,spat,prefixlen); - memcpy(keyname.buf+prefixlen,ssub,sublen); - memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen); - keyname.buf[prefixlen+sublen+postfixlen] = '\0'; - keyname.len = prefixlen+sublen+postfixlen; - decrRefCount(subst); - - /* Lookup substituted key */ - initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2)); - o = lookupKeyRead(db,&keyobj); - if (o == NULL) return NULL; - - if (fieldlen > 0) { - if (o->type != REDIS_HASH || fieldname.len < 1) return NULL; - - /* Retrieve value from hash by the field name. This operation - * already increases the refcount of the returned object. */ - initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2)); - o = hashTypeGet(o, &fieldobj); - } else { - if (o->type != REDIS_STRING) return NULL; - - /* Every object that this function returns needs to have its refcount - * increased. sortCommand decreases it again. */ - incrRefCount(o); - } - - return o; -} - -/* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with - * the additional parameter is not standard but a BSD-specific we have to - * pass sorting parameters via the global 'server' structure */ -static int sortCompare(const void *s1, const void *s2) { - const redisSortObject *so1 = s1, *so2 = s2; - int cmp; - - if (!server.sort_alpha) { - /* Numeric sorting. Here it's trivial as we precomputed scores */ - if (so1->u.score > so2->u.score) { - cmp = 1; - } else if (so1->u.score < so2->u.score) { - cmp = -1; - } else { - cmp = 0; - } - } else { - /* Alphanumeric sorting */ - if (server.sort_bypattern) { - if (!so1->u.cmpobj || !so2->u.cmpobj) { - /* At least one compare object is NULL */ - if (so1->u.cmpobj == so2->u.cmpobj) - cmp = 0; - else if (so1->u.cmpobj == NULL) - cmp = -1; - else - cmp = 1; - } else { - /* We have both the objects, use strcoll */ - cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr); - } - } else { - /* Compare elements directly. */ - cmp = compareStringObjects(so1->obj,so2->obj); - } - } - return server.sort_desc ? -cmp : cmp; -} - -/* The SORT command is the most complex command in Redis. Warning: this code - * is optimized for speed and a bit less for readability */ -static void sortCommand(redisClient *c) { - list *operations; - unsigned int outputlen = 0; - int desc = 0, alpha = 0; - int limit_start = 0, limit_count = -1, start, end; - int j, dontsort = 0, vectorlen; - int getop = 0; /* GET operation counter */ - robj *sortval, *sortby = NULL, *storekey = NULL; - redisSortObject *vector; /* Resulting vector to sort */ - - /* Lookup the key to sort. It must be of the right types */ - sortval = lookupKeyRead(c->db,c->argv[1]); - if (sortval == NULL) { - addReply(c,shared.emptymultibulk); - return; - } - if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST && - sortval->type != REDIS_ZSET) - { - addReply(c,shared.wrongtypeerr); - return; - } - - /* Create a list of operations to perform for every sorted element. - * Operations can be GET/DEL/INCR/DECR */ - operations = listCreate(); - listSetFreeMethod(operations,zfree); - j = 2; - - /* Now we need to protect sortval incrementing its count, in the future - * SORT may have options able to overwrite/delete keys during the sorting - * and the sorted key itself may get destroied */ - incrRefCount(sortval); - - /* The SORT command has an SQL-alike syntax, parse it */ - while(j < c->argc) { - int leftargs = c->argc-j-1; - if (!strcasecmp(c->argv[j]->ptr,"asc")) { - desc = 0; - } else if (!strcasecmp(c->argv[j]->ptr,"desc")) { - desc = 1; - } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) { - alpha = 1; - } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) { - limit_start = atoi(c->argv[j+1]->ptr); - limit_count = atoi(c->argv[j+2]->ptr); - j+=2; - } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) { - storekey = c->argv[j+1]; - j++; - } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) { - sortby = c->argv[j+1]; - /* If the BY pattern does not contain '*', i.e. it is constant, - * we don't need to sort nor to lookup the weight keys. */ - if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1; - j++; - } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) { - listAddNodeTail(operations,createSortOperation( - REDIS_SORT_GET,c->argv[j+1])); - getop++; - j++; - } else { - decrRefCount(sortval); - listRelease(operations); - addReply(c,shared.syntaxerr); - return; - } - j++; - } - - /* Load the sorting vector with all the objects to sort */ - switch(sortval->type) { - case REDIS_LIST: vectorlen = listTypeLength(sortval); break; - case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break; - case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break; - default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */ - } - vector = zmalloc(sizeof(redisSortObject)*vectorlen); - j = 0; - - if (sortval->type == REDIS_LIST) { - listTypeIterator *li = listTypeInitIterator(sortval,0,REDIS_TAIL); - listTypeEntry entry; - while(listTypeNext(li,&entry)) { - vector[j].obj = listTypeGet(&entry); - vector[j].u.score = 0; - vector[j].u.cmpobj = NULL; - j++; - } - listTypeReleaseIterator(li); - } else { - dict *set; - dictIterator *di; - dictEntry *setele; - - if (sortval->type == REDIS_SET) { - set = sortval->ptr; - } else { - zset *zs = sortval->ptr; - set = zs->dict; - } - - di = dictGetIterator(set); - while((setele = dictNext(di)) != NULL) { - vector[j].obj = dictGetEntryKey(setele); - vector[j].u.score = 0; - vector[j].u.cmpobj = NULL; - j++; - } - dictReleaseIterator(di); - } - redisAssert(j == vectorlen); - - /* Now it's time to load the right scores in the sorting vector */ - if (dontsort == 0) { - for (j = 0; j < vectorlen; j++) { - robj *byval; - if (sortby) { - /* lookup value to sort by */ - byval = lookupKeyByPattern(c->db,sortby,vector[j].obj); - if (!byval) continue; - } else { - /* use object itself to sort by */ - byval = vector[j].obj; - } - - if (alpha) { - if (sortby) vector[j].u.cmpobj = getDecodedObject(byval); - } else { - if (byval->encoding == REDIS_ENCODING_RAW) { - vector[j].u.score = strtod(byval->ptr,NULL); - } else if (byval->encoding == REDIS_ENCODING_INT) { - /* Don't need to decode the object if it's - * integer-encoded (the only encoding supported) so - * far. We can just cast it */ - vector[j].u.score = (long)byval->ptr; - } else { - redisAssert(1 != 1); - } - } - - /* when the object was retrieved using lookupKeyByPattern, - * its refcount needs to be decreased. */ - if (sortby) { - decrRefCount(byval); - } - } - } - - /* We are ready to sort the vector... perform a bit of sanity check - * on the LIMIT option too. We'll use a partial version of quicksort. */ - start = (limit_start < 0) ? 0 : limit_start; - end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1; - if (start >= vectorlen) { - start = vectorlen-1; - end = vectorlen-2; - } - if (end >= vectorlen) end = vectorlen-1; - - if (dontsort == 0) { - server.sort_desc = desc; - server.sort_alpha = alpha; - server.sort_bypattern = sortby ? 1 : 0; - if (sortby && (start != 0 || end != vectorlen-1)) - pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end); - else - qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare); - } - - /* Send command output to the output buffer, performing the specified - * GET/DEL/INCR/DECR operations if any. */ - outputlen = getop ? getop*(end-start+1) : end-start+1; - if (storekey == NULL) { - /* STORE option not specified, sent the sorting result to client */ - addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen)); - for (j = start; j <= end; j++) { - listNode *ln; - listIter li; - - if (!getop) addReplyBulk(c,vector[j].obj); - listRewind(operations,&li); - while((ln = listNext(&li))) { - redisSortOperation *sop = ln->value; - robj *val = lookupKeyByPattern(c->db,sop->pattern, - vector[j].obj); - - if (sop->type == REDIS_SORT_GET) { - if (!val) { - addReply(c,shared.nullbulk); - } else { - addReplyBulk(c,val); - decrRefCount(val); - } - } else { - redisAssert(sop->type == REDIS_SORT_GET); /* always fails */ - } - } - } - } else { - robj *sobj = createZiplistObject(); - - /* STORE option specified, set the sorting result as a List object */ - for (j = start; j <= end; j++) { - listNode *ln; - listIter li; - - if (!getop) { - listTypePush(sobj,vector[j].obj,REDIS_TAIL); - } else { - listRewind(operations,&li); - while((ln = listNext(&li))) { - redisSortOperation *sop = ln->value; - robj *val = lookupKeyByPattern(c->db,sop->pattern, - vector[j].obj); - - if (sop->type == REDIS_SORT_GET) { - if (!val) val = createStringObject("",0); - - /* listTypePush does an incrRefCount, so we should take care - * care of the incremented refcount caused by either - * lookupKeyByPattern or createStringObject("",0) */ - listTypePush(sobj,val,REDIS_TAIL); - decrRefCount(val); - } else { - /* always fails */ - redisAssert(sop->type == REDIS_SORT_GET); - } - } - } - } - dbReplace(c->db,storekey,sobj); - /* Note: we add 1 because the DB is dirty anyway since even if the - * SORT result is empty a new key is set and maybe the old content - * replaced. */ - server.dirty += 1+outputlen; - addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen)); - } - - /* Cleanup */ - if (sortval->type == REDIS_LIST) - for (j = 0; j < vectorlen; j++) - decrRefCount(vector[j].obj); - decrRefCount(sortval); - listRelease(operations); - for (j = 0; j < vectorlen; j++) { - if (alpha && vector[j].u.cmpobj) - decrRefCount(vector[j].u.cmpobj); - } - zfree(vector); -} - -/* Convert an amount of bytes into a human readable string in the form - * of 100B, 2G, 100M, 4K, and so forth. */ -static void bytesToHuman(char *s, unsigned long long n) { - double d; - - if (n < 1024) { - /* Bytes */ - sprintf(s,"%lluB",n); - return; - } else if (n < (1024*1024)) { - d = (double)n/(1024); - sprintf(s,"%.2fK",d); - } else if (n < (1024LL*1024*1024)) { - d = (double)n/(1024*1024); - sprintf(s,"%.2fM",d); - } else if (n < (1024LL*1024*1024*1024)) { - d = (double)n/(1024LL*1024*1024); - sprintf(s,"%.2fG",d); - } -} - -/* Create the string returned by the INFO command. This is decoupled - * by the INFO command itself as we need to report the same information - * on memory corruption problems. */ -static sds genRedisInfoString(void) { - sds info; - time_t uptime = time(NULL)-server.stat_starttime; - int j; - char hmem[64]; - - bytesToHuman(hmem,zmalloc_used_memory()); - info = sdscatprintf(sdsempty(), - "redis_version:%s\r\n" - "redis_git_sha1:%s\r\n" - "redis_git_dirty:%d\r\n" - "arch_bits:%s\r\n" - "multiplexing_api:%s\r\n" - "process_id:%ld\r\n" - "uptime_in_seconds:%ld\r\n" - "uptime_in_days:%ld\r\n" - "connected_clients:%d\r\n" - "connected_slaves:%d\r\n" - "blocked_clients:%d\r\n" - "used_memory:%zu\r\n" - "used_memory_human:%s\r\n" - "changes_since_last_save:%lld\r\n" - "bgsave_in_progress:%d\r\n" - "last_save_time:%ld\r\n" - "bgrewriteaof_in_progress:%d\r\n" - "total_connections_received:%lld\r\n" - "total_commands_processed:%lld\r\n" - "expired_keys:%lld\r\n" - "hash_max_zipmap_entries:%zu\r\n" - "hash_max_zipmap_value:%zu\r\n" - "pubsub_channels:%ld\r\n" - "pubsub_patterns:%u\r\n" - "vm_enabled:%d\r\n" - "role:%s\r\n" - ,REDIS_VERSION, - redisGitSHA1(), - strtol(redisGitDirty(),NULL,10) > 0, - (sizeof(long) == 8) ? "64" : "32", - aeGetApiName(), - (long) getpid(), - uptime, - uptime/(3600*24), - listLength(server.clients)-listLength(server.slaves), - listLength(server.slaves), - server.blpop_blocked_clients, - zmalloc_used_memory(), - hmem, - server.dirty, - server.bgsavechildpid != -1, - server.lastsave, - server.bgrewritechildpid != -1, - server.stat_numconnections, - server.stat_numcommands, - server.stat_expiredkeys, - server.hash_max_zipmap_entries, - server.hash_max_zipmap_value, - dictSize(server.pubsub_channels), - listLength(server.pubsub_patterns), - server.vm_enabled != 0, - server.masterhost == NULL ? "master" : "slave" - ); - if (server.masterhost) { - info = sdscatprintf(info, - "master_host:%s\r\n" - "master_port:%d\r\n" - "master_link_status:%s\r\n" - "master_last_io_seconds_ago:%d\r\n" - ,server.masterhost, - server.masterport, - (server.replstate == REDIS_REPL_CONNECTED) ? - "up" : "down", - server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1 - ); - } - if (server.vm_enabled) { - lockThreadedIO(); - info = sdscatprintf(info, - "vm_conf_max_memory:%llu\r\n" - "vm_conf_page_size:%llu\r\n" - "vm_conf_pages:%llu\r\n" - "vm_stats_used_pages:%llu\r\n" - "vm_stats_swapped_objects:%llu\r\n" - "vm_stats_swappin_count:%llu\r\n" - "vm_stats_swappout_count:%llu\r\n" - "vm_stats_io_newjobs_len:%lu\r\n" - "vm_stats_io_processing_len:%lu\r\n" - "vm_stats_io_processed_len:%lu\r\n" - "vm_stats_io_active_threads:%lu\r\n" - "vm_stats_blocked_clients:%lu\r\n" - ,(unsigned long long) server.vm_max_memory, - (unsigned long long) server.vm_page_size, - (unsigned long long) server.vm_pages, - (unsigned long long) server.vm_stats_used_pages, - (unsigned long long) server.vm_stats_swapped_objects, - (unsigned long long) server.vm_stats_swapins, - (unsigned long long) server.vm_stats_swapouts, - (unsigned long) listLength(server.io_newjobs), - (unsigned long) listLength(server.io_processing), - (unsigned long) listLength(server.io_processed), - (unsigned long) server.io_active_threads, - (unsigned long) server.vm_blocked_clients - ); - unlockThreadedIO(); - } - for (j = 0; j < server.dbnum; j++) { - long long keys, vkeys; - - keys = dictSize(server.db[j].dict); - vkeys = dictSize(server.db[j].expires); - if (keys || vkeys) { - info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n", - j, keys, vkeys); - } - } - return info; -} - -static void infoCommand(redisClient *c) { - sds info = genRedisInfoString(); - addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n", - (unsigned long)sdslen(info))); - addReplySds(c,info); - addReply(c,shared.crlf); -} - -static void monitorCommand(redisClient *c) { - /* ignore MONITOR if aleady slave or in monitor mode */ - if (c->flags & REDIS_SLAVE) return; - - c->flags |= (REDIS_SLAVE|REDIS_MONITOR); - c->slaveseldb = 0; - listAddNodeTail(server.monitors,c); - addReply(c,shared.ok); -} - -/* ================================= Expire ================================= */ -static int removeExpire(redisDb *db, robj *key) { - /* An expire may only be removed if there is a corresponding entry in the - * main dict. Otherwise, the key will never be freed. */ - redisAssert(dictFind(db->dict,key->ptr) != NULL); - if (dictDelete(db->expires,key->ptr) == DICT_OK) { - return 1; - } else { - return 0; - } -} - -static int setExpire(redisDb *db, robj *key, time_t when) { - dictEntry *de; - - /* Reuse the sds from the main dict in the expire dict */ - redisAssert((de = dictFind(db->dict,key->ptr)) != NULL); - if (dictAdd(db->expires,dictGetEntryKey(de),(void*)when) == DICT_ERR) { - return 0; - } else { - return 1; - } -} - -/* Return the expire time of the specified key, or -1 if no expire - * is associated with this key (i.e. the key is non volatile) */ -static time_t getExpire(redisDb *db, robj *key) { - dictEntry *de; - - /* No expire? return ASAP */ - if (dictSize(db->expires) == 0 || - (de = dictFind(db->expires,key->ptr)) == NULL) return -1; - - /* The entry was found in the expire dict, this means it should also - * be present in the main dict (safety check). */ - redisAssert(dictFind(db->dict,key->ptr) != NULL); - return (time_t) dictGetEntryVal(de); -} - -static int expireIfNeeded(redisDb *db, robj *key) { - time_t when = getExpire(db,key); - if (when < 0) return 0; - - /* Return when this key has not expired */ - if (time(NULL) <= when) return 0; - - /* Delete the key */ - server.stat_expiredkeys++; - server.dirty++; - return dbDelete(db,key); -} - -static int deleteIfVolatile(redisDb *db, robj *key) { - if (getExpire(db,key) < 0) return 0; - - /* Delete the key */ - server.stat_expiredkeys++; - server.dirty++; - return dbDelete(db,key); -} - -static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) { - dictEntry *de; - time_t seconds; - - if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return; - - seconds -= offset; - - de = dictFind(c->db->dict,key->ptr); - if (de == NULL) { - addReply(c,shared.czero); - return; - } - if (seconds <= 0) { - if (dbDelete(c->db,key)) server.dirty++; - addReply(c, shared.cone); - return; - } else { - time_t when = time(NULL)+seconds; - if (setExpire(c->db,key,when)) { - addReply(c,shared.cone); - server.dirty++; - } else { - addReply(c,shared.czero); - } - return; - } -} - -static void expireCommand(redisClient *c) { - expireGenericCommand(c,c->argv[1],c->argv[2],0); -} - -static void expireatCommand(redisClient *c) { - expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL)); -} - -static void ttlCommand(redisClient *c) { - time_t expire; - int ttl = -1; - - expire = getExpire(c->db,c->argv[1]); - if (expire != -1) { - ttl = (int) (expire-time(NULL)); - if (ttl < 0) ttl = -1; - } - addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl)); -} - -/* ================================ MULTI/EXEC ============================== */ - -/* Client state initialization for MULTI/EXEC */ -static void initClientMultiState(redisClient *c) { - c->mstate.commands = NULL; - c->mstate.count = 0; -} - -/* Release all the resources associated with MULTI/EXEC state */ -static void freeClientMultiState(redisClient *c) { - int j; - - for (j = 0; j < c->mstate.count; j++) { - int i; - multiCmd *mc = c->mstate.commands+j; - - for (i = 0; i < mc->argc; i++) - decrRefCount(mc->argv[i]); - zfree(mc->argv); - } - zfree(c->mstate.commands); -} - -/* Add a new command into the MULTI commands queue */ -static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) { - multiCmd *mc; - int j; - - c->mstate.commands = zrealloc(c->mstate.commands, - sizeof(multiCmd)*(c->mstate.count+1)); - mc = c->mstate.commands+c->mstate.count; - mc->cmd = cmd; - mc->argc = c->argc; - mc->argv = zmalloc(sizeof(robj*)*c->argc); - memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc); - for (j = 0; j < c->argc; j++) - incrRefCount(mc->argv[j]); - c->mstate.count++; -} - -static void multiCommand(redisClient *c) { - if (c->flags & REDIS_MULTI) { - addReplySds(c,sdsnew("-ERR MULTI calls can not be nested\r\n")); - return; - } - c->flags |= REDIS_MULTI; - addReply(c,shared.ok); -} - -static void discardCommand(redisClient *c) { - if (!(c->flags & REDIS_MULTI)) { - addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n")); - return; - } - - freeClientMultiState(c); - initClientMultiState(c); - c->flags &= (~REDIS_MULTI); - unwatchAllKeys(c); - addReply(c,shared.ok); -} - -/* Send a MULTI command to all the slaves and AOF file. Check the execCommand - * implememntation for more information. */ -static void execCommandReplicateMulti(redisClient *c) { - struct redisCommand *cmd; - robj *multistring = createStringObject("MULTI",5); - - cmd = lookupCommand("multi"); - if (server.appendonly) - feedAppendOnlyFile(cmd,c->db->id,&multistring,1); - if (listLength(server.slaves)) - replicationFeedSlaves(server.slaves,c->db->id,&multistring,1); - decrRefCount(multistring); -} - -static void execCommand(redisClient *c) { - int j; - robj **orig_argv; - int orig_argc; - - if (!(c->flags & REDIS_MULTI)) { - addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n")); - return; - } - - /* Check if we need to abort the EXEC if some WATCHed key was touched. - * A failed EXEC will return a multi bulk nil object. */ - if (c->flags & REDIS_DIRTY_CAS) { - freeClientMultiState(c); - initClientMultiState(c); - c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS); - unwatchAllKeys(c); - addReply(c,shared.nullmultibulk); - return; - } - - /* Replicate a MULTI request now that we are sure the block is executed. - * This way we'll deliver the MULTI/..../EXEC block as a whole and - * both the AOF and the replication link will have the same consistency - * and atomicity guarantees. */ - execCommandReplicateMulti(c); - - /* Exec all the queued commands */ - unwatchAllKeys(c); /* Unwatch ASAP otherwise we'll waste CPU cycles */ - orig_argv = c->argv; - orig_argc = c->argc; - addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count)); - for (j = 0; j < c->mstate.count; j++) { - c->argc = c->mstate.commands[j].argc; - c->argv = c->mstate.commands[j].argv; - call(c,c->mstate.commands[j].cmd); - } - c->argv = orig_argv; - c->argc = orig_argc; - freeClientMultiState(c); - initClientMultiState(c); - c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS); - /* Make sure the EXEC command is always replicated / AOF, since we - * always send the MULTI command (we can't know beforehand if the - * next operations will contain at least a modification to the DB). */ - server.dirty++; -} - -/* =========================== Blocking Operations ========================= */ - -/* Currently Redis blocking operations support is limited to list POP ops, - * so the current implementation is not fully generic, but it is also not - * completely specific so it will not require a rewrite to support new - * kind of blocking operations in the future. - * - * Still it's important to note that list blocking operations can be already - * used as a notification mechanism in order to implement other blocking - * operations at application level, so there must be a very strong evidence - * of usefulness and generality before new blocking operations are implemented. - * - * This is how the current blocking POP works, we use BLPOP as example: - * - If the user calls BLPOP and the key exists and contains a non empty list - * then LPOP is called instead. So BLPOP is semantically the same as LPOP - * if there is not to block. - * - If instead BLPOP is called and the key does not exists or the list is - * empty we need to block. In order to do so we remove the notification for - * new data to read in the client socket (so that we'll not serve new - * requests if the blocking request is not served). Also we put the client - * in a dictionary (db->blocking_keys) mapping keys to a list of clients - * blocking for this keys. - * - If a PUSH operation against a key with blocked clients waiting is - * performed, we serve the first in the list: basically instead to push - * the new element inside the list we return it to the (first / oldest) - * blocking client, unblock the client, and remove it form the list. - * - * The above comment and the source code should be enough in order to understand - * the implementation and modify / fix it later. - */ - -/* Set a client in blocking mode for the specified key, with the specified - * timeout */ -static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) { - dictEntry *de; - list *l; - int j; - - c->blocking_keys = zmalloc(sizeof(robj*)*numkeys); - c->blocking_keys_num = numkeys; - c->blockingto = timeout; - for (j = 0; j < numkeys; j++) { - /* Add the key in the client structure, to map clients -> keys */ - c->blocking_keys[j] = keys[j]; - incrRefCount(keys[j]); - - /* And in the other "side", to map keys -> clients */ - de = dictFind(c->db->blocking_keys,keys[j]); - if (de == NULL) { - int retval; - - /* For every key we take a list of clients blocked for it */ - l = listCreate(); - retval = dictAdd(c->db->blocking_keys,keys[j],l); - incrRefCount(keys[j]); - assert(retval == DICT_OK); - } else { - l = dictGetEntryVal(de); - } - listAddNodeTail(l,c); - } - /* Mark the client as a blocked client */ - c->flags |= REDIS_BLOCKED; - server.blpop_blocked_clients++; -} - -/* Unblock a client that's waiting in a blocking operation such as BLPOP */ -static void unblockClientWaitingData(redisClient *c) { - dictEntry *de; - list *l; - int j; - - assert(c->blocking_keys != NULL); - /* The client may wait for multiple keys, so unblock it for every key. */ - for (j = 0; j < c->blocking_keys_num; j++) { - /* Remove this client from the list of clients waiting for this key. */ - de = dictFind(c->db->blocking_keys,c->blocking_keys[j]); - assert(de != NULL); - l = dictGetEntryVal(de); - listDelNode(l,listSearchKey(l,c)); - /* If the list is empty we need to remove it to avoid wasting memory */ - if (listLength(l) == 0) - dictDelete(c->db->blocking_keys,c->blocking_keys[j]); - decrRefCount(c->blocking_keys[j]); - } - /* Cleanup the client structure */ - zfree(c->blocking_keys); - c->blocking_keys = NULL; - c->flags &= (~REDIS_BLOCKED); - server.blpop_blocked_clients--; - /* We want to process data if there is some command waiting - * in the input buffer. Note that this is safe even if - * unblockClientWaitingData() gets called from freeClient() because - * freeClient() will be smart enough to call this function - * *after* c->querybuf was set to NULL. */ - if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c); -} - -/* This should be called from any function PUSHing into lists. - * 'c' is the "pushing client", 'key' is the key it is pushing data against, - * 'ele' is the element pushed. - * - * If the function returns 0 there was no client waiting for a list push - * against this key. - * - * If the function returns 1 there was a client waiting for a list push - * against this key, the element was passed to this client thus it's not - * needed to actually add it to the list and the caller should return asap. */ -static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) { - struct dictEntry *de; - redisClient *receiver; - list *l; - listNode *ln; - - de = dictFind(c->db->blocking_keys,key); - if (de == NULL) return 0; - l = dictGetEntryVal(de); - ln = listFirst(l); - assert(ln != NULL); - receiver = ln->value; - - addReplySds(receiver,sdsnew("*2\r\n")); - addReplyBulk(receiver,key); - addReplyBulk(receiver,ele); - unblockClientWaitingData(receiver); - return 1; -} - -/* Blocking RPOP/LPOP */ -static void blockingPopGenericCommand(redisClient *c, int where) { - robj *o; - time_t timeout; - int j; - - for (j = 1; j < c->argc-1; j++) { - o = lookupKeyWrite(c->db,c->argv[j]); - if (o != NULL) { - if (o->type != REDIS_LIST) { - addReply(c,shared.wrongtypeerr); - return; - } else { - if (listTypeLength(o) != 0) { - /* If the list contains elements fall back to the usual - * non-blocking POP operation */ - robj *argv[2], **orig_argv; - int orig_argc; - - /* We need to alter the command arguments before to call - * popGenericCommand() as the command takes a single key. */ - orig_argv = c->argv; - orig_argc = c->argc; - argv[1] = c->argv[j]; - c->argv = argv; - c->argc = 2; - - /* Also the return value is different, we need to output - * the multi bulk reply header and the key name. The - * "real" command will add the last element (the value) - * for us. If this souds like an hack to you it's just - * because it is... */ - addReplySds(c,sdsnew("*2\r\n")); - addReplyBulk(c,argv[1]); - popGenericCommand(c,where); - - /* Fix the client structure with the original stuff */ - c->argv = orig_argv; - c->argc = orig_argc; - return; - } - } - } - } - /* If the list is empty or the key does not exists we must block */ - timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10); - if (timeout > 0) timeout += time(NULL); - blockForKeys(c,c->argv+1,c->argc-2,timeout); -} - -static void blpopCommand(redisClient *c) { - blockingPopGenericCommand(c,REDIS_HEAD); -} - -static void brpopCommand(redisClient *c) { - blockingPopGenericCommand(c,REDIS_TAIL); -} - -/* =============================== Replication ============================= */ - -static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) { - ssize_t nwritten, ret = size; - time_t start = time(NULL); - - timeout++; - while(size) { - if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) { - nwritten = write(fd,ptr,size); - if (nwritten == -1) return -1; - ptr += nwritten; - size -= nwritten; - } - if ((time(NULL)-start) > timeout) { - errno = ETIMEDOUT; - return -1; - } - } - return ret; -} - -static int syncRead(int fd, char *ptr, ssize_t size, int timeout) { - ssize_t nread, totread = 0; - time_t start = time(NULL); - - timeout++; - while(size) { - if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) { - nread = read(fd,ptr,size); - if (nread == -1) return -1; - ptr += nread; - size -= nread; - totread += nread; - } - if ((time(NULL)-start) > timeout) { - errno = ETIMEDOUT; - return -1; - } - } - return totread; -} - -static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) { - ssize_t nread = 0; - - size--; - while(size) { - char c; - - if (syncRead(fd,&c,1,timeout) == -1) return -1; - if (c == '\n') { - *ptr = '\0'; - if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0'; - return nread; - } else { - *ptr++ = c; - *ptr = '\0'; - nread++; - } - } - return nread; -} - -static void syncCommand(redisClient *c) { - /* ignore SYNC if aleady slave or in monitor mode */ - if (c->flags & REDIS_SLAVE) return; - - /* SYNC can't be issued when the server has pending data to send to - * the client about already issued commands. We need a fresh reply - * buffer registering the differences between the BGSAVE and the current - * dataset, so that we can copy to other slaves if needed. */ - if (listLength(c->reply) != 0) { - addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n")); - return; - } - - redisLog(REDIS_NOTICE,"Slave ask for synchronization"); - /* Here we need to check if there is a background saving operation - * in progress, or if it is required to start one */ - if (server.bgsavechildpid != -1) { - /* Ok a background save is in progress. Let's check if it is a good - * one for replication, i.e. if there is another slave that is - * registering differences since the server forked to save */ - redisClient *slave; - listNode *ln; - listIter li; - - listRewind(server.slaves,&li); - while((ln = listNext(&li))) { - slave = ln->value; - if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break; - } - if (ln) { - /* Perfect, the server is already registering differences for - * another slave. Set the right state, and copy the buffer. */ - listRelease(c->reply); - c->reply = listDup(slave->reply); - c->replstate = REDIS_REPL_WAIT_BGSAVE_END; - redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC"); - } else { - /* No way, we need to wait for the next BGSAVE in order to - * register differences */ - c->replstate = REDIS_REPL_WAIT_BGSAVE_START; - redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC"); - } - } else { - /* Ok we don't have a BGSAVE in progress, let's start one */ - redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC"); - if (rdbSaveBackground(server.dbfilename) != REDIS_OK) { - redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE"); - addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n")); - return; - } - c->replstate = REDIS_REPL_WAIT_BGSAVE_END; - } - c->repldbfd = -1; - c->flags |= REDIS_SLAVE; - c->slaveseldb = 0; - listAddNodeTail(server.slaves,c); - return; -} - -static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) { - redisClient *slave = privdata; - REDIS_NOTUSED(el); - REDIS_NOTUSED(mask); - char buf[REDIS_IOBUF_LEN]; - ssize_t nwritten, buflen; - - if (slave->repldboff == 0) { - /* Write the bulk write count before to transfer the DB. In theory here - * we don't know how much room there is in the output buffer of the - * socket, but in pratice SO_SNDLOWAT (the minimum count for output - * operations) will never be smaller than the few bytes we need. */ - sds bulkcount; - - bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long) - slave->repldbsize); - if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount)) - { - sdsfree(bulkcount); - freeClient(slave); - return; - } - sdsfree(bulkcount); - } - lseek(slave->repldbfd,slave->repldboff,SEEK_SET); - buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN); - if (buflen <= 0) { - redisLog(REDIS_WARNING,"Read error sending DB to slave: %s", - (buflen == 0) ? "premature EOF" : strerror(errno)); - freeClient(slave); - return; - } - if ((nwritten = write(fd,buf,buflen)) == -1) { - redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s", - strerror(errno)); - freeClient(slave); - return; - } - slave->repldboff += nwritten; - if (slave->repldboff == slave->repldbsize) { - close(slave->repldbfd); - slave->repldbfd = -1; - aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE); - slave->replstate = REDIS_REPL_ONLINE; - if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, - sendReplyToClient, slave) == AE_ERR) { - freeClient(slave); - return; - } - addReplySds(slave,sdsempty()); - redisLog(REDIS_NOTICE,"Synchronization with slave succeeded"); - } -} - -/* This function is called at the end of every backgrond saving. - * The argument bgsaveerr is REDIS_OK if the background saving succeeded - * otherwise REDIS_ERR is passed to the function. - * - * The goal of this function is to handle slaves waiting for a successful - * background saving in order to perform non-blocking synchronization. */ -static void updateSlavesWaitingBgsave(int bgsaveerr) { - listNode *ln; - int startbgsave = 0; - listIter li; - - listRewind(server.slaves,&li); - while((ln = listNext(&li))) { - redisClient *slave = ln->value; - - if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) { - startbgsave = 1; - slave->replstate = REDIS_REPL_WAIT_BGSAVE_END; - } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) { - struct redis_stat buf; - - if (bgsaveerr != REDIS_OK) { - freeClient(slave); - redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error"); - continue; - } - if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 || - redis_fstat(slave->repldbfd,&buf) == -1) { - freeClient(slave); - redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno)); - continue; - } - slave->repldboff = 0; - slave->repldbsize = buf.st_size; - slave->replstate = REDIS_REPL_SEND_BULK; - aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE); - if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) { - freeClient(slave); - continue; - } - } - } - if (startbgsave) { - if (rdbSaveBackground(server.dbfilename) != REDIS_OK) { - listIter li; - - listRewind(server.slaves,&li); - redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed"); - while((ln = listNext(&li))) { - redisClient *slave = ln->value; - - if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) - freeClient(slave); - } - } - } -} - -static int syncWithMaster(void) { - char buf[1024], tmpfile[256], authcmd[1024]; - long dumpsize; - int fd = anetTcpConnect(NULL,server.masterhost,server.masterport); - int dfd, maxtries = 5; - - if (fd == -1) { - redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s", - strerror(errno)); - return REDIS_ERR; - } - - /* AUTH with the master if required. */ - if(server.masterauth) { - snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth); - if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) { - close(fd); - redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s", - strerror(errno)); - return REDIS_ERR; - } - /* Read the AUTH result. */ - if (syncReadLine(fd,buf,1024,3600) == -1) { - close(fd); - redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s", - strerror(errno)); - return REDIS_ERR; - } - if (buf[0] != '+') { - close(fd); - redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?"); - return REDIS_ERR; - } - } - - /* Issue the SYNC command */ - if (syncWrite(fd,"SYNC \r\n",7,5) == -1) { - close(fd); - redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s", - strerror(errno)); - return REDIS_ERR; - } - /* Read the bulk write count */ - if (syncReadLine(fd,buf,1024,3600) == -1) { - close(fd); - redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s", - strerror(errno)); - return REDIS_ERR; - } - if (buf[0] != '$') { - close(fd); - redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?"); - return REDIS_ERR; - } - dumpsize = strtol(buf+1,NULL,10); - redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize); - /* Read the bulk write data on a temp file */ - while(maxtries--) { - snprintf(tmpfile,256, - "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid()); - dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644); - if (dfd != -1) break; - sleep(1); - } - if (dfd == -1) { - close(fd); - redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno)); - return REDIS_ERR; - } - while(dumpsize) { - int nread, nwritten; - - nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024); - if (nread == -1) { - redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s", - strerror(errno)); - close(fd); - close(dfd); - return REDIS_ERR; - } - nwritten = write(dfd,buf,nread); - if (nwritten == -1) { - redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno)); - close(fd); - close(dfd); - return REDIS_ERR; - } - dumpsize -= nread; - } - close(dfd); - if (rename(tmpfile,server.dbfilename) == -1) { - redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno)); - unlink(tmpfile); - close(fd); - return REDIS_ERR; - } - emptyDb(); - if (rdbLoad(server.dbfilename) != REDIS_OK) { - redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk"); - close(fd); - return REDIS_ERR; - } - server.master = createClient(fd); - server.master->flags |= REDIS_MASTER; - server.master->authenticated = 1; - server.replstate = REDIS_REPL_CONNECTED; - return REDIS_OK; -} - -static void slaveofCommand(redisClient *c) { - if (!strcasecmp(c->argv[1]->ptr,"no") && - !strcasecmp(c->argv[2]->ptr,"one")) { - if (server.masterhost) { - sdsfree(server.masterhost); - server.masterhost = NULL; - if (server.master) freeClient(server.master); - server.replstate = REDIS_REPL_NONE; - redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)"); - } - } else { - sdsfree(server.masterhost); - server.masterhost = sdsdup(c->argv[1]->ptr); - server.masterport = atoi(c->argv[2]->ptr); - if (server.master) freeClient(server.master); - server.replstate = REDIS_REPL_CONNECT; - redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)", - server.masterhost, server.masterport); - } - addReply(c,shared.ok); -} - -/* ============================ Maxmemory directive ======================== */ - -/* Try to free one object form the pre-allocated objects free list. - * This is useful under low mem conditions as by default we take 1 million - * free objects allocated. On success REDIS_OK is returned, otherwise - * REDIS_ERR. */ -static int tryFreeOneObjectFromFreelist(void) { - robj *o; - - if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex); - if (listLength(server.objfreelist)) { - listNode *head = listFirst(server.objfreelist); - o = listNodeValue(head); - listDelNode(server.objfreelist,head); - if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex); - zfree(o); - return REDIS_OK; - } else { - if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex); - return REDIS_ERR; - } -} - -/* This function gets called when 'maxmemory' is set on the config file to limit - * the max memory used by the server, and we are out of memory. - * This function will try to, in order: - * - * - Free objects from the free list - * - Try to remove keys with an EXPIRE set - * - * It is not possible to free enough memory to reach used-memory < maxmemory - * the server will start refusing commands that will enlarge even more the - * memory usage. - */ -static void freeMemoryIfNeeded(void) { - while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) { - int j, k, freed = 0; - - if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue; - for (j = 0; j < server.dbnum; j++) { - int minttl = -1; - robj *minkey = NULL; - struct dictEntry *de; - - if (dictSize(server.db[j].expires)) { - freed = 1; - /* From a sample of three keys drop the one nearest to - * the natural expire */ - for (k = 0; k < 3; k++) { - time_t t; - - de = dictGetRandomKey(server.db[j].expires); - t = (time_t) dictGetEntryVal(de); - if (minttl == -1 || t < minttl) { - minkey = dictGetEntryKey(de); - minttl = t; - } - } - dbDelete(server.db+j,minkey); - } - } - if (!freed) return; /* nothing to free... */ - } -} - -/* ============================== Append Only file ========================== */ - -/* Called when the user switches from "appendonly yes" to "appendonly no" - * at runtime using the CONFIG command. */ -static void stopAppendOnly(void) { - flushAppendOnlyFile(); - aof_fsync(server.appendfd); - close(server.appendfd); - - server.appendfd = -1; - server.appendseldb = -1; - server.appendonly = 0; - /* rewrite operation in progress? kill it, wait child exit */ - if (server.bgsavechildpid != -1) { - int statloc; - - if (kill(server.bgsavechildpid,SIGKILL) != -1) - wait3(&statloc,0,NULL); - /* reset the buffer accumulating changes while the child saves */ - sdsfree(server.bgrewritebuf); - server.bgrewritebuf = sdsempty(); - server.bgsavechildpid = -1; - } -} - -/* Called when the user switches from "appendonly no" to "appendonly yes" - * at runtime using the CONFIG command. */ -static int startAppendOnly(void) { - server.appendonly = 1; - server.lastfsync = time(NULL); - server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644); - if (server.appendfd == -1) { - redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, but I can't open the AOF file: %s",strerror(errno)); - return REDIS_ERR; - } - if (rewriteAppendOnlyFileBackground() == REDIS_ERR) { - server.appendonly = 0; - close(server.appendfd); - redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, I can't trigger a background AOF rewrite operation. Check the above logs for more info about the error.",strerror(errno)); - return REDIS_ERR; - } - return REDIS_OK; -} - -/* Write the append only file buffer on disk. - * - * Since we are required to write the AOF before replying to the client, - * and the only way the client socket can get a write is entering when the - * the event loop, we accumulate all the AOF writes in a memory - * buffer and write it on disk using this function just before entering - * the event loop again. */ -static void flushAppendOnlyFile(void) { - time_t now; - ssize_t nwritten; - - if (sdslen(server.aofbuf) == 0) return; - - /* We want to perform a single write. This should be guaranteed atomic - * at least if the filesystem we are writing is a real physical one. - * While this will save us against the server being killed I don't think - * there is much to do about the whole server stopping for power problems - * or alike */ - nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf)); - if (nwritten != (signed)sdslen(server.aofbuf)) { - /* Ooops, we are in troubles. The best thing to do for now is - * aborting instead of giving the illusion that everything is - * working as expected. */ - if (nwritten == -1) { - redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno)); - } else { - redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno)); - } - exit(1); - } - sdsfree(server.aofbuf); - server.aofbuf = sdsempty(); - - /* Don't Fsync if no-appendfsync-on-rewrite is set to yes and we have - * childs performing heavy I/O on disk. */ - if (server.no_appendfsync_on_rewrite && - (server.bgrewritechildpid != -1 || server.bgsavechildpid != -1)) - return; - /* Fsync if needed */ - now = time(NULL); - if (server.appendfsync == APPENDFSYNC_ALWAYS || - (server.appendfsync == APPENDFSYNC_EVERYSEC && - now-server.lastfsync > 1)) - { - /* aof_fsync is defined as fdatasync() for Linux in order to avoid - * flushing metadata. */ - aof_fsync(server.appendfd); /* Let's try to get this data on the disk */ - server.lastfsync = now; - } -} - -static sds catAppendOnlyGenericCommand(sds buf, int argc, robj **argv) { - int j; - buf = sdscatprintf(buf,"*%d\r\n",argc); - for (j = 0; j < argc; j++) { - robj *o = getDecodedObject(argv[j]); - buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr)); - buf = sdscatlen(buf,o->ptr,sdslen(o->ptr)); - buf = sdscatlen(buf,"\r\n",2); - decrRefCount(o); - } - return buf; -} - -static sds catAppendOnlyExpireAtCommand(sds buf, robj *key, robj *seconds) { - int argc = 3; - long when; - robj *argv[3]; - - /* Make sure we can use strtol */ - seconds = getDecodedObject(seconds); - when = time(NULL)+strtol(seconds->ptr,NULL,10); - decrRefCount(seconds); - - argv[0] = createStringObject("EXPIREAT",8); - argv[1] = key; - argv[2] = createObject(REDIS_STRING, - sdscatprintf(sdsempty(),"%ld",when)); - buf = catAppendOnlyGenericCommand(buf, argc, argv); - decrRefCount(argv[0]); - decrRefCount(argv[2]); - return buf; -} - -static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) { - sds buf = sdsempty(); - robj *tmpargv[3]; - - /* The DB this command was targetting is not the same as the last command - * we appendend. To issue a SELECT command is needed. */ - if (dictid != server.appendseldb) { - char seldb[64]; - - snprintf(seldb,sizeof(seldb),"%d",dictid); - buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n", - (unsigned long)strlen(seldb),seldb); - server.appendseldb = dictid; - } - - if (cmd->proc == expireCommand) { - /* Translate EXPIRE into EXPIREAT */ - buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]); - } else if (cmd->proc == setexCommand) { - /* Translate SETEX to SET and EXPIREAT */ - tmpargv[0] = createStringObject("SET",3); - tmpargv[1] = argv[1]; - tmpargv[2] = argv[3]; - buf = catAppendOnlyGenericCommand(buf,3,tmpargv); - decrRefCount(tmpargv[0]); - buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]); - } else { - buf = catAppendOnlyGenericCommand(buf,argc,argv); - } - - /* Append to the AOF buffer. This will be flushed on disk just before - * of re-entering the event loop, so before the client will get a - * positive reply about the operation performed. */ - server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf)); - - /* If a background append only file rewriting is in progress we want to - * accumulate the differences between the child DB and the current one - * in a buffer, so that when the child process will do its work we - * can append the differences to the new append only file. */ - if (server.bgrewritechildpid != -1) - server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf)); - - sdsfree(buf); -} - -/* In Redis commands are always executed in the context of a client, so in - * order to load the append only file we need to create a fake client. */ -static struct redisClient *createFakeClient(void) { - struct redisClient *c = zmalloc(sizeof(*c)); - - selectDb(c,0); - c->fd = -1; - c->querybuf = sdsempty(); - c->argc = 0; - c->argv = NULL; - c->flags = 0; - /* We set the fake client as a slave waiting for the synchronization - * so that Redis will not try to send replies to this client. */ - c->replstate = REDIS_REPL_WAIT_BGSAVE_START; - c->reply = listCreate(); - listSetFreeMethod(c->reply,decrRefCount); - listSetDupMethod(c->reply,dupClientReplyValue); - initClientMultiState(c); - return c; -} - -static void freeFakeClient(struct redisClient *c) { - sdsfree(c->querybuf); - listRelease(c->reply); - freeClientMultiState(c); - zfree(c); -} - -/* Replay the append log file. On error REDIS_OK is returned. On non fatal - * error (the append only file is zero-length) REDIS_ERR is returned. On - * fatal error an error message is logged and the program exists. */ -int loadAppendOnlyFile(char *filename) { - struct redisClient *fakeClient; - FILE *fp = fopen(filename,"r"); - struct redis_stat sb; - int appendonly = server.appendonly; - - if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0) - return REDIS_ERR; - - if (fp == NULL) { - redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno)); - exit(1); - } - - /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI - * to the same file we're about to read. */ - server.appendonly = 0; - - fakeClient = createFakeClient(); - while(1) { - int argc, j; - unsigned long len; - robj **argv; - char buf[128]; - sds argsds; - struct redisCommand *cmd; - int force_swapout; - - if (fgets(buf,sizeof(buf),fp) == NULL) { - if (feof(fp)) - break; - else - goto readerr; - } - if (buf[0] != '*') goto fmterr; - argc = atoi(buf+1); - argv = zmalloc(sizeof(robj*)*argc); - for (j = 0; j < argc; j++) { - if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr; - if (buf[0] != '$') goto fmterr; - len = strtol(buf+1,NULL,10); - argsds = sdsnewlen(NULL,len); - if (len && fread(argsds,len,1,fp) == 0) goto fmterr; - argv[j] = createObject(REDIS_STRING,argsds); - if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */ - } - - /* Command lookup */ - cmd = lookupCommand(argv[0]->ptr); - if (!cmd) { - redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr); - exit(1); - } - /* Try object encoding */ - if (cmd->flags & REDIS_CMD_BULK) - argv[argc-1] = tryObjectEncoding(argv[argc-1]); - /* Run the command in the context of a fake client */ - fakeClient->argc = argc; - fakeClient->argv = argv; - cmd->proc(fakeClient); - /* Discard the reply objects list from the fake client */ - while(listLength(fakeClient->reply)) - listDelNode(fakeClient->reply,listFirst(fakeClient->reply)); - /* Clean up, ready for the next command */ - for (j = 0; j < argc; j++) decrRefCount(argv[j]); - zfree(argv); - /* Handle swapping while loading big datasets when VM is on */ - force_swapout = 0; - if ((zmalloc_used_memory() - server.vm_max_memory) > 1024*1024*32) - force_swapout = 1; - - if (server.vm_enabled && force_swapout) { - while (zmalloc_used_memory() > server.vm_max_memory) { - if (vmSwapOneObjectBlocking() == REDIS_ERR) break; - } - } - } - - /* This point can only be reached when EOF is reached without errors. - * If the client is in the middle of a MULTI/EXEC, log error and quit. */ - if (fakeClient->flags & REDIS_MULTI) goto readerr; - - fclose(fp); - freeFakeClient(fakeClient); - server.appendonly = appendonly; - return REDIS_OK; - -readerr: - if (feof(fp)) { - redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file"); - } else { - redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno)); - } - exit(1); -fmterr: - redisLog(REDIS_WARNING,"Bad file format reading the append only file"); - exit(1); -} - -/* Write binary-safe string into a file in the bulkformat - * $\r\n\r\n */ -static int fwriteBulkString(FILE *fp, char *s, unsigned long len) { - char cbuf[128]; - int clen; - cbuf[0] = '$'; - clen = 1+ll2string(cbuf+1,sizeof(cbuf)-1,len); - cbuf[clen++] = '\r'; - cbuf[clen++] = '\n'; - if (fwrite(cbuf,clen,1,fp) == 0) return 0; - if (len > 0 && fwrite(s,len,1,fp) == 0) return 0; - if (fwrite("\r\n",2,1,fp) == 0) return 0; - return 1; -} - -/* Write a double value in bulk format $\r\n\r\n */ -static int fwriteBulkDouble(FILE *fp, double d) { - char buf[128], dbuf[128]; - - snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d); - snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2); - if (fwrite(buf,strlen(buf),1,fp) == 0) return 0; - if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0; - return 1; -} - -/* Write a long value in bulk format $\r\n\r\n */ -static int fwriteBulkLongLong(FILE *fp, long long l) { - char bbuf[128], lbuf[128]; - unsigned int blen, llen; - llen = ll2string(lbuf,32,l); - blen = snprintf(bbuf,sizeof(bbuf),"$%u\r\n%s\r\n",llen,lbuf); - if (fwrite(bbuf,blen,1,fp) == 0) return 0; - return 1; -} - -/* Delegate writing an object to writing a bulk string or bulk long long. */ -static int fwriteBulkObject(FILE *fp, robj *obj) { - /* Avoid using getDecodedObject to help copy-on-write (we are often - * in a child process when this function is called). */ - if (obj->encoding == REDIS_ENCODING_INT) { - return fwriteBulkLongLong(fp,(long)obj->ptr); - } else if (obj->encoding == REDIS_ENCODING_RAW) { - return fwriteBulkString(fp,obj->ptr,sdslen(obj->ptr)); - } else { - redisPanic("Unknown string encoding"); - } -} - -/* Write a sequence of commands able to fully rebuild the dataset into - * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */ -static int rewriteAppendOnlyFile(char *filename) { - dictIterator *di = NULL; - dictEntry *de; - FILE *fp; - char tmpfile[256]; - int j; - time_t now = time(NULL); - - /* Note that we have to use a different temp name here compared to the - * one used by rewriteAppendOnlyFileBackground() function. */ - snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid()); - fp = fopen(tmpfile,"w"); - if (!fp) { - redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno)); - return REDIS_ERR; - } - for (j = 0; j < server.dbnum; j++) { - char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n"; - redisDb *db = server.db+j; - dict *d = db->dict; - if (dictSize(d) == 0) continue; - di = dictGetIterator(d); - if (!di) { - fclose(fp); - return REDIS_ERR; - } - - /* SELECT the new DB */ - if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr; - if (fwriteBulkLongLong(fp,j) == 0) goto werr; - - /* Iterate this DB writing every entry */ - while((de = dictNext(di)) != NULL) { - sds keystr = dictGetEntryKey(de); - robj key, *o; - time_t expiretime; - int swapped; - - keystr = dictGetEntryKey(de); - o = dictGetEntryVal(de); - initStaticStringObject(key,keystr); - /* If the value for this key is swapped, load a preview in memory. - * We use a "swapped" flag to remember if we need to free the - * value object instead to just increment the ref count anyway - * in order to avoid copy-on-write of pages if we are forked() */ - if (!server.vm_enabled || o->storage == REDIS_VM_MEMORY || - o->storage == REDIS_VM_SWAPPING) { - swapped = 0; - } else { - o = vmPreviewObject(o); - swapped = 1; - } - expiretime = getExpire(db,&key); - - /* Save the key and associated value */ - if (o->type == REDIS_STRING) { - /* Emit a SET command */ - char cmd[]="*3\r\n$3\r\nSET\r\n"; - if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr; - /* Key and value */ - if (fwriteBulkObject(fp,&key) == 0) goto werr; - if (fwriteBulkObject(fp,o) == 0) goto werr; - } else if (o->type == REDIS_LIST) { - /* Emit the RPUSHes needed to rebuild the list */ - char cmd[]="*3\r\n$5\r\nRPUSH\r\n"; - if (o->encoding == REDIS_ENCODING_ZIPLIST) { - unsigned char *zl = o->ptr; - unsigned char *p = ziplistIndex(zl,0); - unsigned char *vstr; - unsigned int vlen; - long long vlong; - - while(ziplistGet(p,&vstr,&vlen,&vlong)) { - if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr; - if (fwriteBulkObject(fp,&key) == 0) goto werr; - if (vstr) { - if (fwriteBulkString(fp,(char*)vstr,vlen) == 0) - goto werr; - } else { - if (fwriteBulkLongLong(fp,vlong) == 0) - goto werr; - } - p = ziplistNext(zl,p); - } - } else if (o->encoding == REDIS_ENCODING_LINKEDLIST) { - list *list = o->ptr; - listNode *ln; - listIter li; - - listRewind(list,&li); - while((ln = listNext(&li))) { - robj *eleobj = listNodeValue(ln); - - if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr; - if (fwriteBulkObject(fp,&key) == 0) goto werr; - if (fwriteBulkObject(fp,eleobj) == 0) goto werr; - } - } else { - redisPanic("Unknown list encoding"); - } - } else if (o->type == REDIS_SET) { - /* Emit the SADDs needed to rebuild the set */ - dict *set = o->ptr; - dictIterator *di = dictGetIterator(set); - dictEntry *de; - - while((de = dictNext(di)) != NULL) { - char cmd[]="*3\r\n$4\r\nSADD\r\n"; - robj *eleobj = dictGetEntryKey(de); - - if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr; - if (fwriteBulkObject(fp,&key) == 0) goto werr; - if (fwriteBulkObject(fp,eleobj) == 0) goto werr; - } - dictReleaseIterator(di); - } else if (o->type == REDIS_ZSET) { - /* Emit the ZADDs needed to rebuild the sorted set */ - zset *zs = o->ptr; - dictIterator *di = dictGetIterator(zs->dict); - dictEntry *de; - - while((de = dictNext(di)) != NULL) { - char cmd[]="*4\r\n$4\r\nZADD\r\n"; - robj *eleobj = dictGetEntryKey(de); - double *score = dictGetEntryVal(de); - - if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr; - if (fwriteBulkObject(fp,&key) == 0) goto werr; - if (fwriteBulkDouble(fp,*score) == 0) goto werr; - if (fwriteBulkObject(fp,eleobj) == 0) goto werr; - } - dictReleaseIterator(di); - } else if (o->type == REDIS_HASH) { - char cmd[]="*4\r\n$4\r\nHSET\r\n"; - - /* Emit the HSETs needed to rebuild the hash */ - if (o->encoding == REDIS_ENCODING_ZIPMAP) { - unsigned char *p = zipmapRewind(o->ptr); - unsigned char *field, *val; - unsigned int flen, vlen; - - while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) { - if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr; - if (fwriteBulkObject(fp,&key) == 0) goto werr; - if (fwriteBulkString(fp,(char*)field,flen) == -1) - return -1; - if (fwriteBulkString(fp,(char*)val,vlen) == -1) - return -1; - } - } else { - dictIterator *di = dictGetIterator(o->ptr); - dictEntry *de; - - while((de = dictNext(di)) != NULL) { - robj *field = dictGetEntryKey(de); - robj *val = dictGetEntryVal(de); - - if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr; - if (fwriteBulkObject(fp,&key) == 0) goto werr; - if (fwriteBulkObject(fp,field) == -1) return -1; - if (fwriteBulkObject(fp,val) == -1) return -1; - } - dictReleaseIterator(di); - } - } else { - redisPanic("Unknown object type"); - } - /* Save the expire time */ - if (expiretime != -1) { - char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n"; - /* If this key is already expired skip it */ - if (expiretime < now) continue; - if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr; - if (fwriteBulkObject(fp,&key) == 0) goto werr; - if (fwriteBulkLongLong(fp,expiretime) == 0) goto werr; - } - if (swapped) decrRefCount(o); - } - dictReleaseIterator(di); - } - - /* Make sure data will not remain on the OS's output buffers */ - fflush(fp); - aof_fsync(fileno(fp)); - fclose(fp); - - /* Use RENAME to make sure the DB file is changed atomically only - * if the generate DB file is ok. */ - if (rename(tmpfile,filename) == -1) { - redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno)); - unlink(tmpfile); - return REDIS_ERR; - } - redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed"); - return REDIS_OK; - -werr: - fclose(fp); - unlink(tmpfile); - redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno)); - if (di) dictReleaseIterator(di); - return REDIS_ERR; -} - -/* This is how rewriting of the append only file in background works: - * - * 1) The user calls BGREWRITEAOF - * 2) Redis calls this function, that forks(): - * 2a) the child rewrite the append only file in a temp file. - * 2b) the parent accumulates differences in server.bgrewritebuf. - * 3) When the child finished '2a' exists. - * 4) The parent will trap the exit code, if it's OK, will append the - * data accumulated into server.bgrewritebuf into the temp file, and - * finally will rename(2) the temp file in the actual file name. - * The the new file is reopened as the new append only file. Profit! - */ -static int rewriteAppendOnlyFileBackground(void) { - pid_t childpid; - - if (server.bgrewritechildpid != -1) return REDIS_ERR; - if (server.vm_enabled) waitEmptyIOJobsQueue(); - if ((childpid = fork()) == 0) { - /* Child */ - char tmpfile[256]; - - if (server.vm_enabled) vmReopenSwapFile(); - close(server.fd); - snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid()); - if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) { - _exit(0); - } else { - _exit(1); - } - } else { - /* Parent */ - if (childpid == -1) { - redisLog(REDIS_WARNING, - "Can't rewrite append only file in background: fork: %s", - strerror(errno)); - return REDIS_ERR; - } - redisLog(REDIS_NOTICE, - "Background append only file rewriting started by pid %d",childpid); - server.bgrewritechildpid = childpid; - updateDictResizePolicy(); - /* We set appendseldb to -1 in order to force the next call to the - * feedAppendOnlyFile() to issue a SELECT command, so the differences - * accumulated by the parent into server.bgrewritebuf will start - * with a SELECT statement and it will be safe to merge. */ - server.appendseldb = -1; - return REDIS_OK; - } - return REDIS_OK; /* unreached */ -} - -static void bgrewriteaofCommand(redisClient *c) { - if (server.bgrewritechildpid != -1) { - addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n")); - return; - } - if (rewriteAppendOnlyFileBackground() == REDIS_OK) { - char *status = "+Background append only file rewriting started\r\n"; - addReplySds(c,sdsnew(status)); - } else { - addReply(c,shared.err); - } -} - -static void aofRemoveTempFile(pid_t childpid) { - char tmpfile[256]; - - snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid); - unlink(tmpfile); -} - -/* Virtual Memory is composed mainly of two subsystems: - * - Blocking Virutal Memory - * - Threaded Virtual Memory I/O - * The two parts are not fully decoupled, but functions are split among two - * different sections of the source code (delimited by comments) in order to - * make more clear what functionality is about the blocking VM and what about - * the threaded (not blocking) VM. - * - * Redis VM design: - * - * Redis VM is a blocking VM (one that blocks reading swapped values from - * disk into memory when a value swapped out is needed in memory) that is made - * unblocking by trying to examine the command argument vector in order to - * load in background values that will likely be needed in order to exec - * the command. The command is executed only once all the relevant keys - * are loaded into memory. - * - * This basically is almost as simple of a blocking VM, but almost as parallel - * as a fully non-blocking VM. - */ - -/* =================== Virtual Memory - Blocking Side ====================== */ - -/* Create a VM pointer object. This kind of objects are used in place of - * values in the key -> value hash table, for swapped out objects. */ -static vmpointer *createVmPointer(int vtype) { - vmpointer *vp = zmalloc(sizeof(vmpointer)); - - vp->type = REDIS_VMPOINTER; - vp->storage = REDIS_VM_SWAPPED; - vp->vtype = vtype; - return vp; -} - -static void vmInit(void) { - off_t totsize; - int pipefds[2]; - size_t stacksize; - struct flock fl; - - if (server.vm_max_threads != 0) - zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */ - - redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file); - /* Try to open the old swap file, otherwise create it */ - if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) { - server.vm_fp = fopen(server.vm_swap_file,"w+b"); - } - if (server.vm_fp == NULL) { - redisLog(REDIS_WARNING, - "Can't open the swap file: %s. Exiting.", - strerror(errno)); - exit(1); - } - server.vm_fd = fileno(server.vm_fp); - /* Lock the swap file for writing, this is useful in order to avoid - * another instance to use the same swap file for a config error. */ - fl.l_type = F_WRLCK; - fl.l_whence = SEEK_SET; - fl.l_start = fl.l_len = 0; - if (fcntl(server.vm_fd,F_SETLK,&fl) == -1) { - redisLog(REDIS_WARNING, - "Can't lock the swap file at '%s': %s. Make sure it is not used by another Redis instance.", server.vm_swap_file, strerror(errno)); - exit(1); - } - /* Initialize */ - server.vm_next_page = 0; - server.vm_near_pages = 0; - server.vm_stats_used_pages = 0; - server.vm_stats_swapped_objects = 0; - server.vm_stats_swapouts = 0; - server.vm_stats_swapins = 0; - totsize = server.vm_pages*server.vm_page_size; - redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize); - if (ftruncate(server.vm_fd,totsize) == -1) { - redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.", - strerror(errno)); - exit(1); - } else { - redisLog(REDIS_NOTICE,"Swap file allocated with success"); - } - server.vm_bitmap = zmalloc((server.vm_pages+7)/8); - redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages", - (long long) (server.vm_pages+7)/8, server.vm_pages); - memset(server.vm_bitmap,0,(server.vm_pages+7)/8); - - /* Initialize threaded I/O (used by Virtual Memory) */ - server.io_newjobs = listCreate(); - server.io_processing = listCreate(); - server.io_processed = listCreate(); - server.io_ready_clients = listCreate(); - pthread_mutex_init(&server.io_mutex,NULL); - pthread_mutex_init(&server.obj_freelist_mutex,NULL); - pthread_mutex_init(&server.io_swapfile_mutex,NULL); - server.io_active_threads = 0; - if (pipe(pipefds) == -1) { - redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting." - ,strerror(errno)); - exit(1); - } - server.io_ready_pipe_read = pipefds[0]; - server.io_ready_pipe_write = pipefds[1]; - redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR); - /* LZF requires a lot of stack */ - pthread_attr_init(&server.io_threads_attr); - pthread_attr_getstacksize(&server.io_threads_attr, &stacksize); - while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2; - pthread_attr_setstacksize(&server.io_threads_attr, stacksize); - /* Listen for events in the threaded I/O pipe */ - if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE, - vmThreadedIOCompletedJob, NULL) == AE_ERR) - oom("creating file event"); -} - -/* Mark the page as used */ -static void vmMarkPageUsed(off_t page) { - off_t byte = page/8; - int bit = page&7; - redisAssert(vmFreePage(page) == 1); - server.vm_bitmap[byte] |= 1<= server.vm_pages) { - this -= server.vm_pages; - if (this == 0) { - /* Just overflowed, what we found on tail is no longer - * interesting, as it's no longer contiguous. */ - numfree = 0; - } - } - if (vmFreePage(this)) { - /* This is a free page */ - numfree++; - /* Already got N free pages? Return to the caller, with success */ - if (numfree == n) { - *first = this-(n-1); - server.vm_next_page = this+1; - redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first); - return REDIS_OK; - } - } else { - /* The current one is not a free page */ - numfree = 0; - } - - /* Fast-forward if the current page is not free and we already - * searched enough near this place. */ - since_jump++; - if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) { - offset += random() % REDIS_VM_MAX_RANDOM_JUMP; - since_jump = 0; - /* Note that even if we rewind after the jump, we are don't need - * to make sure numfree is set to zero as we only jump *if* it - * is set to zero. */ - } else { - /* Otherwise just check the next page */ - offset++; - } - } - return REDIS_ERR; -} - -/* Write the specified object at the specified page of the swap file */ -static int vmWriteObjectOnSwap(robj *o, off_t page) { - if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex); - if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) { - if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex); - redisLog(REDIS_WARNING, - "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s", - strerror(errno)); - return REDIS_ERR; - } - rdbSaveObject(server.vm_fp,o); - fflush(server.vm_fp); - if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex); - return REDIS_OK; -} - -/* Transfers the 'val' object to disk. Store all the information - * a 'vmpointer' object containing all the information needed to load the - * object back later is returned. - * - * If we can't find enough contiguous empty pages to swap the object on disk - * NULL is returned. */ -static vmpointer *vmSwapObjectBlocking(robj *val) { - off_t pages = rdbSavedObjectPages(val,NULL); - off_t page; - vmpointer *vp; - - assert(val->storage == REDIS_VM_MEMORY); - assert(val->refcount == 1); - if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return NULL; - if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return NULL; - - vp = createVmPointer(val->type); - vp->page = page; - vp->usedpages = pages; - decrRefCount(val); /* Deallocate the object from memory. */ - vmMarkPagesUsed(page,pages); - redisLog(REDIS_DEBUG,"VM: object %p swapped out at %lld (%lld pages)", - (void*) val, - (unsigned long long) page, (unsigned long long) pages); - server.vm_stats_swapped_objects++; - server.vm_stats_swapouts++; - return vp; -} - -static robj *vmReadObjectFromSwap(off_t page, int type) { - robj *o; - - if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex); - if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) { - redisLog(REDIS_WARNING, - "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s", - strerror(errno)); - _exit(1); - } - o = rdbLoadObject(type,server.vm_fp); - if (o == NULL) { - redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno)); - _exit(1); - } - if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex); - return o; -} - -/* Load the specified object from swap to memory. - * The newly allocated object is returned. - * - * If preview is true the unserialized object is returned to the caller but - * the pages are not marked as freed, nor the vp object is freed. */ -static robj *vmGenericLoadObject(vmpointer *vp, int preview) { - robj *val; - - redisAssert(vp->type == REDIS_VMPOINTER && - (vp->storage == REDIS_VM_SWAPPED || vp->storage == REDIS_VM_LOADING)); - val = vmReadObjectFromSwap(vp->page,vp->vtype); - if (!preview) { - redisLog(REDIS_DEBUG, "VM: object %p loaded from disk", (void*)vp); - vmMarkPagesFree(vp->page,vp->usedpages); - zfree(vp); - server.vm_stats_swapped_objects--; - } else { - redisLog(REDIS_DEBUG, "VM: object %p previewed from disk", (void*)vp); - } - server.vm_stats_swapins++; - return val; -} - -/* Plain object loading, from swap to memory. - * - * 'o' is actually a redisVmPointer structure that will be freed by the call. - * The return value is the loaded object. */ -static robj *vmLoadObject(robj *o) { - /* If we are loading the object in background, stop it, we - * need to load this object synchronously ASAP. */ - if (o->storage == REDIS_VM_LOADING) - vmCancelThreadedIOJob(o); - return vmGenericLoadObject((vmpointer*)o,0); -} - -/* Just load the value on disk, without to modify the key. - * This is useful when we want to perform some operation on the value - * without to really bring it from swap to memory, like while saving the - * dataset or rewriting the append only log. */ -static robj *vmPreviewObject(robj *o) { - return vmGenericLoadObject((vmpointer*)o,1); -} - -/* How a good candidate is this object for swapping? - * The better candidate it is, the greater the returned value. - * - * Currently we try to perform a fast estimation of the object size in - * memory, and combine it with aging informations. - * - * Basically swappability = idle-time * log(estimated size) - * - * Bigger objects are preferred over smaller objects, but not - * proportionally, this is why we use the logarithm. This algorithm is - * just a first try and will probably be tuned later. */ -static double computeObjectSwappability(robj *o) { - /* actual age can be >= minage, but not < minage. As we use wrapping - * 21 bit clocks with minutes resolution for the LRU. */ - time_t minage = abs(server.lruclock - o->lru); - long asize = 0, elesize; - robj *ele; - list *l; - listNode *ln; - dict *d; - struct dictEntry *de; - int z; - - if (minage <= 0) return 0; - switch(o->type) { - case REDIS_STRING: - if (o->encoding != REDIS_ENCODING_RAW) { - asize = sizeof(*o); - } else { - asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2; - } - break; - case REDIS_LIST: - if (o->encoding == REDIS_ENCODING_ZIPLIST) { - asize = sizeof(*o)+ziplistSize(o->ptr); - } else { - l = o->ptr; - ln = listFirst(l); - asize = sizeof(list); - if (ln) { - ele = ln->value; - elesize = (ele->encoding == REDIS_ENCODING_RAW) ? - (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o); - asize += (sizeof(listNode)+elesize)*listLength(l); - } - } - break; - case REDIS_SET: - case REDIS_ZSET: - z = (o->type == REDIS_ZSET); - d = z ? ((zset*)o->ptr)->dict : o->ptr; - - asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d)); - if (z) asize += sizeof(zset)-sizeof(dict); - if (dictSize(d)) { - de = dictGetRandomKey(d); - ele = dictGetEntryKey(de); - elesize = (ele->encoding == REDIS_ENCODING_RAW) ? - (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o); - asize += (sizeof(struct dictEntry)+elesize)*dictSize(d); - if (z) asize += sizeof(zskiplistNode)*dictSize(d); - } - break; - case REDIS_HASH: - if (o->encoding == REDIS_ENCODING_ZIPMAP) { - unsigned char *p = zipmapRewind((unsigned char*)o->ptr); - unsigned int len = zipmapLen((unsigned char*)o->ptr); - unsigned int klen, vlen; - unsigned char *key, *val; - - if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) { - klen = 0; - vlen = 0; - } - asize = len*(klen+vlen+3); - } else if (o->encoding == REDIS_ENCODING_HT) { - d = o->ptr; - asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d)); - if (dictSize(d)) { - de = dictGetRandomKey(d); - ele = dictGetEntryKey(de); - elesize = (ele->encoding == REDIS_ENCODING_RAW) ? - (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o); - ele = dictGetEntryVal(de); - elesize = (ele->encoding == REDIS_ENCODING_RAW) ? - (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o); - asize += (sizeof(struct dictEntry)+elesize)*dictSize(d); - } - } - break; - } - return (double)minage*log(1+asize); -} - -/* Try to swap an object that's a good candidate for swapping. - * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible - * to swap any object at all. - * - * If 'usethreaded' is true, Redis will try to swap the object in background - * using I/O threads. */ -static int vmSwapOneObject(int usethreads) { - int j, i; - struct dictEntry *best = NULL; - double best_swappability = 0; - redisDb *best_db = NULL; - robj *val; - sds key; - - for (j = 0; j < server.dbnum; j++) { - redisDb *db = server.db+j; - /* Why maxtries is set to 100? - * Because this way (usually) we'll find 1 object even if just 1% - 2% - * are swappable objects */ - int maxtries = 100; - - if (dictSize(db->dict) == 0) continue; - for (i = 0; i < 5; i++) { - dictEntry *de; - double swappability; - - if (maxtries) maxtries--; - de = dictGetRandomKey(db->dict); - val = dictGetEntryVal(de); - /* Only swap objects that are currently in memory. - * - * Also don't swap shared objects: not a good idea in general and - * we need to ensure that the main thread does not touch the - * object while the I/O thread is using it, but we can't - * control other keys without adding additional mutex. */ - if (val->storage != REDIS_VM_MEMORY || val->refcount != 1) { - if (maxtries) i--; /* don't count this try */ - continue; - } - swappability = computeObjectSwappability(val); - if (!best || swappability > best_swappability) { - best = de; - best_swappability = swappability; - best_db = db; - } - } - } - if (best == NULL) return REDIS_ERR; - key = dictGetEntryKey(best); - val = dictGetEntryVal(best); - - redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f", - key, best_swappability); - - /* Swap it */ - if (usethreads) { - robj *keyobj = createStringObject(key,sdslen(key)); - vmSwapObjectThreaded(keyobj,val,best_db); - decrRefCount(keyobj); - return REDIS_OK; - } else { - vmpointer *vp; - - if ((vp = vmSwapObjectBlocking(val)) != NULL) { - dictGetEntryVal(best) = vp; - return REDIS_OK; - } else { - return REDIS_ERR; - } - } -} - -static int vmSwapOneObjectBlocking() { - return vmSwapOneObject(0); -} - -static int vmSwapOneObjectThreaded() { - return vmSwapOneObject(1); -} - -/* Return true if it's safe to swap out objects in a given moment. - * Basically we don't want to swap objects out while there is a BGSAVE - * or a BGAEOREWRITE running in backgroud. */ -static int vmCanSwapOut(void) { - return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1); -} - -/* =================== Virtual Memory - Threaded I/O ======================= */ - -static void freeIOJob(iojob *j) { - if ((j->type == REDIS_IOJOB_PREPARE_SWAP || - j->type == REDIS_IOJOB_DO_SWAP || - j->type == REDIS_IOJOB_LOAD) && j->val != NULL) - { - /* we fix the storage type, otherwise decrRefCount() will try to - * kill the I/O thread Job (that does no longer exists). */ - if (j->val->storage == REDIS_VM_SWAPPING) - j->val->storage = REDIS_VM_MEMORY; - decrRefCount(j->val); - } - decrRefCount(j->key); - zfree(j); -} - -/* Every time a thread finished a Job, it writes a byte into the write side - * of an unix pipe in order to "awake" the main thread, and this function - * is called. */ -static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, - int mask) -{ - char buf[1]; - int retval, processed = 0, toprocess = -1, trytoswap = 1; - REDIS_NOTUSED(el); - REDIS_NOTUSED(mask); - REDIS_NOTUSED(privdata); - - /* For every byte we read in the read side of the pipe, there is one - * I/O job completed to process. */ - while((retval = read(fd,buf,1)) == 1) { - iojob *j; - listNode *ln; - struct dictEntry *de; - - redisLog(REDIS_DEBUG,"Processing I/O completed job"); - - /* Get the processed element (the oldest one) */ - lockThreadedIO(); - assert(listLength(server.io_processed) != 0); - if (toprocess == -1) { - toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100; - if (toprocess <= 0) toprocess = 1; - } - ln = listFirst(server.io_processed); - j = ln->value; - listDelNode(server.io_processed,ln); - unlockThreadedIO(); - /* If this job is marked as canceled, just ignore it */ - if (j->canceled) { - freeIOJob(j); - continue; - } - /* Post process it in the main thread, as there are things we - * can do just here to avoid race conditions and/or invasive locks */ - redisLog(REDIS_DEBUG,"COMPLETED Job type: %d, ID %p, key: %s", j->type, (void*)j->id, (unsigned char*)j->key->ptr); - de = dictFind(j->db->dict,j->key->ptr); - redisAssert(de != NULL); - if (j->type == REDIS_IOJOB_LOAD) { - redisDb *db; - vmpointer *vp = dictGetEntryVal(de); - - /* Key loaded, bring it at home */ - vmMarkPagesFree(vp->page,vp->usedpages); - redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)", - (unsigned char*) j->key->ptr); - server.vm_stats_swapped_objects--; - server.vm_stats_swapins++; - dictGetEntryVal(de) = j->val; - incrRefCount(j->val); - db = j->db; - /* Handle clients waiting for this key to be loaded. */ - handleClientsBlockedOnSwappedKey(db,j->key); - freeIOJob(j); - zfree(vp); - } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) { - /* Now we know the amount of pages required to swap this object. - * Let's find some space for it, and queue this task again - * rebranded as REDIS_IOJOB_DO_SWAP. */ - if (!vmCanSwapOut() || - vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR) - { - /* Ooops... no space or we can't swap as there is - * a fork()ed Redis trying to save stuff on disk. */ - j->val->storage = REDIS_VM_MEMORY; /* undo operation */ - freeIOJob(j); - } else { - /* Note that we need to mark this pages as used now, - * if the job will be canceled, we'll mark them as freed - * again. */ - vmMarkPagesUsed(j->page,j->pages); - j->type = REDIS_IOJOB_DO_SWAP; - lockThreadedIO(); - queueIOJob(j); - unlockThreadedIO(); - } - } else if (j->type == REDIS_IOJOB_DO_SWAP) { - vmpointer *vp; - - /* Key swapped. We can finally free some memory. */ - if (j->val->storage != REDIS_VM_SWAPPING) { - vmpointer *vp = (vmpointer*) j->id; - printf("storage: %d\n",vp->storage); - printf("key->name: %s\n",(char*)j->key->ptr); - printf("val: %p\n",(void*)j->val); - printf("val->type: %d\n",j->val->type); - printf("val->ptr: %s\n",(char*)j->val->ptr); - } - redisAssert(j->val->storage == REDIS_VM_SWAPPING); - vp = createVmPointer(j->val->type); - vp->page = j->page; - vp->usedpages = j->pages; - dictGetEntryVal(de) = vp; - /* Fix the storage otherwise decrRefCount will attempt to - * remove the associated I/O job */ - j->val->storage = REDIS_VM_MEMORY; - decrRefCount(j->val); - redisLog(REDIS_DEBUG, - "VM: object %s swapped out at %lld (%lld pages) (threaded)", - (unsigned char*) j->key->ptr, - (unsigned long long) j->page, (unsigned long long) j->pages); - server.vm_stats_swapped_objects++; - server.vm_stats_swapouts++; - freeIOJob(j); - /* Put a few more swap requests in queue if we are still - * out of memory */ - if (trytoswap && vmCanSwapOut() && - zmalloc_used_memory() > server.vm_max_memory) - { - int more = 1; - while(more) { - lockThreadedIO(); - more = listLength(server.io_newjobs) < - (unsigned) server.vm_max_threads; - unlockThreadedIO(); - /* Don't waste CPU time if swappable objects are rare. */ - if (vmSwapOneObjectThreaded() == REDIS_ERR) { - trytoswap = 0; - break; - } - } - } - } - processed++; - if (processed == toprocess) return; - } - if (retval < 0 && errno != EAGAIN) { - redisLog(REDIS_WARNING, - "WARNING: read(2) error in vmThreadedIOCompletedJob() %s", - strerror(errno)); - } -} - -static void lockThreadedIO(void) { - pthread_mutex_lock(&server.io_mutex); -} - -static void unlockThreadedIO(void) { - pthread_mutex_unlock(&server.io_mutex); -} - -/* Remove the specified object from the threaded I/O queue if still not - * processed, otherwise make sure to flag it as canceled. */ -static void vmCancelThreadedIOJob(robj *o) { - list *lists[3] = { - server.io_newjobs, /* 0 */ - server.io_processing, /* 1 */ - server.io_processed /* 2 */ - }; - int i; - - assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING); -again: - lockThreadedIO(); - /* Search for a matching object in one of the queues */ - for (i = 0; i < 3; i++) { - listNode *ln; - listIter li; - - listRewind(lists[i],&li); - while ((ln = listNext(&li)) != NULL) { - iojob *job = ln->value; - - if (job->canceled) continue; /* Skip this, already canceled. */ - if (job->id == o) { - redisLog(REDIS_DEBUG,"*** CANCELED %p (key %s) (type %d) (LIST ID %d)\n", - (void*)job, (char*)job->key->ptr, job->type, i); - /* Mark the pages as free since the swap didn't happened - * or happened but is now discarded. */ - if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP) - vmMarkPagesFree(job->page,job->pages); - /* Cancel the job. It depends on the list the job is - * living in. */ - switch(i) { - case 0: /* io_newjobs */ - /* If the job was yet not processed the best thing to do - * is to remove it from the queue at all */ - freeIOJob(job); - listDelNode(lists[i],ln); - break; - case 1: /* io_processing */ - /* Oh Shi- the thread is messing with the Job: - * - * Probably it's accessing the object if this is a - * PREPARE_SWAP or DO_SWAP job. - * If it's a LOAD job it may be reading from disk and - * if we don't wait for the job to terminate before to - * cancel it, maybe in a few microseconds data can be - * corrupted in this pages. So the short story is: - * - * Better to wait for the job to move into the - * next queue (processed)... */ - - /* We try again and again until the job is completed. */ - unlockThreadedIO(); - /* But let's wait some time for the I/O thread - * to finish with this job. After all this condition - * should be very rare. */ - usleep(1); - goto again; - case 2: /* io_processed */ - /* The job was already processed, that's easy... - * just mark it as canceled so that we'll ignore it - * when processing completed jobs. */ - job->canceled = 1; - break; - } - /* Finally we have to adjust the storage type of the object - * in order to "UNDO" the operaiton. */ - if (o->storage == REDIS_VM_LOADING) - o->storage = REDIS_VM_SWAPPED; - else if (o->storage == REDIS_VM_SWAPPING) - o->storage = REDIS_VM_MEMORY; - unlockThreadedIO(); - redisLog(REDIS_DEBUG,"*** DONE"); - return; - } - } - } - unlockThreadedIO(); - printf("Not found: %p\n", (void*)o); - redisAssert(1 != 1); /* We should never reach this */ -} - -static void *IOThreadEntryPoint(void *arg) { - iojob *j; - listNode *ln; - REDIS_NOTUSED(arg); - - pthread_detach(pthread_self()); - while(1) { - /* Get a new job to process */ - lockThreadedIO(); - if (listLength(server.io_newjobs) == 0) { - /* No new jobs in queue, exit. */ - redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do", - (long) pthread_self()); - server.io_active_threads--; - unlockThreadedIO(); - return NULL; - } - ln = listFirst(server.io_newjobs); - j = ln->value; - listDelNode(server.io_newjobs,ln); - /* Add the job in the processing queue */ - j->thread = pthread_self(); - listAddNodeTail(server.io_processing,j); - ln = listLast(server.io_processing); /* We use ln later to remove it */ - unlockThreadedIO(); - redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'", - (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr); - - /* Process the Job */ - if (j->type == REDIS_IOJOB_LOAD) { - vmpointer *vp = (vmpointer*)j->id; - j->val = vmReadObjectFromSwap(j->page,vp->vtype); - } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) { - FILE *fp = fopen("/dev/null","w+"); - j->pages = rdbSavedObjectPages(j->val,fp); - fclose(fp); - } else if (j->type == REDIS_IOJOB_DO_SWAP) { - if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR) - j->canceled = 1; - } - - /* Done: insert the job into the processed queue */ - redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)", - (long) pthread_self(), (void*)j, (char*)j->key->ptr); - lockThreadedIO(); - listDelNode(server.io_processing,ln); - listAddNodeTail(server.io_processed,j); - unlockThreadedIO(); - - /* Signal the main thread there is new stuff to process */ - assert(write(server.io_ready_pipe_write,"x",1) == 1); - } - return NULL; /* never reached */ -} - -static void spawnIOThread(void) { - pthread_t thread; - sigset_t mask, omask; - int err; - - sigemptyset(&mask); - sigaddset(&mask,SIGCHLD); - sigaddset(&mask,SIGHUP); - sigaddset(&mask,SIGPIPE); - pthread_sigmask(SIG_SETMASK, &mask, &omask); - while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) { - redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s", - strerror(err)); - usleep(1000000); - } - pthread_sigmask(SIG_SETMASK, &omask, NULL); - server.io_active_threads++; -} - -/* We need to wait for the last thread to exit before we are able to - * fork() in order to BGSAVE or BGREWRITEAOF. */ -static void waitEmptyIOJobsQueue(void) { - while(1) { - int io_processed_len; - - lockThreadedIO(); - if (listLength(server.io_newjobs) == 0 && - listLength(server.io_processing) == 0 && - server.io_active_threads == 0) - { - unlockThreadedIO(); - return; - } - /* While waiting for empty jobs queue condition we post-process some - * finshed job, as I/O threads may be hanging trying to write against - * the io_ready_pipe_write FD but there are so much pending jobs that - * it's blocking. */ - io_processed_len = listLength(server.io_processed); - unlockThreadedIO(); - if (io_processed_len) { - vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0); - usleep(1000); /* 1 millisecond */ - } else { - usleep(10000); /* 10 milliseconds */ - } - } -} - -static void vmReopenSwapFile(void) { - /* Note: we don't close the old one as we are in the child process - * and don't want to mess at all with the original file object. */ - server.vm_fp = fopen(server.vm_swap_file,"r+b"); - if (server.vm_fp == NULL) { - redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.", - server.vm_swap_file); - _exit(1); - } - server.vm_fd = fileno(server.vm_fp); -} - -/* This function must be called while with threaded IO locked */ -static void queueIOJob(iojob *j) { - redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n", - (void*)j, j->type, (char*)j->key->ptr); - listAddNodeTail(server.io_newjobs,j); - if (server.io_active_threads < server.vm_max_threads) - spawnIOThread(); -} - -static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) { - iojob *j; - - j = zmalloc(sizeof(*j)); - j->type = REDIS_IOJOB_PREPARE_SWAP; - j->db = db; - j->key = key; - incrRefCount(key); - j->id = j->val = val; - incrRefCount(val); - j->canceled = 0; - j->thread = (pthread_t) -1; - val->storage = REDIS_VM_SWAPPING; - - lockThreadedIO(); - queueIOJob(j); - unlockThreadedIO(); - return REDIS_OK; -} - -/* ============ Virtual Memory - Blocking clients on missing keys =========== */ - -/* This function makes the clinet 'c' waiting for the key 'key' to be loaded. - * If there is not already a job loading the key, it is craeted. - * The key is added to the io_keys list in the client structure, and also - * in the hash table mapping swapped keys to waiting clients, that is, - * server.io_waited_keys. */ -static int waitForSwappedKey(redisClient *c, robj *key) { - struct dictEntry *de; - robj *o; - list *l; - - /* If the key does not exist or is already in RAM we don't need to - * block the client at all. */ - de = dictFind(c->db->dict,key->ptr); - if (de == NULL) return 0; - o = dictGetEntryVal(de); - if (o->storage == REDIS_VM_MEMORY) { - return 0; - } else if (o->storage == REDIS_VM_SWAPPING) { - /* We were swapping the key, undo it! */ - vmCancelThreadedIOJob(o); - return 0; - } - - /* OK: the key is either swapped, or being loaded just now. */ - - /* Add the key to the list of keys this client is waiting for. - * This maps clients to keys they are waiting for. */ - listAddNodeTail(c->io_keys,key); - incrRefCount(key); - - /* Add the client to the swapped keys => clients waiting map. */ - de = dictFind(c->db->io_keys,key); - if (de == NULL) { - int retval; - - /* For every key we take a list of clients blocked for it */ - l = listCreate(); - retval = dictAdd(c->db->io_keys,key,l); - incrRefCount(key); - assert(retval == DICT_OK); - } else { - l = dictGetEntryVal(de); - } - listAddNodeTail(l,c); - - /* Are we already loading the key from disk? If not create a job */ - if (o->storage == REDIS_VM_SWAPPED) { - iojob *j; - vmpointer *vp = (vmpointer*)o; - - o->storage = REDIS_VM_LOADING; - j = zmalloc(sizeof(*j)); - j->type = REDIS_IOJOB_LOAD; - j->db = c->db; - j->id = (robj*)vp; - j->key = key; - incrRefCount(key); - j->page = vp->page; - j->val = NULL; - j->canceled = 0; - j->thread = (pthread_t) -1; - lockThreadedIO(); - queueIOJob(j); - unlockThreadedIO(); - } - return 1; -} - -/* Preload keys for any command with first, last and step values for - * the command keys prototype, as defined in the command table. */ -static void waitForMultipleSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) { - int j, last; - if (cmd->vm_firstkey == 0) return; - last = cmd->vm_lastkey; - if (last < 0) last = argc+last; - for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep) { - redisAssert(j < argc); - waitForSwappedKey(c,argv[j]); - } -} - -/* Preload keys needed for the ZUNIONSTORE and ZINTERSTORE commands. - * Note that the number of keys to preload is user-defined, so we need to - * apply a sanity check against argc. */ -static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) { - int i, num; - REDIS_NOTUSED(cmd); - - num = atoi(argv[2]->ptr); - if (num > (argc-3)) return; - for (i = 0; i < num; i++) { - waitForSwappedKey(c,argv[3+i]); - } -} - -/* Preload keys needed to execute the entire MULTI/EXEC block. - * - * This function is called by blockClientOnSwappedKeys when EXEC is issued, - * and will block the client when any command requires a swapped out value. */ -static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) { - int i, margc; - struct redisCommand *mcmd; - robj **margv; - REDIS_NOTUSED(cmd); - REDIS_NOTUSED(argc); - REDIS_NOTUSED(argv); - - if (!(c->flags & REDIS_MULTI)) return; - for (i = 0; i < c->mstate.count; i++) { - mcmd = c->mstate.commands[i].cmd; - margc = c->mstate.commands[i].argc; - margv = c->mstate.commands[i].argv; - - if (mcmd->vm_preload_proc != NULL) { - mcmd->vm_preload_proc(c,mcmd,margc,margv); - } else { - waitForMultipleSwappedKeys(c,mcmd,margc,margv); - } - } -} - -/* Is this client attempting to run a command against swapped keys? - * If so, block it ASAP, load the keys in background, then resume it. - * - * The important idea about this function is that it can fail! If keys will - * still be swapped when the client is resumed, this key lookups will - * just block loading keys from disk. In practical terms this should only - * happen with SORT BY command or if there is a bug in this function. - * - * Return 1 if the client is marked as blocked, 0 if the client can - * continue as the keys it is going to access appear to be in memory. */ -static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd) { - if (cmd->vm_preload_proc != NULL) { - cmd->vm_preload_proc(c,cmd,c->argc,c->argv); - } else { - waitForMultipleSwappedKeys(c,cmd,c->argc,c->argv); - } - - /* If the client was blocked for at least one key, mark it as blocked. */ - if (listLength(c->io_keys)) { - c->flags |= REDIS_IO_WAIT; - aeDeleteFileEvent(server.el,c->fd,AE_READABLE); - server.vm_blocked_clients++; - return 1; - } else { - return 0; - } -} - -/* Remove the 'key' from the list of blocked keys for a given client. - * - * The function returns 1 when there are no longer blocking keys after - * the current one was removed (and the client can be unblocked). */ -static int dontWaitForSwappedKey(redisClient *c, robj *key) { - list *l; - listNode *ln; - listIter li; - struct dictEntry *de; - - /* Remove the key from the list of keys this client is waiting for. */ - listRewind(c->io_keys,&li); - while ((ln = listNext(&li)) != NULL) { - if (equalStringObjects(ln->value,key)) { - listDelNode(c->io_keys,ln); - break; - } - } - assert(ln != NULL); - - /* Remove the client form the key => waiting clients map. */ - de = dictFind(c->db->io_keys,key); - assert(de != NULL); - l = dictGetEntryVal(de); - ln = listSearchKey(l,c); - assert(ln != NULL); - listDelNode(l,ln); - if (listLength(l) == 0) - dictDelete(c->db->io_keys,key); - - return listLength(c->io_keys) == 0; -} - -/* Every time we now a key was loaded back in memory, we handle clients - * waiting for this key if any. */ -static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) { - struct dictEntry *de; - list *l; - listNode *ln; - int len; - - de = dictFind(db->io_keys,key); - if (!de) return; - - l = dictGetEntryVal(de); - len = listLength(l); - /* Note: we can't use something like while(listLength(l)) as the list - * can be freed by the calling function when we remove the last element. */ - while (len--) { - ln = listFirst(l); - redisClient *c = ln->value; - - if (dontWaitForSwappedKey(c,key)) { - /* Put the client in the list of clients ready to go as we - * loaded all the keys about it. */ - listAddNodeTail(server.io_ready_clients,c); - } - } -} - -/* =========================== Remote Configuration ========================= */ - -static void configSetCommand(redisClient *c) { - robj *o = getDecodedObject(c->argv[3]); - long long ll; - - if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) { - zfree(server.dbfilename); - server.dbfilename = zstrdup(o->ptr); - } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) { - zfree(server.requirepass); - server.requirepass = zstrdup(o->ptr); - } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) { - zfree(server.masterauth); - server.masterauth = zstrdup(o->ptr); - } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) { - if (getLongLongFromObject(o,&ll) == REDIS_ERR || - ll < 0) goto badfmt; - server.maxmemory = ll; - } else if (!strcasecmp(c->argv[2]->ptr,"timeout")) { - if (getLongLongFromObject(o,&ll) == REDIS_ERR || - ll < 0 || ll > LONG_MAX) goto badfmt; - server.maxidletime = ll; - } else if (!strcasecmp(c->argv[2]->ptr,"appendfsync")) { - if (!strcasecmp(o->ptr,"no")) { - server.appendfsync = APPENDFSYNC_NO; - } else if (!strcasecmp(o->ptr,"everysec")) { - server.appendfsync = APPENDFSYNC_EVERYSEC; - } else if (!strcasecmp(o->ptr,"always")) { - server.appendfsync = APPENDFSYNC_ALWAYS; - } else { - goto badfmt; - } - } else if (!strcasecmp(c->argv[2]->ptr,"no-appendfsync-on-rewrite")) { - int yn = yesnotoi(o->ptr); - - if (yn == -1) goto badfmt; - server.no_appendfsync_on_rewrite = yn; - } else if (!strcasecmp(c->argv[2]->ptr,"appendonly")) { - int old = server.appendonly; - int new = yesnotoi(o->ptr); - - if (new == -1) goto badfmt; - if (old != new) { - if (new == 0) { - stopAppendOnly(); - } else { - if (startAppendOnly() == REDIS_ERR) { - addReplySds(c,sdscatprintf(sdsempty(), - "-ERR Unable to turn on AOF. Check server logs.\r\n")); - decrRefCount(o); - return; - } - } - } - } else if (!strcasecmp(c->argv[2]->ptr,"save")) { - int vlen, j; - sds *v = sdssplitlen(o->ptr,sdslen(o->ptr)," ",1,&vlen); - - /* Perform sanity check before setting the new config: - * - Even number of args - * - Seconds >= 1, changes >= 0 */ - if (vlen & 1) { - sdsfreesplitres(v,vlen); - goto badfmt; - } - for (j = 0; j < vlen; j++) { - char *eptr; - long val; - - val = strtoll(v[j], &eptr, 10); - if (eptr[0] != '\0' || - ((j & 1) == 0 && val < 1) || - ((j & 1) == 1 && val < 0)) { - sdsfreesplitres(v,vlen); - goto badfmt; - } - } - /* Finally set the new config */ - resetServerSaveParams(); - for (j = 0; j < vlen; j += 2) { - time_t seconds; - int changes; - - seconds = strtoll(v[j],NULL,10); - changes = strtoll(v[j+1],NULL,10); - appendServerSaveParams(seconds, changes); - } - sdsfreesplitres(v,vlen); - } else { - addReplySds(c,sdscatprintf(sdsempty(), - "-ERR not supported CONFIG parameter %s\r\n", - (char*)c->argv[2]->ptr)); - decrRefCount(o); - return; - } - decrRefCount(o); - addReply(c,shared.ok); - return; - -badfmt: /* Bad format errors */ - addReplySds(c,sdscatprintf(sdsempty(), - "-ERR invalid argument '%s' for CONFIG SET '%s'\r\n", - (char*)o->ptr, - (char*)c->argv[2]->ptr)); - decrRefCount(o); -} - -static void configGetCommand(redisClient *c) { - robj *o = getDecodedObject(c->argv[2]); - robj *lenobj = createObject(REDIS_STRING,NULL); - char *pattern = o->ptr; - int matches = 0; - - addReply(c,lenobj); - decrRefCount(lenobj); - - if (stringmatch(pattern,"dbfilename",0)) { - addReplyBulkCString(c,"dbfilename"); - addReplyBulkCString(c,server.dbfilename); - matches++; - } - if (stringmatch(pattern,"requirepass",0)) { - addReplyBulkCString(c,"requirepass"); - addReplyBulkCString(c,server.requirepass); - matches++; - } - if (stringmatch(pattern,"masterauth",0)) { - addReplyBulkCString(c,"masterauth"); - addReplyBulkCString(c,server.masterauth); - matches++; - } - if (stringmatch(pattern,"maxmemory",0)) { - char buf[128]; - - ll2string(buf,128,server.maxmemory); - addReplyBulkCString(c,"maxmemory"); - addReplyBulkCString(c,buf); - matches++; - } - if (stringmatch(pattern,"timeout",0)) { - char buf[128]; - - ll2string(buf,128,server.maxidletime); - addReplyBulkCString(c,"timeout"); - addReplyBulkCString(c,buf); - matches++; - } - if (stringmatch(pattern,"appendonly",0)) { - addReplyBulkCString(c,"appendonly"); - addReplyBulkCString(c,server.appendonly ? "yes" : "no"); - matches++; - } - if (stringmatch(pattern,"no-appendfsync-on-rewrite",0)) { - addReplyBulkCString(c,"no-appendfsync-on-rewrite"); - addReplyBulkCString(c,server.no_appendfsync_on_rewrite ? "yes" : "no"); - matches++; - } - if (stringmatch(pattern,"appendfsync",0)) { - char *policy; - - switch(server.appendfsync) { - case APPENDFSYNC_NO: policy = "no"; break; - case APPENDFSYNC_EVERYSEC: policy = "everysec"; break; - case APPENDFSYNC_ALWAYS: policy = "always"; break; - default: policy = "unknown"; break; /* too harmless to panic */ - } - addReplyBulkCString(c,"appendfsync"); - addReplyBulkCString(c,policy); - matches++; - } - if (stringmatch(pattern,"save",0)) { - sds buf = sdsempty(); - int j; - - for (j = 0; j < server.saveparamslen; j++) { - buf = sdscatprintf(buf,"%ld %d", - server.saveparams[j].seconds, - server.saveparams[j].changes); - if (j != server.saveparamslen-1) - buf = sdscatlen(buf," ",1); - } - addReplyBulkCString(c,"save"); - addReplyBulkCString(c,buf); - sdsfree(buf); - matches++; - } - decrRefCount(o); - lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2); -} - -static void configCommand(redisClient *c) { - if (!strcasecmp(c->argv[1]->ptr,"set")) { - if (c->argc != 4) goto badarity; - configSetCommand(c); - } else if (!strcasecmp(c->argv[1]->ptr,"get")) { - if (c->argc != 3) goto badarity; - configGetCommand(c); - } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) { - if (c->argc != 2) goto badarity; - server.stat_numcommands = 0; - server.stat_numconnections = 0; - server.stat_expiredkeys = 0; - server.stat_starttime = time(NULL); - addReply(c,shared.ok); - } else { - addReplySds(c,sdscatprintf(sdsempty(), - "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n")); - } - return; - -badarity: - addReplySds(c,sdscatprintf(sdsempty(), - "-ERR Wrong number of arguments for CONFIG %s\r\n", - (char*) c->argv[1]->ptr)); -} - -/* =========================== Pubsub implementation ======================== */ - -static void freePubsubPattern(void *p) { - pubsubPattern *pat = p; - - decrRefCount(pat->pattern); - zfree(pat); -} - -static int listMatchPubsubPattern(void *a, void *b) { - pubsubPattern *pa = a, *pb = b; - - return (pa->client == pb->client) && - (equalStringObjects(pa->pattern,pb->pattern)); -} - -/* Subscribe a client to a channel. Returns 1 if the operation succeeded, or - * 0 if the client was already subscribed to that channel. */ -static int pubsubSubscribeChannel(redisClient *c, robj *channel) { - struct dictEntry *de; - list *clients = NULL; - int retval = 0; - - /* Add the channel to the client -> channels hash table */ - if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) { - retval = 1; - incrRefCount(channel); - /* Add the client to the channel -> list of clients hash table */ - de = dictFind(server.pubsub_channels,channel); - if (de == NULL) { - clients = listCreate(); - dictAdd(server.pubsub_channels,channel,clients); - incrRefCount(channel); - } else { - clients = dictGetEntryVal(de); - } - listAddNodeTail(clients,c); - } - /* Notify the client */ - addReply(c,shared.mbulk3); - addReply(c,shared.subscribebulk); - addReplyBulk(c,channel); - addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns)); - return retval; -} - -/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or - * 0 if the client was not subscribed to the specified channel. */ -static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) { - struct dictEntry *de; - list *clients; - listNode *ln; - int retval = 0; - - /* Remove the channel from the client -> channels hash table */ - incrRefCount(channel); /* channel may be just a pointer to the same object - we have in the hash tables. Protect it... */ - if (dictDelete(c->pubsub_channels,channel) == DICT_OK) { - retval = 1; - /* Remove the client from the channel -> clients list hash table */ - de = dictFind(server.pubsub_channels,channel); - assert(de != NULL); - clients = dictGetEntryVal(de); - ln = listSearchKey(clients,c); - assert(ln != NULL); - listDelNode(clients,ln); - if (listLength(clients) == 0) { - /* Free the list and associated hash entry at all if this was - * the latest client, so that it will be possible to abuse - * Redis PUBSUB creating millions of channels. */ - dictDelete(server.pubsub_channels,channel); - } - } - /* Notify the client */ - if (notify) { - addReply(c,shared.mbulk3); - addReply(c,shared.unsubscribebulk); - addReplyBulk(c,channel); - addReplyLongLong(c,dictSize(c->pubsub_channels)+ - listLength(c->pubsub_patterns)); - - } - decrRefCount(channel); /* it is finally safe to release it */ - return retval; -} - -/* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */ -static int pubsubSubscribePattern(redisClient *c, robj *pattern) { - int retval = 0; - - if (listSearchKey(c->pubsub_patterns,pattern) == NULL) { - retval = 1; - pubsubPattern *pat; - listAddNodeTail(c->pubsub_patterns,pattern); - incrRefCount(pattern); - pat = zmalloc(sizeof(*pat)); - pat->pattern = getDecodedObject(pattern); - pat->client = c; - listAddNodeTail(server.pubsub_patterns,pat); - } - /* Notify the client */ - addReply(c,shared.mbulk3); - addReply(c,shared.psubscribebulk); - addReplyBulk(c,pattern); - addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns)); - return retval; -} - -/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or - * 0 if the client was not subscribed to the specified channel. */ -static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) { - listNode *ln; - pubsubPattern pat; - int retval = 0; - - incrRefCount(pattern); /* Protect the object. May be the same we remove */ - if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) { - retval = 1; - listDelNode(c->pubsub_patterns,ln); - pat.client = c; - pat.pattern = pattern; - ln = listSearchKey(server.pubsub_patterns,&pat); - listDelNode(server.pubsub_patterns,ln); - } - /* Notify the client */ - if (notify) { - addReply(c,shared.mbulk3); - addReply(c,shared.punsubscribebulk); - addReplyBulk(c,pattern); - addReplyLongLong(c,dictSize(c->pubsub_channels)+ - listLength(c->pubsub_patterns)); - } - decrRefCount(pattern); - return retval; -} - -/* Unsubscribe from all the channels. Return the number of channels the - * client was subscribed from. */ -static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) { - dictIterator *di = dictGetIterator(c->pubsub_channels); - dictEntry *de; - int count = 0; - - while((de = dictNext(di)) != NULL) { - robj *channel = dictGetEntryKey(de); - - count += pubsubUnsubscribeChannel(c,channel,notify); - } - dictReleaseIterator(di); - return count; -} - -/* Unsubscribe from all the patterns. Return the number of patterns the - * client was subscribed from. */ -static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) { - listNode *ln; - listIter li; - int count = 0; - - listRewind(c->pubsub_patterns,&li); - while ((ln = listNext(&li)) != NULL) { - robj *pattern = ln->value; - - count += pubsubUnsubscribePattern(c,pattern,notify); - } - return count; -} - -/* Publish a message */ -static int pubsubPublishMessage(robj *channel, robj *message) { - int receivers = 0; - struct dictEntry *de; - listNode *ln; - listIter li; - - /* Send to clients listening for that channel */ - de = dictFind(server.pubsub_channels,channel); - if (de) { - list *list = dictGetEntryVal(de); - listNode *ln; - listIter li; - - listRewind(list,&li); - while ((ln = listNext(&li)) != NULL) { - redisClient *c = ln->value; - - addReply(c,shared.mbulk3); - addReply(c,shared.messagebulk); - addReplyBulk(c,channel); - addReplyBulk(c,message); - receivers++; - } - } - /* Send to clients listening to matching channels */ - if (listLength(server.pubsub_patterns)) { - listRewind(server.pubsub_patterns,&li); - channel = getDecodedObject(channel); - while ((ln = listNext(&li)) != NULL) { - pubsubPattern *pat = ln->value; - - if (stringmatchlen((char*)pat->pattern->ptr, - sdslen(pat->pattern->ptr), - (char*)channel->ptr, - sdslen(channel->ptr),0)) { - addReply(pat->client,shared.mbulk4); - addReply(pat->client,shared.pmessagebulk); - addReplyBulk(pat->client,pat->pattern); - addReplyBulk(pat->client,channel); - addReplyBulk(pat->client,message); - receivers++; - } - } - decrRefCount(channel); - } - return receivers; -} - -static void subscribeCommand(redisClient *c) { - int j; - - for (j = 1; j < c->argc; j++) - pubsubSubscribeChannel(c,c->argv[j]); -} - -static void unsubscribeCommand(redisClient *c) { - if (c->argc == 1) { - pubsubUnsubscribeAllChannels(c,1); - return; - } else { - int j; - - for (j = 1; j < c->argc; j++) - pubsubUnsubscribeChannel(c,c->argv[j],1); - } -} - -static void psubscribeCommand(redisClient *c) { - int j; - - for (j = 1; j < c->argc; j++) - pubsubSubscribePattern(c,c->argv[j]); -} - -static void punsubscribeCommand(redisClient *c) { - if (c->argc == 1) { - pubsubUnsubscribeAllPatterns(c,1); - return; - } else { - int j; - - for (j = 1; j < c->argc; j++) - pubsubUnsubscribePattern(c,c->argv[j],1); - } -} - -static void publishCommand(redisClient *c) { - int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]); - addReplyLongLong(c,receivers); -} - -/* ===================== WATCH (CAS alike for MULTI/EXEC) =================== - * - * The implementation uses a per-DB hash table mapping keys to list of clients - * WATCHing those keys, so that given a key that is going to be modified - * we can mark all the associated clients as dirty. - * - * Also every client contains a list of WATCHed keys so that's possible to - * un-watch such keys when the client is freed or when UNWATCH is called. */ - -/* In the client->watched_keys list we need to use watchedKey structures - * as in order to identify a key in Redis we need both the key name and the - * DB */ -typedef struct watchedKey { - robj *key; - redisDb *db; -} watchedKey; - -/* Watch for the specified key */ -static void watchForKey(redisClient *c, robj *key) { - list *clients = NULL; - listIter li; - listNode *ln; - watchedKey *wk; - - /* Check if we are already watching for this key */ - listRewind(c->watched_keys,&li); - while((ln = listNext(&li))) { - wk = listNodeValue(ln); - if (wk->db == c->db && equalStringObjects(key,wk->key)) - return; /* Key already watched */ - } - /* This key is not already watched in this DB. Let's add it */ - clients = dictFetchValue(c->db->watched_keys,key); - if (!clients) { - clients = listCreate(); - dictAdd(c->db->watched_keys,key,clients); - incrRefCount(key); - } - listAddNodeTail(clients,c); - /* Add the new key to the lits of keys watched by this client */ - wk = zmalloc(sizeof(*wk)); - wk->key = key; - wk->db = c->db; - incrRefCount(key); - listAddNodeTail(c->watched_keys,wk); -} - -/* Unwatch all the keys watched by this client. To clean the EXEC dirty - * flag is up to the caller. */ -static void unwatchAllKeys(redisClient *c) { - listIter li; - listNode *ln; - - if (listLength(c->watched_keys) == 0) return; - listRewind(c->watched_keys,&li); - while((ln = listNext(&li))) { - list *clients; - watchedKey *wk; - - /* Lookup the watched key -> clients list and remove the client - * from the list */ - wk = listNodeValue(ln); - clients = dictFetchValue(wk->db->watched_keys, wk->key); - assert(clients != NULL); - listDelNode(clients,listSearchKey(clients,c)); - /* Kill the entry at all if this was the only client */ - if (listLength(clients) == 0) - dictDelete(wk->db->watched_keys, wk->key); - /* Remove this watched key from the client->watched list */ - listDelNode(c->watched_keys,ln); - decrRefCount(wk->key); - zfree(wk); - } -} - -/* "Touch" a key, so that if this key is being WATCHed by some client the - * next EXEC will fail. */ -static void touchWatchedKey(redisDb *db, robj *key) { - list *clients; - listIter li; - listNode *ln; - - if (dictSize(db->watched_keys) == 0) return; - clients = dictFetchValue(db->watched_keys, key); - if (!clients) return; - - /* Mark all the clients watching this key as REDIS_DIRTY_CAS */ - /* Check if we are already watching for this key */ - listRewind(clients,&li); - while((ln = listNext(&li))) { - redisClient *c = listNodeValue(ln); - - c->flags |= REDIS_DIRTY_CAS; - } -} - -/* On FLUSHDB or FLUSHALL all the watched keys that are present before the - * flush but will be deleted as effect of the flushing operation should - * be touched. "dbid" is the DB that's getting the flush. -1 if it is - * a FLUSHALL operation (all the DBs flushed). */ -static void touchWatchedKeysOnFlush(int dbid) { - listIter li1, li2; - listNode *ln; - - /* For every client, check all the waited keys */ - listRewind(server.clients,&li1); - while((ln = listNext(&li1))) { - redisClient *c = listNodeValue(ln); - listRewind(c->watched_keys,&li2); - while((ln = listNext(&li2))) { - watchedKey *wk = listNodeValue(ln); - - /* For every watched key matching the specified DB, if the - * key exists, mark the client as dirty, as the key will be - * removed. */ - if (dbid == -1 || wk->db->id == dbid) { - if (dictFind(wk->db->dict, wk->key->ptr) != NULL) - c->flags |= REDIS_DIRTY_CAS; - } - } - } -} - -static void watchCommand(redisClient *c) { - int j; - - if (c->flags & REDIS_MULTI) { - addReplySds(c,sdsnew("-ERR WATCH inside MULTI is not allowed\r\n")); - return; - } - for (j = 1; j < c->argc; j++) - watchForKey(c,c->argv[j]); - addReply(c,shared.ok); -} - -static void unwatchCommand(redisClient *c) { - unwatchAllKeys(c); - c->flags &= (~REDIS_DIRTY_CAS); - addReply(c,shared.ok); -} - -/* ================================= Debugging ============================== */ - -/* Compute the sha1 of string at 's' with 'len' bytes long. - * The SHA1 is then xored againt the string pointed by digest. - * Since xor is commutative, this operation is used in order to - * "add" digests relative to unordered elements. - * - * So digest(a,b,c,d) will be the same of digest(b,a,c,d) */ -static void xorDigest(unsigned char *digest, void *ptr, size_t len) { - SHA1_CTX ctx; - unsigned char hash[20], *s = ptr; - int j; - - SHA1Init(&ctx); - SHA1Update(&ctx,s,len); - SHA1Final(hash,&ctx); - - for (j = 0; j < 20; j++) - digest[j] ^= hash[j]; -} - -static void xorObjectDigest(unsigned char *digest, robj *o) { - o = getDecodedObject(o); - xorDigest(digest,o->ptr,sdslen(o->ptr)); - decrRefCount(o); -} - -/* This function instead of just computing the SHA1 and xoring it - * against diget, also perform the digest of "digest" itself and - * replace the old value with the new one. - * - * So the final digest will be: - * - * digest = SHA1(digest xor SHA1(data)) - * - * This function is used every time we want to preserve the order so - * that digest(a,b,c,d) will be different than digest(b,c,d,a) - * - * Also note that mixdigest("foo") followed by mixdigest("bar") - * will lead to a different digest compared to "fo", "obar". - */ -static void mixDigest(unsigned char *digest, void *ptr, size_t len) { - SHA1_CTX ctx; - char *s = ptr; - - xorDigest(digest,s,len); - SHA1Init(&ctx); - SHA1Update(&ctx,digest,20); - SHA1Final(digest,&ctx); -} - -static void mixObjectDigest(unsigned char *digest, robj *o) { - o = getDecodedObject(o); - mixDigest(digest,o->ptr,sdslen(o->ptr)); - decrRefCount(o); -} - -/* Compute the dataset digest. Since keys, sets elements, hashes elements - * are not ordered, we use a trick: every aggregate digest is the xor - * of the digests of their elements. This way the order will not change - * the result. For list instead we use a feedback entering the output digest - * as input in order to ensure that a different ordered list will result in - * a different digest. */ -static void computeDatasetDigest(unsigned char *final) { - unsigned char digest[20]; - char buf[128]; - dictIterator *di = NULL; - dictEntry *de; - int j; - uint32_t aux; - - memset(final,0,20); /* Start with a clean result */ - - for (j = 0; j < server.dbnum; j++) { - redisDb *db = server.db+j; - - if (dictSize(db->dict) == 0) continue; - di = dictGetIterator(db->dict); - - /* hash the DB id, so the same dataset moved in a different - * DB will lead to a different digest */ - aux = htonl(j); - mixDigest(final,&aux,sizeof(aux)); - - /* Iterate this DB writing every entry */ - while((de = dictNext(di)) != NULL) { - sds key; - robj *keyobj, *o; - time_t expiretime; - - memset(digest,0,20); /* This key-val digest */ - key = dictGetEntryKey(de); - keyobj = createStringObject(key,sdslen(key)); - - mixDigest(digest,key,sdslen(key)); - - /* Make sure the key is loaded if VM is active */ - o = lookupKeyRead(db,keyobj); - - aux = htonl(o->type); - mixDigest(digest,&aux,sizeof(aux)); - expiretime = getExpire(db,keyobj); - - /* Save the key and associated value */ - if (o->type == REDIS_STRING) { - mixObjectDigest(digest,o); - } else if (o->type == REDIS_LIST) { - listTypeIterator *li = listTypeInitIterator(o,0,REDIS_TAIL); - listTypeEntry entry; - while(listTypeNext(li,&entry)) { - robj *eleobj = listTypeGet(&entry); - mixObjectDigest(digest,eleobj); - decrRefCount(eleobj); - } - listTypeReleaseIterator(li); - } else if (o->type == REDIS_SET) { - dict *set = o->ptr; - dictIterator *di = dictGetIterator(set); - dictEntry *de; - - while((de = dictNext(di)) != NULL) { - robj *eleobj = dictGetEntryKey(de); - - xorObjectDigest(digest,eleobj); - } - dictReleaseIterator(di); - } else if (o->type == REDIS_ZSET) { - zset *zs = o->ptr; - dictIterator *di = dictGetIterator(zs->dict); - dictEntry *de; - - while((de = dictNext(di)) != NULL) { - robj *eleobj = dictGetEntryKey(de); - double *score = dictGetEntryVal(de); - unsigned char eledigest[20]; - - snprintf(buf,sizeof(buf),"%.17g",*score); - memset(eledigest,0,20); - mixObjectDigest(eledigest,eleobj); - mixDigest(eledigest,buf,strlen(buf)); - xorDigest(digest,eledigest,20); - } - dictReleaseIterator(di); - } else if (o->type == REDIS_HASH) { - hashTypeIterator *hi; - robj *obj; - - hi = hashTypeInitIterator(o); - while (hashTypeNext(hi) != REDIS_ERR) { - unsigned char eledigest[20]; - - memset(eledigest,0,20); - obj = hashTypeCurrent(hi,REDIS_HASH_KEY); - mixObjectDigest(eledigest,obj); - decrRefCount(obj); - obj = hashTypeCurrent(hi,REDIS_HASH_VALUE); - mixObjectDigest(eledigest,obj); - decrRefCount(obj); - xorDigest(digest,eledigest,20); - } - hashTypeReleaseIterator(hi); - } else { - redisPanic("Unknown object type"); - } - /* If the key has an expire, add it to the mix */ - if (expiretime != -1) xorDigest(digest,"!!expire!!",10); - /* We can finally xor the key-val digest to the final digest */ - xorDigest(final,digest,20); - decrRefCount(keyobj); - } - dictReleaseIterator(di); - } -} - -static void debugCommand(redisClient *c) { - if (!strcasecmp(c->argv[1]->ptr,"segfault")) { - *((char*)-1) = 'x'; - } else if (!strcasecmp(c->argv[1]->ptr,"reload")) { - if (rdbSave(server.dbfilename) != REDIS_OK) { - addReply(c,shared.err); - return; - } - emptyDb(); - if (rdbLoad(server.dbfilename) != REDIS_OK) { - addReply(c,shared.err); - return; - } - redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD"); - addReply(c,shared.ok); - } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) { - emptyDb(); - if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) { - addReply(c,shared.err); - return; - } - redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF"); - addReply(c,shared.ok); - } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) { - dictEntry *de = dictFind(c->db->dict,c->argv[2]->ptr); - robj *val; - - if (!de) { - addReply(c,shared.nokeyerr); - return; - } - val = dictGetEntryVal(de); - if (!server.vm_enabled || (val->storage == REDIS_VM_MEMORY || - val->storage == REDIS_VM_SWAPPING)) { - char *strenc; - char buf[128]; - - if (val->encoding < (sizeof(strencoding)/sizeof(char*))) { - strenc = strencoding[val->encoding]; - } else { - snprintf(buf,64,"unknown encoding %d\n", val->encoding); - strenc = buf; - } - addReplySds(c,sdscatprintf(sdsempty(), - "+Value at:%p refcount:%d " - "encoding:%s serializedlength:%lld\r\n", - (void*)val, val->refcount, - strenc, (long long) rdbSavedObjectLen(val,NULL))); - } else { - vmpointer *vp = (vmpointer*) val; - addReplySds(c,sdscatprintf(sdsempty(), - "+Value swapped at: page %llu " - "using %llu pages\r\n", - (unsigned long long) vp->page, - (unsigned long long) vp->usedpages)); - } - } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) { - lookupKeyRead(c->db,c->argv[2]); - addReply(c,shared.ok); - } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) { - dictEntry *de = dictFind(c->db->dict,c->argv[2]->ptr); - robj *val; - vmpointer *vp; - - if (!server.vm_enabled) { - addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n")); - return; - } - if (!de) { - addReply(c,shared.nokeyerr); - return; - } - val = dictGetEntryVal(de); - /* Swap it */ - if (val->storage != REDIS_VM_MEMORY) { - addReplySds(c,sdsnew("-ERR This key is not in memory\r\n")); - } else if (val->refcount != 1) { - addReplySds(c,sdsnew("-ERR Object is shared\r\n")); - } else if ((vp = vmSwapObjectBlocking(val)) != NULL) { - dictGetEntryVal(de) = vp; - addReply(c,shared.ok); - } else { - addReply(c,shared.err); - } - } else if (!strcasecmp(c->argv[1]->ptr,"populate") && c->argc == 3) { - long keys, j; - robj *key, *val; - char buf[128]; - - if (getLongFromObjectOrReply(c, c->argv[2], &keys, NULL) != REDIS_OK) - return; - for (j = 0; j < keys; j++) { - snprintf(buf,sizeof(buf),"key:%lu",j); - key = createStringObject(buf,strlen(buf)); - if (lookupKeyRead(c->db,key) != NULL) { - decrRefCount(key); - continue; - } - snprintf(buf,sizeof(buf),"value:%lu",j); - val = createStringObject(buf,strlen(buf)); - dbAdd(c->db,key,val); - decrRefCount(key); - } - addReply(c,shared.ok); - } else if (!strcasecmp(c->argv[1]->ptr,"digest") && c->argc == 2) { - unsigned char digest[20]; - sds d = sdsnew("+"); - int j; - - computeDatasetDigest(digest); - for (j = 0; j < 20; j++) - d = sdscatprintf(d, "%02x",digest[j]); - - d = sdscatlen(d,"\r\n",2); - addReplySds(c,d); - } else { - addReplySds(c,sdsnew( - "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT |SWAPIN |SWAPOUT |RELOAD]\r\n")); - } -} - -static void _redisAssert(char *estr, char *file, int line) { - redisLog(REDIS_WARNING,"=== ASSERTION FAILED ==="); - redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true",file,line,estr); -#ifdef HAVE_BACKTRACE - redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)"); - *((char*)-1) = 'x'; -#endif -} - -static void _redisPanic(char *msg, char *file, int line) { - redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue"); - redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line); -#ifdef HAVE_BACKTRACE - redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)"); - *((char*)-1) = 'x'; -#endif -} - -/* =================================== Main! ================================ */ - -#ifdef __linux__ -int linuxOvercommitMemoryValue(void) { - FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r"); - char buf[64]; - - if (!fp) return -1; - if (fgets(buf,64,fp) == NULL) { - fclose(fp); - return -1; - } - fclose(fp); - - return atoi(buf); -} - -void linuxOvercommitMemoryWarning(void) { - if (linuxOvercommitMemoryValue() == 0) { - redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect."); - } -} -#endif /* __linux__ */ - -static void daemonize(void) { - int fd; - FILE *fp; - - if (fork() != 0) exit(0); /* parent exits */ - setsid(); /* create a new session */ - - /* Every output goes to /dev/null. If Redis is daemonized but - * the 'logfile' is set to 'stdout' in the configuration file - * it will not log at all. */ - if ((fd = open("/dev/null", O_RDWR, 0)) != -1) { - dup2(fd, STDIN_FILENO); - dup2(fd, STDOUT_FILENO); - dup2(fd, STDERR_FILENO); - if (fd > STDERR_FILENO) close(fd); - } - /* Try to write the pid file */ - fp = fopen(server.pidfile,"w"); - if (fp) { - fprintf(fp,"%d\n",getpid()); - fclose(fp); - } -} - -static void version() { - printf("Redis server version %s (%s:%d)\n", REDIS_VERSION, - redisGitSHA1(), atoi(redisGitDirty()) > 0); - exit(0); -} - -static void usage() { - fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n"); - fprintf(stderr," ./redis-server - (read config from stdin)\n"); - exit(1); -} - -int main(int argc, char **argv) { - time_t start; - - initServerConfig(); - sortCommandTable(); - if (argc == 2) { - if (strcmp(argv[1], "-v") == 0 || - strcmp(argv[1], "--version") == 0) version(); - if (strcmp(argv[1], "--help") == 0) usage(); - resetServerSaveParams(); - loadServerConfig(argv[1]); - } else if ((argc > 2)) { - usage(); - } else { - redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'"); - } - if (server.daemonize) daemonize(); - initServer(); - redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION); -#ifdef __linux__ - linuxOvercommitMemoryWarning(); -#endif - start = time(NULL); - if (server.appendonly) { - if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK) - redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start); - } else { - if (rdbLoad(server.dbfilename) == REDIS_OK) - redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start); - } - redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port); - aeSetBeforeSleepProc(server.el,beforeSleep); - aeMain(server.el); - aeDeleteEventLoop(server.el); - return 0; -} - -/* ============================= Backtrace support ========================= */ - -#ifdef HAVE_BACKTRACE -static char *findFuncName(void *pointer, unsigned long *offset); - -static void *getMcontextEip(ucontext_t *uc) { -#if defined(__FreeBSD__) - return (void*) uc->uc_mcontext.mc_eip; -#elif defined(__dietlibc__) - return (void*) uc->uc_mcontext.eip; -#elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6) - #if __x86_64__ - return (void*) uc->uc_mcontext->__ss.__rip; - #else - return (void*) uc->uc_mcontext->__ss.__eip; - #endif -#elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6) - #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__) - return (void*) uc->uc_mcontext->__ss.__rip; - #else - return (void*) uc->uc_mcontext->__ss.__eip; - #endif -#elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__) - return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */ -#elif defined(__ia64__) /* Linux IA64 */ - return (void*) uc->uc_mcontext.sc_ip; -#else - return NULL; -#endif -} - -static void segvHandler(int sig, siginfo_t *info, void *secret) { - void *trace[100]; - char **messages = NULL; - int i, trace_size = 0; - unsigned long offset=0; - ucontext_t *uc = (ucontext_t*) secret; - sds infostring; - REDIS_NOTUSED(info); - - redisLog(REDIS_WARNING, - "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig); - infostring = genRedisInfoString(); - redisLog(REDIS_WARNING, "%s",infostring); - /* It's not safe to sdsfree() the returned string under memory - * corruption conditions. Let it leak as we are going to abort */ - - trace_size = backtrace(trace, 100); - /* overwrite sigaction with caller's address */ - if (getMcontextEip(uc) != NULL) { - trace[1] = getMcontextEip(uc); - } - messages = backtrace_symbols(trace, trace_size); - - for (i=1; i= symsTable[i].pointer) { - off=lp-symsTable[i].pointer; - if (ret < 0 || off < minoff) { - minoff=off; - ret=i; - } - } - } - if (ret == -1) return NULL; - *offset = minoff; - return symsTable[ret].name; -} -#else /* HAVE_BACKTRACE */ -static void setupSigSegvAction(void) { -} -#endif /* HAVE_BACKTRACE */ - - - -/* The End */ - - - diff --git a/redis.h b/redis.h deleted file mode 100644 index 18816844f..000000000 --- a/redis.h +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright (c) 2009-2010, Salvatore Sanfilippo - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __REDIS_H__ -#define __REDIS_H__ - -enum -{ - REG_GS = 0, -# define REG_GS REG_GS - REG_FS, -# define REG_FS REG_FS - REG_ES, -# define REG_ES REG_ES - REG_DS, -# define REG_DS REG_DS - REG_EDI, -# define REG_EDI REG_EDI - REG_ESI, -# define REG_ESI REG_ESI - REG_EBP, -# define REG_EBP REG_EBP - REG_ESP, -# define REG_ESP REG_ESP - REG_EBX, -# define REG_EBX REG_EBX - REG_EDX, -# define REG_EDX REG_EDX - REG_ECX, -# define REG_ECX REG_ECX - REG_EAX, -# define REG_EAX REG_EAX - REG_TRAPNO, -# define REG_TRAPNO REG_TRAPNO - REG_ERR, -# define REG_ERR REG_ERR - REG_EIP, -# define REG_EIP REG_EIP - REG_CS, -# define REG_CS REG_CS - REG_EFL, -# define REG_EFL REG_EFL - REG_UESP, -# define REG_UESP REG_UESP - REG_SS -# define REG_SS REG_SS -}; - -#endif diff --git a/release.c b/release.c deleted file mode 100644 index 64186ec4e..000000000 --- a/release.c +++ /dev/null @@ -1,13 +0,0 @@ -/* Every time the Redis Git SHA1 or Dirty status changes only this file - * small file is recompiled, as we access this information in all the other - * files using this functions. */ - -#include "release.h" - -char *redisGitSHA1(void) { - return REDIS_GIT_SHA1; -} - -char *redisGitDirty(void) { - return REDIS_GIT_DIRTY; -} diff --git a/sds.c b/sds.c deleted file mode 100644 index feb1a6212..000000000 --- a/sds.c +++ /dev/null @@ -1,359 +0,0 @@ -/* SDSLib, A C dynamic strings library - * - * Copyright (c) 2006-2010, Salvatore Sanfilippo - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#define SDS_ABORT_ON_OOM - -#include "sds.h" -#include -#include -#include -#include -#include -#include "zmalloc.h" - -static void sdsOomAbort(void) { - fprintf(stderr,"SDS: Out Of Memory (SDS_ABORT_ON_OOM defined)\n"); - abort(); -} - -sds sdsnewlen(const void *init, size_t initlen) { - struct sdshdr *sh; - - sh = zmalloc(sizeof(struct sdshdr)+initlen+1); -#ifdef SDS_ABORT_ON_OOM - if (sh == NULL) sdsOomAbort(); -#else - if (sh == NULL) return NULL; -#endif - sh->len = initlen; - sh->free = 0; - if (initlen) { - if (init) memcpy(sh->buf, init, initlen); - else memset(sh->buf,0,initlen); - } - sh->buf[initlen] = '\0'; - return (char*)sh->buf; -} - -sds sdsempty(void) { - return sdsnewlen("",0); -} - -sds sdsnew(const char *init) { - size_t initlen = (init == NULL) ? 0 : strlen(init); - return sdsnewlen(init, initlen); -} - -size_t sdslen(const sds s) { - struct sdshdr *sh = (void*) (s-(sizeof(struct sdshdr))); - return sh->len; -} - -sds sdsdup(const sds s) { - return sdsnewlen(s, sdslen(s)); -} - -void sdsfree(sds s) { - if (s == NULL) return; - zfree(s-sizeof(struct sdshdr)); -} - -size_t sdsavail(sds s) { - struct sdshdr *sh = (void*) (s-(sizeof(struct sdshdr))); - return sh->free; -} - -void sdsupdatelen(sds s) { - struct sdshdr *sh = (void*) (s-(sizeof(struct sdshdr))); - int reallen = strlen(s); - sh->free += (sh->len-reallen); - sh->len = reallen; -} - -static sds sdsMakeRoomFor(sds s, size_t addlen) { - struct sdshdr *sh, *newsh; - size_t free = sdsavail(s); - size_t len, newlen; - - if (free >= addlen) return s; - len = sdslen(s); - sh = (void*) (s-(sizeof(struct sdshdr))); - newlen = (len+addlen)*2; - newsh = zrealloc(sh, sizeof(struct sdshdr)+newlen+1); -#ifdef SDS_ABORT_ON_OOM - if (newsh == NULL) sdsOomAbort(); -#else - if (newsh == NULL) return NULL; -#endif - - newsh->free = newlen - len; - return newsh->buf; -} - -sds sdscatlen(sds s, void *t, size_t len) { - struct sdshdr *sh; - size_t curlen = sdslen(s); - - s = sdsMakeRoomFor(s,len); - if (s == NULL) return NULL; - sh = (void*) (s-(sizeof(struct sdshdr))); - memcpy(s+curlen, t, len); - sh->len = curlen+len; - sh->free = sh->free-len; - s[curlen+len] = '\0'; - return s; -} - -sds sdscat(sds s, char *t) { - return sdscatlen(s, t, strlen(t)); -} - -sds sdscpylen(sds s, char *t, size_t len) { - struct sdshdr *sh = (void*) (s-(sizeof(struct sdshdr))); - size_t totlen = sh->free+sh->len; - - if (totlen < len) { - s = sdsMakeRoomFor(s,len-sh->len); - if (s == NULL) return NULL; - sh = (void*) (s-(sizeof(struct sdshdr))); - totlen = sh->free+sh->len; - } - memcpy(s, t, len); - s[len] = '\0'; - sh->len = len; - sh->free = totlen-len; - return s; -} - -sds sdscpy(sds s, char *t) { - return sdscpylen(s, t, strlen(t)); -} - -sds sdscatprintf(sds s, const char *fmt, ...) { - va_list ap; - char *buf, *t; - size_t buflen = 16; - - while(1) { - buf = zmalloc(buflen); -#ifdef SDS_ABORT_ON_OOM - if (buf == NULL) sdsOomAbort(); -#else - if (buf == NULL) return NULL; -#endif - buf[buflen-2] = '\0'; - va_start(ap, fmt); - vsnprintf(buf, buflen, fmt, ap); - va_end(ap); - if (buf[buflen-2] != '\0') { - zfree(buf); - buflen *= 2; - continue; - } - break; - } - t = sdscat(s, buf); - zfree(buf); - return t; -} - -sds sdstrim(sds s, const char *cset) { - struct sdshdr *sh = (void*) (s-(sizeof(struct sdshdr))); - char *start, *end, *sp, *ep; - size_t len; - - sp = start = s; - ep = end = s+sdslen(s)-1; - while(sp <= end && strchr(cset, *sp)) sp++; - while(ep > start && strchr(cset, *ep)) ep--; - len = (sp > ep) ? 0 : ((ep-sp)+1); - if (sh->buf != sp) memmove(sh->buf, sp, len); - sh->buf[len] = '\0'; - sh->free = sh->free+(sh->len-len); - sh->len = len; - return s; -} - -sds sdsrange(sds s, long start, long end) { - struct sdshdr *sh = (void*) (s-(sizeof(struct sdshdr))); - size_t newlen, len = sdslen(s); - - if (len == 0) return s; - if (start < 0) { - start = len+start; - if (start < 0) start = 0; - } - if (end < 0) { - end = len+end; - if (end < 0) end = 0; - } - newlen = (start > end) ? 0 : (end-start)+1; - if (newlen != 0) { - if (start >= (signed)len) start = len-1; - if (end >= (signed)len) end = len-1; - newlen = (start > end) ? 0 : (end-start)+1; - } else { - start = 0; - } - if (start != 0) memmove(sh->buf, sh->buf+start, newlen); - sh->buf[newlen] = 0; - sh->free = sh->free+(sh->len-newlen); - sh->len = newlen; - return s; -} - -void sdstolower(sds s) { - int len = sdslen(s), j; - - for (j = 0; j < len; j++) s[j] = tolower(s[j]); -} - -void sdstoupper(sds s) { - int len = sdslen(s), j; - - for (j = 0; j < len; j++) s[j] = toupper(s[j]); -} - -int sdscmp(sds s1, sds s2) { - size_t l1, l2, minlen; - int cmp; - - l1 = sdslen(s1); - l2 = sdslen(s2); - minlen = (l1 < l2) ? l1 : l2; - cmp = memcmp(s1,s2,minlen); - if (cmp == 0) return l1-l2; - return cmp; -} - -/* Split 's' with separator in 'sep'. An array - * of sds strings is returned. *count will be set - * by reference to the number of tokens returned. - * - * On out of memory, zero length string, zero length - * separator, NULL is returned. - * - * Note that 'sep' is able to split a string using - * a multi-character separator. For example - * sdssplit("foo_-_bar","_-_"); will return two - * elements "foo" and "bar". - * - * This version of the function is binary-safe but - * requires length arguments. sdssplit() is just the - * same function but for zero-terminated strings. - */ -sds *sdssplitlen(char *s, int len, char *sep, int seplen, int *count) { - int elements = 0, slots = 5, start = 0, j; - - sds *tokens = zmalloc(sizeof(sds)*slots); -#ifdef SDS_ABORT_ON_OOM - if (tokens == NULL) sdsOomAbort(); -#endif - if (seplen < 1 || len < 0 || tokens == NULL) return NULL; - if (len == 0) { - *count = 0; - return tokens; - } - for (j = 0; j < (len-(seplen-1)); j++) { - /* make sure there is room for the next element and the final one */ - if (slots < elements+2) { - sds *newtokens; - - slots *= 2; - newtokens = zrealloc(tokens,sizeof(sds)*slots); - if (newtokens == NULL) { -#ifdef SDS_ABORT_ON_OOM - sdsOomAbort(); -#else - goto cleanup; -#endif - } - tokens = newtokens; - } - /* search the separator */ - if ((seplen == 1 && *(s+j) == sep[0]) || (memcmp(s+j,sep,seplen) == 0)) { - tokens[elements] = sdsnewlen(s+start,j-start); - if (tokens[elements] == NULL) { -#ifdef SDS_ABORT_ON_OOM - sdsOomAbort(); -#else - goto cleanup; -#endif - } - elements++; - start = j+seplen; - j = j+seplen-1; /* skip the separator */ - } - } - /* Add the final element. We are sure there is room in the tokens array. */ - tokens[elements] = sdsnewlen(s+start,len-start); - if (tokens[elements] == NULL) { -#ifdef SDS_ABORT_ON_OOM - sdsOomAbort(); -#else - goto cleanup; -#endif - } - elements++; - *count = elements; - return tokens; - -#ifndef SDS_ABORT_ON_OOM -cleanup: - { - int i; - for (i = 0; i < elements; i++) sdsfree(tokens[i]); - zfree(tokens); - return NULL; - } -#endif -} - -void sdsfreesplitres(sds *tokens, int count) { - if (!tokens) return; - while(count--) - sdsfree(tokens[count]); - zfree(tokens); -} - -sds sdsfromlonglong(long long value) { - char buf[32], *p; - unsigned long long v; - - v = (value < 0) ? -value : value; - p = buf+31; /* point to the last character */ - do { - *p-- = '0'+(v%10); - v /= 10; - } while(v); - if (value < 0) *p-- = '-'; - p++; - return sdsnewlen(p,32-(p-buf)); -} diff --git a/sds.h b/sds.h deleted file mode 100644 index 8b632ff92..000000000 --- a/sds.h +++ /dev/null @@ -1,73 +0,0 @@ -/* SDSLib, A C dynamic strings library - * - * Copyright (c) 2006-2010, Salvatore Sanfilippo - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __SDS_H -#define __SDS_H - -#include - -typedef char *sds; - -struct sdshdr { - long len; - long free; - char buf[]; -}; - -sds sdsnewlen(const void *init, size_t initlen); -sds sdsnew(const char *init); -sds sdsempty(); -size_t sdslen(const sds s); -sds sdsdup(const sds s); -void sdsfree(sds s); -size_t sdsavail(sds s); -sds sdscatlen(sds s, void *t, size_t len); -sds sdscat(sds s, char *t); -sds sdscpylen(sds s, char *t, size_t len); -sds sdscpy(sds s, char *t); - -#ifdef __GNUC__ -sds sdscatprintf(sds s, const char *fmt, ...) - __attribute__((format(printf, 2, 3))); -#else -sds sdscatprintf(sds s, const char *fmt, ...); -#endif - -sds sdstrim(sds s, const char *cset); -sds sdsrange(sds s, long start, long end); -void sdsupdatelen(sds s); -int sdscmp(sds s1, sds s2); -sds *sdssplitlen(char *s, int len, char *sep, int seplen, int *count); -void sdsfreesplitres(sds *tokens, int count); -void sdstolower(sds s); -void sdstoupper(sds s); -sds sdsfromlonglong(long long value); - -#endif diff --git a/sha1.c b/sha1.c deleted file mode 100644 index 2c50433e8..000000000 --- a/sha1.c +++ /dev/null @@ -1,276 +0,0 @@ - -/* from valgrind tests */ - -/* ================ sha1.c ================ */ -/* -SHA-1 in C -By Steve Reid -100% Public Domain - -Test Vectors (from FIPS PUB 180-1) -"abc" - A9993E36 4706816A BA3E2571 7850C26C 9CD0D89D -"abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq" - 84983E44 1C3BD26E BAAE4AA1 F95129E5 E54670F1 -A million repetitions of "a" - 34AA973C D4C4DAA4 F61EEB2B DBAD2731 6534016F -*/ - -/* #define LITTLE_ENDIAN * This should be #define'd already, if true. */ -/* #define SHA1HANDSOFF * Copies data before messing with it. */ - -#define SHA1HANDSOFF - -#include -#include -#include /* for u_int*_t */ -#if defined(__sun) -#include "solarisfixes.h" -#endif -#include "sha1.h" - -#ifndef BYTE_ORDER -#if (BSD >= 199103) -# include -#else -#if defined(linux) || defined(__linux__) -# include -#else -#define LITTLE_ENDIAN 1234 /* least-significant byte first (vax, pc) */ -#define BIG_ENDIAN 4321 /* most-significant byte first (IBM, net) */ -#define PDP_ENDIAN 3412 /* LSB first in word, MSW first in long (pdp)*/ - -#if defined(vax) || defined(ns32000) || defined(sun386) || defined(__i386__) || \ - defined(MIPSEL) || defined(_MIPSEL) || defined(BIT_ZERO_ON_RIGHT) || \ - defined(__alpha__) || defined(__alpha) -#define BYTE_ORDER LITTLE_ENDIAN -#endif - -#if defined(sel) || defined(pyr) || defined(mc68000) || defined(sparc) || \ - defined(is68k) || defined(tahoe) || defined(ibm032) || defined(ibm370) || \ - defined(MIPSEB) || defined(_MIPSEB) || defined(_IBMR2) || defined(DGUX) ||\ - defined(apollo) || defined(__convex__) || defined(_CRAY) || \ - defined(__hppa) || defined(__hp9000) || \ - defined(__hp9000s300) || defined(__hp9000s700) || \ - defined (BIT_ZERO_ON_LEFT) || defined(m68k) || defined(__sparc) -#define BYTE_ORDER BIG_ENDIAN -#endif -#endif /* linux */ -#endif /* BSD */ -#endif /* BYTE_ORDER */ - -#if defined(__BYTE_ORDER) && !defined(BYTE_ORDER) -#if (__BYTE_ORDER == __LITTLE_ENDIAN) -#define BYTE_ORDER LITTLE_ENDIAN -#else -#define BYTE_ORDER BIG_ENDIAN -#endif -#endif - -#if !defined(BYTE_ORDER) || \ - (BYTE_ORDER != BIG_ENDIAN && BYTE_ORDER != LITTLE_ENDIAN && \ - BYTE_ORDER != PDP_ENDIAN) - /* you must determine what the correct bit order is for - * your compiler - the next line is an intentional error - * which will force your compiles to bomb until you fix - * the above macros. - */ -#error "Undefined or invalid BYTE_ORDER" -#endif - -#define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits)))) - -/* blk0() and blk() perform the initial expand. */ -/* I got the idea of expanding during the round function from SSLeay */ -#if BYTE_ORDER == LITTLE_ENDIAN -#define blk0(i) (block->l[i] = (rol(block->l[i],24)&0xFF00FF00) \ - |(rol(block->l[i],8)&0x00FF00FF)) -#elif BYTE_ORDER == BIG_ENDIAN -#define blk0(i) block->l[i] -#else -#error "Endianness not defined!" -#endif -#define blk(i) (block->l[i&15] = rol(block->l[(i+13)&15]^block->l[(i+8)&15] \ - ^block->l[(i+2)&15]^block->l[i&15],1)) - -/* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */ -#define R0(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk0(i)+0x5A827999+rol(v,5);w=rol(w,30); -#define R1(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk(i)+0x5A827999+rol(v,5);w=rol(w,30); -#define R2(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0x6ED9EBA1+rol(v,5);w=rol(w,30); -#define R3(v,w,x,y,z,i) z+=(((w|x)&y)|(w&x))+blk(i)+0x8F1BBCDC+rol(v,5);w=rol(w,30); -#define R4(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0xCA62C1D6+rol(v,5);w=rol(w,30); - - -/* Hash a single 512-bit block. This is the core of the algorithm. */ - -void SHA1Transform(u_int32_t state[5], const unsigned char buffer[64]) -{ -u_int32_t a, b, c, d, e; -typedef union { - unsigned char c[64]; - u_int32_t l[16]; -} CHAR64LONG16; -#ifdef SHA1HANDSOFF -CHAR64LONG16 block[1]; /* use array to appear as a pointer */ - memcpy(block, buffer, 64); -#else - /* The following had better never be used because it causes the - * pointer-to-const buffer to be cast into a pointer to non-const. - * And the result is written through. I threw a "const" in, hoping - * this will cause a diagnostic. - */ -CHAR64LONG16* block = (const CHAR64LONG16*)buffer; -#endif - /* Copy context->state[] to working vars */ - a = state[0]; - b = state[1]; - c = state[2]; - d = state[3]; - e = state[4]; - /* 4 rounds of 20 operations each. Loop unrolled. */ - R0(a,b,c,d,e, 0); R0(e,a,b,c,d, 1); R0(d,e,a,b,c, 2); R0(c,d,e,a,b, 3); - R0(b,c,d,e,a, 4); R0(a,b,c,d,e, 5); R0(e,a,b,c,d, 6); R0(d,e,a,b,c, 7); - R0(c,d,e,a,b, 8); R0(b,c,d,e,a, 9); R0(a,b,c,d,e,10); R0(e,a,b,c,d,11); - R0(d,e,a,b,c,12); R0(c,d,e,a,b,13); R0(b,c,d,e,a,14); R0(a,b,c,d,e,15); - R1(e,a,b,c,d,16); R1(d,e,a,b,c,17); R1(c,d,e,a,b,18); R1(b,c,d,e,a,19); - R2(a,b,c,d,e,20); R2(e,a,b,c,d,21); R2(d,e,a,b,c,22); R2(c,d,e,a,b,23); - R2(b,c,d,e,a,24); R2(a,b,c,d,e,25); R2(e,a,b,c,d,26); R2(d,e,a,b,c,27); - R2(c,d,e,a,b,28); R2(b,c,d,e,a,29); R2(a,b,c,d,e,30); R2(e,a,b,c,d,31); - R2(d,e,a,b,c,32); R2(c,d,e,a,b,33); R2(b,c,d,e,a,34); R2(a,b,c,d,e,35); - R2(e,a,b,c,d,36); R2(d,e,a,b,c,37); R2(c,d,e,a,b,38); R2(b,c,d,e,a,39); - R3(a,b,c,d,e,40); R3(e,a,b,c,d,41); R3(d,e,a,b,c,42); R3(c,d,e,a,b,43); - R3(b,c,d,e,a,44); R3(a,b,c,d,e,45); R3(e,a,b,c,d,46); R3(d,e,a,b,c,47); - R3(c,d,e,a,b,48); R3(b,c,d,e,a,49); R3(a,b,c,d,e,50); R3(e,a,b,c,d,51); - R3(d,e,a,b,c,52); R3(c,d,e,a,b,53); R3(b,c,d,e,a,54); R3(a,b,c,d,e,55); - R3(e,a,b,c,d,56); R3(d,e,a,b,c,57); R3(c,d,e,a,b,58); R3(b,c,d,e,a,59); - R4(a,b,c,d,e,60); R4(e,a,b,c,d,61); R4(d,e,a,b,c,62); R4(c,d,e,a,b,63); - R4(b,c,d,e,a,64); R4(a,b,c,d,e,65); R4(e,a,b,c,d,66); R4(d,e,a,b,c,67); - R4(c,d,e,a,b,68); R4(b,c,d,e,a,69); R4(a,b,c,d,e,70); R4(e,a,b,c,d,71); - R4(d,e,a,b,c,72); R4(c,d,e,a,b,73); R4(b,c,d,e,a,74); R4(a,b,c,d,e,75); - R4(e,a,b,c,d,76); R4(d,e,a,b,c,77); R4(c,d,e,a,b,78); R4(b,c,d,e,a,79); - /* Add the working vars back into context.state[] */ - state[0] += a; - state[1] += b; - state[2] += c; - state[3] += d; - state[4] += e; - /* Wipe variables */ - a = b = c = d = e = 0; -#ifdef SHA1HANDSOFF - memset(block, '\0', sizeof(block)); -#endif -} - - -/* SHA1Init - Initialize new context */ - -void SHA1Init(SHA1_CTX* context) -{ - /* SHA1 initialization constants */ - context->state[0] = 0x67452301; - context->state[1] = 0xEFCDAB89; - context->state[2] = 0x98BADCFE; - context->state[3] = 0x10325476; - context->state[4] = 0xC3D2E1F0; - context->count[0] = context->count[1] = 0; -} - - -/* Run your data through this. */ - -void SHA1Update(SHA1_CTX* context, const unsigned char* data, u_int32_t len) -{ -u_int32_t i; -u_int32_t j; - - j = context->count[0]; - if ((context->count[0] += len << 3) < j) - context->count[1]++; - context->count[1] += (len>>29); - j = (j >> 3) & 63; - if ((j + len) > 63) { - memcpy(&context->buffer[j], data, (i = 64-j)); - SHA1Transform(context->state, context->buffer); - for ( ; i + 63 < len; i += 64) { - SHA1Transform(context->state, &data[i]); - } - j = 0; - } - else i = 0; - memcpy(&context->buffer[j], &data[i], len - i); -} - - -/* Add padding and return the message digest. */ - -void SHA1Final(unsigned char digest[20], SHA1_CTX* context) -{ -unsigned i; -unsigned char finalcount[8]; -unsigned char c; - -#if 0 /* untested "improvement" by DHR */ - /* Convert context->count to a sequence of bytes - * in finalcount. Second element first, but - * big-endian order within element. - * But we do it all backwards. - */ - unsigned char *fcp = &finalcount[8]; - - for (i = 0; i < 2; i++) - { - u_int32_t t = context->count[i]; - int j; - - for (j = 0; j < 4; t >>= 8, j++) - *--fcp = (unsigned char) t - } -#else - for (i = 0; i < 8; i++) { - finalcount[i] = (unsigned char)((context->count[(i >= 4 ? 0 : 1)] - >> ((3-(i & 3)) * 8) ) & 255); /* Endian independent */ - } -#endif - c = 0200; - SHA1Update(context, &c, 1); - while ((context->count[0] & 504) != 448) { - c = 0000; - SHA1Update(context, &c, 1); - } - SHA1Update(context, finalcount, 8); /* Should cause a SHA1Transform() */ - for (i = 0; i < 20; i++) { - digest[i] = (unsigned char) - ((context->state[i>>2] >> ((3-(i & 3)) * 8) ) & 255); - } - /* Wipe variables */ - memset(context, '\0', sizeof(*context)); - memset(&finalcount, '\0', sizeof(finalcount)); -} -/* ================ end of sha1.c ================ */ - -#if 0 -#define BUFSIZE 4096 - -int -main(int argc, char **argv) -{ - SHA1_CTX ctx; - unsigned char hash[20], buf[BUFSIZE]; - int i; - - for(i=0;i -100% Public Domain -*/ - -typedef struct { - u_int32_t state[5]; - u_int32_t count[2]; - unsigned char buffer[64]; -} SHA1_CTX; - -void SHA1Transform(u_int32_t state[5], const unsigned char buffer[64]); -void SHA1Init(SHA1_CTX* context); -void SHA1Update(SHA1_CTX* context, const unsigned char* data, u_int32_t len); -void SHA1Final(unsigned char digest[20], SHA1_CTX* context); diff --git a/solarisfixes.h b/solarisfixes.h deleted file mode 100644 index ce8e7b6fd..000000000 --- a/solarisfixes.h +++ /dev/null @@ -1,21 +0,0 @@ -/* Solaris specific fixes */ - -#if defined(__GNUC__) -#undef isnan -#define isnan(x) \ - __extension__({ __typeof (x) __x_a = (x); \ - __builtin_expect(__x_a != __x_a, 0); }) - -#undef isfinite -#define isfinite(x) \ - __extension__ ({ __typeof (x) __x_f = (x); \ - __builtin_expect(!isnan(__x_f - __x_f), 1); }) - -#undef isinf -#define isinf(x) \ - __extension__ ({ __typeof (x) __x_i = (x); \ - __builtin_expect(!isnan(__x_i) && !isfinite(__x_i), 0); }) - -#define u_int uint -#define u_int32_t uint32_t -#endif /* __GNUC__ */ diff --git a/src/Makefile b/src/Makefile new file mode 100644 index 000000000..3cba3c069 --- /dev/null +++ b/src/Makefile @@ -0,0 +1,111 @@ +# Redis Makefile +# Copyright (C) 2009 Salvatore Sanfilippo +# This file is released under the BSD license, see the COPYING file + +release_hdr := $(shell sh -c './mkreleasehdr.sh') +uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not') +OPTIMIZATION?=-O2 +ifeq ($(uname_S),SunOS) + CFLAGS?= -std=c99 -pedantic $(OPTIMIZATION) -Wall -W -D__EXTENSIONS__ -D_XPG6 + CCLINK?= -ldl -lnsl -lsocket -lm -lpthread +else + CFLAGS?= -std=c99 -pedantic $(OPTIMIZATION) -Wall -W $(ARCH) $(PROF) + CCLINK?= -lm -pthread +endif +CCOPT= $(CFLAGS) $(CCLINK) $(ARCH) $(PROF) +DEBUG?= -g -rdynamic -ggdb + +OBJ = adlist.o ae.o anet.o dict.o redis.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o vm.o pubsub.o multi.o debug.o sort.o +BENCHOBJ = ae.o anet.o redis-benchmark.o sds.o adlist.o zmalloc.o +CLIOBJ = anet.o sds.o adlist.o redis-cli.o zmalloc.o linenoise.o +CHECKDUMPOBJ = redis-check-dump.o lzf_c.o lzf_d.o +CHECKAOFOBJ = redis-check-aof.o + +PRGNAME = redis-server +BENCHPRGNAME = redis-benchmark +CLIPRGNAME = redis-cli +CHECKDUMPPRGNAME = redis-check-dump +CHECKAOFPRGNAME = redis-check-aof + +all: redis-server redis-benchmark redis-cli redis-check-dump redis-check-aof + +# Deps (use make dep to generate this) +adlist.o: adlist.c adlist.h zmalloc.h +ae.o: ae.c ae.h zmalloc.h config.h ae_kqueue.c +ae_epoll.o: ae_epoll.c +ae_kqueue.o: ae_kqueue.c +ae_select.o: ae_select.c +anet.o: anet.c fmacros.h anet.h +dict.o: dict.c fmacros.h dict.h zmalloc.h +linenoise.o: linenoise.c fmacros.h +lzf_c.o: lzf_c.c lzfP.h +lzf_d.o: lzf_d.c lzfP.h +pqsort.o: pqsort.c +redis-benchmark.o: redis-benchmark.c fmacros.h ae.h anet.h sds.h adlist.h \ + zmalloc.h +redis-check-aof.o: redis-check-aof.c fmacros.h config.h +redis-check-dump.o: redis-check-dump.c lzf.h +redis-cli.o: redis-cli.c fmacros.h anet.h sds.h adlist.h zmalloc.h \ + linenoise.h +redis.o: redis.c fmacros.h config.h redis.h ae.h sds.h anet.h dict.h \ + adlist.h zmalloc.h lzf.h pqsort.h zipmap.h ziplist.h sha1.h +release.o: release.c release.h +sds.o: sds.c sds.h zmalloc.h +sha1.o: sha1.c sha1.h +ziplist.o: ziplist.c zmalloc.h ziplist.h +zipmap.o: zipmap.c zmalloc.h +zmalloc.o: zmalloc.c config.h + +redis-server: $(OBJ) + $(CC) -o $(PRGNAME) $(CCOPT) $(DEBUG) $(OBJ) + @echo "" + @echo "Hint: To run 'make test' is a good idea ;)" + @echo "" + +redis-benchmark: $(BENCHOBJ) + $(CC) -o $(BENCHPRGNAME) $(CCOPT) $(DEBUG) $(BENCHOBJ) + +redis-cli: $(CLIOBJ) + $(CC) -o $(CLIPRGNAME) $(CCOPT) $(DEBUG) $(CLIOBJ) + +redis-check-dump: $(CHECKDUMPOBJ) + $(CC) -o $(CHECKDUMPPRGNAME) $(CCOPT) $(DEBUG) $(CHECKDUMPOBJ) + +redis-check-aof: $(CHECKAOFOBJ) + $(CC) -o $(CHECKAOFPRGNAME) $(CCOPT) $(DEBUG) $(CHECKAOFOBJ) + +.c.o: + $(CC) -c $(CFLAGS) $(DEBUG) $(COMPILE_TIME) $< + +clean: + rm -rf $(PRGNAME) $(BENCHPRGNAME) $(CLIPRGNAME) $(CHECKDUMPPRGNAME) $(CHECKAOFPRGNAME) *.o *.gcda *.gcno *.gcov + +dep: + $(CC) -MM *.c + +test: + (cd ..; tclsh8.5 tests/test_helper.tcl --tags "${TAGS}") + +bench: + ./redis-benchmark + +log: + git log '--pretty=format:%ad %s (%cn)' --date=short > Changelog + +32bit: + @echo "" + @echo "WARNING: if it fails under Linux you probably need to install libc6-dev-i386" + @echo "" + make ARCH="-m32" + +gprof: + make PROF="-pg" + +gcov: + make PROF="-fprofile-arcs -ftest-coverage" + +noopt: + make OPTIMIZATION="" + +32bitgprof: + make PROF="-pg" ARCH="-arch i386" diff --git a/src/adlist.c b/src/adlist.c new file mode 100644 index 000000000..015012f5c --- /dev/null +++ b/src/adlist.c @@ -0,0 +1,325 @@ +/* adlist.c - A generic doubly linked list implementation + * + * Copyright (c) 2006-2010, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + + +#include +#include "adlist.h" +#include "zmalloc.h" + +/* Create a new list. The created list can be freed with + * AlFreeList(), but private value of every node need to be freed + * by the user before to call AlFreeList(). + * + * On error, NULL is returned. Otherwise the pointer to the new list. */ +list *listCreate(void) +{ + struct list *list; + + if ((list = zmalloc(sizeof(*list))) == NULL) + return NULL; + list->head = list->tail = NULL; + list->len = 0; + list->dup = NULL; + list->free = NULL; + list->match = NULL; + return list; +} + +/* Free the whole list. + * + * This function can't fail. */ +void listRelease(list *list) +{ + unsigned int len; + listNode *current, *next; + + current = list->head; + len = list->len; + while(len--) { + next = current->next; + if (list->free) list->free(current->value); + zfree(current); + current = next; + } + zfree(list); +} + +/* Add a new node to the list, to head, contaning the specified 'value' + * pointer as value. + * + * On error, NULL is returned and no operation is performed (i.e. the + * list remains unaltered). + * On success the 'list' pointer you pass to the function is returned. */ +list *listAddNodeHead(list *list, void *value) +{ + listNode *node; + + if ((node = zmalloc(sizeof(*node))) == NULL) + return NULL; + node->value = value; + if (list->len == 0) { + list->head = list->tail = node; + node->prev = node->next = NULL; + } else { + node->prev = NULL; + node->next = list->head; + list->head->prev = node; + list->head = node; + } + list->len++; + return list; +} + +/* Add a new node to the list, to tail, contaning the specified 'value' + * pointer as value. + * + * On error, NULL is returned and no operation is performed (i.e. the + * list remains unaltered). + * On success the 'list' pointer you pass to the function is returned. */ +list *listAddNodeTail(list *list, void *value) +{ + listNode *node; + + if ((node = zmalloc(sizeof(*node))) == NULL) + return NULL; + node->value = value; + if (list->len == 0) { + list->head = list->tail = node; + node->prev = node->next = NULL; + } else { + node->prev = list->tail; + node->next = NULL; + list->tail->next = node; + list->tail = node; + } + list->len++; + return list; +} + +list *listInsertNode(list *list, listNode *old_node, void *value, int after) { + listNode *node; + + if ((node = zmalloc(sizeof(*node))) == NULL) + return NULL; + node->value = value; + if (after) { + node->prev = old_node; + node->next = old_node->next; + if (list->tail == old_node) { + list->tail = node; + } + } else { + node->next = old_node; + node->prev = old_node->prev; + if (list->head == old_node) { + list->head = node; + } + } + if (node->prev != NULL) { + node->prev->next = node; + } + if (node->next != NULL) { + node->next->prev = node; + } + list->len++; + return list; +} + +/* Remove the specified node from the specified list. + * It's up to the caller to free the private value of the node. + * + * This function can't fail. */ +void listDelNode(list *list, listNode *node) +{ + if (node->prev) + node->prev->next = node->next; + else + list->head = node->next; + if (node->next) + node->next->prev = node->prev; + else + list->tail = node->prev; + if (list->free) list->free(node->value); + zfree(node); + list->len--; +} + +/* Returns a list iterator 'iter'. After the initialization every + * call to listNext() will return the next element of the list. + * + * This function can't fail. */ +listIter *listGetIterator(list *list, int direction) +{ + listIter *iter; + + if ((iter = zmalloc(sizeof(*iter))) == NULL) return NULL; + if (direction == AL_START_HEAD) + iter->next = list->head; + else + iter->next = list->tail; + iter->direction = direction; + return iter; +} + +/* Release the iterator memory */ +void listReleaseIterator(listIter *iter) { + zfree(iter); +} + +/* Create an iterator in the list private iterator structure */ +void listRewind(list *list, listIter *li) { + li->next = list->head; + li->direction = AL_START_HEAD; +} + +void listRewindTail(list *list, listIter *li) { + li->next = list->tail; + li->direction = AL_START_TAIL; +} + +/* Return the next element of an iterator. + * It's valid to remove the currently returned element using + * listDelNode(), but not to remove other elements. + * + * The function returns a pointer to the next element of the list, + * or NULL if there are no more elements, so the classical usage patter + * is: + * + * iter = listGetIterator(list,); + * while ((node = listNext(iter)) != NULL) { + * doSomethingWith(listNodeValue(node)); + * } + * + * */ +listNode *listNext(listIter *iter) +{ + listNode *current = iter->next; + + if (current != NULL) { + if (iter->direction == AL_START_HEAD) + iter->next = current->next; + else + iter->next = current->prev; + } + return current; +} + +/* Duplicate the whole list. On out of memory NULL is returned. + * On success a copy of the original list is returned. + * + * The 'Dup' method set with listSetDupMethod() function is used + * to copy the node value. Otherwise the same pointer value of + * the original node is used as value of the copied node. + * + * The original list both on success or error is never modified. */ +list *listDup(list *orig) +{ + list *copy; + listIter *iter; + listNode *node; + + if ((copy = listCreate()) == NULL) + return NULL; + copy->dup = orig->dup; + copy->free = orig->free; + copy->match = orig->match; + iter = listGetIterator(orig, AL_START_HEAD); + while((node = listNext(iter)) != NULL) { + void *value; + + if (copy->dup) { + value = copy->dup(node->value); + if (value == NULL) { + listRelease(copy); + listReleaseIterator(iter); + return NULL; + } + } else + value = node->value; + if (listAddNodeTail(copy, value) == NULL) { + listRelease(copy); + listReleaseIterator(iter); + return NULL; + } + } + listReleaseIterator(iter); + return copy; +} + +/* Search the list for a node matching a given key. + * The match is performed using the 'match' method + * set with listSetMatchMethod(). If no 'match' method + * is set, the 'value' pointer of every node is directly + * compared with the 'key' pointer. + * + * On success the first matching node pointer is returned + * (search starts from head). If no matching node exists + * NULL is returned. */ +listNode *listSearchKey(list *list, void *key) +{ + listIter *iter; + listNode *node; + + iter = listGetIterator(list, AL_START_HEAD); + while((node = listNext(iter)) != NULL) { + if (list->match) { + if (list->match(node->value, key)) { + listReleaseIterator(iter); + return node; + } + } else { + if (key == node->value) { + listReleaseIterator(iter); + return node; + } + } + } + listReleaseIterator(iter); + return NULL; +} + +/* Return the element at the specified zero-based index + * where 0 is the head, 1 is the element next to head + * and so on. Negative integers are used in order to count + * from the tail, -1 is the last element, -2 the penultimante + * and so on. If the index is out of range NULL is returned. */ +listNode *listIndex(list *list, int index) { + listNode *n; + + if (index < 0) { + index = (-index)-1; + n = list->tail; + while(index-- && n) n = n->prev; + } else { + n = list->head; + while(index-- && n) n = n->next; + } + return n; +} diff --git a/src/adlist.h b/src/adlist.h new file mode 100644 index 000000000..a1209f62f --- /dev/null +++ b/src/adlist.h @@ -0,0 +1,92 @@ +/* adlist.h - A generic doubly linked list implementation + * + * Copyright (c) 2006-2010, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ADLIST_H__ +#define __ADLIST_H__ + +/* Node, List, and Iterator are the only data structures used currently. */ + +typedef struct listNode { + struct listNode *prev; + struct listNode *next; + void *value; +} listNode; + +typedef struct listIter { + listNode *next; + int direction; +} listIter; + +typedef struct list { + listNode *head; + listNode *tail; + void *(*dup)(void *ptr); + void (*free)(void *ptr); + int (*match)(void *ptr, void *key); + unsigned int len; +} list; + +/* Functions implemented as macros */ +#define listLength(l) ((l)->len) +#define listFirst(l) ((l)->head) +#define listLast(l) ((l)->tail) +#define listPrevNode(n) ((n)->prev) +#define listNextNode(n) ((n)->next) +#define listNodeValue(n) ((n)->value) + +#define listSetDupMethod(l,m) ((l)->dup = (m)) +#define listSetFreeMethod(l,m) ((l)->free = (m)) +#define listSetMatchMethod(l,m) ((l)->match = (m)) + +#define listGetDupMethod(l) ((l)->dup) +#define listGetFree(l) ((l)->free) +#define listGetMatchMethod(l) ((l)->match) + +/* Prototypes */ +list *listCreate(void); +void listRelease(list *list); +list *listAddNodeHead(list *list, void *value); +list *listAddNodeTail(list *list, void *value); +list *listInsertNode(list *list, listNode *old_node, void *value, int after); +void listDelNode(list *list, listNode *node); +listIter *listGetIterator(list *list, int direction); +listNode *listNext(listIter *iter); +void listReleaseIterator(listIter *iter); +list *listDup(list *orig); +listNode *listSearchKey(list *list, void *key); +listNode *listIndex(list *list, int index); +void listRewind(list *list, listIter *li); +void listRewindTail(list *list, listIter *li); + +/* Directions for iterators */ +#define AL_START_HEAD 0 +#define AL_START_TAIL 1 + +#endif /* __ADLIST_H__ */ diff --git a/src/ae.c b/src/ae.c new file mode 100644 index 000000000..c7918ee1d --- /dev/null +++ b/src/ae.c @@ -0,0 +1,390 @@ +/* A simple event-driven programming library. Originally I wrote this code + * for the Jim's event-loop (Jim is a Tcl interpreter) but later translated + * it in form of a library for easy reuse. + * + * Copyright (c) 2006-2010, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include + +#include "ae.h" +#include "zmalloc.h" +#include "config.h" + +/* Include the best multiplexing layer supported by this system. + * The following should be ordered by performances, descending. */ +#ifdef HAVE_EPOLL +#include "ae_epoll.c" +#else + #ifdef HAVE_KQUEUE + #include "ae_kqueue.c" + #else + #include "ae_select.c" + #endif +#endif + +aeEventLoop *aeCreateEventLoop(void) { + aeEventLoop *eventLoop; + int i; + + eventLoop = zmalloc(sizeof(*eventLoop)); + if (!eventLoop) return NULL; + eventLoop->timeEventHead = NULL; + eventLoop->timeEventNextId = 0; + eventLoop->stop = 0; + eventLoop->maxfd = -1; + eventLoop->beforesleep = NULL; + if (aeApiCreate(eventLoop) == -1) { + zfree(eventLoop); + return NULL; + } + /* Events with mask == AE_NONE are not set. So let's initialize the + * vector with it. */ + for (i = 0; i < AE_SETSIZE; i++) + eventLoop->events[i].mask = AE_NONE; + return eventLoop; +} + +void aeDeleteEventLoop(aeEventLoop *eventLoop) { + aeApiFree(eventLoop); + zfree(eventLoop); +} + +void aeStop(aeEventLoop *eventLoop) { + eventLoop->stop = 1; +} + +int aeCreateFileEvent(aeEventLoop *eventLoop, int fd, int mask, + aeFileProc *proc, void *clientData) +{ + if (fd >= AE_SETSIZE) return AE_ERR; + aeFileEvent *fe = &eventLoop->events[fd]; + + if (aeApiAddEvent(eventLoop, fd, mask) == -1) + return AE_ERR; + fe->mask |= mask; + if (mask & AE_READABLE) fe->rfileProc = proc; + if (mask & AE_WRITABLE) fe->wfileProc = proc; + fe->clientData = clientData; + if (fd > eventLoop->maxfd) + eventLoop->maxfd = fd; + return AE_OK; +} + +void aeDeleteFileEvent(aeEventLoop *eventLoop, int fd, int mask) +{ + if (fd >= AE_SETSIZE) return; + aeFileEvent *fe = &eventLoop->events[fd]; + + if (fe->mask == AE_NONE) return; + fe->mask = fe->mask & (~mask); + if (fd == eventLoop->maxfd && fe->mask == AE_NONE) { + /* Update the max fd */ + int j; + + for (j = eventLoop->maxfd-1; j >= 0; j--) + if (eventLoop->events[j].mask != AE_NONE) break; + eventLoop->maxfd = j; + } + aeApiDelEvent(eventLoop, fd, mask); +} + +static void aeGetTime(long *seconds, long *milliseconds) +{ + struct timeval tv; + + gettimeofday(&tv, NULL); + *seconds = tv.tv_sec; + *milliseconds = tv.tv_usec/1000; +} + +static void aeAddMillisecondsToNow(long long milliseconds, long *sec, long *ms) { + long cur_sec, cur_ms, when_sec, when_ms; + + aeGetTime(&cur_sec, &cur_ms); + when_sec = cur_sec + milliseconds/1000; + when_ms = cur_ms + milliseconds%1000; + if (when_ms >= 1000) { + when_sec ++; + when_ms -= 1000; + } + *sec = when_sec; + *ms = when_ms; +} + +long long aeCreateTimeEvent(aeEventLoop *eventLoop, long long milliseconds, + aeTimeProc *proc, void *clientData, + aeEventFinalizerProc *finalizerProc) +{ + long long id = eventLoop->timeEventNextId++; + aeTimeEvent *te; + + te = zmalloc(sizeof(*te)); + if (te == NULL) return AE_ERR; + te->id = id; + aeAddMillisecondsToNow(milliseconds,&te->when_sec,&te->when_ms); + te->timeProc = proc; + te->finalizerProc = finalizerProc; + te->clientData = clientData; + te->next = eventLoop->timeEventHead; + eventLoop->timeEventHead = te; + return id; +} + +int aeDeleteTimeEvent(aeEventLoop *eventLoop, long long id) +{ + aeTimeEvent *te, *prev = NULL; + + te = eventLoop->timeEventHead; + while(te) { + if (te->id == id) { + if (prev == NULL) + eventLoop->timeEventHead = te->next; + else + prev->next = te->next; + if (te->finalizerProc) + te->finalizerProc(eventLoop, te->clientData); + zfree(te); + return AE_OK; + } + prev = te; + te = te->next; + } + return AE_ERR; /* NO event with the specified ID found */ +} + +/* Search the first timer to fire. + * This operation is useful to know how many time the select can be + * put in sleep without to delay any event. + * If there are no timers NULL is returned. + * + * Note that's O(N) since time events are unsorted. + * Possible optimizations (not needed by Redis so far, but...): + * 1) Insert the event in order, so that the nearest is just the head. + * Much better but still insertion or deletion of timers is O(N). + * 2) Use a skiplist to have this operation as O(1) and insertion as O(log(N)). + */ +static aeTimeEvent *aeSearchNearestTimer(aeEventLoop *eventLoop) +{ + aeTimeEvent *te = eventLoop->timeEventHead; + aeTimeEvent *nearest = NULL; + + while(te) { + if (!nearest || te->when_sec < nearest->when_sec || + (te->when_sec == nearest->when_sec && + te->when_ms < nearest->when_ms)) + nearest = te; + te = te->next; + } + return nearest; +} + +/* Process time events */ +static int processTimeEvents(aeEventLoop *eventLoop) { + int processed = 0; + aeTimeEvent *te; + long long maxId; + + te = eventLoop->timeEventHead; + maxId = eventLoop->timeEventNextId-1; + while(te) { + long now_sec, now_ms; + long long id; + + if (te->id > maxId) { + te = te->next; + continue; + } + aeGetTime(&now_sec, &now_ms); + if (now_sec > te->when_sec || + (now_sec == te->when_sec && now_ms >= te->when_ms)) + { + int retval; + + id = te->id; + retval = te->timeProc(eventLoop, id, te->clientData); + processed++; + /* After an event is processed our time event list may + * no longer be the same, so we restart from head. + * Still we make sure to don't process events registered + * by event handlers itself in order to don't loop forever. + * To do so we saved the max ID we want to handle. + * + * FUTURE OPTIMIZATIONS: + * Note that this is NOT great algorithmically. Redis uses + * a single time event so it's not a problem but the right + * way to do this is to add the new elements on head, and + * to flag deleted elements in a special way for later + * deletion (putting references to the nodes to delete into + * another linked list). */ + if (retval != AE_NOMORE) { + aeAddMillisecondsToNow(retval,&te->when_sec,&te->when_ms); + } else { + aeDeleteTimeEvent(eventLoop, id); + } + te = eventLoop->timeEventHead; + } else { + te = te->next; + } + } + return processed; +} + +/* Process every pending time event, then every pending file event + * (that may be registered by time event callbacks just processed). + * Without special flags the function sleeps until some file event + * fires, or when the next time event occurrs (if any). + * + * If flags is 0, the function does nothing and returns. + * if flags has AE_ALL_EVENTS set, all the kind of events are processed. + * if flags has AE_FILE_EVENTS set, file events are processed. + * if flags has AE_TIME_EVENTS set, time events are processed. + * if flags has AE_DONT_WAIT set the function returns ASAP until all + * the events that's possible to process without to wait are processed. + * + * The function returns the number of events processed. */ +int aeProcessEvents(aeEventLoop *eventLoop, int flags) +{ + int processed = 0, numevents; + + /* Nothing to do? return ASAP */ + if (!(flags & AE_TIME_EVENTS) && !(flags & AE_FILE_EVENTS)) return 0; + + /* Note that we want call select() even if there are no + * file events to process as long as we want to process time + * events, in order to sleep until the next time event is ready + * to fire. */ + if (eventLoop->maxfd != -1 || + ((flags & AE_TIME_EVENTS) && !(flags & AE_DONT_WAIT))) { + int j; + aeTimeEvent *shortest = NULL; + struct timeval tv, *tvp; + + if (flags & AE_TIME_EVENTS && !(flags & AE_DONT_WAIT)) + shortest = aeSearchNearestTimer(eventLoop); + if (shortest) { + long now_sec, now_ms; + + /* Calculate the time missing for the nearest + * timer to fire. */ + aeGetTime(&now_sec, &now_ms); + tvp = &tv; + tvp->tv_sec = shortest->when_sec - now_sec; + if (shortest->when_ms < now_ms) { + tvp->tv_usec = ((shortest->when_ms+1000) - now_ms)*1000; + tvp->tv_sec --; + } else { + tvp->tv_usec = (shortest->when_ms - now_ms)*1000; + } + if (tvp->tv_sec < 0) tvp->tv_sec = 0; + if (tvp->tv_usec < 0) tvp->tv_usec = 0; + } else { + /* If we have to check for events but need to return + * ASAP because of AE_DONT_WAIT we need to se the timeout + * to zero */ + if (flags & AE_DONT_WAIT) { + tv.tv_sec = tv.tv_usec = 0; + tvp = &tv; + } else { + /* Otherwise we can block */ + tvp = NULL; /* wait forever */ + } + } + + numevents = aeApiPoll(eventLoop, tvp); + for (j = 0; j < numevents; j++) { + aeFileEvent *fe = &eventLoop->events[eventLoop->fired[j].fd]; + int mask = eventLoop->fired[j].mask; + int fd = eventLoop->fired[j].fd; + int rfired = 0; + + /* note the fe->mask & mask & ... code: maybe an already processed + * event removed an element that fired and we still didn't + * processed, so we check if the event is still valid. */ + if (fe->mask & mask & AE_READABLE) { + rfired = 1; + fe->rfileProc(eventLoop,fd,fe->clientData,mask); + } + if (fe->mask & mask & AE_WRITABLE) { + if (!rfired || fe->wfileProc != fe->rfileProc) + fe->wfileProc(eventLoop,fd,fe->clientData,mask); + } + processed++; + } + } + /* Check time events */ + if (flags & AE_TIME_EVENTS) + processed += processTimeEvents(eventLoop); + + return processed; /* return the number of processed file/time events */ +} + +/* Wait for millseconds until the given file descriptor becomes + * writable/readable/exception */ +int aeWait(int fd, int mask, long long milliseconds) { + struct timeval tv; + fd_set rfds, wfds, efds; + int retmask = 0, retval; + + tv.tv_sec = milliseconds/1000; + tv.tv_usec = (milliseconds%1000)*1000; + FD_ZERO(&rfds); + FD_ZERO(&wfds); + FD_ZERO(&efds); + + if (mask & AE_READABLE) FD_SET(fd,&rfds); + if (mask & AE_WRITABLE) FD_SET(fd,&wfds); + if ((retval = select(fd+1, &rfds, &wfds, &efds, &tv)) > 0) { + if (FD_ISSET(fd,&rfds)) retmask |= AE_READABLE; + if (FD_ISSET(fd,&wfds)) retmask |= AE_WRITABLE; + return retmask; + } else { + return retval; + } +} + +void aeMain(aeEventLoop *eventLoop) { + eventLoop->stop = 0; + while (!eventLoop->stop) { + if (eventLoop->beforesleep != NULL) + eventLoop->beforesleep(eventLoop); + aeProcessEvents(eventLoop, AE_ALL_EVENTS); + } +} + +char *aeGetApiName(void) { + return aeApiName(); +} + +void aeSetBeforeSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *beforesleep) { + eventLoop->beforesleep = beforesleep; +} diff --git a/src/ae.h b/src/ae.h new file mode 100644 index 000000000..a9db18ed9 --- /dev/null +++ b/src/ae.h @@ -0,0 +1,117 @@ +/* A simple event-driven programming library. Originally I wrote this code + * for the Jim's event-loop (Jim is a Tcl interpreter) but later translated + * it in form of a library for easy reuse. + * + * Copyright (c) 2006-2010, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __AE_H__ +#define __AE_H__ + +#define AE_SETSIZE (1024*10) /* Max number of fd supported */ + +#define AE_OK 0 +#define AE_ERR -1 + +#define AE_NONE 0 +#define AE_READABLE 1 +#define AE_WRITABLE 2 + +#define AE_FILE_EVENTS 1 +#define AE_TIME_EVENTS 2 +#define AE_ALL_EVENTS (AE_FILE_EVENTS|AE_TIME_EVENTS) +#define AE_DONT_WAIT 4 + +#define AE_NOMORE -1 + +/* Macros */ +#define AE_NOTUSED(V) ((void) V) + +struct aeEventLoop; + +/* Types and data structures */ +typedef void aeFileProc(struct aeEventLoop *eventLoop, int fd, void *clientData, int mask); +typedef int aeTimeProc(struct aeEventLoop *eventLoop, long long id, void *clientData); +typedef void aeEventFinalizerProc(struct aeEventLoop *eventLoop, void *clientData); +typedef void aeBeforeSleepProc(struct aeEventLoop *eventLoop); + +/* File event structure */ +typedef struct aeFileEvent { + int mask; /* one of AE_(READABLE|WRITABLE) */ + aeFileProc *rfileProc; + aeFileProc *wfileProc; + void *clientData; +} aeFileEvent; + +/* Time event structure */ +typedef struct aeTimeEvent { + long long id; /* time event identifier. */ + long when_sec; /* seconds */ + long when_ms; /* milliseconds */ + aeTimeProc *timeProc; + aeEventFinalizerProc *finalizerProc; + void *clientData; + struct aeTimeEvent *next; +} aeTimeEvent; + +/* A fired event */ +typedef struct aeFiredEvent { + int fd; + int mask; +} aeFiredEvent; + +/* State of an event based program */ +typedef struct aeEventLoop { + int maxfd; + long long timeEventNextId; + aeFileEvent events[AE_SETSIZE]; /* Registered events */ + aeFiredEvent fired[AE_SETSIZE]; /* Fired events */ + aeTimeEvent *timeEventHead; + int stop; + void *apidata; /* This is used for polling API specific data */ + aeBeforeSleepProc *beforesleep; +} aeEventLoop; + +/* Prototypes */ +aeEventLoop *aeCreateEventLoop(void); +void aeDeleteEventLoop(aeEventLoop *eventLoop); +void aeStop(aeEventLoop *eventLoop); +int aeCreateFileEvent(aeEventLoop *eventLoop, int fd, int mask, + aeFileProc *proc, void *clientData); +void aeDeleteFileEvent(aeEventLoop *eventLoop, int fd, int mask); +long long aeCreateTimeEvent(aeEventLoop *eventLoop, long long milliseconds, + aeTimeProc *proc, void *clientData, + aeEventFinalizerProc *finalizerProc); +int aeDeleteTimeEvent(aeEventLoop *eventLoop, long long id); +int aeProcessEvents(aeEventLoop *eventLoop, int flags); +int aeWait(int fd, int mask, long long milliseconds); +void aeMain(aeEventLoop *eventLoop); +char *aeGetApiName(void); +void aeSetBeforeSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *beforesleep); + +#endif diff --git a/src/ae_epoll.c b/src/ae_epoll.c new file mode 100644 index 000000000..d48977b65 --- /dev/null +++ b/src/ae_epoll.c @@ -0,0 +1,91 @@ +/* Linux epoll(2) based ae.c module + * Copyright (C) 2009-2010 Salvatore Sanfilippo - antirez@gmail.com + * Released under the BSD license. See the COPYING file for more info. */ + +#include + +typedef struct aeApiState { + int epfd; + struct epoll_event events[AE_SETSIZE]; +} aeApiState; + +static int aeApiCreate(aeEventLoop *eventLoop) { + aeApiState *state = zmalloc(sizeof(aeApiState)); + + if (!state) return -1; + state->epfd = epoll_create(1024); /* 1024 is just an hint for the kernel */ + if (state->epfd == -1) return -1; + eventLoop->apidata = state; + return 0; +} + +static void aeApiFree(aeEventLoop *eventLoop) { + aeApiState *state = eventLoop->apidata; + + close(state->epfd); + zfree(state); +} + +static int aeApiAddEvent(aeEventLoop *eventLoop, int fd, int mask) { + aeApiState *state = eventLoop->apidata; + struct epoll_event ee; + /* If the fd was already monitored for some event, we need a MOD + * operation. Otherwise we need an ADD operation. */ + int op = eventLoop->events[fd].mask == AE_NONE ? + EPOLL_CTL_ADD : EPOLL_CTL_MOD; + + ee.events = 0; + mask |= eventLoop->events[fd].mask; /* Merge old events */ + if (mask & AE_READABLE) ee.events |= EPOLLIN; + if (mask & AE_WRITABLE) ee.events |= EPOLLOUT; + ee.data.u64 = 0; /* avoid valgrind warning */ + ee.data.fd = fd; + if (epoll_ctl(state->epfd,op,fd,&ee) == -1) return -1; + return 0; +} + +static void aeApiDelEvent(aeEventLoop *eventLoop, int fd, int delmask) { + aeApiState *state = eventLoop->apidata; + struct epoll_event ee; + int mask = eventLoop->events[fd].mask & (~delmask); + + ee.events = 0; + if (mask & AE_READABLE) ee.events |= EPOLLIN; + if (mask & AE_WRITABLE) ee.events |= EPOLLOUT; + ee.data.u64 = 0; /* avoid valgrind warning */ + ee.data.fd = fd; + if (mask != AE_NONE) { + epoll_ctl(state->epfd,EPOLL_CTL_MOD,fd,&ee); + } else { + /* Note, Kernel < 2.6.9 requires a non null event pointer even for + * EPOLL_CTL_DEL. */ + epoll_ctl(state->epfd,EPOLL_CTL_DEL,fd,&ee); + } +} + +static int aeApiPoll(aeEventLoop *eventLoop, struct timeval *tvp) { + aeApiState *state = eventLoop->apidata; + int retval, numevents = 0; + + retval = epoll_wait(state->epfd,state->events,AE_SETSIZE, + tvp ? (tvp->tv_sec*1000 + tvp->tv_usec/1000) : -1); + if (retval > 0) { + int j; + + numevents = retval; + for (j = 0; j < numevents; j++) { + int mask = 0; + struct epoll_event *e = state->events+j; + + if (e->events & EPOLLIN) mask |= AE_READABLE; + if (e->events & EPOLLOUT) mask |= AE_WRITABLE; + eventLoop->fired[j].fd = e->data.fd; + eventLoop->fired[j].mask = mask; + } + } + return numevents; +} + +static char *aeApiName(void) { + return "epoll"; +} diff --git a/src/ae_kqueue.c b/src/ae_kqueue.c new file mode 100644 index 000000000..04c3536ba --- /dev/null +++ b/src/ae_kqueue.c @@ -0,0 +1,93 @@ +/* Kqueue(2)-based ae.c module + * Copyright (C) 2009 Harish Mallipeddi - harish.mallipeddi@gmail.com + * Released under the BSD license. See the COPYING file for more info. */ + +#include +#include +#include + +typedef struct aeApiState { + int kqfd; + struct kevent events[AE_SETSIZE]; +} aeApiState; + +static int aeApiCreate(aeEventLoop *eventLoop) { + aeApiState *state = zmalloc(sizeof(aeApiState)); + + if (!state) return -1; + state->kqfd = kqueue(); + if (state->kqfd == -1) return -1; + eventLoop->apidata = state; + + return 0; +} + +static void aeApiFree(aeEventLoop *eventLoop) { + aeApiState *state = eventLoop->apidata; + + close(state->kqfd); + zfree(state); +} + +static int aeApiAddEvent(aeEventLoop *eventLoop, int fd, int mask) { + aeApiState *state = eventLoop->apidata; + struct kevent ke; + + if (mask & AE_READABLE) { + EV_SET(&ke, fd, EVFILT_READ, EV_ADD, 0, 0, NULL); + if (kevent(state->kqfd, &ke, 1, NULL, 0, NULL) == -1) return -1; + } + if (mask & AE_WRITABLE) { + EV_SET(&ke, fd, EVFILT_WRITE, EV_ADD, 0, 0, NULL); + if (kevent(state->kqfd, &ke, 1, NULL, 0, NULL) == -1) return -1; + } + return 0; +} + +static void aeApiDelEvent(aeEventLoop *eventLoop, int fd, int mask) { + aeApiState *state = eventLoop->apidata; + struct kevent ke; + + if (mask & AE_READABLE) { + EV_SET(&ke, fd, EVFILT_READ, EV_DELETE, 0, 0, NULL); + kevent(state->kqfd, &ke, 1, NULL, 0, NULL); + } + if (mask & AE_WRITABLE) { + EV_SET(&ke, fd, EVFILT_WRITE, EV_DELETE, 0, 0, NULL); + kevent(state->kqfd, &ke, 1, NULL, 0, NULL); + } +} + +static int aeApiPoll(aeEventLoop *eventLoop, struct timeval *tvp) { + aeApiState *state = eventLoop->apidata; + int retval, numevents = 0; + + if (tvp != NULL) { + struct timespec timeout; + timeout.tv_sec = tvp->tv_sec; + timeout.tv_nsec = tvp->tv_usec * 1000; + retval = kevent(state->kqfd, NULL, 0, state->events, AE_SETSIZE, &timeout); + } else { + retval = kevent(state->kqfd, NULL, 0, state->events, AE_SETSIZE, NULL); + } + + if (retval > 0) { + int j; + + numevents = retval; + for(j = 0; j < numevents; j++) { + int mask = 0; + struct kevent *e = state->events+j; + + if (e->filter == EVFILT_READ) mask |= AE_READABLE; + if (e->filter == EVFILT_WRITE) mask |= AE_WRITABLE; + eventLoop->fired[j].fd = e->ident; + eventLoop->fired[j].mask = mask; + } + } + return numevents; +} + +static char *aeApiName(void) { + return "kqueue"; +} diff --git a/src/ae_select.c b/src/ae_select.c new file mode 100644 index 000000000..43f5867f3 --- /dev/null +++ b/src/ae_select.c @@ -0,0 +1,72 @@ +/* Select()-based ae.c module + * Copyright (C) 2009-2010 Salvatore Sanfilippo - antirez@gmail.com + * Released under the BSD license. See the COPYING file for more info. */ + +#include + +typedef struct aeApiState { + fd_set rfds, wfds; + /* We need to have a copy of the fd sets as it's not safe to reuse + * FD sets after select(). */ + fd_set _rfds, _wfds; +} aeApiState; + +static int aeApiCreate(aeEventLoop *eventLoop) { + aeApiState *state = zmalloc(sizeof(aeApiState)); + + if (!state) return -1; + FD_ZERO(&state->rfds); + FD_ZERO(&state->wfds); + eventLoop->apidata = state; + return 0; +} + +static void aeApiFree(aeEventLoop *eventLoop) { + zfree(eventLoop->apidata); +} + +static int aeApiAddEvent(aeEventLoop *eventLoop, int fd, int mask) { + aeApiState *state = eventLoop->apidata; + + if (mask & AE_READABLE) FD_SET(fd,&state->rfds); + if (mask & AE_WRITABLE) FD_SET(fd,&state->wfds); + return 0; +} + +static void aeApiDelEvent(aeEventLoop *eventLoop, int fd, int mask) { + aeApiState *state = eventLoop->apidata; + + if (mask & AE_READABLE) FD_CLR(fd,&state->rfds); + if (mask & AE_WRITABLE) FD_CLR(fd,&state->wfds); +} + +static int aeApiPoll(aeEventLoop *eventLoop, struct timeval *tvp) { + aeApiState *state = eventLoop->apidata; + int retval, j, numevents = 0; + + memcpy(&state->_rfds,&state->rfds,sizeof(fd_set)); + memcpy(&state->_wfds,&state->wfds,sizeof(fd_set)); + + retval = select(eventLoop->maxfd+1, + &state->_rfds,&state->_wfds,NULL,tvp); + if (retval > 0) { + for (j = 0; j <= eventLoop->maxfd; j++) { + int mask = 0; + aeFileEvent *fe = &eventLoop->events[j]; + + if (fe->mask == AE_NONE) continue; + if (fe->mask & AE_READABLE && FD_ISSET(j,&state->_rfds)) + mask |= AE_READABLE; + if (fe->mask & AE_WRITABLE && FD_ISSET(j,&state->_wfds)) + mask |= AE_WRITABLE; + eventLoop->fired[numevents].fd = j; + eventLoop->fired[numevents].mask = mask; + numevents++; + } + } + return numevents; +} + +static char *aeApiName(void) { + return "select"; +} diff --git a/src/anet.c b/src/anet.c new file mode 100644 index 000000000..4fe811a11 --- /dev/null +++ b/src/anet.c @@ -0,0 +1,270 @@ +/* anet.c -- Basic TCP socket stuff made a bit less boring + * + * Copyright (c) 2006-2010, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "fmacros.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "anet.h" + +static void anetSetError(char *err, const char *fmt, ...) +{ + va_list ap; + + if (!err) return; + va_start(ap, fmt); + vsnprintf(err, ANET_ERR_LEN, fmt, ap); + va_end(ap); +} + +int anetNonBlock(char *err, int fd) +{ + int flags; + + /* Set the socket nonblocking. + * Note that fcntl(2) for F_GETFL and F_SETFL can't be + * interrupted by a signal. */ + if ((flags = fcntl(fd, F_GETFL)) == -1) { + anetSetError(err, "fcntl(F_GETFL): %s\n", strerror(errno)); + return ANET_ERR; + } + if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) == -1) { + anetSetError(err, "fcntl(F_SETFL,O_NONBLOCK): %s\n", strerror(errno)); + return ANET_ERR; + } + return ANET_OK; +} + +int anetTcpNoDelay(char *err, int fd) +{ + int yes = 1; + if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &yes, sizeof(yes)) == -1) + { + anetSetError(err, "setsockopt TCP_NODELAY: %s\n", strerror(errno)); + return ANET_ERR; + } + return ANET_OK; +} + +int anetSetSendBuffer(char *err, int fd, int buffsize) +{ + if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &buffsize, sizeof(buffsize)) == -1) + { + anetSetError(err, "setsockopt SO_SNDBUF: %s\n", strerror(errno)); + return ANET_ERR; + } + return ANET_OK; +} + +int anetTcpKeepAlive(char *err, int fd) +{ + int yes = 1; + if (setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &yes, sizeof(yes)) == -1) { + anetSetError(err, "setsockopt SO_KEEPALIVE: %s\n", strerror(errno)); + return ANET_ERR; + } + return ANET_OK; +} + +int anetResolve(char *err, char *host, char *ipbuf) +{ + struct sockaddr_in sa; + + sa.sin_family = AF_INET; + if (inet_aton(host, &sa.sin_addr) == 0) { + struct hostent *he; + + he = gethostbyname(host); + if (he == NULL) { + anetSetError(err, "can't resolve: %s\n", host); + return ANET_ERR; + } + memcpy(&sa.sin_addr, he->h_addr, sizeof(struct in_addr)); + } + strcpy(ipbuf,inet_ntoa(sa.sin_addr)); + return ANET_OK; +} + +#define ANET_CONNECT_NONE 0 +#define ANET_CONNECT_NONBLOCK 1 +static int anetTcpGenericConnect(char *err, char *addr, int port, int flags) +{ + int s, on = 1; + struct sockaddr_in sa; + + if ((s = socket(AF_INET, SOCK_STREAM, 0)) == -1) { + anetSetError(err, "creating socket: %s\n", strerror(errno)); + return ANET_ERR; + } + /* Make sure connection-intensive things like the redis benckmark + * will be able to close/open sockets a zillion of times */ + setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)); + + sa.sin_family = AF_INET; + sa.sin_port = htons(port); + if (inet_aton(addr, &sa.sin_addr) == 0) { + struct hostent *he; + + he = gethostbyname(addr); + if (he == NULL) { + anetSetError(err, "can't resolve: %s\n", addr); + close(s); + return ANET_ERR; + } + memcpy(&sa.sin_addr, he->h_addr, sizeof(struct in_addr)); + } + if (flags & ANET_CONNECT_NONBLOCK) { + if (anetNonBlock(err,s) != ANET_OK) + return ANET_ERR; + } + if (connect(s, (struct sockaddr*)&sa, sizeof(sa)) == -1) { + if (errno == EINPROGRESS && + flags & ANET_CONNECT_NONBLOCK) + return s; + + anetSetError(err, "connect: %s\n", strerror(errno)); + close(s); + return ANET_ERR; + } + return s; +} + +int anetTcpConnect(char *err, char *addr, int port) +{ + return anetTcpGenericConnect(err,addr,port,ANET_CONNECT_NONE); +} + +int anetTcpNonBlockConnect(char *err, char *addr, int port) +{ + return anetTcpGenericConnect(err,addr,port,ANET_CONNECT_NONBLOCK); +} + +/* Like read(2) but make sure 'count' is read before to return + * (unless error or EOF condition is encountered) */ +int anetRead(int fd, char *buf, int count) +{ + int nread, totlen = 0; + while(totlen != count) { + nread = read(fd,buf,count-totlen); + if (nread == 0) return totlen; + if (nread == -1) return -1; + totlen += nread; + buf += nread; + } + return totlen; +} + +/* Like write(2) but make sure 'count' is read before to return + * (unless error is encountered) */ +int anetWrite(int fd, char *buf, int count) +{ + int nwritten, totlen = 0; + while(totlen != count) { + nwritten = write(fd,buf,count-totlen); + if (nwritten == 0) return totlen; + if (nwritten == -1) return -1; + totlen += nwritten; + buf += nwritten; + } + return totlen; +} + +int anetTcpServer(char *err, int port, char *bindaddr) +{ + int s, on = 1; + struct sockaddr_in sa; + + if ((s = socket(AF_INET, SOCK_STREAM, 0)) == -1) { + anetSetError(err, "socket: %s\n", strerror(errno)); + return ANET_ERR; + } + if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) { + anetSetError(err, "setsockopt SO_REUSEADDR: %s\n", strerror(errno)); + close(s); + return ANET_ERR; + } + memset(&sa,0,sizeof(sa)); + sa.sin_family = AF_INET; + sa.sin_port = htons(port); + sa.sin_addr.s_addr = htonl(INADDR_ANY); + if (bindaddr) { + if (inet_aton(bindaddr, &sa.sin_addr) == 0) { + anetSetError(err, "Invalid bind address\n"); + close(s); + return ANET_ERR; + } + } + if (bind(s, (struct sockaddr*)&sa, sizeof(sa)) == -1) { + anetSetError(err, "bind: %s\n", strerror(errno)); + close(s); + return ANET_ERR; + } + if (listen(s, 511) == -1) { /* the magic 511 constant is from nginx */ + anetSetError(err, "listen: %s\n", strerror(errno)); + close(s); + return ANET_ERR; + } + return s; +} + +int anetAccept(char *err, int serversock, char *ip, int *port) +{ + int fd; + struct sockaddr_in sa; + unsigned int saLen; + + while(1) { + saLen = sizeof(sa); + fd = accept(serversock, (struct sockaddr*)&sa, &saLen); + if (fd == -1) { + if (errno == EINTR) + continue; + else { + anetSetError(err, "accept: %s\n", strerror(errno)); + return ANET_ERR; + } + } + break; + } + if (ip) strcpy(ip,inet_ntoa(sa.sin_addr)); + if (port) *port = ntohs(sa.sin_port); + return fd; +} diff --git a/src/anet.h b/src/anet.h new file mode 100644 index 000000000..ce0f47787 --- /dev/null +++ b/src/anet.h @@ -0,0 +1,49 @@ +/* anet.c -- Basic TCP socket stuff made a bit less boring + * + * Copyright (c) 2006-2010, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef ANET_H +#define ANET_H + +#define ANET_OK 0 +#define ANET_ERR -1 +#define ANET_ERR_LEN 256 + +int anetTcpConnect(char *err, char *addr, int port); +int anetTcpNonBlockConnect(char *err, char *addr, int port); +int anetRead(int fd, char *buf, int count); +int anetResolve(char *err, char *host, char *ipbuf); +int anetTcpServer(char *err, int port, char *bindaddr); +int anetAccept(char *err, int serversock, char *ip, int *port); +int anetWrite(int fd, char *buf, int count); +int anetNonBlock(char *err, int fd); +int anetTcpNoDelay(char *err, int fd); +int anetTcpKeepAlive(char *err, int fd); + +#endif diff --git a/src/aof.c b/src/aof.c new file mode 100644 index 000000000..51054b296 --- /dev/null +++ b/src/aof.c @@ -0,0 +1,694 @@ +#include "redis.h" + +#include +#include +#include + +/* Called when the user switches from "appendonly yes" to "appendonly no" + * at runtime using the CONFIG command. */ +void stopAppendOnly(void) { + flushAppendOnlyFile(); + aof_fsync(server.appendfd); + close(server.appendfd); + + server.appendfd = -1; + server.appendseldb = -1; + server.appendonly = 0; + /* rewrite operation in progress? kill it, wait child exit */ + if (server.bgsavechildpid != -1) { + int statloc; + + if (kill(server.bgsavechildpid,SIGKILL) != -1) + wait3(&statloc,0,NULL); + /* reset the buffer accumulating changes while the child saves */ + sdsfree(server.bgrewritebuf); + server.bgrewritebuf = sdsempty(); + server.bgsavechildpid = -1; + } +} + +/* Called when the user switches from "appendonly no" to "appendonly yes" + * at runtime using the CONFIG command. */ +int startAppendOnly(void) { + server.appendonly = 1; + server.lastfsync = time(NULL); + server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644); + if (server.appendfd == -1) { + redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, but I can't open the AOF file: %s",strerror(errno)); + return REDIS_ERR; + } + if (rewriteAppendOnlyFileBackground() == REDIS_ERR) { + server.appendonly = 0; + close(server.appendfd); + redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, I can't trigger a background AOF rewrite operation. Check the above logs for more info about the error.",strerror(errno)); + return REDIS_ERR; + } + return REDIS_OK; +} + +/* Write the append only file buffer on disk. + * + * Since we are required to write the AOF before replying to the client, + * and the only way the client socket can get a write is entering when the + * the event loop, we accumulate all the AOF writes in a memory + * buffer and write it on disk using this function just before entering + * the event loop again. */ +void flushAppendOnlyFile(void) { + time_t now; + ssize_t nwritten; + + if (sdslen(server.aofbuf) == 0) return; + + /* We want to perform a single write. This should be guaranteed atomic + * at least if the filesystem we are writing is a real physical one. + * While this will save us against the server being killed I don't think + * there is much to do about the whole server stopping for power problems + * or alike */ + nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf)); + if (nwritten != (signed)sdslen(server.aofbuf)) { + /* Ooops, we are in troubles. The best thing to do for now is + * aborting instead of giving the illusion that everything is + * working as expected. */ + if (nwritten == -1) { + redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno)); + } else { + redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno)); + } + exit(1); + } + sdsfree(server.aofbuf); + server.aofbuf = sdsempty(); + + /* Don't Fsync if no-appendfsync-on-rewrite is set to yes and we have + * childs performing heavy I/O on disk. */ + if (server.no_appendfsync_on_rewrite && + (server.bgrewritechildpid != -1 || server.bgsavechildpid != -1)) + return; + /* Fsync if needed */ + now = time(NULL); + if (server.appendfsync == APPENDFSYNC_ALWAYS || + (server.appendfsync == APPENDFSYNC_EVERYSEC && + now-server.lastfsync > 1)) + { + /* aof_fsync is defined as fdatasync() for Linux in order to avoid + * flushing metadata. */ + aof_fsync(server.appendfd); /* Let's try to get this data on the disk */ + server.lastfsync = now; + } +} + +sds catAppendOnlyGenericCommand(sds buf, int argc, robj **argv) { + int j; + buf = sdscatprintf(buf,"*%d\r\n",argc); + for (j = 0; j < argc; j++) { + robj *o = getDecodedObject(argv[j]); + buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr)); + buf = sdscatlen(buf,o->ptr,sdslen(o->ptr)); + buf = sdscatlen(buf,"\r\n",2); + decrRefCount(o); + } + return buf; +} + +sds catAppendOnlyExpireAtCommand(sds buf, robj *key, robj *seconds) { + int argc = 3; + long when; + robj *argv[3]; + + /* Make sure we can use strtol */ + seconds = getDecodedObject(seconds); + when = time(NULL)+strtol(seconds->ptr,NULL,10); + decrRefCount(seconds); + + argv[0] = createStringObject("EXPIREAT",8); + argv[1] = key; + argv[2] = createObject(REDIS_STRING, + sdscatprintf(sdsempty(),"%ld",when)); + buf = catAppendOnlyGenericCommand(buf, argc, argv); + decrRefCount(argv[0]); + decrRefCount(argv[2]); + return buf; +} + +void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) { + sds buf = sdsempty(); + robj *tmpargv[3]; + + /* The DB this command was targetting is not the same as the last command + * we appendend. To issue a SELECT command is needed. */ + if (dictid != server.appendseldb) { + char seldb[64]; + + snprintf(seldb,sizeof(seldb),"%d",dictid); + buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n", + (unsigned long)strlen(seldb),seldb); + server.appendseldb = dictid; + } + + if (cmd->proc == expireCommand) { + /* Translate EXPIRE into EXPIREAT */ + buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]); + } else if (cmd->proc == setexCommand) { + /* Translate SETEX to SET and EXPIREAT */ + tmpargv[0] = createStringObject("SET",3); + tmpargv[1] = argv[1]; + tmpargv[2] = argv[3]; + buf = catAppendOnlyGenericCommand(buf,3,tmpargv); + decrRefCount(tmpargv[0]); + buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]); + } else { + buf = catAppendOnlyGenericCommand(buf,argc,argv); + } + + /* Append to the AOF buffer. This will be flushed on disk just before + * of re-entering the event loop, so before the client will get a + * positive reply about the operation performed. */ + server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf)); + + /* If a background append only file rewriting is in progress we want to + * accumulate the differences between the child DB and the current one + * in a buffer, so that when the child process will do its work we + * can append the differences to the new append only file. */ + if (server.bgrewritechildpid != -1) + server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf)); + + sdsfree(buf); +} + +/* In Redis commands are always executed in the context of a client, so in + * order to load the append only file we need to create a fake client. */ +struct redisClient *createFakeClient(void) { + struct redisClient *c = zmalloc(sizeof(*c)); + + selectDb(c,0); + c->fd = -1; + c->querybuf = sdsempty(); + c->argc = 0; + c->argv = NULL; + c->flags = 0; + /* We set the fake client as a slave waiting for the synchronization + * so that Redis will not try to send replies to this client. */ + c->replstate = REDIS_REPL_WAIT_BGSAVE_START; + c->reply = listCreate(); + listSetFreeMethod(c->reply,decrRefCount); + listSetDupMethod(c->reply,dupClientReplyValue); + initClientMultiState(c); + return c; +} + +void freeFakeClient(struct redisClient *c) { + sdsfree(c->querybuf); + listRelease(c->reply); + freeClientMultiState(c); + zfree(c); +} + +/* Replay the append log file. On error REDIS_OK is returned. On non fatal + * error (the append only file is zero-length) REDIS_ERR is returned. On + * fatal error an error message is logged and the program exists. */ +int loadAppendOnlyFile(char *filename) { + struct redisClient *fakeClient; + FILE *fp = fopen(filename,"r"); + struct redis_stat sb; + int appendonly = server.appendonly; + + if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0) + return REDIS_ERR; + + if (fp == NULL) { + redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno)); + exit(1); + } + + /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI + * to the same file we're about to read. */ + server.appendonly = 0; + + fakeClient = createFakeClient(); + while(1) { + int argc, j; + unsigned long len; + robj **argv; + char buf[128]; + sds argsds; + struct redisCommand *cmd; + int force_swapout; + + if (fgets(buf,sizeof(buf),fp) == NULL) { + if (feof(fp)) + break; + else + goto readerr; + } + if (buf[0] != '*') goto fmterr; + argc = atoi(buf+1); + argv = zmalloc(sizeof(robj*)*argc); + for (j = 0; j < argc; j++) { + if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr; + if (buf[0] != '$') goto fmterr; + len = strtol(buf+1,NULL,10); + argsds = sdsnewlen(NULL,len); + if (len && fread(argsds,len,1,fp) == 0) goto fmterr; + argv[j] = createObject(REDIS_STRING,argsds); + if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */ + } + + /* Command lookup */ + cmd = lookupCommand(argv[0]->ptr); + if (!cmd) { + redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr); + exit(1); + } + /* Try object encoding */ + if (cmd->flags & REDIS_CMD_BULK) + argv[argc-1] = tryObjectEncoding(argv[argc-1]); + /* Run the command in the context of a fake client */ + fakeClient->argc = argc; + fakeClient->argv = argv; + cmd->proc(fakeClient); + /* Discard the reply objects list from the fake client */ + while(listLength(fakeClient->reply)) + listDelNode(fakeClient->reply,listFirst(fakeClient->reply)); + /* Clean up, ready for the next command */ + for (j = 0; j < argc; j++) decrRefCount(argv[j]); + zfree(argv); + /* Handle swapping while loading big datasets when VM is on */ + force_swapout = 0; + if ((zmalloc_used_memory() - server.vm_max_memory) > 1024*1024*32) + force_swapout = 1; + + if (server.vm_enabled && force_swapout) { + while (zmalloc_used_memory() > server.vm_max_memory) { + if (vmSwapOneObjectBlocking() == REDIS_ERR) break; + } + } + } + + /* This point can only be reached when EOF is reached without errors. + * If the client is in the middle of a MULTI/EXEC, log error and quit. */ + if (fakeClient->flags & REDIS_MULTI) goto readerr; + + fclose(fp); + freeFakeClient(fakeClient); + server.appendonly = appendonly; + return REDIS_OK; + +readerr: + if (feof(fp)) { + redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file"); + } else { + redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno)); + } + exit(1); +fmterr: + redisLog(REDIS_WARNING,"Bad file format reading the append only file"); + exit(1); +} + +/* Write binary-safe string into a file in the bulkformat + * $\r\n\r\n */ +int fwriteBulkString(FILE *fp, char *s, unsigned long len) { + char cbuf[128]; + int clen; + cbuf[0] = '$'; + clen = 1+ll2string(cbuf+1,sizeof(cbuf)-1,len); + cbuf[clen++] = '\r'; + cbuf[clen++] = '\n'; + if (fwrite(cbuf,clen,1,fp) == 0) return 0; + if (len > 0 && fwrite(s,len,1,fp) == 0) return 0; + if (fwrite("\r\n",2,1,fp) == 0) return 0; + return 1; +} + +/* Write a double value in bulk format $\r\n\r\n */ +int fwriteBulkDouble(FILE *fp, double d) { + char buf[128], dbuf[128]; + + snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d); + snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2); + if (fwrite(buf,strlen(buf),1,fp) == 0) return 0; + if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0; + return 1; +} + +/* Write a long value in bulk format $\r\n\r\n */ +int fwriteBulkLongLong(FILE *fp, long long l) { + char bbuf[128], lbuf[128]; + unsigned int blen, llen; + llen = ll2string(lbuf,32,l); + blen = snprintf(bbuf,sizeof(bbuf),"$%u\r\n%s\r\n",llen,lbuf); + if (fwrite(bbuf,blen,1,fp) == 0) return 0; + return 1; +} + +/* Delegate writing an object to writing a bulk string or bulk long long. */ +int fwriteBulkObject(FILE *fp, robj *obj) { + /* Avoid using getDecodedObject to help copy-on-write (we are often + * in a child process when this function is called). */ + if (obj->encoding == REDIS_ENCODING_INT) { + return fwriteBulkLongLong(fp,(long)obj->ptr); + } else if (obj->encoding == REDIS_ENCODING_RAW) { + return fwriteBulkString(fp,obj->ptr,sdslen(obj->ptr)); + } else { + redisPanic("Unknown string encoding"); + } +} + +/* Write a sequence of commands able to fully rebuild the dataset into + * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */ +int rewriteAppendOnlyFile(char *filename) { + dictIterator *di = NULL; + dictEntry *de; + FILE *fp; + char tmpfile[256]; + int j; + time_t now = time(NULL); + + /* Note that we have to use a different temp name here compared to the + * one used by rewriteAppendOnlyFileBackground() function. */ + snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid()); + fp = fopen(tmpfile,"w"); + if (!fp) { + redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno)); + return REDIS_ERR; + } + for (j = 0; j < server.dbnum; j++) { + char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n"; + redisDb *db = server.db+j; + dict *d = db->dict; + if (dictSize(d) == 0) continue; + di = dictGetIterator(d); + if (!di) { + fclose(fp); + return REDIS_ERR; + } + + /* SELECT the new DB */ + if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr; + if (fwriteBulkLongLong(fp,j) == 0) goto werr; + + /* Iterate this DB writing every entry */ + while((de = dictNext(di)) != NULL) { + sds keystr = dictGetEntryKey(de); + robj key, *o; + time_t expiretime; + int swapped; + + keystr = dictGetEntryKey(de); + o = dictGetEntryVal(de); + initStaticStringObject(key,keystr); + /* If the value for this key is swapped, load a preview in memory. + * We use a "swapped" flag to remember if we need to free the + * value object instead to just increment the ref count anyway + * in order to avoid copy-on-write of pages if we are forked() */ + if (!server.vm_enabled || o->storage == REDIS_VM_MEMORY || + o->storage == REDIS_VM_SWAPPING) { + swapped = 0; + } else { + o = vmPreviewObject(o); + swapped = 1; + } + expiretime = getExpire(db,&key); + + /* Save the key and associated value */ + if (o->type == REDIS_STRING) { + /* Emit a SET command */ + char cmd[]="*3\r\n$3\r\nSET\r\n"; + if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr; + /* Key and value */ + if (fwriteBulkObject(fp,&key) == 0) goto werr; + if (fwriteBulkObject(fp,o) == 0) goto werr; + } else if (o->type == REDIS_LIST) { + /* Emit the RPUSHes needed to rebuild the list */ + char cmd[]="*3\r\n$5\r\nRPUSH\r\n"; + if (o->encoding == REDIS_ENCODING_ZIPLIST) { + unsigned char *zl = o->ptr; + unsigned char *p = ziplistIndex(zl,0); + unsigned char *vstr; + unsigned int vlen; + long long vlong; + + while(ziplistGet(p,&vstr,&vlen,&vlong)) { + if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr; + if (fwriteBulkObject(fp,&key) == 0) goto werr; + if (vstr) { + if (fwriteBulkString(fp,(char*)vstr,vlen) == 0) + goto werr; + } else { + if (fwriteBulkLongLong(fp,vlong) == 0) + goto werr; + } + p = ziplistNext(zl,p); + } + } else if (o->encoding == REDIS_ENCODING_LINKEDLIST) { + list *list = o->ptr; + listNode *ln; + listIter li; + + listRewind(list,&li); + while((ln = listNext(&li))) { + robj *eleobj = listNodeValue(ln); + + if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr; + if (fwriteBulkObject(fp,&key) == 0) goto werr; + if (fwriteBulkObject(fp,eleobj) == 0) goto werr; + } + } else { + redisPanic("Unknown list encoding"); + } + } else if (o->type == REDIS_SET) { + /* Emit the SADDs needed to rebuild the set */ + dict *set = o->ptr; + dictIterator *di = dictGetIterator(set); + dictEntry *de; + + while((de = dictNext(di)) != NULL) { + char cmd[]="*3\r\n$4\r\nSADD\r\n"; + robj *eleobj = dictGetEntryKey(de); + + if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr; + if (fwriteBulkObject(fp,&key) == 0) goto werr; + if (fwriteBulkObject(fp,eleobj) == 0) goto werr; + } + dictReleaseIterator(di); + } else if (o->type == REDIS_ZSET) { + /* Emit the ZADDs needed to rebuild the sorted set */ + zset *zs = o->ptr; + dictIterator *di = dictGetIterator(zs->dict); + dictEntry *de; + + while((de = dictNext(di)) != NULL) { + char cmd[]="*4\r\n$4\r\nZADD\r\n"; + robj *eleobj = dictGetEntryKey(de); + double *score = dictGetEntryVal(de); + + if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr; + if (fwriteBulkObject(fp,&key) == 0) goto werr; + if (fwriteBulkDouble(fp,*score) == 0) goto werr; + if (fwriteBulkObject(fp,eleobj) == 0) goto werr; + } + dictReleaseIterator(di); + } else if (o->type == REDIS_HASH) { + char cmd[]="*4\r\n$4\r\nHSET\r\n"; + + /* Emit the HSETs needed to rebuild the hash */ + if (o->encoding == REDIS_ENCODING_ZIPMAP) { + unsigned char *p = zipmapRewind(o->ptr); + unsigned char *field, *val; + unsigned int flen, vlen; + + while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) { + if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr; + if (fwriteBulkObject(fp,&key) == 0) goto werr; + if (fwriteBulkString(fp,(char*)field,flen) == -1) + return -1; + if (fwriteBulkString(fp,(char*)val,vlen) == -1) + return -1; + } + } else { + dictIterator *di = dictGetIterator(o->ptr); + dictEntry *de; + + while((de = dictNext(di)) != NULL) { + robj *field = dictGetEntryKey(de); + robj *val = dictGetEntryVal(de); + + if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr; + if (fwriteBulkObject(fp,&key) == 0) goto werr; + if (fwriteBulkObject(fp,field) == -1) return -1; + if (fwriteBulkObject(fp,val) == -1) return -1; + } + dictReleaseIterator(di); + } + } else { + redisPanic("Unknown object type"); + } + /* Save the expire time */ + if (expiretime != -1) { + char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n"; + /* If this key is already expired skip it */ + if (expiretime < now) continue; + if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr; + if (fwriteBulkObject(fp,&key) == 0) goto werr; + if (fwriteBulkLongLong(fp,expiretime) == 0) goto werr; + } + if (swapped) decrRefCount(o); + } + dictReleaseIterator(di); + } + + /* Make sure data will not remain on the OS's output buffers */ + fflush(fp); + aof_fsync(fileno(fp)); + fclose(fp); + + /* Use RENAME to make sure the DB file is changed atomically only + * if the generate DB file is ok. */ + if (rename(tmpfile,filename) == -1) { + redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno)); + unlink(tmpfile); + return REDIS_ERR; + } + redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed"); + return REDIS_OK; + +werr: + fclose(fp); + unlink(tmpfile); + redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno)); + if (di) dictReleaseIterator(di); + return REDIS_ERR; +} + +/* This is how rewriting of the append only file in background works: + * + * 1) The user calls BGREWRITEAOF + * 2) Redis calls this function, that forks(): + * 2a) the child rewrite the append only file in a temp file. + * 2b) the parent accumulates differences in server.bgrewritebuf. + * 3) When the child finished '2a' exists. + * 4) The parent will trap the exit code, if it's OK, will append the + * data accumulated into server.bgrewritebuf into the temp file, and + * finally will rename(2) the temp file in the actual file name. + * The the new file is reopened as the new append only file. Profit! + */ +int rewriteAppendOnlyFileBackground(void) { + pid_t childpid; + + if (server.bgrewritechildpid != -1) return REDIS_ERR; + if (server.vm_enabled) waitEmptyIOJobsQueue(); + if ((childpid = fork()) == 0) { + /* Child */ + char tmpfile[256]; + + if (server.vm_enabled) vmReopenSwapFile(); + close(server.fd); + snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid()); + if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) { + _exit(0); + } else { + _exit(1); + } + } else { + /* Parent */ + if (childpid == -1) { + redisLog(REDIS_WARNING, + "Can't rewrite append only file in background: fork: %s", + strerror(errno)); + return REDIS_ERR; + } + redisLog(REDIS_NOTICE, + "Background append only file rewriting started by pid %d",childpid); + server.bgrewritechildpid = childpid; + updateDictResizePolicy(); + /* We set appendseldb to -1 in order to force the next call to the + * feedAppendOnlyFile() to issue a SELECT command, so the differences + * accumulated by the parent into server.bgrewritebuf will start + * with a SELECT statement and it will be safe to merge. */ + server.appendseldb = -1; + return REDIS_OK; + } + return REDIS_OK; /* unreached */ +} + +void bgrewriteaofCommand(redisClient *c) { + if (server.bgrewritechildpid != -1) { + addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n")); + return; + } + if (rewriteAppendOnlyFileBackground() == REDIS_OK) { + char *status = "+Background append only file rewriting started\r\n"; + addReplySds(c,sdsnew(status)); + } else { + addReply(c,shared.err); + } +} + +void aofRemoveTempFile(pid_t childpid) { + char tmpfile[256]; + + snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid); + unlink(tmpfile); +} + +/* A background append only file rewriting (BGREWRITEAOF) terminated its work. + * Handle this. */ +void backgroundRewriteDoneHandler(int statloc) { + int exitcode = WEXITSTATUS(statloc); + int bysignal = WIFSIGNALED(statloc); + + if (!bysignal && exitcode == 0) { + int fd; + char tmpfile[256]; + + redisLog(REDIS_NOTICE, + "Background append only file rewriting terminated with success"); + /* Now it's time to flush the differences accumulated by the parent */ + snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid); + fd = open(tmpfile,O_WRONLY|O_APPEND); + if (fd == -1) { + redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno)); + goto cleanup; + } + /* Flush our data... */ + if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) != + (signed) sdslen(server.bgrewritebuf)) { + redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno)); + close(fd); + goto cleanup; + } + redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf)); + /* Now our work is to rename the temp file into the stable file. And + * switch the file descriptor used by the server for append only. */ + if (rename(tmpfile,server.appendfilename) == -1) { + redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno)); + close(fd); + goto cleanup; + } + /* Mission completed... almost */ + redisLog(REDIS_NOTICE,"Append only file successfully rewritten."); + if (server.appendfd != -1) { + /* If append only is actually enabled... */ + close(server.appendfd); + server.appendfd = fd; + if (server.appendfsync != APPENDFSYNC_NO) aof_fsync(fd); + server.appendseldb = -1; /* Make sure it will issue SELECT */ + redisLog(REDIS_NOTICE,"The new append only file was selected for future appends."); + } else { + /* If append only is disabled we just generate a dump in this + * format. Why not? */ + close(fd); + } + } else if (!bysignal && exitcode != 0) { + redisLog(REDIS_WARNING, "Background append only file rewriting error"); + } else { + redisLog(REDIS_WARNING, + "Background append only file rewriting terminated by signal %d", + WTERMSIG(statloc)); + } +cleanup: + sdsfree(server.bgrewritebuf); + server.bgrewritebuf = sdsempty(); + aofRemoveTempFile(server.bgrewritechildpid); + server.bgrewritechildpid = -1; +} diff --git a/src/config.c b/src/config.c new file mode 100644 index 000000000..6d946ee0c --- /dev/null +++ b/src/config.c @@ -0,0 +1,438 @@ +#include "redis.h" + +/*----------------------------------------------------------------------------- + * Config file parsing + *----------------------------------------------------------------------------*/ + +int yesnotoi(char *s) { + if (!strcasecmp(s,"yes")) return 1; + else if (!strcasecmp(s,"no")) return 0; + else return -1; +} + +void appendServerSaveParams(time_t seconds, int changes) { + server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1)); + server.saveparams[server.saveparamslen].seconds = seconds; + server.saveparams[server.saveparamslen].changes = changes; + server.saveparamslen++; +} + +void resetServerSaveParams() { + zfree(server.saveparams); + server.saveparams = NULL; + server.saveparamslen = 0; +} + +/* I agree, this is a very rudimental way to load a configuration... + will improve later if the config gets more complex */ +void loadServerConfig(char *filename) { + FILE *fp; + char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL; + int linenum = 0; + sds line = NULL; + + if (filename[0] == '-' && filename[1] == '\0') + fp = stdin; + else { + if ((fp = fopen(filename,"r")) == NULL) { + redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename); + exit(1); + } + } + + while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) { + sds *argv; + int argc, j; + + linenum++; + line = sdsnew(buf); + line = sdstrim(line," \t\r\n"); + + /* Skip comments and blank lines*/ + if (line[0] == '#' || line[0] == '\0') { + sdsfree(line); + continue; + } + + /* Split into arguments */ + argv = sdssplitlen(line,sdslen(line)," ",1,&argc); + sdstolower(argv[0]); + + /* Execute config directives */ + if (!strcasecmp(argv[0],"timeout") && argc == 2) { + server.maxidletime = atoi(argv[1]); + if (server.maxidletime < 0) { + err = "Invalid timeout value"; goto loaderr; + } + } else if (!strcasecmp(argv[0],"port") && argc == 2) { + server.port = atoi(argv[1]); + if (server.port < 1 || server.port > 65535) { + err = "Invalid port"; goto loaderr; + } + } else if (!strcasecmp(argv[0],"bind") && argc == 2) { + server.bindaddr = zstrdup(argv[1]); + } else if (!strcasecmp(argv[0],"save") && argc == 3) { + int seconds = atoi(argv[1]); + int changes = atoi(argv[2]); + if (seconds < 1 || changes < 0) { + err = "Invalid save parameters"; goto loaderr; + } + appendServerSaveParams(seconds,changes); + } else if (!strcasecmp(argv[0],"dir") && argc == 2) { + if (chdir(argv[1]) == -1) { + redisLog(REDIS_WARNING,"Can't chdir to '%s': %s", + argv[1], strerror(errno)); + exit(1); + } + } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) { + if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG; + else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE; + else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE; + else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING; + else { + err = "Invalid log level. Must be one of debug, notice, warning"; + goto loaderr; + } + } else if (!strcasecmp(argv[0],"logfile") && argc == 2) { + FILE *logfp; + + server.logfile = zstrdup(argv[1]); + if (!strcasecmp(server.logfile,"stdout")) { + zfree(server.logfile); + server.logfile = NULL; + } + if (server.logfile) { + /* Test if we are able to open the file. The server will not + * be able to abort just for this problem later... */ + logfp = fopen(server.logfile,"a"); + if (logfp == NULL) { + err = sdscatprintf(sdsempty(), + "Can't open the log file: %s", strerror(errno)); + goto loaderr; + } + fclose(logfp); + } + } else if (!strcasecmp(argv[0],"databases") && argc == 2) { + server.dbnum = atoi(argv[1]); + if (server.dbnum < 1) { + err = "Invalid number of databases"; goto loaderr; + } + } else if (!strcasecmp(argv[0],"include") && argc == 2) { + loadServerConfig(argv[1]); + } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) { + server.maxclients = atoi(argv[1]); + } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) { + server.maxmemory = memtoll(argv[1],NULL); + } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) { + server.masterhost = sdsnew(argv[1]); + server.masterport = atoi(argv[2]); + server.replstate = REDIS_REPL_CONNECT; + } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) { + server.masterauth = zstrdup(argv[1]); + } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) { + if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) { + err = "argument must be 'yes' or 'no'"; goto loaderr; + } + } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) { + if ((server.rdbcompression = yesnotoi(argv[1])) == -1) { + err = "argument must be 'yes' or 'no'"; goto loaderr; + } + } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) { + if ((server.activerehashing = yesnotoi(argv[1])) == -1) { + err = "argument must be 'yes' or 'no'"; goto loaderr; + } + } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) { + if ((server.daemonize = yesnotoi(argv[1])) == -1) { + err = "argument must be 'yes' or 'no'"; goto loaderr; + } + } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) { + if ((server.appendonly = yesnotoi(argv[1])) == -1) { + err = "argument must be 'yes' or 'no'"; goto loaderr; + } + } else if (!strcasecmp(argv[0],"appendfilename") && argc == 2) { + zfree(server.appendfilename); + server.appendfilename = zstrdup(argv[1]); + } else if (!strcasecmp(argv[0],"no-appendfsync-on-rewrite") + && argc == 2) { + if ((server.no_appendfsync_on_rewrite= yesnotoi(argv[1])) == -1) { + err = "argument must be 'yes' or 'no'"; goto loaderr; + } + } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) { + if (!strcasecmp(argv[1],"no")) { + server.appendfsync = APPENDFSYNC_NO; + } else if (!strcasecmp(argv[1],"always")) { + server.appendfsync = APPENDFSYNC_ALWAYS; + } else if (!strcasecmp(argv[1],"everysec")) { + server.appendfsync = APPENDFSYNC_EVERYSEC; + } else { + err = "argument must be 'no', 'always' or 'everysec'"; + goto loaderr; + } + } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) { + server.requirepass = zstrdup(argv[1]); + } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) { + zfree(server.pidfile); + server.pidfile = zstrdup(argv[1]); + } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) { + zfree(server.dbfilename); + server.dbfilename = zstrdup(argv[1]); + } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) { + if ((server.vm_enabled = yesnotoi(argv[1])) == -1) { + err = "argument must be 'yes' or 'no'"; goto loaderr; + } + } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) { + zfree(server.vm_swap_file); + server.vm_swap_file = zstrdup(argv[1]); + } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) { + server.vm_max_memory = memtoll(argv[1],NULL); + } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) { + server.vm_page_size = memtoll(argv[1], NULL); + } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) { + server.vm_pages = memtoll(argv[1], NULL); + } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) { + server.vm_max_threads = strtoll(argv[1], NULL, 10); + } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){ + server.hash_max_zipmap_entries = memtoll(argv[1], NULL); + } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){ + server.hash_max_zipmap_value = memtoll(argv[1], NULL); + } else if (!strcasecmp(argv[0],"list-max-ziplist-entries") && argc == 2){ + server.list_max_ziplist_entries = memtoll(argv[1], NULL); + } else if (!strcasecmp(argv[0],"list-max-ziplist-value") && argc == 2){ + server.list_max_ziplist_value = memtoll(argv[1], NULL); + } else { + err = "Bad directive or wrong number of arguments"; goto loaderr; + } + for (j = 0; j < argc; j++) + sdsfree(argv[j]); + zfree(argv); + sdsfree(line); + } + if (fp != stdin) fclose(fp); + return; + +loaderr: + fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n"); + fprintf(stderr, "Reading the configuration file, at line %d\n", linenum); + fprintf(stderr, ">>> '%s'\n", line); + fprintf(stderr, "%s\n", err); + exit(1); +} + +/*----------------------------------------------------------------------------- + * CONFIG command for remote configuration + *----------------------------------------------------------------------------*/ + +void configSetCommand(redisClient *c) { + robj *o = getDecodedObject(c->argv[3]); + long long ll; + + if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) { + zfree(server.dbfilename); + server.dbfilename = zstrdup(o->ptr); + } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) { + zfree(server.requirepass); + server.requirepass = zstrdup(o->ptr); + } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) { + zfree(server.masterauth); + server.masterauth = zstrdup(o->ptr); + } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) { + if (getLongLongFromObject(o,&ll) == REDIS_ERR || + ll < 0) goto badfmt; + server.maxmemory = ll; + } else if (!strcasecmp(c->argv[2]->ptr,"timeout")) { + if (getLongLongFromObject(o,&ll) == REDIS_ERR || + ll < 0 || ll > LONG_MAX) goto badfmt; + server.maxidletime = ll; + } else if (!strcasecmp(c->argv[2]->ptr,"appendfsync")) { + if (!strcasecmp(o->ptr,"no")) { + server.appendfsync = APPENDFSYNC_NO; + } else if (!strcasecmp(o->ptr,"everysec")) { + server.appendfsync = APPENDFSYNC_EVERYSEC; + } else if (!strcasecmp(o->ptr,"always")) { + server.appendfsync = APPENDFSYNC_ALWAYS; + } else { + goto badfmt; + } + } else if (!strcasecmp(c->argv[2]->ptr,"no-appendfsync-on-rewrite")) { + int yn = yesnotoi(o->ptr); + + if (yn == -1) goto badfmt; + server.no_appendfsync_on_rewrite = yn; + } else if (!strcasecmp(c->argv[2]->ptr,"appendonly")) { + int old = server.appendonly; + int new = yesnotoi(o->ptr); + + if (new == -1) goto badfmt; + if (old != new) { + if (new == 0) { + stopAppendOnly(); + } else { + if (startAppendOnly() == REDIS_ERR) { + addReplySds(c,sdscatprintf(sdsempty(), + "-ERR Unable to turn on AOF. Check server logs.\r\n")); + decrRefCount(o); + return; + } + } + } + } else if (!strcasecmp(c->argv[2]->ptr,"save")) { + int vlen, j; + sds *v = sdssplitlen(o->ptr,sdslen(o->ptr)," ",1,&vlen); + + /* Perform sanity check before setting the new config: + * - Even number of args + * - Seconds >= 1, changes >= 0 */ + if (vlen & 1) { + sdsfreesplitres(v,vlen); + goto badfmt; + } + for (j = 0; j < vlen; j++) { + char *eptr; + long val; + + val = strtoll(v[j], &eptr, 10); + if (eptr[0] != '\0' || + ((j & 1) == 0 && val < 1) || + ((j & 1) == 1 && val < 0)) { + sdsfreesplitres(v,vlen); + goto badfmt; + } + } + /* Finally set the new config */ + resetServerSaveParams(); + for (j = 0; j < vlen; j += 2) { + time_t seconds; + int changes; + + seconds = strtoll(v[j],NULL,10); + changes = strtoll(v[j+1],NULL,10); + appendServerSaveParams(seconds, changes); + } + sdsfreesplitres(v,vlen); + } else { + addReplySds(c,sdscatprintf(sdsempty(), + "-ERR not supported CONFIG parameter %s\r\n", + (char*)c->argv[2]->ptr)); + decrRefCount(o); + return; + } + decrRefCount(o); + addReply(c,shared.ok); + return; + +badfmt: /* Bad format errors */ + addReplySds(c,sdscatprintf(sdsempty(), + "-ERR invalid argument '%s' for CONFIG SET '%s'\r\n", + (char*)o->ptr, + (char*)c->argv[2]->ptr)); + decrRefCount(o); +} + +void configGetCommand(redisClient *c) { + robj *o = getDecodedObject(c->argv[2]); + robj *lenobj = createObject(REDIS_STRING,NULL); + char *pattern = o->ptr; + int matches = 0; + + addReply(c,lenobj); + decrRefCount(lenobj); + + if (stringmatch(pattern,"dbfilename",0)) { + addReplyBulkCString(c,"dbfilename"); + addReplyBulkCString(c,server.dbfilename); + matches++; + } + if (stringmatch(pattern,"requirepass",0)) { + addReplyBulkCString(c,"requirepass"); + addReplyBulkCString(c,server.requirepass); + matches++; + } + if (stringmatch(pattern,"masterauth",0)) { + addReplyBulkCString(c,"masterauth"); + addReplyBulkCString(c,server.masterauth); + matches++; + } + if (stringmatch(pattern,"maxmemory",0)) { + char buf[128]; + + ll2string(buf,128,server.maxmemory); + addReplyBulkCString(c,"maxmemory"); + addReplyBulkCString(c,buf); + matches++; + } + if (stringmatch(pattern,"timeout",0)) { + char buf[128]; + + ll2string(buf,128,server.maxidletime); + addReplyBulkCString(c,"timeout"); + addReplyBulkCString(c,buf); + matches++; + } + if (stringmatch(pattern,"appendonly",0)) { + addReplyBulkCString(c,"appendonly"); + addReplyBulkCString(c,server.appendonly ? "yes" : "no"); + matches++; + } + if (stringmatch(pattern,"no-appendfsync-on-rewrite",0)) { + addReplyBulkCString(c,"no-appendfsync-on-rewrite"); + addReplyBulkCString(c,server.no_appendfsync_on_rewrite ? "yes" : "no"); + matches++; + } + if (stringmatch(pattern,"appendfsync",0)) { + char *policy; + + switch(server.appendfsync) { + case APPENDFSYNC_NO: policy = "no"; break; + case APPENDFSYNC_EVERYSEC: policy = "everysec"; break; + case APPENDFSYNC_ALWAYS: policy = "always"; break; + default: policy = "unknown"; break; /* too harmless to panic */ + } + addReplyBulkCString(c,"appendfsync"); + addReplyBulkCString(c,policy); + matches++; + } + if (stringmatch(pattern,"save",0)) { + sds buf = sdsempty(); + int j; + + for (j = 0; j < server.saveparamslen; j++) { + buf = sdscatprintf(buf,"%ld %d", + server.saveparams[j].seconds, + server.saveparams[j].changes); + if (j != server.saveparamslen-1) + buf = sdscatlen(buf," ",1); + } + addReplyBulkCString(c,"save"); + addReplyBulkCString(c,buf); + sdsfree(buf); + matches++; + } + decrRefCount(o); + lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2); +} + +void configCommand(redisClient *c) { + if (!strcasecmp(c->argv[1]->ptr,"set")) { + if (c->argc != 4) goto badarity; + configSetCommand(c); + } else if (!strcasecmp(c->argv[1]->ptr,"get")) { + if (c->argc != 3) goto badarity; + configGetCommand(c); + } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) { + if (c->argc != 2) goto badarity; + server.stat_numcommands = 0; + server.stat_numconnections = 0; + server.stat_expiredkeys = 0; + server.stat_starttime = time(NULL); + addReply(c,shared.ok); + } else { + addReplySds(c,sdscatprintf(sdsempty(), + "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n")); + } + return; + +badarity: + addReplySds(c,sdscatprintf(sdsempty(), + "-ERR Wrong number of arguments for CONFIG %s\r\n", + (char*) c->argv[1]->ptr)); +} diff --git a/src/config.h b/src/config.h new file mode 100644 index 000000000..6e98fbb2c --- /dev/null +++ b/src/config.h @@ -0,0 +1,45 @@ +#ifndef __CONFIG_H +#define __CONFIG_H + +#ifdef __APPLE__ +#include +#endif + +/* test for malloc_size() */ +#ifdef __APPLE__ +#include +#define HAVE_MALLOC_SIZE 1 +#define redis_malloc_size(p) malloc_size(p) +#endif + +/* define redis_fstat to fstat or fstat64() */ +#if defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6) +#define redis_fstat fstat64 +#define redis_stat stat64 +#else +#define redis_fstat fstat +#define redis_stat stat +#endif + +/* test for backtrace() */ +#if defined(__APPLE__) || defined(__linux__) +#define HAVE_BACKTRACE 1 +#endif + +/* test for polling API */ +#ifdef __linux__ +#define HAVE_EPOLL 1 +#endif + +#if (defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined (__NetBSD__) +#define HAVE_KQUEUE 1 +#endif + +/* define aof_fsync to fdatasync() in Linux and fsync() for all the rest */ +#ifdef __linux__ +#define aof_fsync fdatasync +#else +#define aof_fsync fsync +#endif + +#endif diff --git a/src/db.c b/src/db.c new file mode 100644 index 000000000..e1e82cb22 --- /dev/null +++ b/src/db.c @@ -0,0 +1,508 @@ +#include "redis.h" + +#include + +/*----------------------------------------------------------------------------- + * C-level DB API + *----------------------------------------------------------------------------*/ + +robj *lookupKey(redisDb *db, robj *key) { + dictEntry *de = dictFind(db->dict,key->ptr); + if (de) { + robj *val = dictGetEntryVal(de); + + if (server.vm_enabled) { + if (val->storage == REDIS_VM_MEMORY || + val->storage == REDIS_VM_SWAPPING) + { + /* If we were swapping the object out, cancel the operation */ + if (val->storage == REDIS_VM_SWAPPING) + vmCancelThreadedIOJob(val); + /* Update the access time for the aging algorithm. */ + val->lru = server.lruclock; + } else { + int notify = (val->storage == REDIS_VM_LOADING); + + /* Our value was swapped on disk. Bring it at home. */ + redisAssert(val->type == REDIS_VMPOINTER); + val = vmLoadObject(val); + dictGetEntryVal(de) = val; + + /* Clients blocked by the VM subsystem may be waiting for + * this key... */ + if (notify) handleClientsBlockedOnSwappedKey(db,key); + } + } + return val; + } else { + return NULL; + } +} + +robj *lookupKeyRead(redisDb *db, robj *key) { + expireIfNeeded(db,key); + return lookupKey(db,key); +} + +robj *lookupKeyWrite(redisDb *db, robj *key) { + deleteIfVolatile(db,key); + touchWatchedKey(db,key); + return lookupKey(db,key); +} + +robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) { + robj *o = lookupKeyRead(c->db, key); + if (!o) addReply(c,reply); + return o; +} + +robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) { + robj *o = lookupKeyWrite(c->db, key); + if (!o) addReply(c,reply); + return o; +} + +/* Add the key to the DB. If the key already exists REDIS_ERR is returned, + * otherwise REDIS_OK is returned, and the caller should increment the + * refcount of 'val'. */ +int dbAdd(redisDb *db, robj *key, robj *val) { + /* Perform a lookup before adding the key, as we need to copy the + * key value. */ + if (dictFind(db->dict, key->ptr) != NULL) { + return REDIS_ERR; + } else { + sds copy = sdsdup(key->ptr); + dictAdd(db->dict, copy, val); + return REDIS_OK; + } +} + +/* If the key does not exist, this is just like dbAdd(). Otherwise + * the value associated to the key is replaced with the new one. + * + * On update (key already existed) 0 is returned. Otherwise 1. */ +int dbReplace(redisDb *db, robj *key, robj *val) { + if (dictFind(db->dict,key->ptr) == NULL) { + sds copy = sdsdup(key->ptr); + dictAdd(db->dict, copy, val); + return 1; + } else { + dictReplace(db->dict, key->ptr, val); + return 0; + } +} + +int dbExists(redisDb *db, robj *key) { + return dictFind(db->dict,key->ptr) != NULL; +} + +/* Return a random key, in form of a Redis object. + * If there are no keys, NULL is returned. + * + * The function makes sure to return keys not already expired. */ +robj *dbRandomKey(redisDb *db) { + struct dictEntry *de; + + while(1) { + sds key; + robj *keyobj; + + de = dictGetRandomKey(db->dict); + if (de == NULL) return NULL; + + key = dictGetEntryKey(de); + keyobj = createStringObject(key,sdslen(key)); + if (dictFind(db->expires,key)) { + if (expireIfNeeded(db,keyobj)) { + decrRefCount(keyobj); + continue; /* search for another key. This expired. */ + } + } + return keyobj; + } +} + +/* Delete a key, value, and associated expiration entry if any, from the DB */ +int dbDelete(redisDb *db, robj *key) { + /* Deleting an entry from the expires dict will not free the sds of + * the key, because it is shared with the main dictionary. */ + if (dictSize(db->expires) > 0) dictDelete(db->expires,key->ptr); + return dictDelete(db->dict,key->ptr) == DICT_OK; +} + +/* Empty the whole database */ +long long emptyDb() { + int j; + long long removed = 0; + + for (j = 0; j < server.dbnum; j++) { + removed += dictSize(server.db[j].dict); + dictEmpty(server.db[j].dict); + dictEmpty(server.db[j].expires); + } + return removed; +} + +int selectDb(redisClient *c, int id) { + if (id < 0 || id >= server.dbnum) + return REDIS_ERR; + c->db = &server.db[id]; + return REDIS_OK; +} + +/*----------------------------------------------------------------------------- + * Type agnostic commands operating on the key space + *----------------------------------------------------------------------------*/ + +void flushdbCommand(redisClient *c) { + server.dirty += dictSize(c->db->dict); + touchWatchedKeysOnFlush(c->db->id); + dictEmpty(c->db->dict); + dictEmpty(c->db->expires); + addReply(c,shared.ok); +} + +void flushallCommand(redisClient *c) { + touchWatchedKeysOnFlush(-1); + server.dirty += emptyDb(); + addReply(c,shared.ok); + if (server.bgsavechildpid != -1) { + kill(server.bgsavechildpid,SIGKILL); + rdbRemoveTempFile(server.bgsavechildpid); + } + rdbSave(server.dbfilename); + server.dirty++; +} + +void delCommand(redisClient *c) { + int deleted = 0, j; + + for (j = 1; j < c->argc; j++) { + if (dbDelete(c->db,c->argv[j])) { + touchWatchedKey(c->db,c->argv[j]); + server.dirty++; + deleted++; + } + } + addReplyLongLong(c,deleted); +} + +void existsCommand(redisClient *c) { + expireIfNeeded(c->db,c->argv[1]); + if (dbExists(c->db,c->argv[1])) { + addReply(c, shared.cone); + } else { + addReply(c, shared.czero); + } +} + +void selectCommand(redisClient *c) { + int id = atoi(c->argv[1]->ptr); + + if (selectDb(c,id) == REDIS_ERR) { + addReplySds(c,sdsnew("-ERR invalid DB index\r\n")); + } else { + addReply(c,shared.ok); + } +} + +void randomkeyCommand(redisClient *c) { + robj *key; + + if ((key = dbRandomKey(c->db)) == NULL) { + addReply(c,shared.nullbulk); + return; + } + + addReplyBulk(c,key); + decrRefCount(key); +} + +void keysCommand(redisClient *c) { + dictIterator *di; + dictEntry *de; + sds pattern = c->argv[1]->ptr; + int plen = sdslen(pattern); + unsigned long numkeys = 0; + robj *lenobj = createObject(REDIS_STRING,NULL); + + di = dictGetIterator(c->db->dict); + addReply(c,lenobj); + decrRefCount(lenobj); + while((de = dictNext(di)) != NULL) { + sds key = dictGetEntryKey(de); + robj *keyobj; + + if ((pattern[0] == '*' && pattern[1] == '\0') || + stringmatchlen(pattern,plen,key,sdslen(key),0)) { + keyobj = createStringObject(key,sdslen(key)); + if (expireIfNeeded(c->db,keyobj) == 0) { + addReplyBulk(c,keyobj); + numkeys++; + } + decrRefCount(keyobj); + } + } + dictReleaseIterator(di); + lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys); +} + +void dbsizeCommand(redisClient *c) { + addReplySds(c, + sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict))); +} + +void lastsaveCommand(redisClient *c) { + addReplySds(c, + sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave)); +} + +void typeCommand(redisClient *c) { + robj *o; + char *type; + + o = lookupKeyRead(c->db,c->argv[1]); + if (o == NULL) { + type = "+none"; + } else { + switch(o->type) { + case REDIS_STRING: type = "+string"; break; + case REDIS_LIST: type = "+list"; break; + case REDIS_SET: type = "+set"; break; + case REDIS_ZSET: type = "+zset"; break; + case REDIS_HASH: type = "+hash"; break; + default: type = "+unknown"; break; + } + } + addReplySds(c,sdsnew(type)); + addReply(c,shared.crlf); +} + +void saveCommand(redisClient *c) { + if (server.bgsavechildpid != -1) { + addReplySds(c,sdsnew("-ERR background save in progress\r\n")); + return; + } + if (rdbSave(server.dbfilename) == REDIS_OK) { + addReply(c,shared.ok); + } else { + addReply(c,shared.err); + } +} + +void bgsaveCommand(redisClient *c) { + if (server.bgsavechildpid != -1) { + addReplySds(c,sdsnew("-ERR background save already in progress\r\n")); + return; + } + if (rdbSaveBackground(server.dbfilename) == REDIS_OK) { + char *status = "+Background saving started\r\n"; + addReplySds(c,sdsnew(status)); + } else { + addReply(c,shared.err); + } +} + +void shutdownCommand(redisClient *c) { + if (prepareForShutdown() == REDIS_OK) + exit(0); + addReplySds(c, sdsnew("-ERR Errors trying to SHUTDOWN. Check logs.\r\n")); +} + +void renameGenericCommand(redisClient *c, int nx) { + robj *o; + + /* To use the same key as src and dst is probably an error */ + if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) { + addReply(c,shared.sameobjecterr); + return; + } + + if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL) + return; + + incrRefCount(o); + deleteIfVolatile(c->db,c->argv[2]); + if (dbAdd(c->db,c->argv[2],o) == REDIS_ERR) { + if (nx) { + decrRefCount(o); + addReply(c,shared.czero); + return; + } + dbReplace(c->db,c->argv[2],o); + } + dbDelete(c->db,c->argv[1]); + touchWatchedKey(c->db,c->argv[2]); + server.dirty++; + addReply(c,nx ? shared.cone : shared.ok); +} + +void renameCommand(redisClient *c) { + renameGenericCommand(c,0); +} + +void renamenxCommand(redisClient *c) { + renameGenericCommand(c,1); +} + +void moveCommand(redisClient *c) { + robj *o; + redisDb *src, *dst; + int srcid; + + /* Obtain source and target DB pointers */ + src = c->db; + srcid = c->db->id; + if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) { + addReply(c,shared.outofrangeerr); + return; + } + dst = c->db; + selectDb(c,srcid); /* Back to the source DB */ + + /* If the user is moving using as target the same + * DB as the source DB it is probably an error. */ + if (src == dst) { + addReply(c,shared.sameobjecterr); + return; + } + + /* Check if the element exists and get a reference */ + o = lookupKeyWrite(c->db,c->argv[1]); + if (!o) { + addReply(c,shared.czero); + return; + } + + /* Try to add the element to the target DB */ + deleteIfVolatile(dst,c->argv[1]); + if (dbAdd(dst,c->argv[1],o) == REDIS_ERR) { + addReply(c,shared.czero); + return; + } + incrRefCount(o); + + /* OK! key moved, free the entry in the source DB */ + dbDelete(src,c->argv[1]); + server.dirty++; + addReply(c,shared.cone); +} + +/*----------------------------------------------------------------------------- + * Expires API + *----------------------------------------------------------------------------*/ + +int removeExpire(redisDb *db, robj *key) { + /* An expire may only be removed if there is a corresponding entry in the + * main dict. Otherwise, the key will never be freed. */ + redisAssert(dictFind(db->dict,key->ptr) != NULL); + if (dictDelete(db->expires,key->ptr) == DICT_OK) { + return 1; + } else { + return 0; + } +} + +int setExpire(redisDb *db, robj *key, time_t when) { + dictEntry *de; + + /* Reuse the sds from the main dict in the expire dict */ + redisAssert((de = dictFind(db->dict,key->ptr)) != NULL); + if (dictAdd(db->expires,dictGetEntryKey(de),(void*)when) == DICT_ERR) { + return 0; + } else { + return 1; + } +} + +/* Return the expire time of the specified key, or -1 if no expire + * is associated with this key (i.e. the key is non volatile) */ +time_t getExpire(redisDb *db, robj *key) { + dictEntry *de; + + /* No expire? return ASAP */ + if (dictSize(db->expires) == 0 || + (de = dictFind(db->expires,key->ptr)) == NULL) return -1; + + /* The entry was found in the expire dict, this means it should also + * be present in the main dict (safety check). */ + redisAssert(dictFind(db->dict,key->ptr) != NULL); + return (time_t) dictGetEntryVal(de); +} + +int expireIfNeeded(redisDb *db, robj *key) { + time_t when = getExpire(db,key); + if (when < 0) return 0; + + /* Return when this key has not expired */ + if (time(NULL) <= when) return 0; + + /* Delete the key */ + server.stat_expiredkeys++; + server.dirty++; + return dbDelete(db,key); +} + +int deleteIfVolatile(redisDb *db, robj *key) { + if (getExpire(db,key) < 0) return 0; + + /* Delete the key */ + server.stat_expiredkeys++; + server.dirty++; + return dbDelete(db,key); +} + +/*----------------------------------------------------------------------------- + * Expires Commands + *----------------------------------------------------------------------------*/ + +void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) { + dictEntry *de; + time_t seconds; + + if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return; + + seconds -= offset; + + de = dictFind(c->db->dict,key->ptr); + if (de == NULL) { + addReply(c,shared.czero); + return; + } + if (seconds <= 0) { + if (dbDelete(c->db,key)) server.dirty++; + addReply(c, shared.cone); + return; + } else { + time_t when = time(NULL)+seconds; + if (setExpire(c->db,key,when)) { + addReply(c,shared.cone); + server.dirty++; + } else { + addReply(c,shared.czero); + } + return; + } +} + +void expireCommand(redisClient *c) { + expireGenericCommand(c,c->argv[1],c->argv[2],0); +} + +void expireatCommand(redisClient *c) { + expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL)); +} + +void ttlCommand(redisClient *c) { + time_t expire; + int ttl = -1; + + expire = getExpire(c->db,c->argv[1]); + if (expire != -1) { + ttl = (int) (expire-time(NULL)); + if (ttl < 0) ttl = -1; + } + addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl)); +} + + diff --git a/src/debug.c b/src/debug.c new file mode 100644 index 000000000..10b620d6f --- /dev/null +++ b/src/debug.c @@ -0,0 +1,309 @@ +#include "redis.h" +#include "sha1.h" /* SHA1 is used for DEBUG DIGEST */ + +/* ================================= Debugging ============================== */ + +/* Compute the sha1 of string at 's' with 'len' bytes long. + * The SHA1 is then xored againt the string pointed by digest. + * Since xor is commutative, this operation is used in order to + * "add" digests relative to unordered elements. + * + * So digest(a,b,c,d) will be the same of digest(b,a,c,d) */ +void xorDigest(unsigned char *digest, void *ptr, size_t len) { + SHA1_CTX ctx; + unsigned char hash[20], *s = ptr; + int j; + + SHA1Init(&ctx); + SHA1Update(&ctx,s,len); + SHA1Final(hash,&ctx); + + for (j = 0; j < 20; j++) + digest[j] ^= hash[j]; +} + +void xorObjectDigest(unsigned char *digest, robj *o) { + o = getDecodedObject(o); + xorDigest(digest,o->ptr,sdslen(o->ptr)); + decrRefCount(o); +} + +/* This function instead of just computing the SHA1 and xoring it + * against diget, also perform the digest of "digest" itself and + * replace the old value with the new one. + * + * So the final digest will be: + * + * digest = SHA1(digest xor SHA1(data)) + * + * This function is used every time we want to preserve the order so + * that digest(a,b,c,d) will be different than digest(b,c,d,a) + * + * Also note that mixdigest("foo") followed by mixdigest("bar") + * will lead to a different digest compared to "fo", "obar". + */ +void mixDigest(unsigned char *digest, void *ptr, size_t len) { + SHA1_CTX ctx; + char *s = ptr; + + xorDigest(digest,s,len); + SHA1Init(&ctx); + SHA1Update(&ctx,digest,20); + SHA1Final(digest,&ctx); +} + +void mixObjectDigest(unsigned char *digest, robj *o) { + o = getDecodedObject(o); + mixDigest(digest,o->ptr,sdslen(o->ptr)); + decrRefCount(o); +} + +/* Compute the dataset digest. Since keys, sets elements, hashes elements + * are not ordered, we use a trick: every aggregate digest is the xor + * of the digests of their elements. This way the order will not change + * the result. For list instead we use a feedback entering the output digest + * as input in order to ensure that a different ordered list will result in + * a different digest. */ +void computeDatasetDigest(unsigned char *final) { + unsigned char digest[20]; + char buf[128]; + dictIterator *di = NULL; + dictEntry *de; + int j; + uint32_t aux; + + memset(final,0,20); /* Start with a clean result */ + + for (j = 0; j < server.dbnum; j++) { + redisDb *db = server.db+j; + + if (dictSize(db->dict) == 0) continue; + di = dictGetIterator(db->dict); + + /* hash the DB id, so the same dataset moved in a different + * DB will lead to a different digest */ + aux = htonl(j); + mixDigest(final,&aux,sizeof(aux)); + + /* Iterate this DB writing every entry */ + while((de = dictNext(di)) != NULL) { + sds key; + robj *keyobj, *o; + time_t expiretime; + + memset(digest,0,20); /* This key-val digest */ + key = dictGetEntryKey(de); + keyobj = createStringObject(key,sdslen(key)); + + mixDigest(digest,key,sdslen(key)); + + /* Make sure the key is loaded if VM is active */ + o = lookupKeyRead(db,keyobj); + + aux = htonl(o->type); + mixDigest(digest,&aux,sizeof(aux)); + expiretime = getExpire(db,keyobj); + + /* Save the key and associated value */ + if (o->type == REDIS_STRING) { + mixObjectDigest(digest,o); + } else if (o->type == REDIS_LIST) { + listTypeIterator *li = listTypeInitIterator(o,0,REDIS_TAIL); + listTypeEntry entry; + while(listTypeNext(li,&entry)) { + robj *eleobj = listTypeGet(&entry); + mixObjectDigest(digest,eleobj); + decrRefCount(eleobj); + } + listTypeReleaseIterator(li); + } else if (o->type == REDIS_SET) { + dict *set = o->ptr; + dictIterator *di = dictGetIterator(set); + dictEntry *de; + + while((de = dictNext(di)) != NULL) { + robj *eleobj = dictGetEntryKey(de); + + xorObjectDigest(digest,eleobj); + } + dictReleaseIterator(di); + } else if (o->type == REDIS_ZSET) { + zset *zs = o->ptr; + dictIterator *di = dictGetIterator(zs->dict); + dictEntry *de; + + while((de = dictNext(di)) != NULL) { + robj *eleobj = dictGetEntryKey(de); + double *score = dictGetEntryVal(de); + unsigned char eledigest[20]; + + snprintf(buf,sizeof(buf),"%.17g",*score); + memset(eledigest,0,20); + mixObjectDigest(eledigest,eleobj); + mixDigest(eledigest,buf,strlen(buf)); + xorDigest(digest,eledigest,20); + } + dictReleaseIterator(di); + } else if (o->type == REDIS_HASH) { + hashTypeIterator *hi; + robj *obj; + + hi = hashTypeInitIterator(o); + while (hashTypeNext(hi) != REDIS_ERR) { + unsigned char eledigest[20]; + + memset(eledigest,0,20); + obj = hashTypeCurrent(hi,REDIS_HASH_KEY); + mixObjectDigest(eledigest,obj); + decrRefCount(obj); + obj = hashTypeCurrent(hi,REDIS_HASH_VALUE); + mixObjectDigest(eledigest,obj); + decrRefCount(obj); + xorDigest(digest,eledigest,20); + } + hashTypeReleaseIterator(hi); + } else { + redisPanic("Unknown object type"); + } + /* If the key has an expire, add it to the mix */ + if (expiretime != -1) xorDigest(digest,"!!expire!!",10); + /* We can finally xor the key-val digest to the final digest */ + xorDigest(final,digest,20); + decrRefCount(keyobj); + } + dictReleaseIterator(di); + } +} + +void debugCommand(redisClient *c) { + if (!strcasecmp(c->argv[1]->ptr,"segfault")) { + *((char*)-1) = 'x'; + } else if (!strcasecmp(c->argv[1]->ptr,"reload")) { + if (rdbSave(server.dbfilename) != REDIS_OK) { + addReply(c,shared.err); + return; + } + emptyDb(); + if (rdbLoad(server.dbfilename) != REDIS_OK) { + addReply(c,shared.err); + return; + } + redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD"); + addReply(c,shared.ok); + } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) { + emptyDb(); + if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) { + addReply(c,shared.err); + return; + } + redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF"); + addReply(c,shared.ok); + } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) { + dictEntry *de = dictFind(c->db->dict,c->argv[2]->ptr); + robj *val; + + if (!de) { + addReply(c,shared.nokeyerr); + return; + } + val = dictGetEntryVal(de); + if (!server.vm_enabled || (val->storage == REDIS_VM_MEMORY || + val->storage == REDIS_VM_SWAPPING)) { + char *strenc; + + strenc = strEncoding(val->encoding); + addReplySds(c,sdscatprintf(sdsempty(), + "+Value at:%p refcount:%d " + "encoding:%s serializedlength:%lld\r\n", + (void*)val, val->refcount, + strenc, (long long) rdbSavedObjectLen(val,NULL))); + } else { + vmpointer *vp = (vmpointer*) val; + addReplySds(c,sdscatprintf(sdsempty(), + "+Value swapped at: page %llu " + "using %llu pages\r\n", + (unsigned long long) vp->page, + (unsigned long long) vp->usedpages)); + } + } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) { + lookupKeyRead(c->db,c->argv[2]); + addReply(c,shared.ok); + } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) { + dictEntry *de = dictFind(c->db->dict,c->argv[2]->ptr); + robj *val; + vmpointer *vp; + + if (!server.vm_enabled) { + addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n")); + return; + } + if (!de) { + addReply(c,shared.nokeyerr); + return; + } + val = dictGetEntryVal(de); + /* Swap it */ + if (val->storage != REDIS_VM_MEMORY) { + addReplySds(c,sdsnew("-ERR This key is not in memory\r\n")); + } else if (val->refcount != 1) { + addReplySds(c,sdsnew("-ERR Object is shared\r\n")); + } else if ((vp = vmSwapObjectBlocking(val)) != NULL) { + dictGetEntryVal(de) = vp; + addReply(c,shared.ok); + } else { + addReply(c,shared.err); + } + } else if (!strcasecmp(c->argv[1]->ptr,"populate") && c->argc == 3) { + long keys, j; + robj *key, *val; + char buf[128]; + + if (getLongFromObjectOrReply(c, c->argv[2], &keys, NULL) != REDIS_OK) + return; + for (j = 0; j < keys; j++) { + snprintf(buf,sizeof(buf),"key:%lu",j); + key = createStringObject(buf,strlen(buf)); + if (lookupKeyRead(c->db,key) != NULL) { + decrRefCount(key); + continue; + } + snprintf(buf,sizeof(buf),"value:%lu",j); + val = createStringObject(buf,strlen(buf)); + dbAdd(c->db,key,val); + decrRefCount(key); + } + addReply(c,shared.ok); + } else if (!strcasecmp(c->argv[1]->ptr,"digest") && c->argc == 2) { + unsigned char digest[20]; + sds d = sdsnew("+"); + int j; + + computeDatasetDigest(digest); + for (j = 0; j < 20; j++) + d = sdscatprintf(d, "%02x",digest[j]); + + d = sdscatlen(d,"\r\n",2); + addReplySds(c,d); + } else { + addReplySds(c,sdsnew( + "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT |SWAPIN |SWAPOUT |RELOAD]\r\n")); + } +} + +void _redisAssert(char *estr, char *file, int line) { + redisLog(REDIS_WARNING,"=== ASSERTION FAILED ==="); + redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true",file,line,estr); +#ifdef HAVE_BACKTRACE + redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)"); + *((char*)-1) = 'x'; +#endif +} + +void _redisPanic(char *msg, char *file, int line) { + redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue"); + redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line); +#ifdef HAVE_BACKTRACE + redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)"); + *((char*)-1) = 'x'; +#endif +} diff --git a/src/dict.c b/src/dict.c new file mode 100644 index 000000000..d5010708c --- /dev/null +++ b/src/dict.c @@ -0,0 +1,727 @@ +/* Hash Tables Implementation. + * + * This file implements in memory hash tables with insert/del/replace/find/ + * get-random-element operations. Hash tables will auto resize if needed + * tables of power of two in size are used, collisions are handled by + * chaining. See the source code for more information... :) + * + * Copyright (c) 2006-2010, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "fmacros.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "dict.h" +#include "zmalloc.h" + +/* Using dictEnableResize() / dictDisableResize() we make possible to + * enable/disable resizing of the hash table as needed. This is very important + * for Redis, as we use copy-on-write and don't want to move too much memory + * around when there is a child performing saving operations. */ +static int dict_can_resize = 1; + +/* ---------------------------- Utility funcitons --------------------------- */ + +static void _dictPanic(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + fprintf(stderr, "\nDICT LIBRARY PANIC: "); + vfprintf(stderr, fmt, ap); + fprintf(stderr, "\n\n"); + va_end(ap); +} + +/* ------------------------- Heap Management Wrappers------------------------ */ + +static void *_dictAlloc(size_t size) +{ + void *p = zmalloc(size); + if (p == NULL) + _dictPanic("Out of memory"); + return p; +} + +static void _dictFree(void *ptr) { + zfree(ptr); +} + +/* -------------------------- private prototypes ---------------------------- */ + +static int _dictExpandIfNeeded(dict *ht); +static unsigned long _dictNextPower(unsigned long size); +static int _dictKeyIndex(dict *ht, const void *key); +static int _dictInit(dict *ht, dictType *type, void *privDataPtr); + +/* -------------------------- hash functions -------------------------------- */ + +/* Thomas Wang's 32 bit Mix Function */ +unsigned int dictIntHashFunction(unsigned int key) +{ + key += ~(key << 15); + key ^= (key >> 10); + key += (key << 3); + key ^= (key >> 6); + key += ~(key << 11); + key ^= (key >> 16); + return key; +} + +/* Identity hash function for integer keys */ +unsigned int dictIdentityHashFunction(unsigned int key) +{ + return key; +} + +/* Generic hash function (a popular one from Bernstein). + * I tested a few and this was the best. */ +unsigned int dictGenHashFunction(const unsigned char *buf, int len) { + unsigned int hash = 5381; + + while (len--) + hash = ((hash << 5) + hash) + (*buf++); /* hash * 33 + c */ + return hash; +} + +/* ----------------------------- API implementation ------------------------- */ + +/* Reset an hashtable already initialized with ht_init(). + * NOTE: This function should only called by ht_destroy(). */ +static void _dictReset(dictht *ht) +{ + ht->table = NULL; + ht->size = 0; + ht->sizemask = 0; + ht->used = 0; +} + +/* Create a new hash table */ +dict *dictCreate(dictType *type, + void *privDataPtr) +{ + dict *d = _dictAlloc(sizeof(*d)); + + _dictInit(d,type,privDataPtr); + return d; +} + +/* Initialize the hash table */ +int _dictInit(dict *d, dictType *type, + void *privDataPtr) +{ + _dictReset(&d->ht[0]); + _dictReset(&d->ht[1]); + d->type = type; + d->privdata = privDataPtr; + d->rehashidx = -1; + d->iterators = 0; + return DICT_OK; +} + +/* Resize the table to the minimal size that contains all the elements, + * but with the invariant of a USER/BUCKETS ration near to <= 1 */ +int dictResize(dict *d) +{ + int minimal; + + if (!dict_can_resize || dictIsRehashing(d)) return DICT_ERR; + minimal = d->ht[0].used; + if (minimal < DICT_HT_INITIAL_SIZE) + minimal = DICT_HT_INITIAL_SIZE; + return dictExpand(d, minimal); +} + +/* Expand or create the hashtable */ +int dictExpand(dict *d, unsigned long size) +{ + dictht n; /* the new hashtable */ + unsigned long realsize = _dictNextPower(size); + + /* the size is invalid if it is smaller than the number of + * elements already inside the hashtable */ + if (dictIsRehashing(d) || d->ht[0].used > size) + return DICT_ERR; + + n.size = realsize; + n.sizemask = realsize-1; + n.table = _dictAlloc(realsize*sizeof(dictEntry*)); + n.used = 0; + + /* Initialize all the pointers to NULL */ + memset(n.table, 0, realsize*sizeof(dictEntry*)); + + /* Is this the first initialization? If so it's not really a rehashing + * we just set the first hash table so that it can accept keys. */ + if (d->ht[0].table == NULL) { + d->ht[0] = n; + return DICT_OK; + } + + /* Prepare a second hash table for incremental rehashing */ + d->ht[1] = n; + d->rehashidx = 0; + return DICT_OK; +} + +/* Performs N steps of incremental rehashing. Returns 1 if there are still + * keys to move from the old to the new hash table, otherwise 0 is returned. + * Note that a rehashing step consists in moving a bucket (that may have more + * thank one key as we use chaining) from the old to the new hash table. */ +int dictRehash(dict *d, int n) { + if (!dictIsRehashing(d)) return 0; + + while(n--) { + dictEntry *de, *nextde; + + /* Check if we already rehashed the whole table... */ + if (d->ht[0].used == 0) { + _dictFree(d->ht[0].table); + d->ht[0] = d->ht[1]; + _dictReset(&d->ht[1]); + d->rehashidx = -1; + return 0; + } + + /* Note that rehashidx can't overflow as we are sure there are more + * elements because ht[0].used != 0 */ + while(d->ht[0].table[d->rehashidx] == NULL) d->rehashidx++; + de = d->ht[0].table[d->rehashidx]; + /* Move all the keys in this bucket from the old to the new hash HT */ + while(de) { + unsigned int h; + + nextde = de->next; + /* Get the index in the new hash table */ + h = dictHashKey(d, de->key) & d->ht[1].sizemask; + de->next = d->ht[1].table[h]; + d->ht[1].table[h] = de; + d->ht[0].used--; + d->ht[1].used++; + de = nextde; + } + d->ht[0].table[d->rehashidx] = NULL; + d->rehashidx++; + } + return 1; +} + +long long timeInMilliseconds(void) { + struct timeval tv; + + gettimeofday(&tv,NULL); + return (((long long)tv.tv_sec)*1000)+(tv.tv_usec/1000); +} + +/* Rehash for an amount of time between ms milliseconds and ms+1 milliseconds */ +int dictRehashMilliseconds(dict *d, int ms) { + long long start = timeInMilliseconds(); + int rehashes = 0; + + while(dictRehash(d,100)) { + rehashes += 100; + if (timeInMilliseconds()-start > ms) break; + } + return rehashes; +} + +/* This function performs just a step of rehashing, and only if there are + * not iterators bound to our hash table. When we have iterators in the middle + * of a rehashing we can't mess with the two hash tables otherwise some element + * can be missed or duplicated. + * + * This function is called by common lookup or update operations in the + * dictionary so that the hash table automatically migrates from H1 to H2 + * while it is actively used. */ +static void _dictRehashStep(dict *d) { + if (d->iterators == 0) dictRehash(d,1); +} + +/* Add an element to the target hash table */ +int dictAdd(dict *d, void *key, void *val) +{ + int index; + dictEntry *entry; + dictht *ht; + + if (dictIsRehashing(d)) _dictRehashStep(d); + + /* Get the index of the new element, or -1 if + * the element already exists. */ + if ((index = _dictKeyIndex(d, key)) == -1) + return DICT_ERR; + + /* Allocates the memory and stores key */ + ht = dictIsRehashing(d) ? &d->ht[1] : &d->ht[0]; + entry = _dictAlloc(sizeof(*entry)); + entry->next = ht->table[index]; + ht->table[index] = entry; + ht->used++; + + /* Set the hash entry fields. */ + dictSetHashKey(d, entry, key); + dictSetHashVal(d, entry, val); + return DICT_OK; +} + +/* Add an element, discarding the old if the key already exists. + * Return 1 if the key was added from scratch, 0 if there was already an + * element with such key and dictReplace() just performed a value update + * operation. */ +int dictReplace(dict *d, void *key, void *val) +{ + dictEntry *entry, auxentry; + + /* Try to add the element. If the key + * does not exists dictAdd will suceed. */ + if (dictAdd(d, key, val) == DICT_OK) + return 1; + /* It already exists, get the entry */ + entry = dictFind(d, key); + /* Free the old value and set the new one */ + /* Set the new value and free the old one. Note that it is important + * to do that in this order, as the value may just be exactly the same + * as the previous one. In this context, think to reference counting, + * you want to increment (set), and then decrement (free), and not the + * reverse. */ + auxentry = *entry; + dictSetHashVal(d, entry, val); + dictFreeEntryVal(d, &auxentry); + return 0; +} + +/* Search and remove an element */ +static int dictGenericDelete(dict *d, const void *key, int nofree) +{ + unsigned int h, idx; + dictEntry *he, *prevHe; + int table; + + if (d->ht[0].size == 0) return DICT_ERR; /* d->ht[0].table is NULL */ + if (dictIsRehashing(d)) _dictRehashStep(d); + h = dictHashKey(d, key); + + for (table = 0; table <= 1; table++) { + idx = h & d->ht[table].sizemask; + he = d->ht[table].table[idx]; + prevHe = NULL; + while(he) { + if (dictCompareHashKeys(d, key, he->key)) { + /* Unlink the element from the list */ + if (prevHe) + prevHe->next = he->next; + else + d->ht[table].table[idx] = he->next; + if (!nofree) { + dictFreeEntryKey(d, he); + dictFreeEntryVal(d, he); + } + _dictFree(he); + d->ht[table].used--; + return DICT_OK; + } + prevHe = he; + he = he->next; + } + if (!dictIsRehashing(d)) break; + } + return DICT_ERR; /* not found */ +} + +int dictDelete(dict *ht, const void *key) { + return dictGenericDelete(ht,key,0); +} + +int dictDeleteNoFree(dict *ht, const void *key) { + return dictGenericDelete(ht,key,1); +} + +/* Destroy an entire dictionary */ +int _dictClear(dict *d, dictht *ht) +{ + unsigned long i; + + /* Free all the elements */ + for (i = 0; i < ht->size && ht->used > 0; i++) { + dictEntry *he, *nextHe; + + if ((he = ht->table[i]) == NULL) continue; + while(he) { + nextHe = he->next; + dictFreeEntryKey(d, he); + dictFreeEntryVal(d, he); + _dictFree(he); + ht->used--; + he = nextHe; + } + } + /* Free the table and the allocated cache structure */ + _dictFree(ht->table); + /* Re-initialize the table */ + _dictReset(ht); + return DICT_OK; /* never fails */ +} + +/* Clear & Release the hash table */ +void dictRelease(dict *d) +{ + _dictClear(d,&d->ht[0]); + _dictClear(d,&d->ht[1]); + _dictFree(d); +} + +dictEntry *dictFind(dict *d, const void *key) +{ + dictEntry *he; + unsigned int h, idx, table; + + if (d->ht[0].size == 0) return NULL; /* We don't have a table at all */ + if (dictIsRehashing(d)) _dictRehashStep(d); + h = dictHashKey(d, key); + for (table = 0; table <= 1; table++) { + idx = h & d->ht[table].sizemask; + he = d->ht[table].table[idx]; + while(he) { + if (dictCompareHashKeys(d, key, he->key)) + return he; + he = he->next; + } + if (!dictIsRehashing(d)) return NULL; + } + return NULL; +} + +void *dictFetchValue(dict *d, const void *key) { + dictEntry *he; + + he = dictFind(d,key); + return he ? dictGetEntryVal(he) : NULL; +} + +dictIterator *dictGetIterator(dict *d) +{ + dictIterator *iter = _dictAlloc(sizeof(*iter)); + + iter->d = d; + iter->table = 0; + iter->index = -1; + iter->entry = NULL; + iter->nextEntry = NULL; + return iter; +} + +dictEntry *dictNext(dictIterator *iter) +{ + while (1) { + if (iter->entry == NULL) { + dictht *ht = &iter->d->ht[iter->table]; + if (iter->index == -1 && iter->table == 0) iter->d->iterators++; + iter->index++; + if (iter->index >= (signed) ht->size) { + if (dictIsRehashing(iter->d) && iter->table == 0) { + iter->table++; + iter->index = 0; + ht = &iter->d->ht[1]; + } else { + break; + } + } + iter->entry = ht->table[iter->index]; + } else { + iter->entry = iter->nextEntry; + } + if (iter->entry) { + /* We need to save the 'next' here, the iterator user + * may delete the entry we are returning. */ + iter->nextEntry = iter->entry->next; + return iter->entry; + } + } + return NULL; +} + +void dictReleaseIterator(dictIterator *iter) +{ + if (!(iter->index == -1 && iter->table == 0)) iter->d->iterators--; + _dictFree(iter); +} + +/* Return a random entry from the hash table. Useful to + * implement randomized algorithms */ +dictEntry *dictGetRandomKey(dict *d) +{ + dictEntry *he, *orighe; + unsigned int h; + int listlen, listele; + + if (dictSize(d) == 0) return NULL; + if (dictIsRehashing(d)) _dictRehashStep(d); + if (dictIsRehashing(d)) { + do { + h = random() % (d->ht[0].size+d->ht[1].size); + he = (h >= d->ht[0].size) ? d->ht[1].table[h - d->ht[0].size] : + d->ht[0].table[h]; + } while(he == NULL); + } else { + do { + h = random() & d->ht[0].sizemask; + he = d->ht[0].table[h]; + } while(he == NULL); + } + + /* Now we found a non empty bucket, but it is a linked + * list and we need to get a random element from the list. + * The only sane way to do so is counting the elements and + * select a random index. */ + listlen = 0; + orighe = he; + while(he) { + he = he->next; + listlen++; + } + listele = random() % listlen; + he = orighe; + while(listele--) he = he->next; + return he; +} + +/* ------------------------- private functions ------------------------------ */ + +/* Expand the hash table if needed */ +static int _dictExpandIfNeeded(dict *d) +{ + /* If the hash table is empty expand it to the intial size, + * if the table is "full" dobule its size. */ + if (dictIsRehashing(d)) return DICT_OK; + if (d->ht[0].size == 0) + return dictExpand(d, DICT_HT_INITIAL_SIZE); + if (d->ht[0].used >= d->ht[0].size && dict_can_resize) + return dictExpand(d, ((d->ht[0].size > d->ht[0].used) ? + d->ht[0].size : d->ht[0].used)*2); + return DICT_OK; +} + +/* Our hash table capability is a power of two */ +static unsigned long _dictNextPower(unsigned long size) +{ + unsigned long i = DICT_HT_INITIAL_SIZE; + + if (size >= LONG_MAX) return LONG_MAX; + while(1) { + if (i >= size) + return i; + i *= 2; + } +} + +/* Returns the index of a free slot that can be populated with + * an hash entry for the given 'key'. + * If the key already exists, -1 is returned. + * + * Note that if we are in the process of rehashing the hash table, the + * index is always returned in the context of the second (new) hash table. */ +static int _dictKeyIndex(dict *d, const void *key) +{ + unsigned int h, idx, table; + dictEntry *he; + + /* Expand the hashtable if needed */ + if (_dictExpandIfNeeded(d) == DICT_ERR) + return -1; + /* Compute the key hash value */ + h = dictHashKey(d, key); + for (table = 0; table <= 1; table++) { + idx = h & d->ht[table].sizemask; + /* Search if this slot does not already contain the given key */ + he = d->ht[table].table[idx]; + while(he) { + if (dictCompareHashKeys(d, key, he->key)) + return -1; + he = he->next; + } + if (!dictIsRehashing(d)) break; + } + return idx; +} + +void dictEmpty(dict *d) { + _dictClear(d,&d->ht[0]); + _dictClear(d,&d->ht[1]); + d->rehashidx = -1; + d->iterators = 0; +} + +#define DICT_STATS_VECTLEN 50 +static void _dictPrintStatsHt(dictht *ht) { + unsigned long i, slots = 0, chainlen, maxchainlen = 0; + unsigned long totchainlen = 0; + unsigned long clvector[DICT_STATS_VECTLEN]; + + if (ht->used == 0) { + printf("No stats available for empty dictionaries\n"); + return; + } + + for (i = 0; i < DICT_STATS_VECTLEN; i++) clvector[i] = 0; + for (i = 0; i < ht->size; i++) { + dictEntry *he; + + if (ht->table[i] == NULL) { + clvector[0]++; + continue; + } + slots++; + /* For each hash entry on this slot... */ + chainlen = 0; + he = ht->table[i]; + while(he) { + chainlen++; + he = he->next; + } + clvector[(chainlen < DICT_STATS_VECTLEN) ? chainlen : (DICT_STATS_VECTLEN-1)]++; + if (chainlen > maxchainlen) maxchainlen = chainlen; + totchainlen += chainlen; + } + printf("Hash table stats:\n"); + printf(" table size: %ld\n", ht->size); + printf(" number of elements: %ld\n", ht->used); + printf(" different slots: %ld\n", slots); + printf(" max chain length: %ld\n", maxchainlen); + printf(" avg chain length (counted): %.02f\n", (float)totchainlen/slots); + printf(" avg chain length (computed): %.02f\n", (float)ht->used/slots); + printf(" Chain length distribution:\n"); + for (i = 0; i < DICT_STATS_VECTLEN-1; i++) { + if (clvector[i] == 0) continue; + printf(" %s%ld: %ld (%.02f%%)\n",(i == DICT_STATS_VECTLEN-1)?">= ":"", i, clvector[i], ((float)clvector[i]/ht->size)*100); + } +} + +void dictPrintStats(dict *d) { + _dictPrintStatsHt(&d->ht[0]); + if (dictIsRehashing(d)) { + printf("-- Rehashing into ht[1]:\n"); + _dictPrintStatsHt(&d->ht[1]); + } +} + +void dictEnableResize(void) { + dict_can_resize = 1; +} + +void dictDisableResize(void) { + dict_can_resize = 0; +} + +/* ----------------------- StringCopy Hash Table Type ------------------------*/ + +static unsigned int _dictStringCopyHTHashFunction(const void *key) +{ + return dictGenHashFunction(key, strlen(key)); +} + +static void *_dictStringCopyHTKeyDup(void *privdata, const void *key) +{ + int len = strlen(key); + char *copy = _dictAlloc(len+1); + DICT_NOTUSED(privdata); + + memcpy(copy, key, len); + copy[len] = '\0'; + return copy; +} + +static void *_dictStringKeyValCopyHTValDup(void *privdata, const void *val) +{ + int len = strlen(val); + char *copy = _dictAlloc(len+1); + DICT_NOTUSED(privdata); + + memcpy(copy, val, len); + copy[len] = '\0'; + return copy; +} + +static int _dictStringCopyHTKeyCompare(void *privdata, const void *key1, + const void *key2) +{ + DICT_NOTUSED(privdata); + + return strcmp(key1, key2) == 0; +} + +static void _dictStringCopyHTKeyDestructor(void *privdata, void *key) +{ + DICT_NOTUSED(privdata); + + _dictFree((void*)key); /* ATTENTION: const cast */ +} + +static void _dictStringKeyValCopyHTValDestructor(void *privdata, void *val) +{ + DICT_NOTUSED(privdata); + + _dictFree((void*)val); /* ATTENTION: const cast */ +} + +dictType dictTypeHeapStringCopyKey = { + _dictStringCopyHTHashFunction, /* hash function */ + _dictStringCopyHTKeyDup, /* key dup */ + NULL, /* val dup */ + _dictStringCopyHTKeyCompare, /* key compare */ + _dictStringCopyHTKeyDestructor, /* key destructor */ + NULL /* val destructor */ +}; + +/* This is like StringCopy but does not auto-duplicate the key. + * It's used for intepreter's shared strings. */ +dictType dictTypeHeapStrings = { + _dictStringCopyHTHashFunction, /* hash function */ + NULL, /* key dup */ + NULL, /* val dup */ + _dictStringCopyHTKeyCompare, /* key compare */ + _dictStringCopyHTKeyDestructor, /* key destructor */ + NULL /* val destructor */ +}; + +/* This is like StringCopy but also automatically handle dynamic + * allocated C strings as values. */ +dictType dictTypeHeapStringCopyKeyValue = { + _dictStringCopyHTHashFunction, /* hash function */ + _dictStringCopyHTKeyDup, /* key dup */ + _dictStringKeyValCopyHTValDup, /* val dup */ + _dictStringCopyHTKeyCompare, /* key compare */ + _dictStringCopyHTKeyDestructor, /* key destructor */ + _dictStringKeyValCopyHTValDestructor, /* val destructor */ +}; diff --git a/src/dict.h b/src/dict.h new file mode 100644 index 000000000..30ace4db7 --- /dev/null +++ b/src/dict.h @@ -0,0 +1,151 @@ +/* Hash Tables Implementation. + * + * This file implements in memory hash tables with insert/del/replace/find/ + * get-random-element operations. Hash tables will auto resize if needed + * tables of power of two in size are used, collisions are handled by + * chaining. See the source code for more information... :) + * + * Copyright (c) 2006-2010, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __DICT_H +#define __DICT_H + +#define DICT_OK 0 +#define DICT_ERR 1 + +/* Unused arguments generate annoying warnings... */ +#define DICT_NOTUSED(V) ((void) V) + +typedef struct dictEntry { + void *key; + void *val; + struct dictEntry *next; +} dictEntry; + +typedef struct dictType { + unsigned int (*hashFunction)(const void *key); + void *(*keyDup)(void *privdata, const void *key); + void *(*valDup)(void *privdata, const void *obj); + int (*keyCompare)(void *privdata, const void *key1, const void *key2); + void (*keyDestructor)(void *privdata, void *key); + void (*valDestructor)(void *privdata, void *obj); +} dictType; + +/* This is our hash table structure. Every dictionary has two of this as we + * implement incremental rehashing, for the old to the new table. */ +typedef struct dictht { + dictEntry **table; + unsigned long size; + unsigned long sizemask; + unsigned long used; +} dictht; + +typedef struct dict { + dictType *type; + void *privdata; + dictht ht[2]; + int rehashidx; /* rehashing not in progress if rehashidx == -1 */ + int iterators; /* number of iterators currently running */ +} dict; + +typedef struct dictIterator { + dict *d; + int table; + int index; + dictEntry *entry, *nextEntry; +} dictIterator; + +/* This is the initial size of every hash table */ +#define DICT_HT_INITIAL_SIZE 4 + +/* ------------------------------- Macros ------------------------------------*/ +#define dictFreeEntryVal(d, entry) \ + if ((d)->type->valDestructor) \ + (d)->type->valDestructor((d)->privdata, (entry)->val) + +#define dictSetHashVal(d, entry, _val_) do { \ + if ((d)->type->valDup) \ + entry->val = (d)->type->valDup((d)->privdata, _val_); \ + else \ + entry->val = (_val_); \ +} while(0) + +#define dictFreeEntryKey(d, entry) \ + if ((d)->type->keyDestructor) \ + (d)->type->keyDestructor((d)->privdata, (entry)->key) + +#define dictSetHashKey(d, entry, _key_) do { \ + if ((d)->type->keyDup) \ + entry->key = (d)->type->keyDup((d)->privdata, _key_); \ + else \ + entry->key = (_key_); \ +} while(0) + +#define dictCompareHashKeys(d, key1, key2) \ + (((d)->type->keyCompare) ? \ + (d)->type->keyCompare((d)->privdata, key1, key2) : \ + (key1) == (key2)) + +#define dictHashKey(d, key) (d)->type->hashFunction(key) + +#define dictGetEntryKey(he) ((he)->key) +#define dictGetEntryVal(he) ((he)->val) +#define dictSlots(d) ((d)->ht[0].size+(d)->ht[1].size) +#define dictSize(d) ((d)->ht[0].used+(d)->ht[1].used) +#define dictIsRehashing(ht) ((ht)->rehashidx != -1) + +/* API */ +dict *dictCreate(dictType *type, void *privDataPtr); +int dictExpand(dict *d, unsigned long size); +int dictAdd(dict *d, void *key, void *val); +int dictReplace(dict *d, void *key, void *val); +int dictDelete(dict *d, const void *key); +int dictDeleteNoFree(dict *d, const void *key); +void dictRelease(dict *d); +dictEntry * dictFind(dict *d, const void *key); +void *dictFetchValue(dict *d, const void *key); +int dictResize(dict *d); +dictIterator *dictGetIterator(dict *d); +dictEntry *dictNext(dictIterator *iter); +void dictReleaseIterator(dictIterator *iter); +dictEntry *dictGetRandomKey(dict *d); +void dictPrintStats(dict *d); +unsigned int dictGenHashFunction(const unsigned char *buf, int len); +void dictEmpty(dict *d); +void dictEnableResize(void); +void dictDisableResize(void); +int dictRehash(dict *d, int n); +int dictRehashMilliseconds(dict *d, int ms); + +/* Hash table types */ +extern dictType dictTypeHeapStringCopyKey; +extern dictType dictTypeHeapStrings; +extern dictType dictTypeHeapStringCopyKeyValue; + +#endif /* __DICT_H */ diff --git a/src/fmacros.h b/src/fmacros.h new file mode 100644 index 000000000..38f46482a --- /dev/null +++ b/src/fmacros.h @@ -0,0 +1,15 @@ +#ifndef _REDIS_FMACRO_H +#define _REDIS_FMACRO_H + +#define _BSD_SOURCE + +#ifdef __linux__ +#define _XOPEN_SOURCE 700 +#else +#define _XOPEN_SOURCE +#endif + +#define _LARGEFILE_SOURCE +#define _FILE_OFFSET_BITS 64 + +#endif diff --git a/src/linenoise.c b/src/linenoise.c new file mode 100644 index 000000000..0c04d03fb --- /dev/null +++ b/src/linenoise.c @@ -0,0 +1,433 @@ +/* linenoise.c -- guerrilla line editing library against the idea that a + * line editing lib needs to be 20,000 lines of C code. + * + * You can find the latest source code at: + * + * http://github.com/antirez/linenoise + * + * Does a number of crazy assumptions that happen to be true in 99.9999% of + * the 2010 UNIX computers around. + * + * Copyright (c) 2010, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * References: + * - http://invisible-island.net/xterm/ctlseqs/ctlseqs.html + * - http://www.3waylabs.com/nw/WWW/products/wizcon/vt220.html + * + * Todo list: + * - Switch to gets() if $TERM is something we can't support. + * - Filter bogus Ctrl+ combinations. + * - Win32 support + * + * Bloat: + * - Completion? + * - History search like Ctrl+r in readline? + * + * List of escape sequences used by this program, we do everything just + * with three sequences. In order to be so cheap we may have some + * flickering effect with some slow terminal, but the lesser sequences + * the more compatible. + * + * CHA (Cursor Horizontal Absolute) + * Sequence: ESC [ n G + * Effect: moves cursor to column n + * + * EL (Erase Line) + * Sequence: ESC [ n K + * Effect: if n is 0 or missing, clear from cursor to end of line + * Effect: if n is 1, clear from beginning of line to cursor + * Effect: if n is 2, clear entire line + * + * CUF (CUrsor Forward) + * Sequence: ESC [ n C + * Effect: moves cursor forward of n chars + * + */ + +#include "fmacros.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define LINENOISE_MAX_LINE 4096 +static char *unsupported_term[] = {"dumb","cons25",NULL}; + +static struct termios orig_termios; /* in order to restore at exit */ +static int rawmode = 0; /* for atexit() function to check if restore is needed*/ +static int atexit_registered = 0; /* register atexit just 1 time */ +static int history_max_len = 100; +static int history_len = 0; +char **history = NULL; + +static void linenoiseAtExit(void); +int linenoiseHistoryAdd(const char *line); + +static int isUnsupportedTerm(void) { + char *term = getenv("TERM"); + int j; + + if (term == NULL) return 0; + for (j = 0; unsupported_term[j]; j++) + if (!strcasecmp(term,unsupported_term[j])) return 1; + return 0; +} + +static void freeHistory(void) { + if (history) { + int j; + + for (j = 0; j < history_len; j++) + free(history[j]); + free(history); + } +} + +static int enableRawMode(int fd) { + struct termios raw; + + if (!isatty(STDIN_FILENO)) goto fatal; + if (!atexit_registered) { + atexit(linenoiseAtExit); + atexit_registered = 1; + } + if (tcgetattr(fd,&orig_termios) == -1) goto fatal; + + raw = orig_termios; /* modify the original mode */ + /* input modes: no break, no CR to NL, no parity check, no strip char, + * no start/stop output control. */ + raw.c_iflag &= ~(BRKINT | ICRNL | INPCK | ISTRIP | IXON); + /* output modes - disable post processing */ + raw.c_oflag &= ~(OPOST); + /* control modes - set 8 bit chars */ + raw.c_cflag |= (CS8); + /* local modes - choing off, canonical off, no extended functions, + * no signal chars (^Z,^C) */ + raw.c_lflag &= ~(ECHO | ICANON | IEXTEN | ISIG); + /* control chars - set return condition: min number of bytes and timer. + * We want read to return every single byte, without timeout. */ + raw.c_cc[VMIN] = 1; raw.c_cc[VTIME] = 0; /* 1 byte, no timer */ + + /* put terminal in raw mode after flushing */ + if (tcsetattr(fd,TCSAFLUSH,&raw) < 0) goto fatal; + rawmode = 1; + return 0; + +fatal: + errno = ENOTTY; + return -1; +} + +static void disableRawMode(int fd) { + /* Don't even check the return value as it's too late. */ + if (rawmode && tcsetattr(fd,TCSAFLUSH,&orig_termios) != -1) + rawmode = 0; +} + +/* At exit we'll try to fix the terminal to the initial conditions. */ +static void linenoiseAtExit(void) { + disableRawMode(STDIN_FILENO); + freeHistory(); +} + +static int getColumns(void) { + struct winsize ws; + + if (ioctl(1, TIOCGWINSZ, &ws) == -1) return 80; + return ws.ws_col; +} + +static void refreshLine(int fd, const char *prompt, char *buf, size_t len, size_t pos, size_t cols) { + char seq[64]; + size_t plen = strlen(prompt); + + while((plen+pos) >= cols) { + buf++; + len--; + pos--; + } + while (plen+len > cols) { + len--; + } + + /* Cursor to left edge */ + snprintf(seq,64,"\x1b[0G"); + if (write(fd,seq,strlen(seq)) == -1) return; + /* Write the prompt and the current buffer content */ + if (write(fd,prompt,strlen(prompt)) == -1) return; + if (write(fd,buf,len) == -1) return; + /* Erase to right */ + snprintf(seq,64,"\x1b[0K"); + if (write(fd,seq,strlen(seq)) == -1) return; + /* Move cursor to original position. */ + snprintf(seq,64,"\x1b[0G\x1b[%dC", (int)(pos+plen)); + if (write(fd,seq,strlen(seq)) == -1) return; +} + +static int linenoisePrompt(int fd, char *buf, size_t buflen, const char *prompt) { + size_t plen = strlen(prompt); + size_t pos = 0; + size_t len = 0; + size_t cols = getColumns(); + int history_index = 0; + + buf[0] = '\0'; + buflen--; /* Make sure there is always space for the nulterm */ + + /* The latest history entry is always our current buffer, that + * initially is just an empty string. */ + linenoiseHistoryAdd(""); + + if (write(fd,prompt,plen) == -1) return -1; + while(1) { + char c; + int nread; + char seq[2]; + + nread = read(fd,&c,1); + if (nread <= 0) return len; + switch(c) { + case 13: /* enter */ + history_len--; + return len; + case 4: /* ctrl-d */ + history_len--; + return (len == 0) ? -1 : (int)len; + case 3: /* ctrl-c */ + errno = EAGAIN; + return -1; + case 127: /* backspace */ + case 8: /* ctrl-h */ + if (pos > 0 && len > 0) { + memmove(buf+pos-1,buf+pos,len-pos); + pos--; + len--; + buf[len] = '\0'; + refreshLine(fd,prompt,buf,len,pos,cols); + } + break; + case 20: /* ctrl-t */ + if (pos > 0 && pos < len) { + int aux = buf[pos-1]; + buf[pos-1] = buf[pos]; + buf[pos] = aux; + if (pos != len-1) pos++; + refreshLine(fd,prompt,buf,len,pos,cols); + } + break; + case 2: /* ctrl-b */ + goto left_arrow; + case 6: /* ctrl-f */ + goto right_arrow; + case 16: /* ctrl-p */ + seq[1] = 65; + goto up_down_arrow; + case 14: /* ctrl-n */ + seq[1] = 66; + goto up_down_arrow; + break; + case 27: /* escape sequence */ + if (read(fd,seq,2) == -1) break; + if (seq[0] == 91 && seq[1] == 68) { +left_arrow: + /* left arrow */ + if (pos > 0) { + pos--; + refreshLine(fd,prompt,buf,len,pos,cols); + } + } else if (seq[0] == 91 && seq[1] == 67) { +right_arrow: + /* right arrow */ + if (pos != len) { + pos++; + refreshLine(fd,prompt,buf,len,pos,cols); + } + } else if (seq[0] == 91 && (seq[1] == 65 || seq[1] == 66)) { +up_down_arrow: + /* up and down arrow: history */ + if (history_len > 1) { + /* Update the current history entry before to + * overwrite it with tne next one. */ + free(history[history_len-1-history_index]); + history[history_len-1-history_index] = strdup(buf); + /* Show the new entry */ + history_index += (seq[1] == 65) ? 1 : -1; + if (history_index < 0) { + history_index = 0; + break; + } else if (history_index >= history_len) { + history_index = history_len-1; + break; + } + strncpy(buf,history[history_len-1-history_index],buflen); + buf[buflen] = '\0'; + len = pos = strlen(buf); + refreshLine(fd,prompt,buf,len,pos,cols); + } + } + break; + default: + if (len < buflen) { + if (len == pos) { + buf[pos] = c; + pos++; + len++; + buf[len] = '\0'; + if (plen+len < cols) { + /* Avoid a full update of the line in the + * trivial case. */ + if (write(fd,&c,1) == -1) return -1; + } else { + refreshLine(fd,prompt,buf,len,pos,cols); + } + } else { + memmove(buf+pos+1,buf+pos,len-pos); + buf[pos] = c; + len++; + pos++; + buf[len] = '\0'; + refreshLine(fd,prompt,buf,len,pos,cols); + } + } + break; + case 21: /* Ctrl+u, delete the whole line. */ + buf[0] = '\0'; + pos = len = 0; + refreshLine(fd,prompt,buf,len,pos,cols); + break; + case 11: /* Ctrl+k, delete from current to end of line. */ + buf[pos] = '\0'; + len = pos; + refreshLine(fd,prompt,buf,len,pos,cols); + break; + case 1: /* Ctrl+a, go to the start of the line */ + pos = 0; + refreshLine(fd,prompt,buf,len,pos,cols); + break; + case 5: /* ctrl+e, go to the end of the line */ + pos = len; + refreshLine(fd,prompt,buf,len,pos,cols); + break; + } + } + return len; +} + +static int linenoiseRaw(char *buf, size_t buflen, const char *prompt) { + int fd = STDIN_FILENO; + int count; + + if (buflen == 0) { + errno = EINVAL; + return -1; + } + if (!isatty(STDIN_FILENO)) { + if (fgets(buf, buflen, stdin) == NULL) return -1; + count = strlen(buf); + if (count && buf[count-1] == '\n') { + count--; + buf[count] = '\0'; + } + } else { + if (enableRawMode(fd) == -1) return -1; + count = linenoisePrompt(fd, buf, buflen, prompt); + disableRawMode(fd); + printf("\n"); + } + return count; +} + +char *linenoise(const char *prompt) { + char buf[LINENOISE_MAX_LINE]; + int count; + + if (isUnsupportedTerm()) { + size_t len; + + printf("%s",prompt); + fflush(stdout); + if (fgets(buf,LINENOISE_MAX_LINE,stdin) == NULL) return NULL; + len = strlen(buf); + while(len && (buf[len-1] == '\n' || buf[len-1] == '\r')) { + len--; + buf[len] = '\0'; + } + return strdup(buf); + } else { + count = linenoiseRaw(buf,LINENOISE_MAX_LINE,prompt); + if (count == -1) return NULL; + return strdup(buf); + } +} + +/* Using a circular buffer is smarter, but a bit more complex to handle. */ +int linenoiseHistoryAdd(const char *line) { + char *linecopy; + + if (history_max_len == 0) return 0; + if (history == 0) { + history = malloc(sizeof(char*)*history_max_len); + if (history == NULL) return 0; + memset(history,0,(sizeof(char*)*history_max_len)); + } + linecopy = strdup(line); + if (!linecopy) return 0; + if (history_len == history_max_len) { + memmove(history,history+1,sizeof(char*)*(history_max_len-1)); + history_len--; + } + history[history_len] = linecopy; + history_len++; + return 1; +} + +int linenoiseHistorySetMaxLen(int len) { + char **new; + + if (len < 1) return 0; + if (history) { + int tocopy = history_len; + + new = malloc(sizeof(char*)*len); + if (new == NULL) return 0; + if (len < tocopy) tocopy = len; + memcpy(new,history+(history_max_len-tocopy), sizeof(char*)*tocopy); + free(history); + history = new; + } + history_max_len = len; + if (history_len > history_max_len) + history_len = history_max_len; + return 1; +} diff --git a/src/linenoise.h b/src/linenoise.h new file mode 100644 index 000000000..ff45e2c47 --- /dev/null +++ b/src/linenoise.h @@ -0,0 +1,41 @@ +/* linenoise.h -- guerrilla line editing library against the idea that a + * line editing lib needs to be 20,000 lines of C code. + * + * See linenoise.c for more information. + * + * Copyright (c) 2010, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __LINENOISE_H +#define __LINENOISE_H + +char *linenoise(const char *prompt); +int linenoiseHistoryAdd(char *line); +int linenoiseHistorySetMaxLen(int len); + +#endif /* __LINENOISE_H */ diff --git a/src/lzf.h b/src/lzf.h new file mode 100644 index 000000000..919b6e6be --- /dev/null +++ b/src/lzf.h @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2000-2008 Marc Alexander Lehmann + * + * Redistribution and use in source and binary forms, with or without modifica- + * tion, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER- + * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO + * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- + * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH- + * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Alternatively, the contents of this file may be used under the terms of + * the GNU General Public License ("GPL") version 2 or any later version, + * in which case the provisions of the GPL are applicable instead of + * the above. If you wish to allow the use of your version of this file + * only under the terms of the GPL and not to allow others to use your + * version of this file under the BSD license, indicate your decision + * by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL. If you do not delete the + * provisions above, a recipient may use your version of this file under + * either the BSD or the GPL. + */ + +#ifndef LZF_H +#define LZF_H + +/*********************************************************************** +** +** lzf -- an extremely fast/free compression/decompression-method +** http://liblzf.plan9.de/ +** +** This algorithm is believed to be patent-free. +** +***********************************************************************/ + +#define LZF_VERSION 0x0105 /* 1.5, API version */ + +/* + * Compress in_len bytes stored at the memory block starting at + * in_data and write the result to out_data, up to a maximum length + * of out_len bytes. + * + * If the output buffer is not large enough or any error occurs return 0, + * otherwise return the number of bytes used, which might be considerably + * more than in_len (but less than 104% of the original size), so it + * makes sense to always use out_len == in_len - 1), to ensure _some_ + * compression, and store the data uncompressed otherwise (with a flag, of + * course. + * + * lzf_compress might use different algorithms on different systems and + * even different runs, thus might result in different compressed strings + * depending on the phase of the moon or similar factors. However, all + * these strings are architecture-independent and will result in the + * original data when decompressed using lzf_decompress. + * + * The buffers must not be overlapping. + * + * If the option LZF_STATE_ARG is enabled, an extra argument must be + * supplied which is not reflected in this header file. Refer to lzfP.h + * and lzf_c.c. + * + */ +unsigned int +lzf_compress (const void *const in_data, unsigned int in_len, + void *out_data, unsigned int out_len); + +/* + * Decompress data compressed with some version of the lzf_compress + * function and stored at location in_data and length in_len. The result + * will be stored at out_data up to a maximum of out_len characters. + * + * If the output buffer is not large enough to hold the decompressed + * data, a 0 is returned and errno is set to E2BIG. Otherwise the number + * of decompressed bytes (i.e. the original length of the data) is + * returned. + * + * If an error in the compressed data is detected, a zero is returned and + * errno is set to EINVAL. + * + * This function is very fast, about as fast as a copying loop. + */ +unsigned int +lzf_decompress (const void *const in_data, unsigned int in_len, + void *out_data, unsigned int out_len); + +#endif + diff --git a/src/lzfP.h b/src/lzfP.h new file mode 100644 index 000000000..d533f1829 --- /dev/null +++ b/src/lzfP.h @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2000-2007 Marc Alexander Lehmann + * + * Redistribution and use in source and binary forms, with or without modifica- + * tion, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER- + * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO + * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- + * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH- + * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Alternatively, the contents of this file may be used under the terms of + * the GNU General Public License ("GPL") version 2 or any later version, + * in which case the provisions of the GPL are applicable instead of + * the above. If you wish to allow the use of your version of this file + * only under the terms of the GPL and not to allow others to use your + * version of this file under the BSD license, indicate your decision + * by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL. If you do not delete the + * provisions above, a recipient may use your version of this file under + * either the BSD or the GPL. + */ + +#ifndef LZFP_h +#define LZFP_h + +#define STANDALONE 1 /* at the moment, this is ok. */ + +#ifndef STANDALONE +# include "lzf.h" +#endif + +/* + * Size of hashtable is (1 << HLOG) * sizeof (char *) + * decompression is independent of the hash table size + * the difference between 15 and 14 is very small + * for small blocks (and 14 is usually a bit faster). + * For a low-memory/faster configuration, use HLOG == 13; + * For best compression, use 15 or 16 (or more, up to 23). + */ +#ifndef HLOG +# define HLOG 16 +#endif + +/* + * Sacrifice very little compression quality in favour of compression speed. + * This gives almost the same compression as the default code, and is + * (very roughly) 15% faster. This is the preferred mode of operation. + */ +#ifndef VERY_FAST +# define VERY_FAST 1 +#endif + +/* + * Sacrifice some more compression quality in favour of compression speed. + * (roughly 1-2% worse compression for large blocks and + * 9-10% for small, redundant, blocks and >>20% better speed in both cases) + * In short: when in need for speed, enable this for binary data, + * possibly disable this for text data. + */ +#ifndef ULTRA_FAST +# define ULTRA_FAST 0 +#endif + +/* + * Unconditionally aligning does not cost very much, so do it if unsure + */ +#ifndef STRICT_ALIGN +# define STRICT_ALIGN !(defined(__i386) || defined (__amd64)) +#endif + +/* + * You may choose to pre-set the hash table (might be faster on some + * modern cpus and large (>>64k) blocks, and also makes compression + * deterministic/repeatable when the configuration otherwise is the same). + */ +#ifndef INIT_HTAB +# define INIT_HTAB 0 +#endif + +/* + * Avoid assigning values to errno variable? for some embedding purposes + * (linux kernel for example), this is neccessary. NOTE: this breaks + * the documentation in lzf.h. + */ +#ifndef AVOID_ERRNO +# define AVOID_ERRNO 0 +#endif + +/* + * Wether to pass the LZF_STATE variable as argument, or allocate it + * on the stack. For small-stack environments, define this to 1. + * NOTE: this breaks the prototype in lzf.h. + */ +#ifndef LZF_STATE_ARG +# define LZF_STATE_ARG 0 +#endif + +/* + * Wether to add extra checks for input validity in lzf_decompress + * and return EINVAL if the input stream has been corrupted. This + * only shields against overflowing the input buffer and will not + * detect most corrupted streams. + * This check is not normally noticable on modern hardware + * (<1% slowdown), but might slow down older cpus considerably. + */ +#ifndef CHECK_INPUT +# define CHECK_INPUT 1 +#endif + +/*****************************************************************************/ +/* nothing should be changed below */ + +typedef unsigned char u8; + +typedef const u8 *LZF_STATE[1 << (HLOG)]; + +#if !STRICT_ALIGN +/* for unaligned accesses we need a 16 bit datatype. */ +# include +# if USHRT_MAX == 65535 + typedef unsigned short u16; +# elif UINT_MAX == 65535 + typedef unsigned int u16; +# else +# undef STRICT_ALIGN +# define STRICT_ALIGN 1 +# endif +#endif + +#if ULTRA_FAST +# if defined(VERY_FAST) +# undef VERY_FAST +# endif +#endif + +#if INIT_HTAB +# ifdef __cplusplus +# include +# else +# include +# endif +#endif + +#endif + diff --git a/src/lzf_c.c b/src/lzf_c.c new file mode 100644 index 000000000..99dab091c --- /dev/null +++ b/src/lzf_c.c @@ -0,0 +1,295 @@ +/* + * Copyright (c) 2000-2008 Marc Alexander Lehmann + * + * Redistribution and use in source and binary forms, with or without modifica- + * tion, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER- + * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO + * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- + * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH- + * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Alternatively, the contents of this file may be used under the terms of + * the GNU General Public License ("GPL") version 2 or any later version, + * in which case the provisions of the GPL are applicable instead of + * the above. If you wish to allow the use of your version of this file + * only under the terms of the GPL and not to allow others to use your + * version of this file under the BSD license, indicate your decision + * by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL. If you do not delete the + * provisions above, a recipient may use your version of this file under + * either the BSD or the GPL. + */ + +#include "lzfP.h" + +#define HSIZE (1 << (HLOG)) + +/* + * don't play with this unless you benchmark! + * decompression is not dependent on the hash function + * the hashing function might seem strange, just believe me + * it works ;) + */ +#ifndef FRST +# define FRST(p) (((p[0]) << 8) | p[1]) +# define NEXT(v,p) (((v) << 8) | p[2]) +# if ULTRA_FAST +# define IDX(h) ((( h >> (3*8 - HLOG)) - h ) & (HSIZE - 1)) +# elif VERY_FAST +# define IDX(h) ((( h >> (3*8 - HLOG)) - h*5) & (HSIZE - 1)) +# else +# define IDX(h) ((((h ^ (h << 5)) >> (3*8 - HLOG)) - h*5) & (HSIZE - 1)) +# endif +#endif +/* + * IDX works because it is very similar to a multiplicative hash, e.g. + * ((h * 57321 >> (3*8 - HLOG)) & (HSIZE - 1)) + * the latter is also quite fast on newer CPUs, and compresses similarly. + * + * the next one is also quite good, albeit slow ;) + * (int)(cos(h & 0xffffff) * 1e6) + */ + +#if 0 +/* original lzv-like hash function, much worse and thus slower */ +# define FRST(p) (p[0] << 5) ^ p[1] +# define NEXT(v,p) ((v) << 5) ^ p[2] +# define IDX(h) ((h) & (HSIZE - 1)) +#endif + +#define MAX_LIT (1 << 5) +#define MAX_OFF (1 << 13) +#define MAX_REF ((1 << 8) + (1 << 3)) + +#if __GNUC__ >= 3 +# define expect(expr,value) __builtin_expect ((expr),(value)) +# define inline inline +#else +# define expect(expr,value) (expr) +# define inline static +#endif + +#define expect_false(expr) expect ((expr) != 0, 0) +#define expect_true(expr) expect ((expr) != 0, 1) + +/* + * compressed format + * + * 000LLLLL ; literal + * LLLooooo oooooooo ; backref L + * 111ooooo LLLLLLLL oooooooo ; backref L+7 + * + */ + +unsigned int +lzf_compress (const void *const in_data, unsigned int in_len, + void *out_data, unsigned int out_len +#if LZF_STATE_ARG + , LZF_STATE htab +#endif + ) +{ +#if !LZF_STATE_ARG + LZF_STATE htab; +#endif + const u8 **hslot; + const u8 *ip = (const u8 *)in_data; + u8 *op = (u8 *)out_data; + const u8 *in_end = ip + in_len; + u8 *out_end = op + out_len; + const u8 *ref; + + /* off requires a type wide enough to hold a general pointer difference. + * ISO C doesn't have that (size_t might not be enough and ptrdiff_t only + * works for differences within a single object). We also assume that no + * no bit pattern traps. Since the only platform that is both non-POSIX + * and fails to support both assumptions is windows 64 bit, we make a + * special workaround for it. + */ +#if defined (WIN32) && defined (_M_X64) + unsigned _int64 off; /* workaround for missing POSIX compliance */ +#else + unsigned long off; +#endif + unsigned int hval; + int lit; + + if (!in_len || !out_len) + return 0; + +#if INIT_HTAB + memset (htab, 0, sizeof (htab)); +# if 0 + for (hslot = htab; hslot < htab + HSIZE; hslot++) + *hslot++ = ip; +# endif +#endif + + lit = 0; op++; /* start run */ + + hval = FRST (ip); + while (ip < in_end - 2) + { + hval = NEXT (hval, ip); + hslot = htab + IDX (hval); + ref = *hslot; *hslot = ip; + + if (1 +#if INIT_HTAB + && ref < ip /* the next test will actually take care of this, but this is faster */ +#endif + && (off = ip - ref - 1) < MAX_OFF + && ip + 4 < in_end + && ref > (u8 *)in_data +#if STRICT_ALIGN + && ref[0] == ip[0] + && ref[1] == ip[1] + && ref[2] == ip[2] +#else + && *(u16 *)ref == *(u16 *)ip + && ref[2] == ip[2] +#endif + ) + { + /* match found at *ref++ */ + unsigned int len = 2; + unsigned int maxlen = in_end - ip - len; + maxlen = maxlen > MAX_REF ? MAX_REF : maxlen; + + op [- lit - 1] = lit - 1; /* stop run */ + op -= !lit; /* undo run if length is zero */ + + if (expect_false (op + 3 + 1 >= out_end)) + return 0; + + for (;;) + { + if (expect_true (maxlen > 16)) + { + len++; if (ref [len] != ip [len]) break; + len++; if (ref [len] != ip [len]) break; + len++; if (ref [len] != ip [len]) break; + len++; if (ref [len] != ip [len]) break; + + len++; if (ref [len] != ip [len]) break; + len++; if (ref [len] != ip [len]) break; + len++; if (ref [len] != ip [len]) break; + len++; if (ref [len] != ip [len]) break; + + len++; if (ref [len] != ip [len]) break; + len++; if (ref [len] != ip [len]) break; + len++; if (ref [len] != ip [len]) break; + len++; if (ref [len] != ip [len]) break; + + len++; if (ref [len] != ip [len]) break; + len++; if (ref [len] != ip [len]) break; + len++; if (ref [len] != ip [len]) break; + len++; if (ref [len] != ip [len]) break; + } + + do + len++; + while (len < maxlen && ref[len] == ip[len]); + + break; + } + + len -= 2; /* len is now #octets - 1 */ + ip++; + + if (len < 7) + { + *op++ = (off >> 8) + (len << 5); + } + else + { + *op++ = (off >> 8) + ( 7 << 5); + *op++ = len - 7; + } + + *op++ = off; + lit = 0; op++; /* start run */ + + ip += len + 1; + + if (expect_false (ip >= in_end - 2)) + break; + +#if ULTRA_FAST || VERY_FAST + --ip; +# if VERY_FAST && !ULTRA_FAST + --ip; +# endif + hval = FRST (ip); + + hval = NEXT (hval, ip); + htab[IDX (hval)] = ip; + ip++; + +# if VERY_FAST && !ULTRA_FAST + hval = NEXT (hval, ip); + htab[IDX (hval)] = ip; + ip++; +# endif +#else + ip -= len + 1; + + do + { + hval = NEXT (hval, ip); + htab[IDX (hval)] = ip; + ip++; + } + while (len--); +#endif + } + else + { + /* one more literal byte we must copy */ + if (expect_false (op >= out_end)) + return 0; + + lit++; *op++ = *ip++; + + if (expect_false (lit == MAX_LIT)) + { + op [- lit - 1] = lit - 1; /* stop run */ + lit = 0; op++; /* start run */ + } + } + } + + if (op + 3 > out_end) /* at most 3 bytes can be missing here */ + return 0; + + while (ip < in_end) + { + lit++; *op++ = *ip++; + + if (expect_false (lit == MAX_LIT)) + { + op [- lit - 1] = lit - 1; /* stop run */ + lit = 0; op++; /* start run */ + } + } + + op [- lit - 1] = lit - 1; /* end run */ + op -= !lit; /* undo run if length is zero */ + + return op - (u8 *)out_data; +} + diff --git a/src/lzf_d.c b/src/lzf_d.c new file mode 100644 index 000000000..e7e48c138 --- /dev/null +++ b/src/lzf_d.c @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2000-2007 Marc Alexander Lehmann + * + * Redistribution and use in source and binary forms, with or without modifica- + * tion, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER- + * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO + * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- + * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH- + * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Alternatively, the contents of this file may be used under the terms of + * the GNU General Public License ("GPL") version 2 or any later version, + * in which case the provisions of the GPL are applicable instead of + * the above. If you wish to allow the use of your version of this file + * only under the terms of the GPL and not to allow others to use your + * version of this file under the BSD license, indicate your decision + * by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL. If you do not delete the + * provisions above, a recipient may use your version of this file under + * either the BSD or the GPL. + */ + +#include "lzfP.h" + +#if AVOID_ERRNO +# define SET_ERRNO(n) +#else +# include +# define SET_ERRNO(n) errno = (n) +#endif + +/* +#if (__i386 || __amd64) && __GNUC__ >= 3 +# define lzf_movsb(dst, src, len) \ + asm ("rep movsb" \ + : "=D" (dst), "=S" (src), "=c" (len) \ + : "0" (dst), "1" (src), "2" (len)); +#endif +*/ + +unsigned int +lzf_decompress (const void *const in_data, unsigned int in_len, + void *out_data, unsigned int out_len) +{ + u8 const *ip = (const u8 *)in_data; + u8 *op = (u8 *)out_data; + u8 const *const in_end = ip + in_len; + u8 *const out_end = op + out_len; + + do + { + unsigned int ctrl = *ip++; + + if (ctrl < (1 << 5)) /* literal run */ + { + ctrl++; + + if (op + ctrl > out_end) + { + SET_ERRNO (E2BIG); + return 0; + } + +#if CHECK_INPUT + if (ip + ctrl > in_end) + { + SET_ERRNO (EINVAL); + return 0; + } +#endif + +#ifdef lzf_movsb + lzf_movsb (op, ip, ctrl); +#else + do + *op++ = *ip++; + while (--ctrl); +#endif + } + else /* back reference */ + { + unsigned int len = ctrl >> 5; + + u8 *ref = op - ((ctrl & 0x1f) << 8) - 1; + +#if CHECK_INPUT + if (ip >= in_end) + { + SET_ERRNO (EINVAL); + return 0; + } +#endif + if (len == 7) + { + len += *ip++; +#if CHECK_INPUT + if (ip >= in_end) + { + SET_ERRNO (EINVAL); + return 0; + } +#endif + } + + ref -= *ip++; + + if (op + len + 2 > out_end) + { + SET_ERRNO (E2BIG); + return 0; + } + + if (ref < (u8 *)out_data) + { + SET_ERRNO (EINVAL); + return 0; + } + +#ifdef lzf_movsb + len += 2; + lzf_movsb (op, ref, len); +#else + *op++ = *ref++; + *op++ = *ref++; + + do + *op++ = *ref++; + while (--len); +#endif + } + } + while (ip < in_end); + + return op - (u8 *)out_data; +} + diff --git a/src/mkreleasehdr.sh b/src/mkreleasehdr.sh new file mode 100755 index 000000000..30984160e --- /dev/null +++ b/src/mkreleasehdr.sh @@ -0,0 +1,9 @@ +#!/bin/sh +GIT_SHA1=`(git show-ref --head --hash=8 2> /dev/null || echo 00000000) | head -n1` +GIT_DIRTY=`git diff 2> /dev/null | wc -l` +test -f release.h || touch release.h +(cat release.h | grep SHA1 | grep $GIT_SHA1) && \ +(cat release.h | grep DIRTY | grep $GIT_DIRTY) && exit 0 # Already uptodate +echo "#define REDIS_GIT_SHA1 \"$GIT_SHA1\"" > release.h +echo "#define REDIS_GIT_DIRTY \"$GIT_DIRTY\"" >> release.h +touch release.c # Force recompile of release.c diff --git a/src/multi.c b/src/multi.c new file mode 100644 index 000000000..def1dd673 --- /dev/null +++ b/src/multi.c @@ -0,0 +1,266 @@ +#include "redis.h" + +/* ================================ MULTI/EXEC ============================== */ + +/* Client state initialization for MULTI/EXEC */ +void initClientMultiState(redisClient *c) { + c->mstate.commands = NULL; + c->mstate.count = 0; +} + +/* Release all the resources associated with MULTI/EXEC state */ +void freeClientMultiState(redisClient *c) { + int j; + + for (j = 0; j < c->mstate.count; j++) { + int i; + multiCmd *mc = c->mstate.commands+j; + + for (i = 0; i < mc->argc; i++) + decrRefCount(mc->argv[i]); + zfree(mc->argv); + } + zfree(c->mstate.commands); +} + +/* Add a new command into the MULTI commands queue */ +void queueMultiCommand(redisClient *c, struct redisCommand *cmd) { + multiCmd *mc; + int j; + + c->mstate.commands = zrealloc(c->mstate.commands, + sizeof(multiCmd)*(c->mstate.count+1)); + mc = c->mstate.commands+c->mstate.count; + mc->cmd = cmd; + mc->argc = c->argc; + mc->argv = zmalloc(sizeof(robj*)*c->argc); + memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc); + for (j = 0; j < c->argc; j++) + incrRefCount(mc->argv[j]); + c->mstate.count++; +} + +void multiCommand(redisClient *c) { + if (c->flags & REDIS_MULTI) { + addReplySds(c,sdsnew("-ERR MULTI calls can not be nested\r\n")); + return; + } + c->flags |= REDIS_MULTI; + addReply(c,shared.ok); +} + +void discardCommand(redisClient *c) { + if (!(c->flags & REDIS_MULTI)) { + addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n")); + return; + } + + freeClientMultiState(c); + initClientMultiState(c); + c->flags &= (~REDIS_MULTI); + unwatchAllKeys(c); + addReply(c,shared.ok); +} + +/* Send a MULTI command to all the slaves and AOF file. Check the execCommand + * implememntation for more information. */ +void execCommandReplicateMulti(redisClient *c) { + struct redisCommand *cmd; + robj *multistring = createStringObject("MULTI",5); + + cmd = lookupCommand("multi"); + if (server.appendonly) + feedAppendOnlyFile(cmd,c->db->id,&multistring,1); + if (listLength(server.slaves)) + replicationFeedSlaves(server.slaves,c->db->id,&multistring,1); + decrRefCount(multistring); +} + +void execCommand(redisClient *c) { + int j; + robj **orig_argv; + int orig_argc; + + if (!(c->flags & REDIS_MULTI)) { + addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n")); + return; + } + + /* Check if we need to abort the EXEC if some WATCHed key was touched. + * A failed EXEC will return a multi bulk nil object. */ + if (c->flags & REDIS_DIRTY_CAS) { + freeClientMultiState(c); + initClientMultiState(c); + c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS); + unwatchAllKeys(c); + addReply(c,shared.nullmultibulk); + return; + } + + /* Replicate a MULTI request now that we are sure the block is executed. + * This way we'll deliver the MULTI/..../EXEC block as a whole and + * both the AOF and the replication link will have the same consistency + * and atomicity guarantees. */ + execCommandReplicateMulti(c); + + /* Exec all the queued commands */ + unwatchAllKeys(c); /* Unwatch ASAP otherwise we'll waste CPU cycles */ + orig_argv = c->argv; + orig_argc = c->argc; + addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count)); + for (j = 0; j < c->mstate.count; j++) { + c->argc = c->mstate.commands[j].argc; + c->argv = c->mstate.commands[j].argv; + call(c,c->mstate.commands[j].cmd); + } + c->argv = orig_argv; + c->argc = orig_argc; + freeClientMultiState(c); + initClientMultiState(c); + c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS); + /* Make sure the EXEC command is always replicated / AOF, since we + * always send the MULTI command (we can't know beforehand if the + * next operations will contain at least a modification to the DB). */ + server.dirty++; +} + +/* ===================== WATCH (CAS alike for MULTI/EXEC) =================== + * + * The implementation uses a per-DB hash table mapping keys to list of clients + * WATCHing those keys, so that given a key that is going to be modified + * we can mark all the associated clients as dirty. + * + * Also every client contains a list of WATCHed keys so that's possible to + * un-watch such keys when the client is freed or when UNWATCH is called. */ + +/* In the client->watched_keys list we need to use watchedKey structures + * as in order to identify a key in Redis we need both the key name and the + * DB */ +typedef struct watchedKey { + robj *key; + redisDb *db; +} watchedKey; + +/* Watch for the specified key */ +void watchForKey(redisClient *c, robj *key) { + list *clients = NULL; + listIter li; + listNode *ln; + watchedKey *wk; + + /* Check if we are already watching for this key */ + listRewind(c->watched_keys,&li); + while((ln = listNext(&li))) { + wk = listNodeValue(ln); + if (wk->db == c->db && equalStringObjects(key,wk->key)) + return; /* Key already watched */ + } + /* This key is not already watched in this DB. Let's add it */ + clients = dictFetchValue(c->db->watched_keys,key); + if (!clients) { + clients = listCreate(); + dictAdd(c->db->watched_keys,key,clients); + incrRefCount(key); + } + listAddNodeTail(clients,c); + /* Add the new key to the lits of keys watched by this client */ + wk = zmalloc(sizeof(*wk)); + wk->key = key; + wk->db = c->db; + incrRefCount(key); + listAddNodeTail(c->watched_keys,wk); +} + +/* Unwatch all the keys watched by this client. To clean the EXEC dirty + * flag is up to the caller. */ +void unwatchAllKeys(redisClient *c) { + listIter li; + listNode *ln; + + if (listLength(c->watched_keys) == 0) return; + listRewind(c->watched_keys,&li); + while((ln = listNext(&li))) { + list *clients; + watchedKey *wk; + + /* Lookup the watched key -> clients list and remove the client + * from the list */ + wk = listNodeValue(ln); + clients = dictFetchValue(wk->db->watched_keys, wk->key); + redisAssert(clients != NULL); + listDelNode(clients,listSearchKey(clients,c)); + /* Kill the entry at all if this was the only client */ + if (listLength(clients) == 0) + dictDelete(wk->db->watched_keys, wk->key); + /* Remove this watched key from the client->watched list */ + listDelNode(c->watched_keys,ln); + decrRefCount(wk->key); + zfree(wk); + } +} + +/* "Touch" a key, so that if this key is being WATCHed by some client the + * next EXEC will fail. */ +void touchWatchedKey(redisDb *db, robj *key) { + list *clients; + listIter li; + listNode *ln; + + if (dictSize(db->watched_keys) == 0) return; + clients = dictFetchValue(db->watched_keys, key); + if (!clients) return; + + /* Mark all the clients watching this key as REDIS_DIRTY_CAS */ + /* Check if we are already watching for this key */ + listRewind(clients,&li); + while((ln = listNext(&li))) { + redisClient *c = listNodeValue(ln); + + c->flags |= REDIS_DIRTY_CAS; + } +} + +/* On FLUSHDB or FLUSHALL all the watched keys that are present before the + * flush but will be deleted as effect of the flushing operation should + * be touched. "dbid" is the DB that's getting the flush. -1 if it is + * a FLUSHALL operation (all the DBs flushed). */ +void touchWatchedKeysOnFlush(int dbid) { + listIter li1, li2; + listNode *ln; + + /* For every client, check all the waited keys */ + listRewind(server.clients,&li1); + while((ln = listNext(&li1))) { + redisClient *c = listNodeValue(ln); + listRewind(c->watched_keys,&li2); + while((ln = listNext(&li2))) { + watchedKey *wk = listNodeValue(ln); + + /* For every watched key matching the specified DB, if the + * key exists, mark the client as dirty, as the key will be + * removed. */ + if (dbid == -1 || wk->db->id == dbid) { + if (dictFind(wk->db->dict, wk->key->ptr) != NULL) + c->flags |= REDIS_DIRTY_CAS; + } + } + } +} + +void watchCommand(redisClient *c) { + int j; + + if (c->flags & REDIS_MULTI) { + addReplySds(c,sdsnew("-ERR WATCH inside MULTI is not allowed\r\n")); + return; + } + for (j = 1; j < c->argc; j++) + watchForKey(c,c->argv[j]); + addReply(c,shared.ok); +} + +void unwatchCommand(redisClient *c) { + unwatchAllKeys(c); + c->flags &= (~REDIS_DIRTY_CAS); + addReply(c,shared.ok); +} diff --git a/src/networking.c b/src/networking.c new file mode 100644 index 000000000..31844a09f --- /dev/null +++ b/src/networking.c @@ -0,0 +1,589 @@ +#include "redis.h" + +#include + +void *dupClientReplyValue(void *o) { + incrRefCount((robj*)o); + return o; +} + +int listMatchObjects(void *a, void *b) { + return equalStringObjects(a,b); +} + +redisClient *createClient(int fd) { + redisClient *c = zmalloc(sizeof(*c)); + + anetNonBlock(NULL,fd); + anetTcpNoDelay(NULL,fd); + if (!c) return NULL; + selectDb(c,0); + c->fd = fd; + c->querybuf = sdsempty(); + c->argc = 0; + c->argv = NULL; + c->bulklen = -1; + c->multibulk = 0; + c->mbargc = 0; + c->mbargv = NULL; + c->sentlen = 0; + c->flags = 0; + c->lastinteraction = time(NULL); + c->authenticated = 0; + c->replstate = REDIS_REPL_NONE; + c->reply = listCreate(); + listSetFreeMethod(c->reply,decrRefCount); + listSetDupMethod(c->reply,dupClientReplyValue); + c->blocking_keys = NULL; + c->blocking_keys_num = 0; + c->io_keys = listCreate(); + c->watched_keys = listCreate(); + listSetFreeMethod(c->io_keys,decrRefCount); + c->pubsub_channels = dictCreate(&setDictType,NULL); + c->pubsub_patterns = listCreate(); + listSetFreeMethod(c->pubsub_patterns,decrRefCount); + listSetMatchMethod(c->pubsub_patterns,listMatchObjects); + if (aeCreateFileEvent(server.el, c->fd, AE_READABLE, + readQueryFromClient, c) == AE_ERR) { + freeClient(c); + return NULL; + } + listAddNodeTail(server.clients,c); + initClientMultiState(c); + return c; +} + +void addReply(redisClient *c, robj *obj) { + if (listLength(c->reply) == 0 && + (c->replstate == REDIS_REPL_NONE || + c->replstate == REDIS_REPL_ONLINE) && + aeCreateFileEvent(server.el, c->fd, AE_WRITABLE, + sendReplyToClient, c) == AE_ERR) return; + + if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) { + obj = dupStringObject(obj); + obj->refcount = 0; /* getDecodedObject() will increment the refcount */ + } + listAddNodeTail(c->reply,getDecodedObject(obj)); +} + +void addReplySds(redisClient *c, sds s) { + robj *o = createObject(REDIS_STRING,s); + addReply(c,o); + decrRefCount(o); +} + +void addReplyDouble(redisClient *c, double d) { + char buf[128]; + + snprintf(buf,sizeof(buf),"%.17g",d); + addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n", + (unsigned long) strlen(buf),buf)); +} + +void addReplyLongLong(redisClient *c, long long ll) { + char buf[128]; + size_t len; + + if (ll == 0) { + addReply(c,shared.czero); + return; + } else if (ll == 1) { + addReply(c,shared.cone); + return; + } + buf[0] = ':'; + len = ll2string(buf+1,sizeof(buf)-1,ll); + buf[len+1] = '\r'; + buf[len+2] = '\n'; + addReplySds(c,sdsnewlen(buf,len+3)); +} + +void addReplyUlong(redisClient *c, unsigned long ul) { + char buf[128]; + size_t len; + + if (ul == 0) { + addReply(c,shared.czero); + return; + } else if (ul == 1) { + addReply(c,shared.cone); + return; + } + len = snprintf(buf,sizeof(buf),":%lu\r\n",ul); + addReplySds(c,sdsnewlen(buf,len)); +} + +void addReplyBulkLen(redisClient *c, robj *obj) { + size_t len, intlen; + char buf[128]; + + if (obj->encoding == REDIS_ENCODING_RAW) { + len = sdslen(obj->ptr); + } else { + long n = (long)obj->ptr; + + /* Compute how many bytes will take this integer as a radix 10 string */ + len = 1; + if (n < 0) { + len++; + n = -n; + } + while((n = n/10) != 0) { + len++; + } + } + buf[0] = '$'; + intlen = ll2string(buf+1,sizeof(buf)-1,(long long)len); + buf[intlen+1] = '\r'; + buf[intlen+2] = '\n'; + addReplySds(c,sdsnewlen(buf,intlen+3)); +} + +void addReplyBulk(redisClient *c, robj *obj) { + addReplyBulkLen(c,obj); + addReply(c,obj); + addReply(c,shared.crlf); +} + +/* In the CONFIG command we need to add vanilla C string as bulk replies */ +void addReplyBulkCString(redisClient *c, char *s) { + if (s == NULL) { + addReply(c,shared.nullbulk); + } else { + robj *o = createStringObject(s,strlen(s)); + addReplyBulk(c,o); + decrRefCount(o); + } +} + +void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) { + int cport, cfd; + char cip[128]; + redisClient *c; + REDIS_NOTUSED(el); + REDIS_NOTUSED(mask); + REDIS_NOTUSED(privdata); + + cfd = anetAccept(server.neterr, fd, cip, &cport); + if (cfd == AE_ERR) { + redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr); + return; + } + redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport); + if ((c = createClient(cfd)) == NULL) { + redisLog(REDIS_WARNING,"Error allocating resoures for the client"); + close(cfd); /* May be already closed, just ingore errors */ + return; + } + /* If maxclient directive is set and this is one client more... close the + * connection. Note that we create the client instead to check before + * for this condition, since now the socket is already set in nonblocking + * mode and we can send an error for free using the Kernel I/O */ + if (server.maxclients && listLength(server.clients) > server.maxclients) { + char *err = "-ERR max number of clients reached\r\n"; + + /* That's a best effort error message, don't check write errors */ + if (write(c->fd,err,strlen(err)) == -1) { + /* Nothing to do, Just to avoid the warning... */ + } + freeClient(c); + return; + } + server.stat_numconnections++; +} + +static void freeClientArgv(redisClient *c) { + int j; + + for (j = 0; j < c->argc; j++) + decrRefCount(c->argv[j]); + for (j = 0; j < c->mbargc; j++) + decrRefCount(c->mbargv[j]); + c->argc = 0; + c->mbargc = 0; +} + +void freeClient(redisClient *c) { + listNode *ln; + + /* Note that if the client we are freeing is blocked into a blocking + * call, we have to set querybuf to NULL *before* to call + * unblockClientWaitingData() to avoid processInputBuffer() will get + * called. Also it is important to remove the file events after + * this, because this call adds the READABLE event. */ + sdsfree(c->querybuf); + c->querybuf = NULL; + if (c->flags & REDIS_BLOCKED) + unblockClientWaitingData(c); + + /* UNWATCH all the keys */ + unwatchAllKeys(c); + listRelease(c->watched_keys); + /* Unsubscribe from all the pubsub channels */ + pubsubUnsubscribeAllChannels(c,0); + pubsubUnsubscribeAllPatterns(c,0); + dictRelease(c->pubsub_channels); + listRelease(c->pubsub_patterns); + /* Obvious cleanup */ + aeDeleteFileEvent(server.el,c->fd,AE_READABLE); + aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE); + listRelease(c->reply); + freeClientArgv(c); + close(c->fd); + /* Remove from the list of clients */ + ln = listSearchKey(server.clients,c); + redisAssert(ln != NULL); + listDelNode(server.clients,ln); + /* Remove from the list of clients that are now ready to be restarted + * after waiting for swapped keys */ + if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) { + ln = listSearchKey(server.io_ready_clients,c); + if (ln) { + listDelNode(server.io_ready_clients,ln); + server.vm_blocked_clients--; + } + } + /* Remove from the list of clients waiting for swapped keys */ + while (server.vm_enabled && listLength(c->io_keys)) { + ln = listFirst(c->io_keys); + dontWaitForSwappedKey(c,ln->value); + } + listRelease(c->io_keys); + /* Master/slave cleanup */ + if (c->flags & REDIS_SLAVE) { + if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1) + close(c->repldbfd); + list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves; + ln = listSearchKey(l,c); + redisAssert(ln != NULL); + listDelNode(l,ln); + } + if (c->flags & REDIS_MASTER) { + server.master = NULL; + server.replstate = REDIS_REPL_CONNECT; + } + /* Release memory */ + zfree(c->argv); + zfree(c->mbargv); + freeClientMultiState(c); + zfree(c); +} + +#define GLUEREPLY_UP_TO (1024) +static void glueReplyBuffersIfNeeded(redisClient *c) { + int copylen = 0; + char buf[GLUEREPLY_UP_TO]; + listNode *ln; + listIter li; + robj *o; + + listRewind(c->reply,&li); + while((ln = listNext(&li))) { + int objlen; + + o = ln->value; + objlen = sdslen(o->ptr); + if (copylen + objlen <= GLUEREPLY_UP_TO) { + memcpy(buf+copylen,o->ptr,objlen); + copylen += objlen; + listDelNode(c->reply,ln); + } else { + if (copylen == 0) return; + break; + } + } + /* Now the output buffer is empty, add the new single element */ + o = createObject(REDIS_STRING,sdsnewlen(buf,copylen)); + listAddNodeHead(c->reply,o); +} + +void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) { + redisClient *c = privdata; + int nwritten = 0, totwritten = 0, objlen; + robj *o; + REDIS_NOTUSED(el); + REDIS_NOTUSED(mask); + + /* Use writev() if we have enough buffers to send */ + if (!server.glueoutputbuf && + listLength(c->reply) > REDIS_WRITEV_THRESHOLD && + !(c->flags & REDIS_MASTER)) + { + sendReplyToClientWritev(el, fd, privdata, mask); + return; + } + + while(listLength(c->reply)) { + if (server.glueoutputbuf && listLength(c->reply) > 1) + glueReplyBuffersIfNeeded(c); + + o = listNodeValue(listFirst(c->reply)); + objlen = sdslen(o->ptr); + + if (objlen == 0) { + listDelNode(c->reply,listFirst(c->reply)); + continue; + } + + if (c->flags & REDIS_MASTER) { + /* Don't reply to a master */ + nwritten = objlen - c->sentlen; + } else { + nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen); + if (nwritten <= 0) break; + } + c->sentlen += nwritten; + totwritten += nwritten; + /* If we fully sent the object on head go to the next one */ + if (c->sentlen == objlen) { + listDelNode(c->reply,listFirst(c->reply)); + c->sentlen = 0; + } + /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT + * bytes, in a single threaded server it's a good idea to serve + * other clients as well, even if a very large request comes from + * super fast link that is always able to accept data (in real world + * scenario think about 'KEYS *' against the loopback interfae) */ + if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break; + } + if (nwritten == -1) { + if (errno == EAGAIN) { + nwritten = 0; + } else { + redisLog(REDIS_VERBOSE, + "Error writing to client: %s", strerror(errno)); + freeClient(c); + return; + } + } + if (totwritten > 0) c->lastinteraction = time(NULL); + if (listLength(c->reply) == 0) { + c->sentlen = 0; + aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE); + } +} + +void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask) +{ + redisClient *c = privdata; + int nwritten = 0, totwritten = 0, objlen, willwrite; + robj *o; + struct iovec iov[REDIS_WRITEV_IOVEC_COUNT]; + int offset, ion = 0; + REDIS_NOTUSED(el); + REDIS_NOTUSED(mask); + + listNode *node; + while (listLength(c->reply)) { + offset = c->sentlen; + ion = 0; + willwrite = 0; + + /* fill-in the iov[] array */ + for(node = listFirst(c->reply); node; node = listNextNode(node)) { + o = listNodeValue(node); + objlen = sdslen(o->ptr); + + if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT) + break; + + if(ion == REDIS_WRITEV_IOVEC_COUNT) + break; /* no more iovecs */ + + iov[ion].iov_base = ((char*)o->ptr) + offset; + iov[ion].iov_len = objlen - offset; + willwrite += objlen - offset; + offset = 0; /* just for the first item */ + ion++; + } + + if(willwrite == 0) + break; + + /* write all collected blocks at once */ + if((nwritten = writev(fd, iov, ion)) < 0) { + if (errno != EAGAIN) { + redisLog(REDIS_VERBOSE, + "Error writing to client: %s", strerror(errno)); + freeClient(c); + return; + } + break; + } + + totwritten += nwritten; + offset = c->sentlen; + + /* remove written robjs from c->reply */ + while (nwritten && listLength(c->reply)) { + o = listNodeValue(listFirst(c->reply)); + objlen = sdslen(o->ptr); + + if(nwritten >= objlen - offset) { + listDelNode(c->reply, listFirst(c->reply)); + nwritten -= objlen - offset; + c->sentlen = 0; + } else { + /* partial write */ + c->sentlen += nwritten; + break; + } + offset = 0; + } + } + + if (totwritten > 0) + c->lastinteraction = time(NULL); + + if (listLength(c->reply) == 0) { + c->sentlen = 0; + aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE); + } +} + +/* resetClient prepare the client to process the next command */ +void resetClient(redisClient *c) { + freeClientArgv(c); + c->bulklen = -1; + c->multibulk = 0; +} + +void closeTimedoutClients(void) { + redisClient *c; + listNode *ln; + time_t now = time(NULL); + listIter li; + + listRewind(server.clients,&li); + while ((ln = listNext(&li)) != NULL) { + c = listNodeValue(ln); + if (server.maxidletime && + !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */ + !(c->flags & REDIS_MASTER) && /* no timeout for masters */ + dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */ + listLength(c->pubsub_patterns) == 0 && + (now - c->lastinteraction > server.maxidletime)) + { + redisLog(REDIS_VERBOSE,"Closing idle client"); + freeClient(c); + } else if (c->flags & REDIS_BLOCKED) { + if (c->blockingto != 0 && c->blockingto < now) { + addReply(c,shared.nullmultibulk); + unblockClientWaitingData(c); + } + } + } +} + +void processInputBuffer(redisClient *c) { +again: + /* Before to process the input buffer, make sure the client is not + * waitig for a blocking operation such as BLPOP. Note that the first + * iteration the client is never blocked, otherwise the processInputBuffer + * would not be called at all, but after the execution of the first commands + * in the input buffer the client may be blocked, and the "goto again" + * will try to reiterate. The following line will make it return asap. */ + if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return; + if (c->bulklen == -1) { + /* Read the first line of the query */ + char *p = strchr(c->querybuf,'\n'); + size_t querylen; + + if (p) { + sds query, *argv; + int argc, j; + + query = c->querybuf; + c->querybuf = sdsempty(); + querylen = 1+(p-(query)); + if (sdslen(query) > querylen) { + /* leave data after the first line of the query in the buffer */ + c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen); + } + *p = '\0'; /* remove "\n" */ + if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */ + sdsupdatelen(query); + + /* Now we can split the query in arguments */ + argv = sdssplitlen(query,sdslen(query)," ",1,&argc); + sdsfree(query); + + if (c->argv) zfree(c->argv); + c->argv = zmalloc(sizeof(robj*)*argc); + + for (j = 0; j < argc; j++) { + if (sdslen(argv[j])) { + c->argv[c->argc] = createObject(REDIS_STRING,argv[j]); + c->argc++; + } else { + sdsfree(argv[j]); + } + } + zfree(argv); + if (c->argc) { + /* Execute the command. If the client is still valid + * after processCommand() return and there is something + * on the query buffer try to process the next command. */ + if (processCommand(c) && sdslen(c->querybuf)) goto again; + } else { + /* Nothing to process, argc == 0. Just process the query + * buffer if it's not empty or return to the caller */ + if (sdslen(c->querybuf)) goto again; + } + return; + } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) { + redisLog(REDIS_VERBOSE, "Client protocol error"); + freeClient(c); + return; + } + } else { + /* Bulk read handling. Note that if we are at this point + the client already sent a command terminated with a newline, + we are reading the bulk data that is actually the last + argument of the command. */ + int qbl = sdslen(c->querybuf); + + if (c->bulklen <= qbl) { + /* Copy everything but the final CRLF as final argument */ + c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2); + c->argc++; + c->querybuf = sdsrange(c->querybuf,c->bulklen,-1); + /* Process the command. If the client is still valid after + * the processing and there is more data in the buffer + * try to parse it. */ + if (processCommand(c) && sdslen(c->querybuf)) goto again; + return; + } + } +} + +void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) { + redisClient *c = (redisClient*) privdata; + char buf[REDIS_IOBUF_LEN]; + int nread; + REDIS_NOTUSED(el); + REDIS_NOTUSED(mask); + + nread = read(fd, buf, REDIS_IOBUF_LEN); + if (nread == -1) { + if (errno == EAGAIN) { + nread = 0; + } else { + redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno)); + freeClient(c); + return; + } + } else if (nread == 0) { + redisLog(REDIS_VERBOSE, "Client closed connection"); + freeClient(c); + return; + } + if (nread) { + c->querybuf = sdscatlen(c->querybuf, buf, nread); + c->lastinteraction = time(NULL); + } else { + return; + } + processInputBuffer(c); +} diff --git a/src/object.c b/src/object.c new file mode 100644 index 000000000..4854909e0 --- /dev/null +++ b/src/object.c @@ -0,0 +1,405 @@ +#include "redis.h" +#include + +robj *createObject(int type, void *ptr) { + robj *o; + + if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex); + if (listLength(server.objfreelist)) { + listNode *head = listFirst(server.objfreelist); + o = listNodeValue(head); + listDelNode(server.objfreelist,head); + if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex); + } else { + if (server.vm_enabled) + pthread_mutex_unlock(&server.obj_freelist_mutex); + o = zmalloc(sizeof(*o)); + } + o->type = type; + o->encoding = REDIS_ENCODING_RAW; + o->ptr = ptr; + o->refcount = 1; + if (server.vm_enabled) { + /* Note that this code may run in the context of an I/O thread + * and accessing server.lruclock in theory is an error + * (no locks). But in practice this is safe, and even if we read + * garbage Redis will not fail. */ + o->lru = server.lruclock; + o->storage = REDIS_VM_MEMORY; + } + return o; +} + +robj *createStringObject(char *ptr, size_t len) { + return createObject(REDIS_STRING,sdsnewlen(ptr,len)); +} + +robj *createStringObjectFromLongLong(long long value) { + robj *o; + if (value >= 0 && value < REDIS_SHARED_INTEGERS) { + incrRefCount(shared.integers[value]); + o = shared.integers[value]; + } else { + if (value >= LONG_MIN && value <= LONG_MAX) { + o = createObject(REDIS_STRING, NULL); + o->encoding = REDIS_ENCODING_INT; + o->ptr = (void*)((long)value); + } else { + o = createObject(REDIS_STRING,sdsfromlonglong(value)); + } + } + return o; +} + +robj *dupStringObject(robj *o) { + redisAssert(o->encoding == REDIS_ENCODING_RAW); + return createStringObject(o->ptr,sdslen(o->ptr)); +} + +robj *createListObject(void) { + list *l = listCreate(); + robj *o = createObject(REDIS_LIST,l); + listSetFreeMethod(l,decrRefCount); + o->encoding = REDIS_ENCODING_LINKEDLIST; + return o; +} + +robj *createZiplistObject(void) { + unsigned char *zl = ziplistNew(); + robj *o = createObject(REDIS_LIST,zl); + o->encoding = REDIS_ENCODING_ZIPLIST; + return o; +} + +robj *createSetObject(void) { + dict *d = dictCreate(&setDictType,NULL); + return createObject(REDIS_SET,d); +} + +robj *createHashObject(void) { + /* All the Hashes start as zipmaps. Will be automatically converted + * into hash tables if there are enough elements or big elements + * inside. */ + unsigned char *zm = zipmapNew(); + robj *o = createObject(REDIS_HASH,zm); + o->encoding = REDIS_ENCODING_ZIPMAP; + return o; +} + +robj *createZsetObject(void) { + zset *zs = zmalloc(sizeof(*zs)); + + zs->dict = dictCreate(&zsetDictType,NULL); + zs->zsl = zslCreate(); + return createObject(REDIS_ZSET,zs); +} + +void freeStringObject(robj *o) { + if (o->encoding == REDIS_ENCODING_RAW) { + sdsfree(o->ptr); + } +} + +void freeListObject(robj *o) { + switch (o->encoding) { + case REDIS_ENCODING_LINKEDLIST: + listRelease((list*) o->ptr); + break; + case REDIS_ENCODING_ZIPLIST: + zfree(o->ptr); + break; + default: + redisPanic("Unknown list encoding type"); + } +} + +void freeSetObject(robj *o) { + dictRelease((dict*) o->ptr); +} + +void freeZsetObject(robj *o) { + zset *zs = o->ptr; + + dictRelease(zs->dict); + zslFree(zs->zsl); + zfree(zs); +} + +void freeHashObject(robj *o) { + switch (o->encoding) { + case REDIS_ENCODING_HT: + dictRelease((dict*) o->ptr); + break; + case REDIS_ENCODING_ZIPMAP: + zfree(o->ptr); + break; + default: + redisPanic("Unknown hash encoding type"); + break; + } +} + +void incrRefCount(robj *o) { + o->refcount++; +} + +void decrRefCount(void *obj) { + robj *o = obj; + + /* Object is a swapped out value, or in the process of being loaded. */ + if (server.vm_enabled && + (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING)) + { + vmpointer *vp = obj; + if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(o); + vmMarkPagesFree(vp->page,vp->usedpages); + server.vm_stats_swapped_objects--; + zfree(vp); + return; + } + + if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0"); + /* Object is in memory, or in the process of being swapped out. + * + * If the object is being swapped out, abort the operation on + * decrRefCount even if the refcount does not drop to 0: the object + * is referenced at least two times, as value of the key AND as + * job->val in the iojob. So if we don't invalidate the iojob, when it is + * done but the relevant key was removed in the meantime, the + * complete jobs handler will not find the key about the job and the + * assert will fail. */ + if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING) + vmCancelThreadedIOJob(o); + if (--(o->refcount) == 0) { + switch(o->type) { + case REDIS_STRING: freeStringObject(o); break; + case REDIS_LIST: freeListObject(o); break; + case REDIS_SET: freeSetObject(o); break; + case REDIS_ZSET: freeZsetObject(o); break; + case REDIS_HASH: freeHashObject(o); break; + default: redisPanic("Unknown object type"); break; + } + if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex); + if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX || + !listAddNodeHead(server.objfreelist,o)) + zfree(o); + if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex); + } +} + +int checkType(redisClient *c, robj *o, int type) { + if (o->type != type) { + addReply(c,shared.wrongtypeerr); + return 1; + } + return 0; +} + +/* Try to encode a string object in order to save space */ +robj *tryObjectEncoding(robj *o) { + long value; + sds s = o->ptr; + + if (o->encoding != REDIS_ENCODING_RAW) + return o; /* Already encoded */ + + /* It's not safe to encode shared objects: shared objects can be shared + * everywhere in the "object space" of Redis. Encoded objects can only + * appear as "values" (and not, for instance, as keys) */ + if (o->refcount > 1) return o; + + /* Currently we try to encode only strings */ + redisAssert(o->type == REDIS_STRING); + + /* Check if we can represent this string as a long integer */ + if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o; + + /* Ok, this object can be encoded */ + if (value >= 0 && value < REDIS_SHARED_INTEGERS) { + decrRefCount(o); + incrRefCount(shared.integers[value]); + return shared.integers[value]; + } else { + o->encoding = REDIS_ENCODING_INT; + sdsfree(o->ptr); + o->ptr = (void*) value; + return o; + } +} + +/* Get a decoded version of an encoded object (returned as a new object). + * If the object is already raw-encoded just increment the ref count. */ +robj *getDecodedObject(robj *o) { + robj *dec; + + if (o->encoding == REDIS_ENCODING_RAW) { + incrRefCount(o); + return o; + } + if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) { + char buf[32]; + + ll2string(buf,32,(long)o->ptr); + dec = createStringObject(buf,strlen(buf)); + return dec; + } else { + redisPanic("Unknown encoding type"); + } +} + +/* Compare two string objects via strcmp() or alike. + * Note that the objects may be integer-encoded. In such a case we + * use ll2string() to get a string representation of the numbers on the stack + * and compare the strings, it's much faster than calling getDecodedObject(). + * + * Important note: if objects are not integer encoded, but binary-safe strings, + * sdscmp() from sds.c will apply memcmp() so this function ca be considered + * binary safe. */ +int compareStringObjects(robj *a, robj *b) { + redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING); + char bufa[128], bufb[128], *astr, *bstr; + int bothsds = 1; + + if (a == b) return 0; + if (a->encoding != REDIS_ENCODING_RAW) { + ll2string(bufa,sizeof(bufa),(long) a->ptr); + astr = bufa; + bothsds = 0; + } else { + astr = a->ptr; + } + if (b->encoding != REDIS_ENCODING_RAW) { + ll2string(bufb,sizeof(bufb),(long) b->ptr); + bstr = bufb; + bothsds = 0; + } else { + bstr = b->ptr; + } + return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr); +} + +/* Equal string objects return 1 if the two objects are the same from the + * point of view of a string comparison, otherwise 0 is returned. Note that + * this function is faster then checking for (compareStringObject(a,b) == 0) + * because it can perform some more optimization. */ +int equalStringObjects(robj *a, robj *b) { + if (a->encoding != REDIS_ENCODING_RAW && b->encoding != REDIS_ENCODING_RAW){ + return a->ptr == b->ptr; + } else { + return compareStringObjects(a,b) == 0; + } +} + +size_t stringObjectLen(robj *o) { + redisAssert(o->type == REDIS_STRING); + if (o->encoding == REDIS_ENCODING_RAW) { + return sdslen(o->ptr); + } else { + char buf[32]; + + return ll2string(buf,32,(long)o->ptr); + } +} + +int getDoubleFromObject(robj *o, double *target) { + double value; + char *eptr; + + if (o == NULL) { + value = 0; + } else { + redisAssert(o->type == REDIS_STRING); + if (o->encoding == REDIS_ENCODING_RAW) { + value = strtod(o->ptr, &eptr); + if (eptr[0] != '\0') return REDIS_ERR; + } else if (o->encoding == REDIS_ENCODING_INT) { + value = (long)o->ptr; + } else { + redisPanic("Unknown string encoding"); + } + } + + *target = value; + return REDIS_OK; +} + +int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) { + double value; + if (getDoubleFromObject(o, &value) != REDIS_OK) { + if (msg != NULL) { + addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg)); + } else { + addReplySds(c, sdsnew("-ERR value is not a double\r\n")); + } + return REDIS_ERR; + } + + *target = value; + return REDIS_OK; +} + +int getLongLongFromObject(robj *o, long long *target) { + long long value; + char *eptr; + + if (o == NULL) { + value = 0; + } else { + redisAssert(o->type == REDIS_STRING); + if (o->encoding == REDIS_ENCODING_RAW) { + value = strtoll(o->ptr, &eptr, 10); + if (eptr[0] != '\0') return REDIS_ERR; + } else if (o->encoding == REDIS_ENCODING_INT) { + value = (long)o->ptr; + } else { + redisPanic("Unknown string encoding"); + } + } + + *target = value; + return REDIS_OK; +} + +int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) { + long long value; + if (getLongLongFromObject(o, &value) != REDIS_OK) { + if (msg != NULL) { + addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg)); + } else { + addReplySds(c, sdsnew("-ERR value is not an integer\r\n")); + } + return REDIS_ERR; + } + + *target = value; + return REDIS_OK; +} + +int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) { + long long value; + + if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR; + if (value < LONG_MIN || value > LONG_MAX) { + if (msg != NULL) { + addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg)); + } else { + addReplySds(c, sdsnew("-ERR value is out of range\r\n")); + } + return REDIS_ERR; + } + + *target = value; + return REDIS_OK; +} + +char *strEncoding(int encoding) { + switch(encoding) { + case REDIS_ENCODING_RAW: return "raw"; + case REDIS_ENCODING_INT: return "int"; + case REDIS_ENCODING_HT: return "hashtable"; + case REDIS_ENCODING_ZIPMAP: return "zipmap"; + case REDIS_ENCODING_LINKEDLIST: return "linkedlist"; + case REDIS_ENCODING_ZIPLIST: return "ziplist"; + default: return "unknown"; + } +} diff --git a/src/pqsort.c b/src/pqsort.c new file mode 100644 index 000000000..257756376 --- /dev/null +++ b/src/pqsort.c @@ -0,0 +1,197 @@ +/* The following is the NetBSD libc qsort implementation modified in order to + * support partial sorting of ranges for Redis. + * + * Copyright(C) 2009-2010 Salvatore Sanfilippo. All rights reserved. + * + * The original copyright notice follows. */ + + +/* $NetBSD: qsort.c,v 1.19 2009/01/30 23:38:44 lukem Exp $ */ + +/*- + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include +#include + +static inline char *med3 (char *, char *, char *, + int (*)(const void *, const void *)); +static inline void swapfunc (char *, char *, size_t, int); + +#define min(a, b) (a) < (b) ? a : b + +/* + * Qsort routine from Bentley & McIlroy's "Engineering a Sort Function". + */ +#define swapcode(TYPE, parmi, parmj, n) { \ + size_t i = (n) / sizeof (TYPE); \ + TYPE *pi = (TYPE *)(void *)(parmi); \ + TYPE *pj = (TYPE *)(void *)(parmj); \ + do { \ + TYPE t = *pi; \ + *pi++ = *pj; \ + *pj++ = t; \ + } while (--i > 0); \ +} + +#define SWAPINIT(a, es) swaptype = ((char *)a - (char *)0) % sizeof(long) || \ + es % sizeof(long) ? 2 : es == sizeof(long)? 0 : 1; + +static inline void +swapfunc(char *a, char *b, size_t n, int swaptype) +{ + + if (swaptype <= 1) + swapcode(long, a, b, n) + else + swapcode(char, a, b, n) +} + +#define swap(a, b) \ + if (swaptype == 0) { \ + long t = *(long *)(void *)(a); \ + *(long *)(void *)(a) = *(long *)(void *)(b); \ + *(long *)(void *)(b) = t; \ + } else \ + swapfunc(a, b, es, swaptype) + +#define vecswap(a, b, n) if ((n) > 0) swapfunc((a), (b), (size_t)(n), swaptype) + +static inline char * +med3(char *a, char *b, char *c, + int (*cmp) (const void *, const void *)) +{ + + return cmp(a, b) < 0 ? + (cmp(b, c) < 0 ? b : (cmp(a, c) < 0 ? c : a )) + :(cmp(b, c) > 0 ? b : (cmp(a, c) < 0 ? a : c )); +} + +static void +_pqsort(void *a, size_t n, size_t es, + int (*cmp) (const void *, const void *), void *lrange, void *rrange) +{ + char *pa, *pb, *pc, *pd, *pl, *pm, *pn; + size_t d, r; + int swaptype, swap_cnt, cmp_result; + +loop: SWAPINIT(a, es); + swap_cnt = 0; + if (n < 7) { + for (pm = (char *) a + es; pm < (char *) a + n * es; pm += es) + for (pl = pm; pl > (char *) a && cmp(pl - es, pl) > 0; + pl -= es) + swap(pl, pl - es); + return; + } + pm = (char *) a + (n / 2) * es; + if (n > 7) { + pl = (char *) a; + pn = (char *) a + (n - 1) * es; + if (n > 40) { + d = (n / 8) * es; + pl = med3(pl, pl + d, pl + 2 * d, cmp); + pm = med3(pm - d, pm, pm + d, cmp); + pn = med3(pn - 2 * d, pn - d, pn, cmp); + } + pm = med3(pl, pm, pn, cmp); + } + swap(a, pm); + pa = pb = (char *) a + es; + + pc = pd = (char *) a + (n - 1) * es; + for (;;) { + while (pb <= pc && (cmp_result = cmp(pb, a)) <= 0) { + if (cmp_result == 0) { + swap_cnt = 1; + swap(pa, pb); + pa += es; + } + pb += es; + } + while (pb <= pc && (cmp_result = cmp(pc, a)) >= 0) { + if (cmp_result == 0) { + swap_cnt = 1; + swap(pc, pd); + pd -= es; + } + pc -= es; + } + if (pb > pc) + break; + swap(pb, pc); + swap_cnt = 1; + pb += es; + pc -= es; + } + if (swap_cnt == 0) { /* Switch to insertion sort */ + for (pm = (char *) a + es; pm < (char *) a + n * es; pm += es) + for (pl = pm; pl > (char *) a && cmp(pl - es, pl) > 0; + pl -= es) + swap(pl, pl - es); + return; + } + + pn = (char *) a + n * es; + r = min(pa - (char *) a, pb - pa); + vecswap(a, pb - r, r); + r = min((size_t)(pd - pc), pn - pd - es); + vecswap(pb, pn - r, r); + if ((r = pb - pa) > es) { + void *_l = a, *_r = ((unsigned char*)a)+r-1; + if (!((lrange < _l && rrange < _l) || + (lrange > _r && rrange > _r))) + _pqsort(a, r / es, es, cmp, lrange, rrange); + } + if ((r = pd - pc) > es) { + void *_l, *_r; + + /* Iterate rather than recurse to save stack space */ + a = pn - r; + n = r / es; + + _l = a; + _r = ((unsigned char*)a)+r-1; + if (!((lrange < _l && rrange < _l) || + (lrange > _r && rrange > _r))) + goto loop; + } +/* qsort(pn - r, r / es, es, cmp);*/ +} + +void +pqsort(void *a, size_t n, size_t es, + int (*cmp) (const void *, const void *), size_t lrange, size_t rrange) +{ + _pqsort(a,n,es,cmp,((unsigned char*)a)+(lrange*es), + ((unsigned char*)a)+((rrange+1)*es)-1); +} diff --git a/src/pqsort.h b/src/pqsort.h new file mode 100644 index 000000000..5054d5209 --- /dev/null +++ b/src/pqsort.h @@ -0,0 +1,15 @@ +/* The following is the NetBSD libc qsort implementation modified in order to + * support partial sorting of ranges for Redis. + * + * Copyright(C) 2009-2010 Salvatore Sanfilippo. All rights reserved. + * + * See the pqsort.c file for the original copyright notice. */ + +#ifndef __PQSORT_H +#define __PQSORT_H + +void +pqsort(void *a, size_t n, size_t es, + int (*cmp) (const void *, const void *), size_t lrange, size_t rrange); + +#endif diff --git a/src/pubsub.c b/src/pubsub.c new file mode 100644 index 000000000..c9f5f310e --- /dev/null +++ b/src/pubsub.c @@ -0,0 +1,259 @@ +#include "redis.h" + +void freePubsubPattern(void *p) { + pubsubPattern *pat = p; + + decrRefCount(pat->pattern); + zfree(pat); +} + +int listMatchPubsubPattern(void *a, void *b) { + pubsubPattern *pa = a, *pb = b; + + return (pa->client == pb->client) && + (equalStringObjects(pa->pattern,pb->pattern)); +} + +/* Subscribe a client to a channel. Returns 1 if the operation succeeded, or + * 0 if the client was already subscribed to that channel. */ +int pubsubSubscribeChannel(redisClient *c, robj *channel) { + struct dictEntry *de; + list *clients = NULL; + int retval = 0; + + /* Add the channel to the client -> channels hash table */ + if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) { + retval = 1; + incrRefCount(channel); + /* Add the client to the channel -> list of clients hash table */ + de = dictFind(server.pubsub_channels,channel); + if (de == NULL) { + clients = listCreate(); + dictAdd(server.pubsub_channels,channel,clients); + incrRefCount(channel); + } else { + clients = dictGetEntryVal(de); + } + listAddNodeTail(clients,c); + } + /* Notify the client */ + addReply(c,shared.mbulk3); + addReply(c,shared.subscribebulk); + addReplyBulk(c,channel); + addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns)); + return retval; +} + +/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or + * 0 if the client was not subscribed to the specified channel. */ +int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) { + struct dictEntry *de; + list *clients; + listNode *ln; + int retval = 0; + + /* Remove the channel from the client -> channels hash table */ + incrRefCount(channel); /* channel may be just a pointer to the same object + we have in the hash tables. Protect it... */ + if (dictDelete(c->pubsub_channels,channel) == DICT_OK) { + retval = 1; + /* Remove the client from the channel -> clients list hash table */ + de = dictFind(server.pubsub_channels,channel); + redisAssert(de != NULL); + clients = dictGetEntryVal(de); + ln = listSearchKey(clients,c); + redisAssert(ln != NULL); + listDelNode(clients,ln); + if (listLength(clients) == 0) { + /* Free the list and associated hash entry at all if this was + * the latest client, so that it will be possible to abuse + * Redis PUBSUB creating millions of channels. */ + dictDelete(server.pubsub_channels,channel); + } + } + /* Notify the client */ + if (notify) { + addReply(c,shared.mbulk3); + addReply(c,shared.unsubscribebulk); + addReplyBulk(c,channel); + addReplyLongLong(c,dictSize(c->pubsub_channels)+ + listLength(c->pubsub_patterns)); + + } + decrRefCount(channel); /* it is finally safe to release it */ + return retval; +} + +/* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */ +int pubsubSubscribePattern(redisClient *c, robj *pattern) { + int retval = 0; + + if (listSearchKey(c->pubsub_patterns,pattern) == NULL) { + retval = 1; + pubsubPattern *pat; + listAddNodeTail(c->pubsub_patterns,pattern); + incrRefCount(pattern); + pat = zmalloc(sizeof(*pat)); + pat->pattern = getDecodedObject(pattern); + pat->client = c; + listAddNodeTail(server.pubsub_patterns,pat); + } + /* Notify the client */ + addReply(c,shared.mbulk3); + addReply(c,shared.psubscribebulk); + addReplyBulk(c,pattern); + addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns)); + return retval; +} + +/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or + * 0 if the client was not subscribed to the specified channel. */ +int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) { + listNode *ln; + pubsubPattern pat; + int retval = 0; + + incrRefCount(pattern); /* Protect the object. May be the same we remove */ + if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) { + retval = 1; + listDelNode(c->pubsub_patterns,ln); + pat.client = c; + pat.pattern = pattern; + ln = listSearchKey(server.pubsub_patterns,&pat); + listDelNode(server.pubsub_patterns,ln); + } + /* Notify the client */ + if (notify) { + addReply(c,shared.mbulk3); + addReply(c,shared.punsubscribebulk); + addReplyBulk(c,pattern); + addReplyLongLong(c,dictSize(c->pubsub_channels)+ + listLength(c->pubsub_patterns)); + } + decrRefCount(pattern); + return retval; +} + +/* Unsubscribe from all the channels. Return the number of channels the + * client was subscribed from. */ +int pubsubUnsubscribeAllChannels(redisClient *c, int notify) { + dictIterator *di = dictGetIterator(c->pubsub_channels); + dictEntry *de; + int count = 0; + + while((de = dictNext(di)) != NULL) { + robj *channel = dictGetEntryKey(de); + + count += pubsubUnsubscribeChannel(c,channel,notify); + } + dictReleaseIterator(di); + return count; +} + +/* Unsubscribe from all the patterns. Return the number of patterns the + * client was subscribed from. */ +int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) { + listNode *ln; + listIter li; + int count = 0; + + listRewind(c->pubsub_patterns,&li); + while ((ln = listNext(&li)) != NULL) { + robj *pattern = ln->value; + + count += pubsubUnsubscribePattern(c,pattern,notify); + } + return count; +} + +/* Publish a message */ +int pubsubPublishMessage(robj *channel, robj *message) { + int receivers = 0; + struct dictEntry *de; + listNode *ln; + listIter li; + + /* Send to clients listening for that channel */ + de = dictFind(server.pubsub_channels,channel); + if (de) { + list *list = dictGetEntryVal(de); + listNode *ln; + listIter li; + + listRewind(list,&li); + while ((ln = listNext(&li)) != NULL) { + redisClient *c = ln->value; + + addReply(c,shared.mbulk3); + addReply(c,shared.messagebulk); + addReplyBulk(c,channel); + addReplyBulk(c,message); + receivers++; + } + } + /* Send to clients listening to matching channels */ + if (listLength(server.pubsub_patterns)) { + listRewind(server.pubsub_patterns,&li); + channel = getDecodedObject(channel); + while ((ln = listNext(&li)) != NULL) { + pubsubPattern *pat = ln->value; + + if (stringmatchlen((char*)pat->pattern->ptr, + sdslen(pat->pattern->ptr), + (char*)channel->ptr, + sdslen(channel->ptr),0)) { + addReply(pat->client,shared.mbulk4); + addReply(pat->client,shared.pmessagebulk); + addReplyBulk(pat->client,pat->pattern); + addReplyBulk(pat->client,channel); + addReplyBulk(pat->client,message); + receivers++; + } + } + decrRefCount(channel); + } + return receivers; +} + +void subscribeCommand(redisClient *c) { + int j; + + for (j = 1; j < c->argc; j++) + pubsubSubscribeChannel(c,c->argv[j]); +} + +void unsubscribeCommand(redisClient *c) { + if (c->argc == 1) { + pubsubUnsubscribeAllChannels(c,1); + return; + } else { + int j; + + for (j = 1; j < c->argc; j++) + pubsubUnsubscribeChannel(c,c->argv[j],1); + } +} + +void psubscribeCommand(redisClient *c) { + int j; + + for (j = 1; j < c->argc; j++) + pubsubSubscribePattern(c,c->argv[j]); +} + +void punsubscribeCommand(redisClient *c) { + if (c->argc == 1) { + pubsubUnsubscribeAllPatterns(c,1); + return; + } else { + int j; + + for (j = 1; j < c->argc; j++) + pubsubUnsubscribePattern(c,c->argv[j],1); + } +} + +void publishCommand(redisClient *c) { + int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]); + addReplyLongLong(c,receivers); +} diff --git a/src/rdb.c b/src/rdb.c new file mode 100644 index 000000000..5bda5e565 --- /dev/null +++ b/src/rdb.c @@ -0,0 +1,886 @@ +#include "redis.h" +#include "lzf.h" /* LZF compression library */ + +#include + +int rdbSaveType(FILE *fp, unsigned char type) { + if (fwrite(&type,1,1,fp) == 0) return -1; + return 0; +} + +int rdbSaveTime(FILE *fp, time_t t) { + int32_t t32 = (int32_t) t; + if (fwrite(&t32,4,1,fp) == 0) return -1; + return 0; +} + +/* check rdbLoadLen() comments for more info */ +int rdbSaveLen(FILE *fp, uint32_t len) { + unsigned char buf[2]; + + if (len < (1<<6)) { + /* Save a 6 bit len */ + buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6); + if (fwrite(buf,1,1,fp) == 0) return -1; + } else if (len < (1<<14)) { + /* Save a 14 bit len */ + buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6); + buf[1] = len&0xFF; + if (fwrite(buf,2,1,fp) == 0) return -1; + } else { + /* Save a 32 bit len */ + buf[0] = (REDIS_RDB_32BITLEN<<6); + if (fwrite(buf,1,1,fp) == 0) return -1; + len = htonl(len); + if (fwrite(&len,4,1,fp) == 0) return -1; + } + return 0; +} + +/* Encode 'value' as an integer if possible (if integer will fit the + * supported range). If the function sucessful encoded the integer + * then the (up to 5 bytes) encoded representation is written in the + * string pointed by 'enc' and the length is returned. Otherwise + * 0 is returned. */ +int rdbEncodeInteger(long long value, unsigned char *enc) { + /* Finally check if it fits in our ranges */ + if (value >= -(1<<7) && value <= (1<<7)-1) { + enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8; + enc[1] = value&0xFF; + return 2; + } else if (value >= -(1<<15) && value <= (1<<15)-1) { + enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16; + enc[1] = value&0xFF; + enc[2] = (value>>8)&0xFF; + return 3; + } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) { + enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32; + enc[1] = value&0xFF; + enc[2] = (value>>8)&0xFF; + enc[3] = (value>>16)&0xFF; + enc[4] = (value>>24)&0xFF; + return 5; + } else { + return 0; + } +} + +/* String objects in the form "2391" "-100" without any space and with a + * range of values that can fit in an 8, 16 or 32 bit signed value can be + * encoded as integers to save space */ +int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) { + long long value; + char *endptr, buf[32]; + + /* Check if it's possible to encode this value as a number */ + value = strtoll(s, &endptr, 10); + if (endptr[0] != '\0') return 0; + ll2string(buf,32,value); + + /* If the number converted back into a string is not identical + * then it's not possible to encode the string as integer */ + if (strlen(buf) != len || memcmp(buf,s,len)) return 0; + + return rdbEncodeInteger(value,enc); +} + +int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) { + size_t comprlen, outlen; + unsigned char byte; + void *out; + + /* We require at least four bytes compression for this to be worth it */ + if (len <= 4) return 0; + outlen = len-4; + if ((out = zmalloc(outlen+1)) == NULL) return 0; + comprlen = lzf_compress(s, len, out, outlen); + if (comprlen == 0) { + zfree(out); + return 0; + } + /* Data compressed! Let's save it on disk */ + byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF; + if (fwrite(&byte,1,1,fp) == 0) goto writeerr; + if (rdbSaveLen(fp,comprlen) == -1) goto writeerr; + if (rdbSaveLen(fp,len) == -1) goto writeerr; + if (fwrite(out,comprlen,1,fp) == 0) goto writeerr; + zfree(out); + return comprlen; + +writeerr: + zfree(out); + return -1; +} + +/* Save a string objet as [len][data] on disk. If the object is a string + * representation of an integer value we try to safe it in a special form */ +int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) { + int enclen; + + /* Try integer encoding */ + if (len <= 11) { + unsigned char buf[5]; + if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) { + if (fwrite(buf,enclen,1,fp) == 0) return -1; + return 0; + } + } + + /* Try LZF compression - under 20 bytes it's unable to compress even + * aaaaaaaaaaaaaaaaaa so skip it */ + if (server.rdbcompression && len > 20) { + int retval; + + retval = rdbSaveLzfStringObject(fp,s,len); + if (retval == -1) return -1; + if (retval > 0) return 0; + /* retval == 0 means data can't be compressed, save the old way */ + } + + /* Store verbatim */ + if (rdbSaveLen(fp,len) == -1) return -1; + if (len && fwrite(s,len,1,fp) == 0) return -1; + return 0; +} + +/* Save a long long value as either an encoded string or a string. */ +int rdbSaveLongLongAsStringObject(FILE *fp, long long value) { + unsigned char buf[32]; + int enclen = rdbEncodeInteger(value,buf); + if (enclen > 0) { + if (fwrite(buf,enclen,1,fp) == 0) return -1; + } else { + /* Encode as string */ + enclen = ll2string((char*)buf,32,value); + redisAssert(enclen < 32); + if (rdbSaveLen(fp,enclen) == -1) return -1; + if (fwrite(buf,enclen,1,fp) == 0) return -1; + } + return 0; +} + +/* Like rdbSaveStringObjectRaw() but handle encoded objects */ +int rdbSaveStringObject(FILE *fp, robj *obj) { + /* Avoid to decode the object, then encode it again, if the + * object is alrady integer encoded. */ + if (obj->encoding == REDIS_ENCODING_INT) { + return rdbSaveLongLongAsStringObject(fp,(long)obj->ptr); + } else { + redisAssert(obj->encoding == REDIS_ENCODING_RAW); + return rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr)); + } +} + +/* Save a double value. Doubles are saved as strings prefixed by an unsigned + * 8 bit integer specifing the length of the representation. + * This 8 bit integer has special values in order to specify the following + * conditions: + * 253: not a number + * 254: + inf + * 255: - inf + */ +int rdbSaveDoubleValue(FILE *fp, double val) { + unsigned char buf[128]; + int len; + + if (isnan(val)) { + buf[0] = 253; + len = 1; + } else if (!isfinite(val)) { + len = 1; + buf[0] = (val < 0) ? 255 : 254; + } else { +#if (DBL_MANT_DIG >= 52) && (LLONG_MAX == 0x7fffffffffffffffLL) + /* Check if the float is in a safe range to be casted into a + * long long. We are assuming that long long is 64 bit here. + * Also we are assuming that there are no implementations around where + * double has precision < 52 bit. + * + * Under this assumptions we test if a double is inside an interval + * where casting to long long is safe. Then using two castings we + * make sure the decimal part is zero. If all this is true we use + * integer printing function that is much faster. */ + double min = -4503599627370495; /* (2^52)-1 */ + double max = 4503599627370496; /* -(2^52) */ + if (val > min && val < max && val == ((double)((long long)val))) + ll2string((char*)buf+1,sizeof(buf),(long long)val); + else +#endif + snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val); + buf[0] = strlen((char*)buf+1); + len = buf[0]+1; + } + if (fwrite(buf,len,1,fp) == 0) return -1; + return 0; +} + +/* Save a Redis object. */ +int rdbSaveObject(FILE *fp, robj *o) { + if (o->type == REDIS_STRING) { + /* Save a string value */ + if (rdbSaveStringObject(fp,o) == -1) return -1; + } else if (o->type == REDIS_LIST) { + /* Save a list value */ + if (o->encoding == REDIS_ENCODING_ZIPLIST) { + unsigned char *p; + unsigned char *vstr; + unsigned int vlen; + long long vlong; + + if (rdbSaveLen(fp,ziplistLen(o->ptr)) == -1) return -1; + p = ziplistIndex(o->ptr,0); + while(ziplistGet(p,&vstr,&vlen,&vlong)) { + if (vstr) { + if (rdbSaveRawString(fp,vstr,vlen) == -1) + return -1; + } else { + if (rdbSaveLongLongAsStringObject(fp,vlong) == -1) + return -1; + } + p = ziplistNext(o->ptr,p); + } + } else if (o->encoding == REDIS_ENCODING_LINKEDLIST) { + list *list = o->ptr; + listIter li; + listNode *ln; + + if (rdbSaveLen(fp,listLength(list)) == -1) return -1; + listRewind(list,&li); + while((ln = listNext(&li))) { + robj *eleobj = listNodeValue(ln); + if (rdbSaveStringObject(fp,eleobj) == -1) return -1; + } + } else { + redisPanic("Unknown list encoding"); + } + } else if (o->type == REDIS_SET) { + /* Save a set value */ + dict *set = o->ptr; + dictIterator *di = dictGetIterator(set); + dictEntry *de; + + if (rdbSaveLen(fp,dictSize(set)) == -1) return -1; + while((de = dictNext(di)) != NULL) { + robj *eleobj = dictGetEntryKey(de); + + if (rdbSaveStringObject(fp,eleobj) == -1) return -1; + } + dictReleaseIterator(di); + } else if (o->type == REDIS_ZSET) { + /* Save a set value */ + zset *zs = o->ptr; + dictIterator *di = dictGetIterator(zs->dict); + dictEntry *de; + + if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1; + while((de = dictNext(di)) != NULL) { + robj *eleobj = dictGetEntryKey(de); + double *score = dictGetEntryVal(de); + + if (rdbSaveStringObject(fp,eleobj) == -1) return -1; + if (rdbSaveDoubleValue(fp,*score) == -1) return -1; + } + dictReleaseIterator(di); + } else if (o->type == REDIS_HASH) { + /* Save a hash value */ + if (o->encoding == REDIS_ENCODING_ZIPMAP) { + unsigned char *p = zipmapRewind(o->ptr); + unsigned int count = zipmapLen(o->ptr); + unsigned char *key, *val; + unsigned int klen, vlen; + + if (rdbSaveLen(fp,count) == -1) return -1; + while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) { + if (rdbSaveRawString(fp,key,klen) == -1) return -1; + if (rdbSaveRawString(fp,val,vlen) == -1) return -1; + } + } else { + dictIterator *di = dictGetIterator(o->ptr); + dictEntry *de; + + if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1; + while((de = dictNext(di)) != NULL) { + robj *key = dictGetEntryKey(de); + robj *val = dictGetEntryVal(de); + + if (rdbSaveStringObject(fp,key) == -1) return -1; + if (rdbSaveStringObject(fp,val) == -1) return -1; + } + dictReleaseIterator(di); + } + } else { + redisPanic("Unknown object type"); + } + return 0; +} + +/* Return the length the object will have on disk if saved with + * the rdbSaveObject() function. Currently we use a trick to get + * this length with very little changes to the code. In the future + * we could switch to a faster solution. */ +off_t rdbSavedObjectLen(robj *o, FILE *fp) { + if (fp == NULL) fp = server.devnull; + rewind(fp); + redisAssert(rdbSaveObject(fp,o) != 1); + return ftello(fp); +} + +/* Return the number of pages required to save this object in the swap file */ +off_t rdbSavedObjectPages(robj *o, FILE *fp) { + off_t bytes = rdbSavedObjectLen(o,fp); + + return (bytes+(server.vm_page_size-1))/server.vm_page_size; +} + +/* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */ +int rdbSave(char *filename) { + dictIterator *di = NULL; + dictEntry *de; + FILE *fp; + char tmpfile[256]; + int j; + time_t now = time(NULL); + + /* Wait for I/O therads to terminate, just in case this is a + * foreground-saving, to avoid seeking the swap file descriptor at the + * same time. */ + if (server.vm_enabled) + waitEmptyIOJobsQueue(); + + snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid()); + fp = fopen(tmpfile,"w"); + if (!fp) { + redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno)); + return REDIS_ERR; + } + if (fwrite("REDIS0001",9,1,fp) == 0) goto werr; + for (j = 0; j < server.dbnum; j++) { + redisDb *db = server.db+j; + dict *d = db->dict; + if (dictSize(d) == 0) continue; + di = dictGetIterator(d); + if (!di) { + fclose(fp); + return REDIS_ERR; + } + + /* Write the SELECT DB opcode */ + if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr; + if (rdbSaveLen(fp,j) == -1) goto werr; + + /* Iterate this DB writing every entry */ + while((de = dictNext(di)) != NULL) { + sds keystr = dictGetEntryKey(de); + robj key, *o = dictGetEntryVal(de); + time_t expiretime; + + initStaticStringObject(key,keystr); + expiretime = getExpire(db,&key); + + /* Save the expire time */ + if (expiretime != -1) { + /* If this key is already expired skip it */ + if (expiretime < now) continue; + if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr; + if (rdbSaveTime(fp,expiretime) == -1) goto werr; + } + /* Save the key and associated value. This requires special + * handling if the value is swapped out. */ + if (!server.vm_enabled || o->storage == REDIS_VM_MEMORY || + o->storage == REDIS_VM_SWAPPING) { + /* Save type, key, value */ + if (rdbSaveType(fp,o->type) == -1) goto werr; + if (rdbSaveStringObject(fp,&key) == -1) goto werr; + if (rdbSaveObject(fp,o) == -1) goto werr; + } else { + /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */ + robj *po; + /* Get a preview of the object in memory */ + po = vmPreviewObject(o); + /* Save type, key, value */ + if (rdbSaveType(fp,po->type) == -1) goto werr; + if (rdbSaveStringObject(fp,&key) == -1) goto werr; + if (rdbSaveObject(fp,po) == -1) goto werr; + /* Remove the loaded object from memory */ + decrRefCount(po); + } + } + dictReleaseIterator(di); + } + /* EOF opcode */ + if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr; + + /* Make sure data will not remain on the OS's output buffers */ + fflush(fp); + fsync(fileno(fp)); + fclose(fp); + + /* Use RENAME to make sure the DB file is changed atomically only + * if the generate DB file is ok. */ + if (rename(tmpfile,filename) == -1) { + redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno)); + unlink(tmpfile); + return REDIS_ERR; + } + redisLog(REDIS_NOTICE,"DB saved on disk"); + server.dirty = 0; + server.lastsave = time(NULL); + return REDIS_OK; + +werr: + fclose(fp); + unlink(tmpfile); + redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno)); + if (di) dictReleaseIterator(di); + return REDIS_ERR; +} + +int rdbSaveBackground(char *filename) { + pid_t childpid; + + if (server.bgsavechildpid != -1) return REDIS_ERR; + if (server.vm_enabled) waitEmptyIOJobsQueue(); + if ((childpid = fork()) == 0) { + /* Child */ + if (server.vm_enabled) vmReopenSwapFile(); + close(server.fd); + if (rdbSave(filename) == REDIS_OK) { + _exit(0); + } else { + _exit(1); + } + } else { + /* Parent */ + if (childpid == -1) { + redisLog(REDIS_WARNING,"Can't save in background: fork: %s", + strerror(errno)); + return REDIS_ERR; + } + redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid); + server.bgsavechildpid = childpid; + updateDictResizePolicy(); + return REDIS_OK; + } + return REDIS_OK; /* unreached */ +} + +void rdbRemoveTempFile(pid_t childpid) { + char tmpfile[256]; + + snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid); + unlink(tmpfile); +} + +int rdbLoadType(FILE *fp) { + unsigned char type; + if (fread(&type,1,1,fp) == 0) return -1; + return type; +} + +time_t rdbLoadTime(FILE *fp) { + int32_t t32; + if (fread(&t32,4,1,fp) == 0) return -1; + return (time_t) t32; +} + +/* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top + * of this file for a description of how this are stored on disk. + * + * isencoded is set to 1 if the readed length is not actually a length but + * an "encoding type", check the above comments for more info */ +uint32_t rdbLoadLen(FILE *fp, int *isencoded) { + unsigned char buf[2]; + uint32_t len; + int type; + + if (isencoded) *isencoded = 0; + if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR; + type = (buf[0]&0xC0)>>6; + if (type == REDIS_RDB_6BITLEN) { + /* Read a 6 bit len */ + return buf[0]&0x3F; + } else if (type == REDIS_RDB_ENCVAL) { + /* Read a 6 bit len encoding type */ + if (isencoded) *isencoded = 1; + return buf[0]&0x3F; + } else if (type == REDIS_RDB_14BITLEN) { + /* Read a 14 bit len */ + if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR; + return ((buf[0]&0x3F)<<8)|buf[1]; + } else { + /* Read a 32 bit len */ + if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR; + return ntohl(len); + } +} + +/* Load an integer-encoded object from file 'fp', with the specified + * encoding type 'enctype'. If encode is true the function may return + * an integer-encoded object as reply, otherwise the returned object + * will always be encoded as a raw string. */ +robj *rdbLoadIntegerObject(FILE *fp, int enctype, int encode) { + unsigned char enc[4]; + long long val; + + if (enctype == REDIS_RDB_ENC_INT8) { + if (fread(enc,1,1,fp) == 0) return NULL; + val = (signed char)enc[0]; + } else if (enctype == REDIS_RDB_ENC_INT16) { + uint16_t v; + if (fread(enc,2,1,fp) == 0) return NULL; + v = enc[0]|(enc[1]<<8); + val = (int16_t)v; + } else if (enctype == REDIS_RDB_ENC_INT32) { + uint32_t v; + if (fread(enc,4,1,fp) == 0) return NULL; + v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24); + val = (int32_t)v; + } else { + val = 0; /* anti-warning */ + redisPanic("Unknown RDB integer encoding type"); + } + if (encode) + return createStringObjectFromLongLong(val); + else + return createObject(REDIS_STRING,sdsfromlonglong(val)); +} + +robj *rdbLoadLzfStringObject(FILE*fp) { + unsigned int len, clen; + unsigned char *c = NULL; + sds val = NULL; + + if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL; + if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL; + if ((c = zmalloc(clen)) == NULL) goto err; + if ((val = sdsnewlen(NULL,len)) == NULL) goto err; + if (fread(c,clen,1,fp) == 0) goto err; + if (lzf_decompress(c,clen,val,len) == 0) goto err; + zfree(c); + return createObject(REDIS_STRING,val); +err: + zfree(c); + sdsfree(val); + return NULL; +} + +robj *rdbGenericLoadStringObject(FILE*fp, int encode) { + int isencoded; + uint32_t len; + sds val; + + len = rdbLoadLen(fp,&isencoded); + if (isencoded) { + switch(len) { + case REDIS_RDB_ENC_INT8: + case REDIS_RDB_ENC_INT16: + case REDIS_RDB_ENC_INT32: + return rdbLoadIntegerObject(fp,len,encode); + case REDIS_RDB_ENC_LZF: + return rdbLoadLzfStringObject(fp); + default: + redisPanic("Unknown RDB encoding type"); + } + } + + if (len == REDIS_RDB_LENERR) return NULL; + val = sdsnewlen(NULL,len); + if (len && fread(val,len,1,fp) == 0) { + sdsfree(val); + return NULL; + } + return createObject(REDIS_STRING,val); +} + +robj *rdbLoadStringObject(FILE *fp) { + return rdbGenericLoadStringObject(fp,0); +} + +robj *rdbLoadEncodedStringObject(FILE *fp) { + return rdbGenericLoadStringObject(fp,1); +} + +/* For information about double serialization check rdbSaveDoubleValue() */ +int rdbLoadDoubleValue(FILE *fp, double *val) { + char buf[128]; + unsigned char len; + + if (fread(&len,1,1,fp) == 0) return -1; + switch(len) { + case 255: *val = R_NegInf; return 0; + case 254: *val = R_PosInf; return 0; + case 253: *val = R_Nan; return 0; + default: + if (fread(buf,len,1,fp) == 0) return -1; + buf[len] = '\0'; + sscanf(buf, "%lg", val); + return 0; + } +} + +/* Load a Redis object of the specified type from the specified file. + * On success a newly allocated object is returned, otherwise NULL. */ +robj *rdbLoadObject(int type, FILE *fp) { + robj *o, *ele, *dec; + size_t len; + + redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp)); + if (type == REDIS_STRING) { + /* Read string value */ + if ((o = rdbLoadEncodedStringObject(fp)) == NULL) return NULL; + o = tryObjectEncoding(o); + } else if (type == REDIS_LIST) { + /* Read list value */ + if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL; + + /* Use a real list when there are too many entries */ + if (len > server.list_max_ziplist_entries) { + o = createListObject(); + } else { + o = createZiplistObject(); + } + + /* Load every single element of the list */ + while(len--) { + if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL; + + /* If we are using a ziplist and the value is too big, convert + * the object to a real list. */ + if (o->encoding == REDIS_ENCODING_ZIPLIST && + ele->encoding == REDIS_ENCODING_RAW && + sdslen(ele->ptr) > server.list_max_ziplist_value) + listTypeConvert(o,REDIS_ENCODING_LINKEDLIST); + + if (o->encoding == REDIS_ENCODING_ZIPLIST) { + dec = getDecodedObject(ele); + o->ptr = ziplistPush(o->ptr,dec->ptr,sdslen(dec->ptr),REDIS_TAIL); + decrRefCount(dec); + decrRefCount(ele); + } else { + ele = tryObjectEncoding(ele); + listAddNodeTail(o->ptr,ele); + } + } + } else if (type == REDIS_SET) { + /* Read list/set value */ + if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL; + o = createSetObject(); + /* It's faster to expand the dict to the right size asap in order + * to avoid rehashing */ + if (len > DICT_HT_INITIAL_SIZE) + dictExpand(o->ptr,len); + /* Load every single element of the list/set */ + while(len--) { + if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL; + ele = tryObjectEncoding(ele); + dictAdd((dict*)o->ptr,ele,NULL); + } + } else if (type == REDIS_ZSET) { + /* Read list/set value */ + size_t zsetlen; + zset *zs; + + if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL; + o = createZsetObject(); + zs = o->ptr; + /* Load every single element of the list/set */ + while(zsetlen--) { + robj *ele; + double *score = zmalloc(sizeof(double)); + + if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL; + ele = tryObjectEncoding(ele); + if (rdbLoadDoubleValue(fp,score) == -1) return NULL; + dictAdd(zs->dict,ele,score); + zslInsert(zs->zsl,*score,ele); + incrRefCount(ele); /* added to skiplist */ + } + } else if (type == REDIS_HASH) { + size_t hashlen; + + if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL; + o = createHashObject(); + /* Too many entries? Use an hash table. */ + if (hashlen > server.hash_max_zipmap_entries) + convertToRealHash(o); + /* Load every key/value, then set it into the zipmap or hash + * table, as needed. */ + while(hashlen--) { + robj *key, *val; + + if ((key = rdbLoadEncodedStringObject(fp)) == NULL) return NULL; + if ((val = rdbLoadEncodedStringObject(fp)) == NULL) return NULL; + /* If we are using a zipmap and there are too big values + * the object is converted to real hash table encoding. */ + if (o->encoding != REDIS_ENCODING_HT && + ((key->encoding == REDIS_ENCODING_RAW && + sdslen(key->ptr) > server.hash_max_zipmap_value) || + (val->encoding == REDIS_ENCODING_RAW && + sdslen(val->ptr) > server.hash_max_zipmap_value))) + { + convertToRealHash(o); + } + + if (o->encoding == REDIS_ENCODING_ZIPMAP) { + unsigned char *zm = o->ptr; + robj *deckey, *decval; + + /* We need raw string objects to add them to the zipmap */ + deckey = getDecodedObject(key); + decval = getDecodedObject(val); + zm = zipmapSet(zm,deckey->ptr,sdslen(deckey->ptr), + decval->ptr,sdslen(decval->ptr),NULL); + o->ptr = zm; + decrRefCount(deckey); + decrRefCount(decval); + decrRefCount(key); + decrRefCount(val); + } else { + key = tryObjectEncoding(key); + val = tryObjectEncoding(val); + dictAdd((dict*)o->ptr,key,val); + } + } + } else { + redisPanic("Unknown object type"); + } + return o; +} + +int rdbLoad(char *filename) { + FILE *fp; + uint32_t dbid; + int type, retval, rdbver; + int swap_all_values = 0; + redisDb *db = server.db+0; + char buf[1024]; + time_t expiretime, now = time(NULL); + + fp = fopen(filename,"r"); + if (!fp) return REDIS_ERR; + if (fread(buf,9,1,fp) == 0) goto eoferr; + buf[9] = '\0'; + if (memcmp(buf,"REDIS",5) != 0) { + fclose(fp); + redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file"); + return REDIS_ERR; + } + rdbver = atoi(buf+5); + if (rdbver != 1) { + fclose(fp); + redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver); + return REDIS_ERR; + } + while(1) { + robj *key, *val; + int force_swapout; + + expiretime = -1; + /* Read type. */ + if ((type = rdbLoadType(fp)) == -1) goto eoferr; + if (type == REDIS_EXPIRETIME) { + if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr; + /* We read the time so we need to read the object type again */ + if ((type = rdbLoadType(fp)) == -1) goto eoferr; + } + if (type == REDIS_EOF) break; + /* Handle SELECT DB opcode as a special case */ + if (type == REDIS_SELECTDB) { + if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) + goto eoferr; + if (dbid >= (unsigned)server.dbnum) { + redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum); + exit(1); + } + db = server.db+dbid; + continue; + } + /* Read key */ + if ((key = rdbLoadStringObject(fp)) == NULL) goto eoferr; + /* Read value */ + if ((val = rdbLoadObject(type,fp)) == NULL) goto eoferr; + /* Check if the key already expired */ + if (expiretime != -1 && expiretime < now) { + decrRefCount(key); + decrRefCount(val); + continue; + } + /* Add the new object in the hash table */ + retval = dbAdd(db,key,val); + if (retval == REDIS_ERR) { + redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", key->ptr); + exit(1); + } + /* Set the expire time if needed */ + if (expiretime != -1) setExpire(db,key,expiretime); + + /* Handle swapping while loading big datasets when VM is on */ + + /* If we detecter we are hopeless about fitting something in memory + * we just swap every new key on disk. Directly... + * Note that's important to check for this condition before resorting + * to random sampling, otherwise we may try to swap already + * swapped keys. */ + if (swap_all_values) { + dictEntry *de = dictFind(db->dict,key->ptr); + + /* de may be NULL since the key already expired */ + if (de) { + vmpointer *vp; + val = dictGetEntryVal(de); + + if (val->refcount == 1 && + (vp = vmSwapObjectBlocking(val)) != NULL) + dictGetEntryVal(de) = vp; + } + decrRefCount(key); + continue; + } + decrRefCount(key); + + /* Flush data on disk once 32 MB of additional RAM are used... */ + force_swapout = 0; + if ((zmalloc_used_memory() - server.vm_max_memory) > 1024*1024*32) + force_swapout = 1; + + /* If we have still some hope of having some value fitting memory + * then we try random sampling. */ + if (!swap_all_values && server.vm_enabled && force_swapout) { + while (zmalloc_used_memory() > server.vm_max_memory) { + if (vmSwapOneObjectBlocking() == REDIS_ERR) break; + } + if (zmalloc_used_memory() > server.vm_max_memory) + swap_all_values = 1; /* We are already using too much mem */ + } + } + fclose(fp); + return REDIS_OK; + +eoferr: /* unexpected end of file is handled here with a fatal exit */ + redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now."); + exit(1); + return REDIS_ERR; /* Just to avoid warning */ +} + +/* A background saving child (BGSAVE) terminated its work. Handle this. */ +void backgroundSaveDoneHandler(int statloc) { + int exitcode = WEXITSTATUS(statloc); + int bysignal = WIFSIGNALED(statloc); + + if (!bysignal && exitcode == 0) { + redisLog(REDIS_NOTICE, + "Background saving terminated with success"); + server.dirty = 0; + server.lastsave = time(NULL); + } else if (!bysignal && exitcode != 0) { + redisLog(REDIS_WARNING, "Background saving error"); + } else { + redisLog(REDIS_WARNING, + "Background saving terminated by signal %d", WTERMSIG(statloc)); + rdbRemoveTempFile(server.bgsavechildpid); + } + server.bgsavechildpid = -1; + /* Possibly there are slaves waiting for a BGSAVE in order to be served + * (the first stage of SYNC is a bulk transfer of dump.rdb) */ + updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR); +} diff --git a/src/redis-benchmark.c b/src/redis-benchmark.c new file mode 100644 index 000000000..123d81180 --- /dev/null +++ b/src/redis-benchmark.c @@ -0,0 +1,665 @@ +/* Redis benchmark utility. + * + * Copyright (c) 2009-2010, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "fmacros.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ae.h" +#include "anet.h" +#include "sds.h" +#include "adlist.h" +#include "zmalloc.h" + +#define REPLY_INT 0 +#define REPLY_RETCODE 1 +#define REPLY_BULK 2 +#define REPLY_MBULK 3 + +#define CLIENT_CONNECTING 0 +#define CLIENT_SENDQUERY 1 +#define CLIENT_READREPLY 2 + +#define MAX_LATENCY 5000 + +#define REDIS_NOTUSED(V) ((void) V) + +static struct config { + int debug; + int numclients; + int requests; + int liveclients; + int donerequests; + int keysize; + int datasize; + int randomkeys; + int randomkeys_keyspacelen; + aeEventLoop *el; + char *hostip; + int hostport; + int keepalive; + long long start; + long long totlatency; + int *latency; + list *clients; + int quiet; + int loop; + int idlemode; +} config; + +typedef struct _client { + int state; + int fd; + sds obuf; + sds ibuf; + int mbulk; /* Number of elements in an mbulk reply */ + int readlen; /* readlen == -1 means read a single line */ + int totreceived; + unsigned int written; /* bytes of 'obuf' already written */ + int replytype; + long long start; /* start time in milliseconds */ +} *client; + +/* Prototypes */ +static void writeHandler(aeEventLoop *el, int fd, void *privdata, int mask); +static void createMissingClients(client c); + +/* Implementation */ +static long long mstime(void) { + struct timeval tv; + long long mst; + + gettimeofday(&tv, NULL); + mst = ((long)tv.tv_sec)*1000; + mst += tv.tv_usec/1000; + return mst; +} + +static void freeClient(client c) { + listNode *ln; + + aeDeleteFileEvent(config.el,c->fd,AE_WRITABLE); + aeDeleteFileEvent(config.el,c->fd,AE_READABLE); + sdsfree(c->ibuf); + sdsfree(c->obuf); + close(c->fd); + zfree(c); + config.liveclients--; + ln = listSearchKey(config.clients,c); + assert(ln != NULL); + listDelNode(config.clients,ln); +} + +static void freeAllClients(void) { + listNode *ln = config.clients->head, *next; + + while(ln) { + next = ln->next; + freeClient(ln->value); + ln = next; + } +} + +static void resetClient(client c) { + aeDeleteFileEvent(config.el,c->fd,AE_WRITABLE); + aeDeleteFileEvent(config.el,c->fd,AE_READABLE); + aeCreateFileEvent(config.el,c->fd, AE_WRITABLE,writeHandler,c); + sdsfree(c->ibuf); + c->ibuf = sdsempty(); + c->readlen = (c->replytype == REPLY_BULK || + c->replytype == REPLY_MBULK) ? -1 : 0; + c->mbulk = -1; + c->written = 0; + c->totreceived = 0; + c->state = CLIENT_SENDQUERY; + c->start = mstime(); + createMissingClients(c); +} + +static void randomizeClientKey(client c) { + char *p; + char buf[32]; + long r; + + p = strstr(c->obuf, "_rand"); + if (!p) return; + p += 5; + r = random() % config.randomkeys_keyspacelen; + sprintf(buf,"%ld",r); + memcpy(p,buf,strlen(buf)); +} + +static void prepareClientForReply(client c, int type) { + if (type == REPLY_BULK) { + c->replytype = REPLY_BULK; + c->readlen = -1; + } else if (type == REPLY_MBULK) { + c->replytype = REPLY_MBULK; + c->readlen = -1; + c->mbulk = -1; + } else { + c->replytype = type; + c->readlen = 0; + } +} + +static void clientDone(client c) { + static int last_tot_received = 1; + + long long latency; + config.donerequests ++; + latency = mstime() - c->start; + if (latency > MAX_LATENCY) latency = MAX_LATENCY; + config.latency[latency]++; + + if (config.debug && last_tot_received != c->totreceived) { + printf("Tot bytes received: %d\n", c->totreceived); + last_tot_received = c->totreceived; + } + if (config.donerequests == config.requests) { + freeClient(c); + aeStop(config.el); + return; + } + if (config.keepalive) { + resetClient(c); + if (config.randomkeys) randomizeClientKey(c); + } else { + config.liveclients--; + createMissingClients(c); + config.liveclients++; + freeClient(c); + } +} + +static void readHandler(aeEventLoop *el, int fd, void *privdata, int mask) +{ + char buf[1024]; + int nread; + client c = privdata; + REDIS_NOTUSED(el); + REDIS_NOTUSED(fd); + REDIS_NOTUSED(mask); + + nread = read(c->fd, buf, 1024); + if (nread == -1) { + fprintf(stderr, "Reading from socket: %s\n", strerror(errno)); + freeClient(c); + return; + } + if (nread == 0) { + fprintf(stderr, "EOF from client\n"); + freeClient(c); + return; + } + c->totreceived += nread; + c->ibuf = sdscatlen(c->ibuf,buf,nread); + +processdata: + /* Are we waiting for the first line of the command of for sdf + * count in bulk or multi bulk operations? */ + if (c->replytype == REPLY_INT || + c->replytype == REPLY_RETCODE || + (c->replytype == REPLY_BULK && c->readlen == -1) || + (c->replytype == REPLY_MBULK && c->readlen == -1) || + (c->replytype == REPLY_MBULK && c->mbulk == -1)) { + char *p; + + /* Check if the first line is complete. This is only true if + * there is a newline inside the buffer. */ + if ((p = strchr(c->ibuf,'\n')) != NULL) { + if (c->replytype == REPLY_BULK || + (c->replytype == REPLY_MBULK && c->mbulk != -1)) + { + /* Read the count of a bulk reply (being it a single bulk or + * a multi bulk reply). "$" for the protocol spec. */ + *p = '\0'; + *(p-1) = '\0'; + c->readlen = atoi(c->ibuf+1)+2; + // printf("BULK ATOI: %s\n", c->ibuf+1); + /* Handle null bulk reply "$-1" */ + if (c->readlen-2 == -1) { + clientDone(c); + return; + } + /* Leave all the rest in the input buffer */ + c->ibuf = sdsrange(c->ibuf,(p-c->ibuf)+1,-1); + /* fall through to reach the point where the code will try + * to check if the bulk reply is complete. */ + } else if (c->replytype == REPLY_MBULK && c->mbulk == -1) { + /* Read the count of a multi bulk reply. That is, how many + * bulk replies we have to read next. "*" protocol. */ + *p = '\0'; + *(p-1) = '\0'; + c->mbulk = atoi(c->ibuf+1); + /* Handle null bulk reply "*-1" */ + if (c->mbulk == -1) { + clientDone(c); + return; + } + // printf("%p) %d elements list\n", c, c->mbulk); + /* Leave all the rest in the input buffer */ + c->ibuf = sdsrange(c->ibuf,(p-c->ibuf)+1,-1); + goto processdata; + } else { + c->ibuf = sdstrim(c->ibuf,"\r\n"); + clientDone(c); + return; + } + } + } + /* bulk read, did we read everything? */ + if (((c->replytype == REPLY_MBULK && c->mbulk != -1) || + (c->replytype == REPLY_BULK)) && c->readlen != -1 && + (unsigned)c->readlen <= sdslen(c->ibuf)) + { + // printf("BULKSTATUS mbulk:%d readlen:%d sdslen:%d\n", + // c->mbulk,c->readlen,sdslen(c->ibuf)); + if (c->replytype == REPLY_BULK) { + clientDone(c); + } else if (c->replytype == REPLY_MBULK) { + // printf("%p) %d (%d)) ",c, c->mbulk, c->readlen); + // fwrite(c->ibuf,c->readlen,1,stdout); + // printf("\n"); + if (--c->mbulk == 0) { + clientDone(c); + } else { + c->ibuf = sdsrange(c->ibuf,c->readlen,-1); + c->readlen = -1; + goto processdata; + } + } + } +} + +static void writeHandler(aeEventLoop *el, int fd, void *privdata, int mask) +{ + client c = privdata; + REDIS_NOTUSED(el); + REDIS_NOTUSED(fd); + REDIS_NOTUSED(mask); + + if (c->state == CLIENT_CONNECTING) { + c->state = CLIENT_SENDQUERY; + c->start = mstime(); + } + if (sdslen(c->obuf) > c->written) { + void *ptr = c->obuf+c->written; + int len = sdslen(c->obuf) - c->written; + int nwritten = write(c->fd, ptr, len); + if (nwritten == -1) { + if (errno != EPIPE) + fprintf(stderr, "Writing to socket: %s\n", strerror(errno)); + freeClient(c); + return; + } + c->written += nwritten; + if (sdslen(c->obuf) == c->written) { + aeDeleteFileEvent(config.el,c->fd,AE_WRITABLE); + aeCreateFileEvent(config.el,c->fd,AE_READABLE,readHandler,c); + c->state = CLIENT_READREPLY; + } + } +} + +static client createClient(void) { + client c = zmalloc(sizeof(struct _client)); + char err[ANET_ERR_LEN]; + + c->fd = anetTcpNonBlockConnect(err,config.hostip,config.hostport); + if (c->fd == ANET_ERR) { + zfree(c); + fprintf(stderr,"Connect: %s\n",err); + return NULL; + } + anetTcpNoDelay(NULL,c->fd); + c->obuf = sdsempty(); + c->ibuf = sdsempty(); + c->mbulk = -1; + c->readlen = 0; + c->written = 0; + c->totreceived = 0; + c->state = CLIENT_CONNECTING; + aeCreateFileEvent(config.el, c->fd, AE_WRITABLE, writeHandler, c); + config.liveclients++; + listAddNodeTail(config.clients,c); + return c; +} + +static void createMissingClients(client c) { + while(config.liveclients < config.numclients) { + client new = createClient(); + if (!new) continue; + sdsfree(new->obuf); + new->obuf = sdsdup(c->obuf); + if (config.randomkeys) randomizeClientKey(c); + prepareClientForReply(new,c->replytype); + } +} + +static void showLatencyReport(char *title) { + int j, seen = 0; + float perc, reqpersec; + + reqpersec = (float)config.donerequests/((float)config.totlatency/1000); + if (!config.quiet) { + printf("====== %s ======\n", title); + printf(" %d requests completed in %.2f seconds\n", config.donerequests, + (float)config.totlatency/1000); + printf(" %d parallel clients\n", config.numclients); + printf(" %d bytes payload\n", config.datasize); + printf(" keep alive: %d\n", config.keepalive); + printf("\n"); + for (j = 0; j <= MAX_LATENCY; j++) { + if (config.latency[j]) { + seen += config.latency[j]; + perc = ((float)seen*100)/config.donerequests; + printf("%.2f%% <= %d milliseconds\n", perc, j); + } + } + printf("%.2f requests per second\n\n", reqpersec); + } else { + printf("%s: %.2f requests per second\n", title, reqpersec); + } +} + +static void prepareForBenchmark(void) +{ + memset(config.latency,0,sizeof(int)*(MAX_LATENCY+1)); + config.start = mstime(); + config.donerequests = 0; +} + +static void endBenchmark(char *title) { + config.totlatency = mstime()-config.start; + showLatencyReport(title); + freeAllClients(); +} + +void parseOptions(int argc, char **argv) { + int i; + + for (i = 1; i < argc; i++) { + int lastarg = i==argc-1; + + if (!strcmp(argv[i],"-c") && !lastarg) { + config.numclients = atoi(argv[i+1]); + i++; + } else if (!strcmp(argv[i],"-n") && !lastarg) { + config.requests = atoi(argv[i+1]); + i++; + } else if (!strcmp(argv[i],"-k") && !lastarg) { + config.keepalive = atoi(argv[i+1]); + i++; + } else if (!strcmp(argv[i],"-h") && !lastarg) { + char *ip = zmalloc(32); + if (anetResolve(NULL,argv[i+1],ip) == ANET_ERR) { + printf("Can't resolve %s\n", argv[i]); + exit(1); + } + config.hostip = ip; + i++; + } else if (!strcmp(argv[i],"-p") && !lastarg) { + config.hostport = atoi(argv[i+1]); + i++; + } else if (!strcmp(argv[i],"-d") && !lastarg) { + config.datasize = atoi(argv[i+1]); + i++; + if (config.datasize < 1) config.datasize=1; + if (config.datasize > 1024*1024) config.datasize = 1024*1024; + } else if (!strcmp(argv[i],"-r") && !lastarg) { + config.randomkeys = 1; + config.randomkeys_keyspacelen = atoi(argv[i+1]); + if (config.randomkeys_keyspacelen < 0) + config.randomkeys_keyspacelen = 0; + i++; + } else if (!strcmp(argv[i],"-q")) { + config.quiet = 1; + } else if (!strcmp(argv[i],"-l")) { + config.loop = 1; + } else if (!strcmp(argv[i],"-D")) { + config.debug = 1; + } else if (!strcmp(argv[i],"-I")) { + config.idlemode = 1; + } else { + printf("Wrong option '%s' or option argument missing\n\n",argv[i]); + printf("Usage: redis-benchmark [-h ] [-p ] [-c ] [-n [-k ]\n\n"); + printf(" -h Server hostname (default 127.0.0.1)\n"); + printf(" -p Server port (default 6379)\n"); + printf(" -c Number of parallel connections (default 50)\n"); + printf(" -n Total number of requests (default 10000)\n"); + printf(" -d Data size of SET/GET value in bytes (default 2)\n"); + printf(" -k 1=keep alive 0=reconnect (default 1)\n"); + printf(" -r Use random keys for SET/GET/INCR, random values for SADD\n"); + printf(" Using this option the benchmark will get/set keys\n"); + printf(" in the form mykey_rand000000012456 instead of constant\n"); + printf(" keys, the argument determines the max\n"); + printf(" number of values for the random number. For instance\n"); + printf(" if set to 10 only rand000000000000 - rand000000000009\n"); + printf(" range will be allowed.\n"); + printf(" -q Quiet. Just show query/sec values\n"); + printf(" -l Loop. Run the tests forever\n"); + printf(" -I Idle mode. Just open N idle connections and wait.\n"); + printf(" -D Debug mode. more verbose.\n"); + exit(1); + } + } +} + +int main(int argc, char **argv) { + client c; + + signal(SIGHUP, SIG_IGN); + signal(SIGPIPE, SIG_IGN); + + config.debug = 0; + config.numclients = 50; + config.requests = 10000; + config.liveclients = 0; + config.el = aeCreateEventLoop(); + config.keepalive = 1; + config.donerequests = 0; + config.datasize = 3; + config.randomkeys = 0; + config.randomkeys_keyspacelen = 0; + config.quiet = 0; + config.loop = 0; + config.idlemode = 0; + config.latency = NULL; + config.clients = listCreate(); + config.latency = zmalloc(sizeof(int)*(MAX_LATENCY+1)); + + config.hostip = "127.0.0.1"; + config.hostport = 6379; + + parseOptions(argc,argv); + + if (config.keepalive == 0) { + printf("WARNING: keepalive disabled, you probably need 'echo 1 > /proc/sys/net/ipv4/tcp_tw_reuse' for Linux and 'sudo sysctl -w net.inet.tcp.msl=1000' for Mac OS X in order to use a lot of clients/requests\n"); + } + + if (config.idlemode) { + printf("Creating %d idle connections and waiting forever (Ctrl+C when done)\n", config.numclients); + prepareForBenchmark(); + c = createClient(); + if (!c) exit(1); + c->obuf = sdsempty(); + prepareClientForReply(c,REPLY_RETCODE); /* will never receive it */ + createMissingClients(c); + aeMain(config.el); + /* and will wait for every */ + } + + do { + prepareForBenchmark(); + c = createClient(); + if (!c) exit(1); + c->obuf = sdscat(c->obuf,"PING\r\n"); + prepareClientForReply(c,REPLY_RETCODE); + createMissingClients(c); + aeMain(config.el); + endBenchmark("PING"); + + prepareForBenchmark(); + c = createClient(); + if (!c) exit(1); + c->obuf = sdscat(c->obuf,"*1\r\n$4\r\nPING\r\n"); + prepareClientForReply(c,REPLY_RETCODE); + createMissingClients(c); + aeMain(config.el); + endBenchmark("PING (multi bulk)"); + + prepareForBenchmark(); + c = createClient(); + if (!c) exit(1); + c->obuf = sdscatprintf(c->obuf,"SET foo_rand000000000000 %d\r\n",config.datasize); + { + char *data = zmalloc(config.datasize+2); + memset(data,'x',config.datasize); + data[config.datasize] = '\r'; + data[config.datasize+1] = '\n'; + c->obuf = sdscatlen(c->obuf,data,config.datasize+2); + } + prepareClientForReply(c,REPLY_RETCODE); + createMissingClients(c); + aeMain(config.el); + endBenchmark("SET"); + + prepareForBenchmark(); + c = createClient(); + if (!c) exit(1); + c->obuf = sdscat(c->obuf,"GET foo_rand000000000000\r\n"); + prepareClientForReply(c,REPLY_BULK); + createMissingClients(c); + aeMain(config.el); + endBenchmark("GET"); + + prepareForBenchmark(); + c = createClient(); + if (!c) exit(1); + c->obuf = sdscat(c->obuf,"INCR counter_rand000000000000\r\n"); + prepareClientForReply(c,REPLY_INT); + createMissingClients(c); + aeMain(config.el); + endBenchmark("INCR"); + + prepareForBenchmark(); + c = createClient(); + if (!c) exit(1); + c->obuf = sdscat(c->obuf,"LPUSH mylist 3\r\nbar\r\n"); + prepareClientForReply(c,REPLY_INT); + createMissingClients(c); + aeMain(config.el); + endBenchmark("LPUSH"); + + prepareForBenchmark(); + c = createClient(); + if (!c) exit(1); + c->obuf = sdscat(c->obuf,"LPOP mylist\r\n"); + prepareClientForReply(c,REPLY_BULK); + createMissingClients(c); + aeMain(config.el); + endBenchmark("LPOP"); + + prepareForBenchmark(); + c = createClient(); + if (!c) exit(1); + c->obuf = sdscat(c->obuf,"SADD myset 24\r\ncounter_rand000000000000\r\n"); + prepareClientForReply(c,REPLY_RETCODE); + createMissingClients(c); + aeMain(config.el); + endBenchmark("SADD"); + + prepareForBenchmark(); + c = createClient(); + if (!c) exit(1); + c->obuf = sdscat(c->obuf,"SPOP myset\r\n"); + prepareClientForReply(c,REPLY_BULK); + createMissingClients(c); + aeMain(config.el); + endBenchmark("SPOP"); + + prepareForBenchmark(); + c = createClient(); + if (!c) exit(1); + c->obuf = sdscat(c->obuf,"LPUSH mylist 3\r\nbar\r\n"); + prepareClientForReply(c,REPLY_RETCODE); + createMissingClients(c); + aeMain(config.el); + endBenchmark("LPUSH (again, in order to bench LRANGE)"); + + prepareForBenchmark(); + c = createClient(); + if (!c) exit(1); + c->obuf = sdscat(c->obuf,"LRANGE mylist 0 99\r\n"); + prepareClientForReply(c,REPLY_MBULK); + createMissingClients(c); + aeMain(config.el); + endBenchmark("LRANGE (first 100 elements)"); + + prepareForBenchmark(); + c = createClient(); + if (!c) exit(1); + c->obuf = sdscat(c->obuf,"LRANGE mylist 0 299\r\n"); + prepareClientForReply(c,REPLY_MBULK); + createMissingClients(c); + aeMain(config.el); + endBenchmark("LRANGE (first 300 elements)"); + + prepareForBenchmark(); + c = createClient(); + if (!c) exit(1); + c->obuf = sdscat(c->obuf,"LRANGE mylist 0 449\r\n"); + prepareClientForReply(c,REPLY_MBULK); + createMissingClients(c); + aeMain(config.el); + endBenchmark("LRANGE (first 450 elements)"); + + prepareForBenchmark(); + c = createClient(); + if (!c) exit(1); + c->obuf = sdscat(c->obuf,"LRANGE mylist 0 599\r\n"); + prepareClientForReply(c,REPLY_MBULK); + createMissingClients(c); + aeMain(config.el); + endBenchmark("LRANGE (first 600 elements)"); + + printf("\n"); + } while(config.loop); + + return 0; +} diff --git a/src/redis-check-aof.c b/src/redis-check-aof.c new file mode 100644 index 000000000..ff0d1f82c --- /dev/null +++ b/src/redis-check-aof.c @@ -0,0 +1,185 @@ +#include "fmacros.h" +#include +#include +#include +#include +#include +#include "config.h" + +#define ERROR(...) { \ + char __buf[1024]; \ + sprintf(__buf, __VA_ARGS__); \ + sprintf(error, "0x%08lx: %s", epos, __buf); \ +} + +static char error[1024]; +static long epos; + +int consumeNewline(char *buf) { + if (strncmp(buf,"\r\n",2) != 0) { + ERROR("Expected \\r\\n, got: %02x%02x",buf[0],buf[1]); + return 0; + } + return 1; +} + +int readLong(FILE *fp, char prefix, long *target) { + char buf[128], *eptr; + epos = ftell(fp); + if (fgets(buf,sizeof(buf),fp) == NULL) { + return 0; + } + if (buf[0] != prefix) { + ERROR("Expected prefix '%c', got: '%c'",buf[0],prefix); + return 0; + } + *target = strtol(buf+1,&eptr,10); + return consumeNewline(eptr); +} + +int readBytes(FILE *fp, char *target, long length) { + long real; + epos = ftell(fp); + real = fread(target,1,length,fp); + if (real != length) { + ERROR("Expected to read %ld bytes, got %ld bytes",length,real); + return 0; + } + return 1; +} + +int readString(FILE *fp, char** target) { + long len; + *target = NULL; + if (!readLong(fp,'$',&len)) { + return 0; + } + + /* Increase length to also consume \r\n */ + len += 2; + *target = (char*)malloc(len); + if (!readBytes(fp,*target,len)) { + return 0; + } + if (!consumeNewline(*target+len-2)) { + return 0; + } + (*target)[len-2] = '\0'; + return 1; +} + +int readArgc(FILE *fp, long *target) { + return readLong(fp,'*',target); +} + +long process(FILE *fp) { + long argc, pos = 0; + int i, multi = 0; + char *str; + + while(1) { + if (!multi) pos = ftell(fp); + if (!readArgc(fp, &argc)) break; + + for (i = 0; i < argc; i++) { + if (!readString(fp,&str)) break; + if (i == 0) { + if (strcasecmp(str, "multi") == 0) { + if (multi++) { + ERROR("Unexpected MULTI"); + break; + } + } else if (strcasecmp(str, "exec") == 0) { + if (--multi) { + ERROR("Unexpected EXEC"); + break; + } + } + } + free(str); + } + + /* Stop if the loop did not finish */ + if (i < argc) { + if (str) free(str); + break; + } + } + + if (feof(fp) && multi && strlen(error) == 0) { + ERROR("Reached EOF before reading EXEC for MULTI"); + } + if (strlen(error) > 0) { + printf("%s\n", error); + } + return pos; +} + +int main(int argc, char **argv) { + char *filename; + int fix = 0; + + if (argc < 2) { + printf("Usage: %s [--fix] \n", argv[0]); + exit(1); + } else if (argc == 2) { + filename = argv[1]; + } else if (argc == 3) { + if (strcmp(argv[1],"--fix") != 0) { + printf("Invalid argument: %s\n", argv[1]); + exit(1); + } + filename = argv[2]; + fix = 1; + } else { + printf("Invalid arguments\n"); + exit(1); + } + + FILE *fp = fopen(filename,"r+"); + if (fp == NULL) { + printf("Cannot open file: %s\n", filename); + exit(1); + } + + struct redis_stat sb; + if (redis_fstat(fileno(fp),&sb) == -1) { + printf("Cannot stat file: %s\n", filename); + exit(1); + } + + long size = sb.st_size; + if (size == 0) { + printf("Empty file: %s\n", filename); + exit(1); + } + + long pos = process(fp); + long diff = size-pos; + if (diff > 0) { + if (fix) { + char buf[2]; + printf("This will shrink the AOF from %ld bytes, with %ld bytes, to %ld bytes\n",size,diff,pos); + printf("Continue? [y/N]: "); + if (fgets(buf,sizeof(buf),stdin) == NULL || + strncasecmp(buf,"y",1) != 0) { + printf("Aborting...\n"); + exit(1); + } + if (ftruncate(fileno(fp), pos) == -1) { + printf("Failed to truncate AOF\n"); + exit(1); + } else { + printf("Successfully truncated AOF\n"); + } + } else { + printf("AOF is not valid\n"); + exit(1); + } + } else { + printf("AOF is valid\n"); + } + + fclose(fp); + return 0; +} diff --git a/src/redis-check-dump.c b/src/redis-check-dump.c new file mode 100644 index 000000000..0b002790d --- /dev/null +++ b/src/redis-check-dump.c @@ -0,0 +1,671 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "lzf.h" + +/* Object types */ +#define REDIS_STRING 0 +#define REDIS_LIST 1 +#define REDIS_SET 2 +#define REDIS_ZSET 3 +#define REDIS_HASH 4 + +/* Objects encoding. Some kind of objects like Strings and Hashes can be + * internally represented in multiple ways. The 'encoding' field of the object + * is set to one of this fields for this object. */ +#define REDIS_ENCODING_RAW 0 /* Raw representation */ +#define REDIS_ENCODING_INT 1 /* Encoded as integer */ +#define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */ +#define REDIS_ENCODING_HT 3 /* Encoded as an hash table */ + +/* Object types only used for dumping to disk */ +#define REDIS_EXPIRETIME 253 +#define REDIS_SELECTDB 254 +#define REDIS_EOF 255 + +/* Defines related to the dump file format. To store 32 bits lengths for short + * keys requires a lot of space, so we check the most significant 2 bits of + * the first byte to interpreter the length: + * + * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte + * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte + * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow + * 11|000000 this means: specially encoded object will follow. The six bits + * number specify the kind of object that follows. + * See the REDIS_RDB_ENC_* defines. + * + * Lenghts up to 63 are stored using a single byte, most DB keys, and may + * values, will fit inside. */ +#define REDIS_RDB_6BITLEN 0 +#define REDIS_RDB_14BITLEN 1 +#define REDIS_RDB_32BITLEN 2 +#define REDIS_RDB_ENCVAL 3 +#define REDIS_RDB_LENERR UINT_MAX + +/* When a length of a string object stored on disk has the first two bits + * set, the remaining two bits specify a special encoding for the object + * accordingly to the following defines: */ +#define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */ +#define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */ +#define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */ +#define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */ + +#define ERROR(...) { \ + printf(__VA_ARGS__); \ + exit(1); \ +} + +/* data type to hold offset in file and size */ +typedef struct { + void *data; + unsigned long size; + unsigned long offset; +} pos; + +static unsigned char level = 0; +static pos positions[16]; + +#define CURR_OFFSET (positions[level].offset) + +/* Hold a stack of errors */ +typedef struct { + char error[16][1024]; + unsigned long offset[16]; + unsigned int level; +} errors_t; +static errors_t errors; + +#define SHIFT_ERROR(provided_offset, ...) { \ + sprintf(errors.error[errors.level], __VA_ARGS__); \ + errors.offset[errors.level] = provided_offset; \ + errors.level++; \ +} + +/* Data type to hold opcode with optional key name an success status */ +typedef struct { + char* key; + int type; + char success; +} entry; + +/* Global vars that are actally used as constants. The following double + * values are used for double on-disk serialization, and are initialized + * at runtime to avoid strange compiler optimizations. */ +static double R_Zero, R_PosInf, R_NegInf, R_Nan; + +/* store string types for output */ +static char types[256][16]; + +/* when number of bytes to read is negative, do a peek */ +int readBytes(void *target, long num) { + char peek = (num < 0) ? 1 : 0; + num = (num < 0) ? -num : num; + + pos p = positions[level]; + if (p.offset + num > p.size) { + return 0; + } else { + memcpy(target, (void*)((unsigned long)p.data + p.offset), num); + if (!peek) positions[level].offset += num; + } + return 1; +} + +int processHeader() { + char buf[10] = "_________"; + int dump_version; + + if (!readBytes(buf, 9)) { + ERROR("Cannot read header\n"); + } + + /* expect the first 5 bytes to equal REDIS */ + if (memcmp(buf,"REDIS",5) != 0) { + ERROR("Wrong signature in header\n"); + } + + dump_version = (int)strtol(buf + 5, NULL, 10); + if (dump_version != 1) { + ERROR("Unknown RDB format version: %d\n", dump_version); + } + return 1; +} + +int loadType(entry *e) { + uint32_t offset = CURR_OFFSET; + + /* this byte needs to qualify as type */ + unsigned char t; + if (readBytes(&t, 1)) { + if (t <= 4 || t >= 253) { + e->type = t; + return 1; + } else { + SHIFT_ERROR(offset, "Unknown type (0x%02x)", t); + } + } else { + SHIFT_ERROR(offset, "Could not read type"); + } + + /* failure */ + return 0; +} + +int peekType() { + unsigned char t; + if (readBytes(&t, -1) && (t <= 4 || t >= 253)) return t; + return -1; +} + +/* discard time, just consume the bytes */ +int processTime() { + uint32_t offset = CURR_OFFSET; + unsigned char t[4]; + if (readBytes(t, 4)) { + return 1; + } else { + SHIFT_ERROR(offset, "Could not read time"); + } + + /* failure */ + return 0; +} + +uint32_t loadLength(int *isencoded) { + unsigned char buf[2]; + uint32_t len; + int type; + + if (isencoded) *isencoded = 0; + if (!readBytes(buf, 1)) return REDIS_RDB_LENERR; + type = (buf[0] & 0xC0) >> 6; + if (type == REDIS_RDB_6BITLEN) { + /* Read a 6 bit len */ + return buf[0] & 0x3F; + } else if (type == REDIS_RDB_ENCVAL) { + /* Read a 6 bit len encoding type */ + if (isencoded) *isencoded = 1; + return buf[0] & 0x3F; + } else if (type == REDIS_RDB_14BITLEN) { + /* Read a 14 bit len */ + if (!readBytes(buf+1,1)) return REDIS_RDB_LENERR; + return ((buf[0] & 0x3F) << 8) | buf[1]; + } else { + /* Read a 32 bit len */ + if (!readBytes(&len, 4)) return REDIS_RDB_LENERR; + return (unsigned int)ntohl(len); + } +} + +char *loadIntegerObject(int enctype) { + uint32_t offset = CURR_OFFSET; + unsigned char enc[4]; + long long val; + + if (enctype == REDIS_RDB_ENC_INT8) { + uint8_t v; + if (!readBytes(enc, 1)) return NULL; + v = enc[0]; + val = (int8_t)v; + } else if (enctype == REDIS_RDB_ENC_INT16) { + uint16_t v; + if (!readBytes(enc, 2)) return NULL; + v = enc[0]|(enc[1]<<8); + val = (int16_t)v; + } else if (enctype == REDIS_RDB_ENC_INT32) { + uint32_t v; + if (!readBytes(enc, 4)) return NULL; + v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24); + val = (int32_t)v; + } else { + SHIFT_ERROR(offset, "Unknown integer encoding (0x%02x)", enctype); + return NULL; + } + + /* convert val into string */ + char *buf; + buf = malloc(sizeof(char) * 128); + sprintf(buf, "%lld", val); + return buf; +} + +char* loadLzfStringObject() { + unsigned int slen, clen; + char *c, *s; + + if ((clen = loadLength(NULL)) == REDIS_RDB_LENERR) return NULL; + if ((slen = loadLength(NULL)) == REDIS_RDB_LENERR) return NULL; + + c = malloc(clen); + if (!readBytes(c, clen)) { + free(c); + return NULL; + } + + s = malloc(slen+1); + if (lzf_decompress(c,clen,s,slen) == 0) { + free(c); free(s); + return NULL; + } + + free(c); + return s; +} + +/* returns NULL when not processable, char* when valid */ +char* loadStringObject() { + uint32_t offset = CURR_OFFSET; + int isencoded; + uint32_t len; + + len = loadLength(&isencoded); + if (isencoded) { + switch(len) { + case REDIS_RDB_ENC_INT8: + case REDIS_RDB_ENC_INT16: + case REDIS_RDB_ENC_INT32: + return loadIntegerObject(len); + case REDIS_RDB_ENC_LZF: + return loadLzfStringObject(); + default: + /* unknown encoding */ + SHIFT_ERROR(offset, "Unknown string encoding (0x%02x)", len); + return NULL; + } + } + + if (len == REDIS_RDB_LENERR) return NULL; + + char *buf = malloc(sizeof(char) * (len+1)); + buf[len] = '\0'; + if (!readBytes(buf, len)) { + free(buf); + return NULL; + } + return buf; +} + +int processStringObject(char** store) { + unsigned long offset = CURR_OFFSET; + char *key = loadStringObject(); + if (key == NULL) { + SHIFT_ERROR(offset, "Error reading string object"); + free(key); + return 0; + } + + if (store != NULL) { + *store = key; + } else { + free(key); + } + return 1; +} + +double* loadDoubleValue() { + char buf[256]; + unsigned char len; + double* val; + + if (!readBytes(&len,1)) return NULL; + + val = malloc(sizeof(double)); + switch(len) { + case 255: *val = R_NegInf; return val; + case 254: *val = R_PosInf; return val; + case 253: *val = R_Nan; return val; + default: + if (!readBytes(buf, len)) { + free(val); + return NULL; + } + buf[len] = '\0'; + sscanf(buf, "%lg", val); + return val; + } +} + +int processDoubleValue(double** store) { + unsigned long offset = CURR_OFFSET; + double *val = loadDoubleValue(); + if (val == NULL) { + SHIFT_ERROR(offset, "Error reading double value"); + free(val); + return 0; + } + + if (store != NULL) { + *store = val; + } else { + free(val); + } + return 1; +} + +int loadPair(entry *e) { + uint32_t offset = CURR_OFFSET; + uint32_t i; + + /* read key first */ + char *key; + if (processStringObject(&key)) { + e->key = key; + } else { + SHIFT_ERROR(offset, "Error reading entry key"); + return 0; + } + + uint32_t length = 0; + if (e->type == REDIS_LIST || + e->type == REDIS_SET || + e->type == REDIS_ZSET || + e->type == REDIS_HASH) { + if ((length = loadLength(NULL)) == REDIS_RDB_LENERR) { + SHIFT_ERROR(offset, "Error reading %s length", types[e->type]); + return 0; + } + } + + switch(e->type) { + case REDIS_STRING: + if (!processStringObject(NULL)) { + SHIFT_ERROR(offset, "Error reading entry value"); + return 0; + } + break; + case REDIS_LIST: + case REDIS_SET: + for (i = 0; i < length; i++) { + offset = CURR_OFFSET; + if (!processStringObject(NULL)) { + SHIFT_ERROR(offset, "Error reading element at index %d (length: %d)", i, length); + return 0; + } + } + break; + case REDIS_ZSET: + for (i = 0; i < length; i++) { + offset = CURR_OFFSET; + if (!processStringObject(NULL)) { + SHIFT_ERROR(offset, "Error reading element key at index %d (length: %d)", i, length); + return 0; + } + offset = CURR_OFFSET; + if (!processDoubleValue(NULL)) { + SHIFT_ERROR(offset, "Error reading element value at index %d (length: %d)", i, length); + return 0; + } + } + break; + case REDIS_HASH: + for (i = 0; i < length; i++) { + offset = CURR_OFFSET; + if (!processStringObject(NULL)) { + SHIFT_ERROR(offset, "Error reading element key at index %d (length: %d)", i, length); + return 0; + } + offset = CURR_OFFSET; + if (!processStringObject(NULL)) { + SHIFT_ERROR(offset, "Error reading element value at index %d (length: %d)", i, length); + return 0; + } + } + break; + default: + SHIFT_ERROR(offset, "Type not implemented"); + return 0; + } + /* because we're done, we assume success */ + e->success = 1; + return 1; +} + +entry loadEntry() { + entry e = { NULL, -1, 0 }; + uint32_t length, offset[4]; + + /* reset error container */ + errors.level = 0; + + offset[0] = CURR_OFFSET; + if (!loadType(&e)) { + return e; + } + + offset[1] = CURR_OFFSET; + if (e.type == REDIS_SELECTDB) { + if ((length = loadLength(NULL)) == REDIS_RDB_LENERR) { + SHIFT_ERROR(offset[1], "Error reading database number"); + return e; + } + if (length > 63) { + SHIFT_ERROR(offset[1], "Database number out of range (%d)", length); + return e; + } + } else if (e.type == REDIS_EOF) { + if (positions[level].offset < positions[level].size) { + SHIFT_ERROR(offset[0], "Unexpected EOF"); + } else { + e.success = 1; + } + return e; + } else { + /* optionally consume expire */ + if (e.type == REDIS_EXPIRETIME) { + if (!processTime()) return e; + if (!loadType(&e)) return e; + } + + offset[1] = CURR_OFFSET; + if (!loadPair(&e)) { + SHIFT_ERROR(offset[1], "Error for type %s", types[e.type]); + return e; + } + } + + /* all entries are followed by a valid type: + * e.g. a new entry, SELECTDB, EXPIRE, EOF */ + offset[2] = CURR_OFFSET; + if (peekType() == -1) { + SHIFT_ERROR(offset[2], "Followed by invalid type"); + SHIFT_ERROR(offset[0], "Error for type %s", types[e.type]); + e.success = 0; + } else { + e.success = 1; + } + + return e; +} + +void printCentered(int indent, int width, char* body) { + char head[256], tail[256]; + memset(head, '\0', 256); + memset(tail, '\0', 256); + + memset(head, '=', indent); + memset(tail, '=', width - 2 - indent - strlen(body)); + printf("%s %s %s\n", head, body, tail); +} + +void printValid(int ops, int bytes) { + char body[80]; + sprintf(body, "Processed %d valid opcodes (in %d bytes)", ops, bytes); + printCentered(4, 80, body); +} + +void printSkipped(int bytes, int offset) { + char body[80]; + sprintf(body, "Skipped %d bytes (resuming at 0x%08x)", bytes, offset); + printCentered(4, 80, body); +} + +void printErrorStack(entry *e) { + unsigned int i; + char body[64]; + + if (e->type == -1) { + sprintf(body, "Error trace"); + } else if (e->type >= 253) { + sprintf(body, "Error trace (%s)", types[e->type]); + } else if (!e->key) { + sprintf(body, "Error trace (%s: (unknown))", types[e->type]); + } else { + char tmp[41]; + strncpy(tmp, e->key, 40); + + /* display truncation at the last 3 chars */ + if (strlen(e->key) > 40) { + memset(&tmp[37], '.', 3); + } + + /* display unprintable characters as ? */ + for (i = 0; i < strlen(tmp); i++) { + if (tmp[i] <= 32) tmp[i] = '?'; + } + sprintf(body, "Error trace (%s: %s)", types[e->type], tmp); + } + + printCentered(4, 80, body); + + /* display error stack */ + for (i = 0; i < errors.level; i++) { + printf("0x%08lx - %s\n", errors.offset[i], errors.error[i]); + } +} + +void process() { + int i, num_errors = 0, num_valid_ops = 0, num_valid_bytes = 0; + entry entry; + processHeader(); + + level = 1; + while(positions[0].offset < positions[0].size) { + positions[1] = positions[0]; + + entry = loadEntry(); + if (!entry.success) { + printValid(num_valid_ops, num_valid_bytes); + printErrorStack(&entry); + num_errors++; + num_valid_ops = 0; + num_valid_bytes = 0; + + /* search for next valid entry */ + unsigned long offset = positions[0].offset + 1; + while (!entry.success && offset < positions[0].size) { + positions[1].offset = offset; + + /* find 3 consecutive valid entries */ + for (i = 0; i < 3; i++) { + entry = loadEntry(); + if (!entry.success) break; + } + /* check if we found 3 consecutive valid entries */ + if (i < 3) { + offset++; + } + } + + /* print how many bytes we have skipped to find a new valid opcode */ + if (offset < positions[0].size) { + printSkipped(offset - positions[0].offset, offset); + } + + positions[0].offset = offset; + } else { + num_valid_ops++; + num_valid_bytes += positions[1].offset - positions[0].offset; + + /* advance position */ + positions[0] = positions[1]; + } + } + + /* because there is another potential error, + * print how many valid ops we have processed */ + printValid(num_valid_ops, num_valid_bytes); + + /* expect an eof */ + if (entry.type != REDIS_EOF) { + /* last byte should be EOF, add error */ + errors.level = 0; + SHIFT_ERROR(positions[0].offset, "Expected EOF, got %s", types[entry.type]); + + /* this is an EOF error so reset type */ + entry.type = -1; + printErrorStack(&entry); + + num_errors++; + } + + /* print summary on errors */ + if (num_errors > 0) { + printf("\n"); + printf("Total unprocessable opcodes: %d\n", num_errors); + } +} + +int main(int argc, char **argv) { + /* expect the first argument to be the dump file */ + if (argc <= 1) { + printf("Usage: %s \n", argv[0]); + exit(0); + } + + int fd; + unsigned long size; + struct stat stat; + void *data; + + fd = open(argv[1], O_RDONLY); + if (fd < 1) { + ERROR("Cannot open file: %s\n", argv[1]); + } + if (fstat(fd, &stat) == -1) { + ERROR("Cannot stat: %s\n", argv[1]); + } else { + size = stat.st_size; + } + + data = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0); + if (data == MAP_FAILED) { + ERROR("Cannot mmap: %s\n", argv[1]); + } + + /* Initialize static vars */ + positions[0].data = data; + positions[0].size = size; + positions[0].offset = 0; + errors.level = 0; + + /* Object types */ + sprintf(types[REDIS_STRING], "STRING"); + sprintf(types[REDIS_LIST], "LIST"); + sprintf(types[REDIS_SET], "SET"); + sprintf(types[REDIS_ZSET], "ZSET"); + sprintf(types[REDIS_HASH], "HASH"); + + /* Object types only used for dumping to disk */ + sprintf(types[REDIS_EXPIRETIME], "EXPIRETIME"); + sprintf(types[REDIS_SELECTDB], "SELECTDB"); + sprintf(types[REDIS_EOF], "EOF"); + + /* Double constants initialization */ + R_Zero = 0.0; + R_PosInf = 1.0/R_Zero; + R_NegInf = -1.0/R_Zero; + R_Nan = R_Zero/R_Zero; + + process(); + + munmap(data, size); + close(fd); + return 0; +} diff --git a/src/redis-cli.c b/src/redis-cli.c new file mode 100644 index 000000000..2daa7c461 --- /dev/null +++ b/src/redis-cli.c @@ -0,0 +1,493 @@ +/* Redis CLI (command line interface) + * + * Copyright (c) 2009-2010, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "fmacros.h" + +#include +#include +#include +#include +#include + +#include "anet.h" +#include "sds.h" +#include "adlist.h" +#include "zmalloc.h" +#include "linenoise.h" + +#define REDIS_CMD_INLINE 1 +#define REDIS_CMD_BULK 2 +#define REDIS_CMD_MULTIBULK 4 + +#define REDIS_NOTUSED(V) ((void) V) + +static struct config { + char *hostip; + int hostport; + long repeat; + int dbnum; + int argn_from_stdin; + int interactive; + int shutdown; + int monitor_mode; + int pubsub_mode; + int raw_output; + char *auth; +} config; + +static int cliReadReply(int fd); +static void usage(); + +static int cliConnect(void) { + char err[ANET_ERR_LEN]; + static int fd = ANET_ERR; + + if (fd == ANET_ERR) { + fd = anetTcpConnect(err,config.hostip,config.hostport); + if (fd == ANET_ERR) { + fprintf(stderr, "Could not connect to Redis at %s:%d: %s", config.hostip, config.hostport, err); + return -1; + } + anetTcpNoDelay(NULL,fd); + } + return fd; +} + +static sds cliReadLine(int fd) { + sds line = sdsempty(); + + while(1) { + char c; + ssize_t ret; + + ret = read(fd,&c,1); + if (ret == -1) { + sdsfree(line); + return NULL; + } else if ((ret == 0) || (c == '\n')) { + break; + } else { + line = sdscatlen(line,&c,1); + } + } + return sdstrim(line,"\r\n"); +} + +static int cliReadSingleLineReply(int fd, int quiet) { + sds reply = cliReadLine(fd); + + if (reply == NULL) return 1; + if (!quiet) + printf("%s\n", reply); + sdsfree(reply); + return 0; +} + +static void printStringRepr(char *s, int len) { + printf("\""); + while(len--) { + switch(*s) { + case '\\': + case '"': + printf("\\%c",*s); + break; + case '\n': printf("\\n"); break; + case '\r': printf("\\r"); break; + case '\t': printf("\\t"); break; + case '\a': printf("\\a"); break; + case '\b': printf("\\b"); break; + default: + if (isprint(*s)) + printf("%c",*s); + else + printf("\\x%02x",(unsigned char)*s); + break; + } + s++; + } + printf("\"\n"); +} + +static int cliReadBulkReply(int fd) { + sds replylen = cliReadLine(fd); + char *reply, crlf[2]; + int bulklen; + + if (replylen == NULL) return 1; + bulklen = atoi(replylen); + if (bulklen == -1) { + sdsfree(replylen); + printf("(nil)\n"); + return 0; + } + reply = zmalloc(bulklen); + anetRead(fd,reply,bulklen); + anetRead(fd,crlf,2); + if (config.raw_output || !isatty(fileno(stdout))) { + if (bulklen && fwrite(reply,bulklen,1,stdout) == 0) { + zfree(reply); + return 1; + } + } else { + /* If you are producing output for the standard output we want + * a more interesting output with quoted characters and so forth */ + printStringRepr(reply,bulklen); + } + zfree(reply); + return 0; +} + +static int cliReadMultiBulkReply(int fd) { + sds replylen = cliReadLine(fd); + int elements, c = 1; + + if (replylen == NULL) return 1; + elements = atoi(replylen); + if (elements == -1) { + sdsfree(replylen); + printf("(nil)\n"); + return 0; + } + if (elements == 0) { + printf("(empty list or set)\n"); + } + while(elements--) { + printf("%d. ", c); + if (cliReadReply(fd)) return 1; + c++; + } + return 0; +} + +static int cliReadReply(int fd) { + char type; + + if (anetRead(fd,&type,1) <= 0) { + if (config.shutdown) return 0; + exit(1); + } + switch(type) { + case '-': + printf("(error) "); + cliReadSingleLineReply(fd,0); + return 1; + case '+': + return cliReadSingleLineReply(fd,0); + case ':': + printf("(integer) "); + return cliReadSingleLineReply(fd,0); + case '$': + return cliReadBulkReply(fd); + case '*': + return cliReadMultiBulkReply(fd); + default: + printf("protocol error, got '%c' as reply type byte\n", type); + return 1; + } +} + +static int selectDb(int fd) { + int retval; + sds cmd; + char type; + + if (config.dbnum == 0) + return 0; + + cmd = sdsempty(); + cmd = sdscatprintf(cmd,"SELECT %d\r\n",config.dbnum); + anetWrite(fd,cmd,sdslen(cmd)); + anetRead(fd,&type,1); + if (type <= 0 || type != '+') return 1; + retval = cliReadSingleLineReply(fd,1); + if (retval) { + return retval; + } + return 0; +} + +static int cliSendCommand(int argc, char **argv, int repeat) { + char *command = argv[0]; + int fd, j, retval = 0; + sds cmd; + + config.raw_output = !strcasecmp(command,"info"); + if (!strcasecmp(command,"shutdown")) config.shutdown = 1; + if (!strcasecmp(command,"monitor")) config.monitor_mode = 1; + if (!strcasecmp(command,"subscribe") || + !strcasecmp(command,"psubscribe")) config.pubsub_mode = 1; + if ((fd = cliConnect()) == -1) return 1; + + /* Select db number */ + retval = selectDb(fd); + if (retval) { + fprintf(stderr,"Error setting DB num\n"); + return 1; + } + + /* Build the command to send */ + cmd = sdscatprintf(sdsempty(),"*%d\r\n",argc); + for (j = 0; j < argc; j++) { + cmd = sdscatprintf(cmd,"$%lu\r\n", + (unsigned long)sdslen(argv[j])); + cmd = sdscatlen(cmd,argv[j],sdslen(argv[j])); + cmd = sdscatlen(cmd,"\r\n",2); + } + + while(repeat--) { + anetWrite(fd,cmd,sdslen(cmd)); + while (config.monitor_mode) { + cliReadSingleLineReply(fd,0); + } + + if (config.pubsub_mode) { + printf("Reading messages... (press Ctrl-c to quit)\n"); + while (1) { + cliReadReply(fd); + printf("\n"); + } + } + + retval = cliReadReply(fd); + if (retval) { + return retval; + } + } + return 0; +} + +static int parseOptions(int argc, char **argv) { + int i; + + for (i = 1; i < argc; i++) { + int lastarg = i==argc-1; + + if (!strcmp(argv[i],"-h") && !lastarg) { + char *ip = zmalloc(32); + if (anetResolve(NULL,argv[i+1],ip) == ANET_ERR) { + printf("Can't resolve %s\n", argv[i]); + exit(1); + } + config.hostip = ip; + i++; + } else if (!strcmp(argv[i],"-h") && lastarg) { + usage(); + } else if (!strcmp(argv[i],"-p") && !lastarg) { + config.hostport = atoi(argv[i+1]); + i++; + } else if (!strcmp(argv[i],"-r") && !lastarg) { + config.repeat = strtoll(argv[i+1],NULL,10); + i++; + } else if (!strcmp(argv[i],"-n") && !lastarg) { + config.dbnum = atoi(argv[i+1]); + i++; + } else if (!strcmp(argv[i],"-a") && !lastarg) { + config.auth = argv[i+1]; + i++; + } else if (!strcmp(argv[i],"-i")) { + config.interactive = 1; + } else if (!strcmp(argv[i],"-c")) { + config.argn_from_stdin = 1; + } else { + break; + } + } + return i; +} + +static sds readArgFromStdin(void) { + char buf[1024]; + sds arg = sdsempty(); + + while(1) { + int nread = read(fileno(stdin),buf,1024); + + if (nread == 0) break; + else if (nread == -1) { + perror("Reading from standard input"); + exit(1); + } + arg = sdscatlen(arg,buf,nread); + } + return arg; +} + +static void usage() { + fprintf(stderr, "usage: redis-cli [-h host] [-p port] [-a authpw] [-r repeat_times] [-n db_num] [-i] cmd arg1 arg2 arg3 ... argN\n"); + fprintf(stderr, "usage: echo \"argN\" | redis-cli -c [-h host] [-p port] [-a authpw] [-r repeat_times] [-n db_num] cmd arg1 arg2 ... arg(N-1)\n"); + fprintf(stderr, "\nIf a pipe from standard input is detected this data is used as last argument.\n\n"); + fprintf(stderr, "example: cat /etc/passwd | redis-cli set my_passwd\n"); + fprintf(stderr, "example: redis-cli get my_passwd\n"); + fprintf(stderr, "example: redis-cli -r 100 lpush mylist x\n"); + fprintf(stderr, "\nRun in interactive mode: redis-cli -i or just don't pass any command\n"); + exit(1); +} + +/* Turn the plain C strings into Sds strings */ +static char **convertToSds(int count, char** args) { + int j; + char **sds = zmalloc(sizeof(char*)*count); + + for(j = 0; j < count; j++) + sds[j] = sdsnew(args[j]); + + return sds; +} + +static char **splitArguments(char *line, int *argc) { + char *p = line; + char *current = NULL; + char **vector = NULL; + + *argc = 0; + while(1) { + /* skip blanks */ + while(*p && isspace(*p)) p++; + if (*p) { + /* get a token */ + int inq=0; /* set to 1 if we are in "quotes" */ + int done = 0; + + if (current == NULL) current = sdsempty(); + while(!done) { + if (inq) { + if (*p == '\\' && *(p+1)) { + char c; + + p++; + switch(*p) { + case 'n': c = '\n'; break; + case 'r': c = '\r'; break; + case 't': c = '\t'; break; + case 'b': c = '\b'; break; + case 'a': c = '\a'; break; + default: c = *p; break; + } + current = sdscatlen(current,&c,1); + } else if (*p == '"') { + done = 1; + } else { + current = sdscatlen(current,p,1); + } + } else { + switch(*p) { + case ' ': + case '\n': + case '\r': + case '\t': + case '\0': + done=1; + break; + case '"': + inq=1; + break; + default: + current = sdscatlen(current,p,1); + break; + } + } + if (*p) p++; + } + /* add the token to the vector */ + vector = zrealloc(vector,((*argc)+1)*sizeof(char*)); + vector[*argc] = current; + (*argc)++; + current = NULL; + } else { + return vector; + } + } +} + +#define LINE_BUFLEN 4096 +static void repl() { + int argc, j; + char *line, **argv; + + while((line = linenoise("redis> ")) != NULL) { + if (line[0] != '\0') { + argv = splitArguments(line,&argc); + linenoiseHistoryAdd(line); + if (argc > 0) { + if (strcasecmp(argv[0],"quit") == 0 || + strcasecmp(argv[0],"exit") == 0) + exit(0); + else + cliSendCommand(argc, argv, 1); + } + /* Free the argument vector */ + for (j = 0; j < argc; j++) + sdsfree(argv[j]); + zfree(argv); + } + /* linenoise() returns malloc-ed lines like readline() */ + free(line); + } + exit(0); +} + +int main(int argc, char **argv) { + int firstarg; + char **argvcopy; + + config.hostip = "127.0.0.1"; + config.hostport = 6379; + config.repeat = 1; + config.dbnum = 0; + config.argn_from_stdin = 0; + config.shutdown = 0; + config.interactive = 0; + config.monitor_mode = 0; + config.pubsub_mode = 0; + config.raw_output = 0; + config.auth = NULL; + + firstarg = parseOptions(argc,argv); + argc -= firstarg; + argv += firstarg; + + if (config.auth != NULL) { + char *authargv[2]; + + authargv[0] = "AUTH"; + authargv[1] = config.auth; + cliSendCommand(2, convertToSds(2, authargv), 1); + } + + if (argc == 0 || config.interactive == 1) repl(); + + argvcopy = convertToSds(argc+1, argv); + if (config.argn_from_stdin) { + sds lastarg = readArgFromStdin(); + argvcopy[argc] = lastarg; + argc++; + } + return cliSendCommand(argc, argvcopy, config.repeat); +} diff --git a/src/redis.c b/src/redis.c new file mode 100644 index 000000000..5f539216f --- /dev/null +++ b/src/redis.c @@ -0,0 +1,1516 @@ +/* + * Copyright (c) 2009-2010, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "redis.h" + +#ifdef HAVE_BACKTRACE +#include +#include +#endif /* HAVE_BACKTRACE */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Our shared "common" objects */ + +struct sharedObjectsStruct shared; + +/* Global vars that are actally used as constants. The following double + * values are used for double on-disk serialization, and are initialized + * at runtime to avoid strange compiler optimizations. */ + +double R_Zero, R_PosInf, R_NegInf, R_Nan; + +/*================================= Globals ================================= */ + +/* Global vars */ +struct redisServer server; /* server global state */ +struct redisCommand *commandTable; +struct redisCommand readonlyCommandTable[] = { + {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, + {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0}, + {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0}, + {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0}, + {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, + {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1}, + {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0}, + {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, + {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1}, + {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1}, + {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1}, + {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, + {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, + {"rpushx",rpushxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, + {"lpushx",lpushxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, + {"linsert",linsertCommand,5,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, + {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, + {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, + {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1}, + {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1}, + {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, + {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1}, + {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, + {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1}, + {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1}, + {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1}, + {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1}, + {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, + {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1}, + {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1}, + {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1}, + {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, + {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, + {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, + {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1}, + {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1}, + {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1}, + {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1}, + {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1}, + {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1}, + {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, + {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, + {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, + {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1}, + {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1}, + {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1}, + {"zunionstore",zunionstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0}, + {"zinterstore",zinterstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0}, + {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1}, + {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1}, + {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1}, + {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1}, + {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, + {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, + {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1}, + {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1}, + {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, + {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, + {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1}, + {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, + {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1}, + {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1}, + {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1}, + {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, + {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, + {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, + {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, + {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1}, + {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1}, + {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1}, + {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, + {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2}, + {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2}, + {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, + {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0}, + {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1}, + {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1}, + {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1}, + {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0}, + {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0}, + {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0}, + {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, + {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0}, + {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, + {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0}, + {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, + {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, + {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, + {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, + {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, + {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, + {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, + {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,execBlockClientOnSwappedKeys,0,0,0}, + {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, + {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, + {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, + {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, + {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1}, + {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, + {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, + {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, + {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0}, + {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0}, + {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0}, + {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0}, + {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0}, + {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0}, + {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0}, + {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0}, + {"watch",watchCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0}, + {"unwatch",unwatchCommand,1,REDIS_CMD_INLINE,NULL,0,0,0} +}; + +/*============================ Utility functions ============================ */ + +void redisLog(int level, const char *fmt, ...) { + va_list ap; + FILE *fp; + + fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a"); + if (!fp) return; + + va_start(ap, fmt); + if (level >= server.verbosity) { + char *c = ".-*#"; + char buf[64]; + time_t now; + + now = time(NULL); + strftime(buf,64,"%d %b %H:%M:%S",localtime(&now)); + fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]); + vfprintf(fp, fmt, ap); + fprintf(fp,"\n"); + fflush(fp); + } + va_end(ap); + + if (server.logfile) fclose(fp); +} + +/* Redis generally does not try to recover from out of memory conditions + * when allocating objects or strings, it is not clear if it will be possible + * to report this condition to the client since the networking layer itself + * is based on heap allocation for send buffers, so we simply abort. + * At least the code will be simpler to read... */ +void oom(const char *msg) { + redisLog(REDIS_WARNING, "%s: Out of memory\n",msg); + sleep(1); + abort(); +} + +/*====================== Hash table type implementation ==================== */ + +/* This is an hash table type that uses the SDS dynamic strings libary as + * keys and radis objects as values (objects can hold SDS strings, + * lists, sets). */ + +void dictVanillaFree(void *privdata, void *val) +{ + DICT_NOTUSED(privdata); + zfree(val); +} + +void dictListDestructor(void *privdata, void *val) +{ + DICT_NOTUSED(privdata); + listRelease((list*)val); +} + +int dictSdsKeyCompare(void *privdata, const void *key1, + const void *key2) +{ + int l1,l2; + DICT_NOTUSED(privdata); + + l1 = sdslen((sds)key1); + l2 = sdslen((sds)key2); + if (l1 != l2) return 0; + return memcmp(key1, key2, l1) == 0; +} + +void dictRedisObjectDestructor(void *privdata, void *val) +{ + DICT_NOTUSED(privdata); + + if (val == NULL) return; /* Values of swapped out keys as set to NULL */ + decrRefCount(val); +} + +void dictSdsDestructor(void *privdata, void *val) +{ + DICT_NOTUSED(privdata); + + sdsfree(val); +} + +int dictObjKeyCompare(void *privdata, const void *key1, + const void *key2) +{ + const robj *o1 = key1, *o2 = key2; + return dictSdsKeyCompare(privdata,o1->ptr,o2->ptr); +} + +unsigned int dictObjHash(const void *key) { + const robj *o = key; + return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr)); +} + +unsigned int dictSdsHash(const void *key) { + return dictGenHashFunction((unsigned char*)key, sdslen((char*)key)); +} + +int dictEncObjKeyCompare(void *privdata, const void *key1, + const void *key2) +{ + robj *o1 = (robj*) key1, *o2 = (robj*) key2; + int cmp; + + if (o1->encoding == REDIS_ENCODING_INT && + o2->encoding == REDIS_ENCODING_INT) + return o1->ptr == o2->ptr; + + o1 = getDecodedObject(o1); + o2 = getDecodedObject(o2); + cmp = dictSdsKeyCompare(privdata,o1->ptr,o2->ptr); + decrRefCount(o1); + decrRefCount(o2); + return cmp; +} + +unsigned int dictEncObjHash(const void *key) { + robj *o = (robj*) key; + + if (o->encoding == REDIS_ENCODING_RAW) { + return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr)); + } else { + if (o->encoding == REDIS_ENCODING_INT) { + char buf[32]; + int len; + + len = ll2string(buf,32,(long)o->ptr); + return dictGenHashFunction((unsigned char*)buf, len); + } else { + unsigned int hash; + + o = getDecodedObject(o); + hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr)); + decrRefCount(o); + return hash; + } + } +} + +/* Sets type */ +dictType setDictType = { + dictEncObjHash, /* hash function */ + NULL, /* key dup */ + NULL, /* val dup */ + dictEncObjKeyCompare, /* key compare */ + dictRedisObjectDestructor, /* key destructor */ + NULL /* val destructor */ +}; + +/* Sorted sets hash (note: a skiplist is used in addition to the hash table) */ +dictType zsetDictType = { + dictEncObjHash, /* hash function */ + NULL, /* key dup */ + NULL, /* val dup */ + dictEncObjKeyCompare, /* key compare */ + dictRedisObjectDestructor, /* key destructor */ + dictVanillaFree /* val destructor of malloc(sizeof(double)) */ +}; + +/* Db->dict, keys are sds strings, vals are Redis objects. */ +dictType dbDictType = { + dictSdsHash, /* hash function */ + NULL, /* key dup */ + NULL, /* val dup */ + dictSdsKeyCompare, /* key compare */ + dictSdsDestructor, /* key destructor */ + dictRedisObjectDestructor /* val destructor */ +}; + +/* Db->expires */ +dictType keyptrDictType = { + dictSdsHash, /* hash function */ + NULL, /* key dup */ + NULL, /* val dup */ + dictSdsKeyCompare, /* key compare */ + NULL, /* key destructor */ + NULL /* val destructor */ +}; + +/* Hash type hash table (note that small hashes are represented with zimpaps) */ +dictType hashDictType = { + dictEncObjHash, /* hash function */ + NULL, /* key dup */ + NULL, /* val dup */ + dictEncObjKeyCompare, /* key compare */ + dictRedisObjectDestructor, /* key destructor */ + dictRedisObjectDestructor /* val destructor */ +}; + +/* Keylist hash table type has unencoded redis objects as keys and + * lists as values. It's used for blocking operations (BLPOP) and to + * map swapped keys to a list of clients waiting for this keys to be loaded. */ +dictType keylistDictType = { + dictObjHash, /* hash function */ + NULL, /* key dup */ + NULL, /* val dup */ + dictObjKeyCompare, /* key compare */ + dictRedisObjectDestructor, /* key destructor */ + dictListDestructor /* val destructor */ +}; + +int htNeedsResize(dict *dict) { + long long size, used; + + size = dictSlots(dict); + used = dictSize(dict); + return (size && used && size > DICT_HT_INITIAL_SIZE && + (used*100/size < REDIS_HT_MINFILL)); +} + +/* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL + * we resize the hash table to save memory */ +void tryResizeHashTables(void) { + int j; + + for (j = 0; j < server.dbnum; j++) { + if (htNeedsResize(server.db[j].dict)) + dictResize(server.db[j].dict); + if (htNeedsResize(server.db[j].expires)) + dictResize(server.db[j].expires); + } +} + +/* Our hash table implementation performs rehashing incrementally while + * we write/read from the hash table. Still if the server is idle, the hash + * table will use two tables for a long time. So we try to use 1 millisecond + * of CPU time at every serverCron() loop in order to rehash some key. */ +void incrementallyRehash(void) { + int j; + + for (j = 0; j < server.dbnum; j++) { + if (dictIsRehashing(server.db[j].dict)) { + dictRehashMilliseconds(server.db[j].dict,1); + break; /* already used our millisecond for this loop... */ + } + } +} + +/* This function is called once a background process of some kind terminates, + * as we want to avoid resizing the hash tables when there is a child in order + * to play well with copy-on-write (otherwise when a resize happens lots of + * memory pages are copied). The goal of this function is to update the ability + * for dict.c to resize the hash tables accordingly to the fact we have o not + * running childs. */ +void updateDictResizePolicy(void) { + if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) + dictEnableResize(); + else + dictDisableResize(); +} + +/* ======================= Cron: called every 100 ms ======================== */ + +int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) { + int j, loops = server.cronloops++; + REDIS_NOTUSED(eventLoop); + REDIS_NOTUSED(id); + REDIS_NOTUSED(clientData); + + /* We take a cached value of the unix time in the global state because + * with virtual memory and aging there is to store the current time + * in objects at every object access, and accuracy is not needed. + * To access a global var is faster than calling time(NULL) */ + server.unixtime = time(NULL); + /* We have just 21 bits per object for LRU information. + * So we use an (eventually wrapping) LRU clock with minutes resolution. + * + * When we need to select what object to swap, we compute the minimum + * time distance between the current lruclock and the object last access + * lruclock info. Even if clocks will wrap on overflow, there is + * the interesting property that we are sure that at least + * ABS(A-B) minutes passed between current time and timestamp B. + * + * This is not precise but we don't need at all precision, but just + * something statistically reasonable. + */ + server.lruclock = (time(NULL)/60)&((1<<21)-1); + + /* We received a SIGTERM, shutting down here in a safe way, as it is + * not ok doing so inside the signal handler. */ + if (server.shutdown_asap) { + if (prepareForShutdown() == REDIS_OK) exit(0); + redisLog(REDIS_WARNING,"SIGTERM received but errors trying to shut down the server, check the logs for more information"); + } + + /* Show some info about non-empty databases */ + for (j = 0; j < server.dbnum; j++) { + long long size, used, vkeys; + + size = dictSlots(server.db[j].dict); + used = dictSize(server.db[j].dict); + vkeys = dictSize(server.db[j].expires); + if (!(loops % 50) && (used || vkeys)) { + redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size); + /* dictPrintStats(server.dict); */ + } + } + + /* We don't want to resize the hash tables while a bacground saving + * is in progress: the saving child is created using fork() that is + * implemented with a copy-on-write semantic in most modern systems, so + * if we resize the HT while there is the saving child at work actually + * a lot of memory movements in the parent will cause a lot of pages + * copied. */ + if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) { + if (!(loops % 10)) tryResizeHashTables(); + if (server.activerehashing) incrementallyRehash(); + } + + /* Show information about connected clients */ + if (!(loops % 50)) { + redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use", + listLength(server.clients)-listLength(server.slaves), + listLength(server.slaves), + zmalloc_used_memory()); + } + + /* Close connections of timedout clients */ + if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients) + closeTimedoutClients(); + + /* Check if a background saving or AOF rewrite in progress terminated */ + if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) { + int statloc; + pid_t pid; + + if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) { + if (pid == server.bgsavechildpid) { + backgroundSaveDoneHandler(statloc); + } else { + backgroundRewriteDoneHandler(statloc); + } + updateDictResizePolicy(); + } + } else { + /* If there is not a background saving in progress check if + * we have to save now */ + time_t now = time(NULL); + for (j = 0; j < server.saveparamslen; j++) { + struct saveparam *sp = server.saveparams+j; + + if (server.dirty >= sp->changes && + now-server.lastsave > sp->seconds) { + redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...", + sp->changes, sp->seconds); + rdbSaveBackground(server.dbfilename); + break; + } + } + } + + /* Try to expire a few timed out keys. The algorithm used is adaptive and + * will use few CPU cycles if there are few expiring keys, otherwise + * it will get more aggressive to avoid that too much memory is used by + * keys that can be removed from the keyspace. */ + for (j = 0; j < server.dbnum; j++) { + int expired; + redisDb *db = server.db+j; + + /* Continue to expire if at the end of the cycle more than 25% + * of the keys were expired. */ + do { + long num = dictSize(db->expires); + time_t now = time(NULL); + + expired = 0; + if (num > REDIS_EXPIRELOOKUPS_PER_CRON) + num = REDIS_EXPIRELOOKUPS_PER_CRON; + while (num--) { + dictEntry *de; + time_t t; + + if ((de = dictGetRandomKey(db->expires)) == NULL) break; + t = (time_t) dictGetEntryVal(de); + if (now > t) { + sds key = dictGetEntryKey(de); + robj *keyobj = createStringObject(key,sdslen(key)); + + dbDelete(db,keyobj); + decrRefCount(keyobj); + expired++; + server.stat_expiredkeys++; + } + } + } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4); + } + + /* Swap a few keys on disk if we are over the memory limit and VM + * is enbled. Try to free objects from the free list first. */ + if (vmCanSwapOut()) { + while (server.vm_enabled && zmalloc_used_memory() > + server.vm_max_memory) + { + int retval; + + if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue; + retval = (server.vm_max_threads == 0) ? + vmSwapOneObjectBlocking() : + vmSwapOneObjectThreaded(); + if (retval == REDIS_ERR && !(loops % 300) && + zmalloc_used_memory() > + (server.vm_max_memory+server.vm_max_memory/10)) + { + redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!"); + } + /* Note that when using threade I/O we free just one object, + * because anyway when the I/O thread in charge to swap this + * object out will finish, the handler of completed jobs + * will try to swap more objects if we are still out of memory. */ + if (retval == REDIS_ERR || server.vm_max_threads > 0) break; + } + } + + /* Check if we should connect to a MASTER */ + if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) { + redisLog(REDIS_NOTICE,"Connecting to MASTER..."); + if (syncWithMaster() == REDIS_OK) { + redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded"); + if (server.appendonly) rewriteAppendOnlyFileBackground(); + } + } + return 100; +} + +/* This function gets called every time Redis is entering the + * main loop of the event driven library, that is, before to sleep + * for ready file descriptors. */ +void beforeSleep(struct aeEventLoop *eventLoop) { + REDIS_NOTUSED(eventLoop); + + /* Awake clients that got all the swapped keys they requested */ + if (server.vm_enabled && listLength(server.io_ready_clients)) { + listIter li; + listNode *ln; + + listRewind(server.io_ready_clients,&li); + while((ln = listNext(&li))) { + redisClient *c = ln->value; + struct redisCommand *cmd; + + /* Resume the client. */ + listDelNode(server.io_ready_clients,ln); + c->flags &= (~REDIS_IO_WAIT); + server.vm_blocked_clients--; + aeCreateFileEvent(server.el, c->fd, AE_READABLE, + readQueryFromClient, c); + cmd = lookupCommand(c->argv[0]->ptr); + redisAssert(cmd != NULL); + call(c,cmd); + resetClient(c); + /* There may be more data to process in the input buffer. */ + if (c->querybuf && sdslen(c->querybuf) > 0) + processInputBuffer(c); + } + } + /* Write the AOF buffer on disk */ + flushAppendOnlyFile(); +} + +/* =========================== Server initialization ======================== */ + +void createSharedObjects(void) { + int j; + + shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n")); + shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n")); + shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n")); + shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n")); + shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n")); + shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n")); + shared.cnegone = createObject(REDIS_STRING,sdsnew(":-1\r\n")); + shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n")); + shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n")); + shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n")); + shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n")); + shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n")); + shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew( + "-ERR Operation against a key holding the wrong kind of value\r\n")); + shared.nokeyerr = createObject(REDIS_STRING,sdsnew( + "-ERR no such key\r\n")); + shared.syntaxerr = createObject(REDIS_STRING,sdsnew( + "-ERR syntax error\r\n")); + shared.sameobjecterr = createObject(REDIS_STRING,sdsnew( + "-ERR source and destination objects are the same\r\n")); + shared.outofrangeerr = createObject(REDIS_STRING,sdsnew( + "-ERR index out of range\r\n")); + shared.space = createObject(REDIS_STRING,sdsnew(" ")); + shared.colon = createObject(REDIS_STRING,sdsnew(":")); + shared.plus = createObject(REDIS_STRING,sdsnew("+")); + shared.select0 = createStringObject("select 0\r\n",10); + shared.select1 = createStringObject("select 1\r\n",10); + shared.select2 = createStringObject("select 2\r\n",10); + shared.select3 = createStringObject("select 3\r\n",10); + shared.select4 = createStringObject("select 4\r\n",10); + shared.select5 = createStringObject("select 5\r\n",10); + shared.select6 = createStringObject("select 6\r\n",10); + shared.select7 = createStringObject("select 7\r\n",10); + shared.select8 = createStringObject("select 8\r\n",10); + shared.select9 = createStringObject("select 9\r\n",10); + shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13); + shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14); + shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15); + shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18); + shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17); + shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19); + shared.mbulk3 = createStringObject("*3\r\n",4); + shared.mbulk4 = createStringObject("*4\r\n",4); + for (j = 0; j < REDIS_SHARED_INTEGERS; j++) { + shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j); + shared.integers[j]->encoding = REDIS_ENCODING_INT; + } +} + +void initServerConfig() { + server.dbnum = REDIS_DEFAULT_DBNUM; + server.port = REDIS_SERVERPORT; + server.verbosity = REDIS_VERBOSE; + server.maxidletime = REDIS_MAXIDLETIME; + server.saveparams = NULL; + server.logfile = NULL; /* NULL = log on standard output */ + server.bindaddr = NULL; + server.glueoutputbuf = 1; + server.daemonize = 0; + server.appendonly = 0; + server.appendfsync = APPENDFSYNC_EVERYSEC; + server.no_appendfsync_on_rewrite = 0; + server.lastfsync = time(NULL); + server.appendfd = -1; + server.appendseldb = -1; /* Make sure the first time will not match */ + server.pidfile = zstrdup("/var/run/redis.pid"); + server.dbfilename = zstrdup("dump.rdb"); + server.appendfilename = zstrdup("appendonly.aof"); + server.requirepass = NULL; + server.rdbcompression = 1; + server.activerehashing = 1; + server.maxclients = 0; + server.blpop_blocked_clients = 0; + server.maxmemory = 0; + server.vm_enabled = 0; + server.vm_swap_file = zstrdup("/tmp/redis-%p.vm"); + server.vm_page_size = 256; /* 256 bytes per page */ + server.vm_pages = 1024*1024*100; /* 104 millions of pages */ + server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */ + server.vm_max_threads = 4; + server.vm_blocked_clients = 0; + server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES; + server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE; + server.list_max_ziplist_entries = REDIS_LIST_MAX_ZIPLIST_ENTRIES; + server.list_max_ziplist_value = REDIS_LIST_MAX_ZIPLIST_VALUE; + server.shutdown_asap = 0; + + resetServerSaveParams(); + + appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */ + appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */ + appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */ + /* Replication related */ + server.isslave = 0; + server.masterauth = NULL; + server.masterhost = NULL; + server.masterport = 6379; + server.master = NULL; + server.replstate = REDIS_REPL_NONE; + + /* Double constants initialization */ + R_Zero = 0.0; + R_PosInf = 1.0/R_Zero; + R_NegInf = -1.0/R_Zero; + R_Nan = R_Zero/R_Zero; +} + +void initServer() { + int j; + + signal(SIGHUP, SIG_IGN); + signal(SIGPIPE, SIG_IGN); + setupSigSegvAction(); + + server.devnull = fopen("/dev/null","w"); + if (server.devnull == NULL) { + redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr); + exit(1); + } + server.clients = listCreate(); + server.slaves = listCreate(); + server.monitors = listCreate(); + server.objfreelist = listCreate(); + createSharedObjects(); + server.el = aeCreateEventLoop(); + server.db = zmalloc(sizeof(redisDb)*server.dbnum); + server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr); + if (server.fd == -1) { + redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr); + exit(1); + } + for (j = 0; j < server.dbnum; j++) { + server.db[j].dict = dictCreate(&dbDictType,NULL); + server.db[j].expires = dictCreate(&keyptrDictType,NULL); + server.db[j].blocking_keys = dictCreate(&keylistDictType,NULL); + server.db[j].watched_keys = dictCreate(&keylistDictType,NULL); + if (server.vm_enabled) + server.db[j].io_keys = dictCreate(&keylistDictType,NULL); + server.db[j].id = j; + } + server.pubsub_channels = dictCreate(&keylistDictType,NULL); + server.pubsub_patterns = listCreate(); + listSetFreeMethod(server.pubsub_patterns,freePubsubPattern); + listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern); + server.cronloops = 0; + server.bgsavechildpid = -1; + server.bgrewritechildpid = -1; + server.bgrewritebuf = sdsempty(); + server.aofbuf = sdsempty(); + server.lastsave = time(NULL); + server.dirty = 0; + server.stat_numcommands = 0; + server.stat_numconnections = 0; + server.stat_expiredkeys = 0; + server.stat_starttime = time(NULL); + server.unixtime = time(NULL); + aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL); + if (aeCreateFileEvent(server.el, server.fd, AE_READABLE, + acceptHandler, NULL) == AE_ERR) oom("creating file event"); + + if (server.appendonly) { + server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644); + if (server.appendfd == -1) { + redisLog(REDIS_WARNING, "Can't open the append-only file: %s", + strerror(errno)); + exit(1); + } + } + + if (server.vm_enabled) vmInit(); +} + +int qsortRedisCommands(const void *r1, const void *r2) { + return strcasecmp( + ((struct redisCommand*)r1)->name, + ((struct redisCommand*)r2)->name); +} + +void sortCommandTable() { + /* Copy and sort the read-only version of the command table */ + commandTable = (struct redisCommand*)malloc(sizeof(readonlyCommandTable)); + memcpy(commandTable,readonlyCommandTable,sizeof(readonlyCommandTable)); + qsort(commandTable, + sizeof(readonlyCommandTable)/sizeof(struct redisCommand), + sizeof(struct redisCommand),qsortRedisCommands); +} + +/* ====================== Commands lookup and execution ===================== */ + +struct redisCommand *lookupCommand(char *name) { + struct redisCommand tmp = {name,NULL,0,0,NULL,0,0,0}; + return bsearch( + &tmp, + commandTable, + sizeof(readonlyCommandTable)/sizeof(struct redisCommand), + sizeof(struct redisCommand), + qsortRedisCommands); +} + +/* Call() is the core of Redis execution of a command */ +void call(redisClient *c, struct redisCommand *cmd) { + long long dirty; + + dirty = server.dirty; + cmd->proc(c); + dirty = server.dirty-dirty; + + if (server.appendonly && dirty) + feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc); + if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) && + listLength(server.slaves)) + replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc); + if (listLength(server.monitors)) + replicationFeedMonitors(server.monitors,c->db->id,c->argv,c->argc); + server.stat_numcommands++; +} + +/* If this function gets called we already read a whole + * command, argments are in the client argv/argc fields. + * processCommand() execute the command or prepare the + * server for a bulk read from the client. + * + * If 1 is returned the client is still alive and valid and + * and other operations can be performed by the caller. Otherwise + * if 0 is returned the client was destroied (i.e. after QUIT). */ +int processCommand(redisClient *c) { + struct redisCommand *cmd; + + /* Free some memory if needed (maxmemory setting) */ + if (server.maxmemory) freeMemoryIfNeeded(); + + /* Handle the multi bulk command type. This is an alternative protocol + * supported by Redis in order to receive commands that are composed of + * multiple binary-safe "bulk" arguments. The latency of processing is + * a bit higher but this allows things like multi-sets, so if this + * protocol is used only for MSET and similar commands this is a big win. */ + if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') { + c->multibulk = atoi(((char*)c->argv[0]->ptr)+1); + if (c->multibulk <= 0) { + resetClient(c); + return 1; + } else { + decrRefCount(c->argv[c->argc-1]); + c->argc--; + return 1; + } + } else if (c->multibulk) { + if (c->bulklen == -1) { + if (((char*)c->argv[0]->ptr)[0] != '$') { + addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n")); + resetClient(c); + return 1; + } else { + int bulklen = atoi(((char*)c->argv[0]->ptr)+1); + decrRefCount(c->argv[0]); + if (bulklen < 0 || bulklen > 1024*1024*1024) { + c->argc--; + addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n")); + resetClient(c); + return 1; + } + c->argc--; + c->bulklen = bulklen+2; /* add two bytes for CR+LF */ + return 1; + } + } else { + c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1)); + c->mbargv[c->mbargc] = c->argv[0]; + c->mbargc++; + c->argc--; + c->multibulk--; + if (c->multibulk == 0) { + robj **auxargv; + int auxargc; + + /* Here we need to swap the multi-bulk argc/argv with the + * normal argc/argv of the client structure. */ + auxargv = c->argv; + c->argv = c->mbargv; + c->mbargv = auxargv; + + auxargc = c->argc; + c->argc = c->mbargc; + c->mbargc = auxargc; + + /* We need to set bulklen to something different than -1 + * in order for the code below to process the command without + * to try to read the last argument of a bulk command as + * a special argument. */ + c->bulklen = 0; + /* continue below and process the command */ + } else { + c->bulklen = -1; + return 1; + } + } + } + /* -- end of multi bulk commands processing -- */ + + /* The QUIT command is handled as a special case. Normal command + * procs are unable to close the client connection safely */ + if (!strcasecmp(c->argv[0]->ptr,"quit")) { + freeClient(c); + return 0; + } + + /* Now lookup the command and check ASAP about trivial error conditions + * such wrong arity, bad command name and so forth. */ + cmd = lookupCommand(c->argv[0]->ptr); + if (!cmd) { + addReplySds(c, + sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n", + (char*)c->argv[0]->ptr)); + resetClient(c); + return 1; + } else if ((cmd->arity > 0 && cmd->arity != c->argc) || + (c->argc < -cmd->arity)) { + addReplySds(c, + sdscatprintf(sdsempty(), + "-ERR wrong number of arguments for '%s' command\r\n", + cmd->name)); + resetClient(c); + return 1; + } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) { + /* This is a bulk command, we have to read the last argument yet. */ + int bulklen = atoi(c->argv[c->argc-1]->ptr); + + decrRefCount(c->argv[c->argc-1]); + if (bulklen < 0 || bulklen > 1024*1024*1024) { + c->argc--; + addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n")); + resetClient(c); + return 1; + } + c->argc--; + c->bulklen = bulklen+2; /* add two bytes for CR+LF */ + /* It is possible that the bulk read is already in the + * buffer. Check this condition and handle it accordingly. + * This is just a fast path, alternative to call processInputBuffer(). + * It's a good idea since the code is small and this condition + * happens most of the times. */ + if ((signed)sdslen(c->querybuf) >= c->bulklen) { + c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2); + c->argc++; + c->querybuf = sdsrange(c->querybuf,c->bulklen,-1); + } else { + /* Otherwise return... there is to read the last argument + * from the socket. */ + return 1; + } + } + /* Let's try to encode the bulk object to save space. */ + if (cmd->flags & REDIS_CMD_BULK) + c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]); + + /* Check if the user is authenticated */ + if (server.requirepass && !c->authenticated && cmd->proc != authCommand) { + addReplySds(c,sdsnew("-ERR operation not permitted\r\n")); + resetClient(c); + return 1; + } + + /* Handle the maxmemory directive */ + if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) && + zmalloc_used_memory() > server.maxmemory) + { + addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n")); + resetClient(c); + return 1; + } + + /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */ + if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0) + && + cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand && + cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) { + addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n")); + resetClient(c); + return 1; + } + + /* Exec the command */ + if (c->flags & REDIS_MULTI && + cmd->proc != execCommand && cmd->proc != discardCommand && + cmd->proc != multiCommand && cmd->proc != watchCommand) + { + queueMultiCommand(c,cmd); + addReply(c,shared.queued); + } else { + if (server.vm_enabled && server.vm_max_threads > 0 && + blockClientOnSwappedKeys(c,cmd)) return 1; + call(c,cmd); + } + + /* Prepare the client for the next command */ + resetClient(c); + return 1; +} + +/*================================== Shutdown =============================== */ + +int prepareForShutdown() { + redisLog(REDIS_WARNING,"User requested shutdown, saving DB..."); + /* Kill the saving child if there is a background saving in progress. + We want to avoid race conditions, for instance our saving child may + overwrite the synchronous saving did by SHUTDOWN. */ + if (server.bgsavechildpid != -1) { + redisLog(REDIS_WARNING,"There is a live saving child. Killing it!"); + kill(server.bgsavechildpid,SIGKILL); + rdbRemoveTempFile(server.bgsavechildpid); + } + if (server.appendonly) { + /* Append only file: fsync() the AOF and exit */ + aof_fsync(server.appendfd); + if (server.vm_enabled) unlink(server.vm_swap_file); + } else { + /* Snapshotting. Perform a SYNC SAVE and exit */ + if (rdbSave(server.dbfilename) == REDIS_OK) { + if (server.daemonize) + unlink(server.pidfile); + redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory()); + } else { + /* Ooops.. error saving! The best we can do is to continue + * operating. Note that if there was a background saving process, + * in the next cron() Redis will be notified that the background + * saving aborted, handling special stuff like slaves pending for + * synchronization... */ + redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit"); + return REDIS_ERR; + } + } + redisLog(REDIS_WARNING,"Server exit now, bye bye..."); + return REDIS_OK; +} + +/*================================== Commands =============================== */ + +void authCommand(redisClient *c) { + if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) { + c->authenticated = 1; + addReply(c,shared.ok); + } else { + c->authenticated = 0; + addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n")); + } +} + +void pingCommand(redisClient *c) { + addReply(c,shared.pong); +} + +void echoCommand(redisClient *c) { + addReplyBulk(c,c->argv[1]); +} + +/* Convert an amount of bytes into a human readable string in the form + * of 100B, 2G, 100M, 4K, and so forth. */ +void bytesToHuman(char *s, unsigned long long n) { + double d; + + if (n < 1024) { + /* Bytes */ + sprintf(s,"%lluB",n); + return; + } else if (n < (1024*1024)) { + d = (double)n/(1024); + sprintf(s,"%.2fK",d); + } else if (n < (1024LL*1024*1024)) { + d = (double)n/(1024*1024); + sprintf(s,"%.2fM",d); + } else if (n < (1024LL*1024*1024*1024)) { + d = (double)n/(1024LL*1024*1024); + sprintf(s,"%.2fG",d); + } +} + +/* Create the string returned by the INFO command. This is decoupled + * by the INFO command itself as we need to report the same information + * on memory corruption problems. */ +sds genRedisInfoString(void) { + sds info; + time_t uptime = time(NULL)-server.stat_starttime; + int j; + char hmem[64]; + + bytesToHuman(hmem,zmalloc_used_memory()); + info = sdscatprintf(sdsempty(), + "redis_version:%s\r\n" + "redis_git_sha1:%s\r\n" + "redis_git_dirty:%d\r\n" + "arch_bits:%s\r\n" + "multiplexing_api:%s\r\n" + "process_id:%ld\r\n" + "uptime_in_seconds:%ld\r\n" + "uptime_in_days:%ld\r\n" + "connected_clients:%d\r\n" + "connected_slaves:%d\r\n" + "blocked_clients:%d\r\n" + "used_memory:%zu\r\n" + "used_memory_human:%s\r\n" + "changes_since_last_save:%lld\r\n" + "bgsave_in_progress:%d\r\n" + "last_save_time:%ld\r\n" + "bgrewriteaof_in_progress:%d\r\n" + "total_connections_received:%lld\r\n" + "total_commands_processed:%lld\r\n" + "expired_keys:%lld\r\n" + "hash_max_zipmap_entries:%zu\r\n" + "hash_max_zipmap_value:%zu\r\n" + "pubsub_channels:%ld\r\n" + "pubsub_patterns:%u\r\n" + "vm_enabled:%d\r\n" + "role:%s\r\n" + ,REDIS_VERSION, + redisGitSHA1(), + strtol(redisGitDirty(),NULL,10) > 0, + (sizeof(long) == 8) ? "64" : "32", + aeGetApiName(), + (long) getpid(), + uptime, + uptime/(3600*24), + listLength(server.clients)-listLength(server.slaves), + listLength(server.slaves), + server.blpop_blocked_clients, + zmalloc_used_memory(), + hmem, + server.dirty, + server.bgsavechildpid != -1, + server.lastsave, + server.bgrewritechildpid != -1, + server.stat_numconnections, + server.stat_numcommands, + server.stat_expiredkeys, + server.hash_max_zipmap_entries, + server.hash_max_zipmap_value, + dictSize(server.pubsub_channels), + listLength(server.pubsub_patterns), + server.vm_enabled != 0, + server.masterhost == NULL ? "master" : "slave" + ); + if (server.masterhost) { + info = sdscatprintf(info, + "master_host:%s\r\n" + "master_port:%d\r\n" + "master_link_status:%s\r\n" + "master_last_io_seconds_ago:%d\r\n" + ,server.masterhost, + server.masterport, + (server.replstate == REDIS_REPL_CONNECTED) ? + "up" : "down", + server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1 + ); + } + if (server.vm_enabled) { + lockThreadedIO(); + info = sdscatprintf(info, + "vm_conf_max_memory:%llu\r\n" + "vm_conf_page_size:%llu\r\n" + "vm_conf_pages:%llu\r\n" + "vm_stats_used_pages:%llu\r\n" + "vm_stats_swapped_objects:%llu\r\n" + "vm_stats_swappin_count:%llu\r\n" + "vm_stats_swappout_count:%llu\r\n" + "vm_stats_io_newjobs_len:%lu\r\n" + "vm_stats_io_processing_len:%lu\r\n" + "vm_stats_io_processed_len:%lu\r\n" + "vm_stats_io_active_threads:%lu\r\n" + "vm_stats_blocked_clients:%lu\r\n" + ,(unsigned long long) server.vm_max_memory, + (unsigned long long) server.vm_page_size, + (unsigned long long) server.vm_pages, + (unsigned long long) server.vm_stats_used_pages, + (unsigned long long) server.vm_stats_swapped_objects, + (unsigned long long) server.vm_stats_swapins, + (unsigned long long) server.vm_stats_swapouts, + (unsigned long) listLength(server.io_newjobs), + (unsigned long) listLength(server.io_processing), + (unsigned long) listLength(server.io_processed), + (unsigned long) server.io_active_threads, + (unsigned long) server.vm_blocked_clients + ); + unlockThreadedIO(); + } + for (j = 0; j < server.dbnum; j++) { + long long keys, vkeys; + + keys = dictSize(server.db[j].dict); + vkeys = dictSize(server.db[j].expires); + if (keys || vkeys) { + info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n", + j, keys, vkeys); + } + } + return info; +} + +void infoCommand(redisClient *c) { + sds info = genRedisInfoString(); + addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n", + (unsigned long)sdslen(info))); + addReplySds(c,info); + addReply(c,shared.crlf); +} + +void monitorCommand(redisClient *c) { + /* ignore MONITOR if aleady slave or in monitor mode */ + if (c->flags & REDIS_SLAVE) return; + + c->flags |= (REDIS_SLAVE|REDIS_MONITOR); + c->slaveseldb = 0; + listAddNodeTail(server.monitors,c); + addReply(c,shared.ok); +} + +/* ============================ Maxmemory directive ======================== */ + +/* Try to free one object form the pre-allocated objects free list. + * This is useful under low mem conditions as by default we take 1 million + * free objects allocated. On success REDIS_OK is returned, otherwise + * REDIS_ERR. */ +int tryFreeOneObjectFromFreelist(void) { + robj *o; + + if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex); + if (listLength(server.objfreelist)) { + listNode *head = listFirst(server.objfreelist); + o = listNodeValue(head); + listDelNode(server.objfreelist,head); + if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex); + zfree(o); + return REDIS_OK; + } else { + if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex); + return REDIS_ERR; + } +} + +/* This function gets called when 'maxmemory' is set on the config file to limit + * the max memory used by the server, and we are out of memory. + * This function will try to, in order: + * + * - Free objects from the free list + * - Try to remove keys with an EXPIRE set + * + * It is not possible to free enough memory to reach used-memory < maxmemory + * the server will start refusing commands that will enlarge even more the + * memory usage. + */ +void freeMemoryIfNeeded(void) { + while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) { + int j, k, freed = 0; + + if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue; + for (j = 0; j < server.dbnum; j++) { + int minttl = -1; + robj *minkey = NULL; + struct dictEntry *de; + + if (dictSize(server.db[j].expires)) { + freed = 1; + /* From a sample of three keys drop the one nearest to + * the natural expire */ + for (k = 0; k < 3; k++) { + time_t t; + + de = dictGetRandomKey(server.db[j].expires); + t = (time_t) dictGetEntryVal(de); + if (minttl == -1 || t < minttl) { + minkey = dictGetEntryKey(de); + minttl = t; + } + } + dbDelete(server.db+j,minkey); + } + } + if (!freed) return; /* nothing to free... */ + } +} + +/* =================================== Main! ================================ */ + +#ifdef __linux__ +int linuxOvercommitMemoryValue(void) { + FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r"); + char buf[64]; + + if (!fp) return -1; + if (fgets(buf,64,fp) == NULL) { + fclose(fp); + return -1; + } + fclose(fp); + + return atoi(buf); +} + +void linuxOvercommitMemoryWarning(void) { + if (linuxOvercommitMemoryValue() == 0) { + redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect."); + } +} +#endif /* __linux__ */ + +void daemonize(void) { + int fd; + FILE *fp; + + if (fork() != 0) exit(0); /* parent exits */ + setsid(); /* create a new session */ + + /* Every output goes to /dev/null. If Redis is daemonized but + * the 'logfile' is set to 'stdout' in the configuration file + * it will not log at all. */ + if ((fd = open("/dev/null", O_RDWR, 0)) != -1) { + dup2(fd, STDIN_FILENO); + dup2(fd, STDOUT_FILENO); + dup2(fd, STDERR_FILENO); + if (fd > STDERR_FILENO) close(fd); + } + /* Try to write the pid file */ + fp = fopen(server.pidfile,"w"); + if (fp) { + fprintf(fp,"%d\n",getpid()); + fclose(fp); + } +} + +void version() { + printf("Redis server version %s (%s:%d)\n", REDIS_VERSION, + redisGitSHA1(), atoi(redisGitDirty()) > 0); + exit(0); +} + +void usage() { + fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n"); + fprintf(stderr," ./redis-server - (read config from stdin)\n"); + exit(1); +} + +int main(int argc, char **argv) { + time_t start; + + initServerConfig(); + sortCommandTable(); + if (argc == 2) { + if (strcmp(argv[1], "-v") == 0 || + strcmp(argv[1], "--version") == 0) version(); + if (strcmp(argv[1], "--help") == 0) usage(); + resetServerSaveParams(); + loadServerConfig(argv[1]); + } else if ((argc > 2)) { + usage(); + } else { + redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'"); + } + if (server.daemonize) daemonize(); + initServer(); + redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION); +#ifdef __linux__ + linuxOvercommitMemoryWarning(); +#endif + start = time(NULL); + if (server.appendonly) { + if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK) + redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start); + } else { + if (rdbLoad(server.dbfilename) == REDIS_OK) + redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start); + } + redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port); + aeSetBeforeSleepProc(server.el,beforeSleep); + aeMain(server.el); + aeDeleteEventLoop(server.el); + return 0; +} + +/* ============================= Backtrace support ========================= */ + +#ifdef HAVE_BACKTRACE +void *getMcontextEip(ucontext_t *uc) { +#if defined(__FreeBSD__) + return (void*) uc->uc_mcontext.mc_eip; +#elif defined(__dietlibc__) + return (void*) uc->uc_mcontext.eip; +#elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6) + #if __x86_64__ + return (void*) uc->uc_mcontext->__ss.__rip; + #else + return (void*) uc->uc_mcontext->__ss.__eip; + #endif +#elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6) + #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__) + return (void*) uc->uc_mcontext->__ss.__rip; + #else + return (void*) uc->uc_mcontext->__ss.__eip; + #endif +#elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__) + return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */ +#elif defined(__ia64__) /* Linux IA64 */ + return (void*) uc->uc_mcontext.sc_ip; +#else + return NULL; +#endif +} + +void segvHandler(int sig, siginfo_t *info, void *secret) { + void *trace[100]; + char **messages = NULL; + int i, trace_size = 0; + ucontext_t *uc = (ucontext_t*) secret; + sds infostring; + REDIS_NOTUSED(info); + + redisLog(REDIS_WARNING, + "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig); + infostring = genRedisInfoString(); + redisLog(REDIS_WARNING, "%s",infostring); + /* It's not safe to sdsfree() the returned string under memory + * corruption conditions. Let it leak as we are going to abort */ + + trace_size = backtrace(trace, 100); + /* overwrite sigaction with caller's address */ + if (getMcontextEip(uc) != NULL) { + trace[1] = getMcontextEip(uc); + } + messages = backtrace_symbols(trace, trace_size); + + for (i=1; i +#include +#include +#include +#include +#include +#include + +#include "ae.h" /* Event driven programming library */ +#include "sds.h" /* Dynamic safe strings */ +#include "dict.h" /* Hash tables */ +#include "adlist.h" /* Linked lists */ +#include "zmalloc.h" /* total memory usage aware version of malloc/free */ +#include "anet.h" /* Networking the easy way */ +#include "zipmap.h" /* Compact string -> string data structure */ +#include "ziplist.h" /* Compact list data structure */ +#include "version.h" + +/* Error codes */ +#define REDIS_OK 0 +#define REDIS_ERR -1 + +/* Static server configuration */ +#define REDIS_SERVERPORT 6379 /* TCP port */ +#define REDIS_MAXIDLETIME (60*5) /* default client timeout */ +#define REDIS_IOBUF_LEN 1024 +#define REDIS_LOADBUF_LEN 1024 +#define REDIS_STATIC_ARGS 8 +#define REDIS_DEFAULT_DBNUM 16 +#define REDIS_CONFIGLINE_MAX 1024 +#define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */ +#define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */ +#define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */ +#define REDIS_MAX_WRITE_PER_EVENT (1024*64) +#define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */ +#define REDIS_SHARED_INTEGERS 10000 + +/* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */ +#define REDIS_WRITEV_THRESHOLD 3 +/* Max number of iovecs used for each writev call */ +#define REDIS_WRITEV_IOVEC_COUNT 256 + +/* Hash table parameters */ +#define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */ + +/* Command flags */ +#define REDIS_CMD_BULK 1 /* Bulk write command */ +#define REDIS_CMD_INLINE 2 /* Inline command */ +/* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with + this flags will return an error when the 'maxmemory' option is set in the + config file and the server is using more than maxmemory bytes of memory. + In short this commands are denied on low memory conditions. */ +#define REDIS_CMD_DENYOOM 4 +#define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */ + +/* Object types */ +#define REDIS_STRING 0 +#define REDIS_LIST 1 +#define REDIS_SET 2 +#define REDIS_ZSET 3 +#define REDIS_HASH 4 +#define REDIS_VMPOINTER 8 + +/* Objects encoding. Some kind of objects like Strings and Hashes can be + * internally represented in multiple ways. The 'encoding' field of the object + * is set to one of this fields for this object. */ +#define REDIS_ENCODING_RAW 0 /* Raw representation */ +#define REDIS_ENCODING_INT 1 /* Encoded as integer */ +#define REDIS_ENCODING_HT 2 /* Encoded as hash table */ +#define REDIS_ENCODING_ZIPMAP 3 /* Encoded as zipmap */ +#define REDIS_ENCODING_LINKEDLIST 4 /* Encoded as regular linked list */ +#define REDIS_ENCODING_ZIPLIST 5 /* Encoded as ziplist */ + +/* Object types only used for dumping to disk */ +#define REDIS_EXPIRETIME 253 +#define REDIS_SELECTDB 254 +#define REDIS_EOF 255 + +/* Defines related to the dump file format. To store 32 bits lengths for short + * keys requires a lot of space, so we check the most significant 2 bits of + * the first byte to interpreter the length: + * + * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte + * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte + * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow + * 11|000000 this means: specially encoded object will follow. The six bits + * number specify the kind of object that follows. + * See the REDIS_RDB_ENC_* defines. + * + * Lenghts up to 63 are stored using a single byte, most DB keys, and may + * values, will fit inside. */ +#define REDIS_RDB_6BITLEN 0 +#define REDIS_RDB_14BITLEN 1 +#define REDIS_RDB_32BITLEN 2 +#define REDIS_RDB_ENCVAL 3 +#define REDIS_RDB_LENERR UINT_MAX + +/* When a length of a string object stored on disk has the first two bits + * set, the remaining two bits specify a special encoding for the object + * accordingly to the following defines: */ +#define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */ +#define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */ +#define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */ +#define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */ + +/* Virtual memory object->where field. */ +#define REDIS_VM_MEMORY 0 /* The object is on memory */ +#define REDIS_VM_SWAPPED 1 /* The object is on disk */ +#define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */ +#define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */ + +/* Virtual memory static configuration stuff. + * Check vmFindContiguousPages() to know more about this magic numbers. */ +#define REDIS_VM_MAX_NEAR_PAGES 65536 +#define REDIS_VM_MAX_RANDOM_JUMP 4096 +#define REDIS_VM_MAX_THREADS 32 +#define REDIS_THREAD_STACK_SIZE (1024*1024*4) +/* The following is the *percentage* of completed I/O jobs to process when the + * handelr is called. While Virtual Memory I/O operations are performed by + * threads, this operations must be processed by the main thread when completed + * in order to take effect. */ +#define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1 + +/* Client flags */ +#define REDIS_SLAVE 1 /* This client is a slave server */ +#define REDIS_MASTER 2 /* This client is a master server */ +#define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */ +#define REDIS_MULTI 8 /* This client is in a MULTI context */ +#define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */ +#define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */ +#define REDIS_DIRTY_CAS 64 /* Watched keys modified. EXEC will fail. */ + +/* Slave replication state - slave side */ +#define REDIS_REPL_NONE 0 /* No active replication */ +#define REDIS_REPL_CONNECT 1 /* Must connect to master */ +#define REDIS_REPL_CONNECTED 2 /* Connected to master */ + +/* Slave replication state - from the point of view of master + * Note that in SEND_BULK and ONLINE state the slave receives new updates + * in its output queue. In the WAIT_BGSAVE state instead the server is waiting + * to start the next background saving in order to send updates to it. */ +#define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */ +#define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */ +#define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */ +#define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */ + +/* List related stuff */ +#define REDIS_HEAD 0 +#define REDIS_TAIL 1 + +/* Sort operations */ +#define REDIS_SORT_GET 0 +#define REDIS_SORT_ASC 1 +#define REDIS_SORT_DESC 2 +#define REDIS_SORTKEY_MAX 1024 + +/* Log levels */ +#define REDIS_DEBUG 0 +#define REDIS_VERBOSE 1 +#define REDIS_NOTICE 2 +#define REDIS_WARNING 3 + +/* Anti-warning macro... */ +#define REDIS_NOTUSED(V) ((void) V) + +#define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */ +#define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */ + +/* Append only defines */ +#define APPENDFSYNC_NO 0 +#define APPENDFSYNC_ALWAYS 1 +#define APPENDFSYNC_EVERYSEC 2 + +/* Zip structure related defaults */ +#define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64 +#define REDIS_HASH_MAX_ZIPMAP_VALUE 512 +#define REDIS_LIST_MAX_ZIPLIST_ENTRIES 1024 +#define REDIS_LIST_MAX_ZIPLIST_VALUE 32 + +/* Sets operations codes */ +#define REDIS_OP_UNION 0 +#define REDIS_OP_DIFF 1 +#define REDIS_OP_INTER 2 + +/* We can print the stacktrace, so our assert is defined this way: */ +#define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1))) +#define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1) +void _redisAssert(char *estr, char *file, int line); +void _redisPanic(char *msg, char *file, int line); + +/*----------------------------------------------------------------------------- + * Data types + *----------------------------------------------------------------------------*/ + +/* A redis object, that is a type able to hold a string / list / set */ + +/* The actual Redis Object */ +typedef struct redisObject { + unsigned type:4; + unsigned storage:2; /* REDIS_VM_MEMORY or REDIS_VM_SWAPPING */ + unsigned encoding:4; + unsigned lru:22; /* lru time (relative to server.lruclock) */ + int refcount; + void *ptr; + /* VM fields are only allocated if VM is active, otherwise the + * object allocation function will just allocate + * sizeof(redisObjct) minus sizeof(redisObjectVM), so using + * Redis without VM active will not have any overhead. */ +} robj; + +/* The VM pointer structure - identifies an object in the swap file. + * + * This object is stored in place of the value + * object in the main key->value hash table representing a database. + * Note that the first fields (type, storage) are the same as the redisObject + * structure so that vmPointer strucuters can be accessed even when casted + * as redisObject structures. + * + * This is useful as we don't know if a value object is or not on disk, but we + * are always able to read obj->storage to check this. For vmPointer + * structures "type" is set to REDIS_VMPOINTER (even if without this field + * is still possible to check the kind of object from the value of 'storage').*/ +typedef struct vmPointer { + unsigned type:4; + unsigned storage:2; /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */ + unsigned notused:26; + unsigned int vtype; /* type of the object stored in the swap file */ + off_t page; /* the page at witch the object is stored on disk */ + off_t usedpages; /* number of pages used on disk */ +} vmpointer; + +/* Macro used to initalize a Redis object allocated on the stack. + * Note that this macro is taken near the structure definition to make sure + * we'll update it when the structure is changed, to avoid bugs like + * bug #85 introduced exactly in this way. */ +#define initStaticStringObject(_var,_ptr) do { \ + _var.refcount = 1; \ + _var.type = REDIS_STRING; \ + _var.encoding = REDIS_ENCODING_RAW; \ + _var.ptr = _ptr; \ + _var.storage = REDIS_VM_MEMORY; \ +} while(0); + +typedef struct redisDb { + dict *dict; /* The keyspace for this DB */ + dict *expires; /* Timeout of keys with a timeout set */ + dict *blocking_keys; /* Keys with clients waiting for data (BLPOP) */ + dict *io_keys; /* Keys with clients waiting for VM I/O */ + dict *watched_keys; /* WATCHED keys for MULTI/EXEC CAS */ + int id; +} redisDb; + +/* Client MULTI/EXEC state */ +typedef struct multiCmd { + robj **argv; + int argc; + struct redisCommand *cmd; +} multiCmd; + +typedef struct multiState { + multiCmd *commands; /* Array of MULTI commands */ + int count; /* Total number of MULTI commands */ +} multiState; + +/* With multiplexing we need to take per-clinet state. + * Clients are taken in a liked list. */ +typedef struct redisClient { + int fd; + redisDb *db; + int dictid; + sds querybuf; + robj **argv, **mbargv; + int argc, mbargc; + int bulklen; /* bulk read len. -1 if not in bulk read mode */ + int multibulk; /* multi bulk command format active */ + list *reply; + int sentlen; + time_t lastinteraction; /* time of the last interaction, used for timeout */ + int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */ + int slaveseldb; /* slave selected db, if this client is a slave */ + int authenticated; /* when requirepass is non-NULL */ + int replstate; /* replication state if this is a slave */ + int repldbfd; /* replication DB file descriptor */ + long repldboff; /* replication DB file offset */ + off_t repldbsize; /* replication DB file size */ + multiState mstate; /* MULTI/EXEC state */ + robj **blocking_keys; /* The key we are waiting to terminate a blocking + * operation such as BLPOP. Otherwise NULL. */ + int blocking_keys_num; /* Number of blocking keys */ + time_t blockingto; /* Blocking operation timeout. If UNIX current time + * is >= blockingto then the operation timed out. */ + list *io_keys; /* Keys this client is waiting to be loaded from the + * swap file in order to continue. */ + list *watched_keys; /* Keys WATCHED for MULTI/EXEC CAS */ + dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */ + list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */ +} redisClient; + +struct saveparam { + time_t seconds; + int changes; +}; + +struct sharedObjectsStruct { + robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *cnegone, *pong, *space, + *colon, *nullbulk, *nullmultibulk, *queued, + *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr, + *outofrangeerr, *plus, + *select0, *select1, *select2, *select3, *select4, + *select5, *select6, *select7, *select8, *select9, + *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3, + *mbulk4, *psubscribebulk, *punsubscribebulk, + *integers[REDIS_SHARED_INTEGERS]; +}; + +/* Global server state structure */ +struct redisServer { + int port; + int fd; + redisDb *db; + long long dirty; /* changes to DB from the last save */ + list *clients; + list *slaves, *monitors; + char neterr[ANET_ERR_LEN]; + aeEventLoop *el; + int cronloops; /* number of times the cron function run */ + list *objfreelist; /* A list of freed objects to avoid malloc() */ + time_t lastsave; /* Unix time of last save succeeede */ + /* Fields used only for stats */ + time_t stat_starttime; /* server start time */ + long long stat_numcommands; /* number of processed commands */ + long long stat_numconnections; /* number of connections received */ + long long stat_expiredkeys; /* number of expired keys */ + /* Configuration */ + int verbosity; + int glueoutputbuf; + int maxidletime; + int dbnum; + int daemonize; + int appendonly; + int appendfsync; + int no_appendfsync_on_rewrite; + int shutdown_asap; + time_t lastfsync; + int appendfd; + int appendseldb; + char *pidfile; + pid_t bgsavechildpid; + pid_t bgrewritechildpid; + sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */ + sds aofbuf; /* AOF buffer, written before entering the event loop */ + struct saveparam *saveparams; + int saveparamslen; + char *logfile; + char *bindaddr; + char *dbfilename; + char *appendfilename; + char *requirepass; + int rdbcompression; + int activerehashing; + /* Replication related */ + int isslave; + char *masterauth; + char *masterhost; + int masterport; + redisClient *master; /* client that is master for this slave */ + int replstate; + unsigned int maxclients; + unsigned long long maxmemory; + unsigned int blpop_blocked_clients; + unsigned int vm_blocked_clients; + /* Sort parameters - qsort_r() is only available under BSD so we + * have to take this state global, in order to pass it to sortCompare() */ + int sort_desc; + int sort_alpha; + int sort_bypattern; + /* Virtual memory configuration */ + int vm_enabled; + char *vm_swap_file; + off_t vm_page_size; + off_t vm_pages; + unsigned long long vm_max_memory; + /* Zip structure config */ + size_t hash_max_zipmap_entries; + size_t hash_max_zipmap_value; + size_t list_max_ziplist_entries; + size_t list_max_ziplist_value; + /* Virtual memory state */ + FILE *vm_fp; + int vm_fd; + off_t vm_next_page; /* Next probably empty page */ + off_t vm_near_pages; /* Number of pages allocated sequentially */ + unsigned char *vm_bitmap; /* Bitmap of free/used pages */ + time_t unixtime; /* Unix time sampled every second. */ + /* Virtual memory I/O threads stuff */ + /* An I/O thread process an element taken from the io_jobs queue and + * put the result of the operation in the io_done list. While the + * job is being processed, it's put on io_processing queue. */ + list *io_newjobs; /* List of VM I/O jobs yet to be processed */ + list *io_processing; /* List of VM I/O jobs being processed */ + list *io_processed; /* List of VM I/O jobs already processed */ + list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */ + pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */ + pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */ + pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */ + pthread_attr_t io_threads_attr; /* attributes for threads creation */ + int io_active_threads; /* Number of running I/O threads */ + int vm_max_threads; /* Max number of I/O threads running at the same time */ + /* Our main thread is blocked on the event loop, locking for sockets ready + * to be read or written, so when a threaded I/O operation is ready to be + * processed by the main thread, the I/O thread will use a unix pipe to + * awake the main thread. The followings are the two pipe FDs. */ + int io_ready_pipe_read; + int io_ready_pipe_write; + /* Virtual memory stats */ + unsigned long long vm_stats_used_pages; + unsigned long long vm_stats_swapped_objects; + unsigned long long vm_stats_swapouts; + unsigned long long vm_stats_swapins; + /* Pubsub */ + dict *pubsub_channels; /* Map channels to list of subscribed clients */ + list *pubsub_patterns; /* A list of pubsub_patterns */ + /* Misc */ + FILE *devnull; + unsigned lruclock:22; /* clock incrementing every minute, for LRU */ + unsigned lruclock_padding:10; +}; + +typedef struct pubsubPattern { + redisClient *client; + robj *pattern; +} pubsubPattern; + +typedef void redisCommandProc(redisClient *c); +typedef void redisVmPreloadProc(redisClient *c, struct redisCommand *cmd, int argc, robj **argv); +struct redisCommand { + char *name; + redisCommandProc *proc; + int arity; + int flags; + /* Use a function to determine which keys need to be loaded + * in the background prior to executing this command. Takes precedence + * over vm_firstkey and others, ignored when NULL */ + redisVmPreloadProc *vm_preload_proc; + /* What keys should be loaded in background when calling this command? */ + int vm_firstkey; /* The first argument that's a key (0 = no keys) */ + int vm_lastkey; /* THe last argument that's a key */ + int vm_keystep; /* The step between first and last key */ +}; + +struct redisFunctionSym { + char *name; + unsigned long pointer; +}; + +typedef struct _redisSortObject { + robj *obj; + union { + double score; + robj *cmpobj; + } u; +} redisSortObject; + +typedef struct _redisSortOperation { + int type; + robj *pattern; +} redisSortOperation; + +/* ZSETs use a specialized version of Skiplists */ + +typedef struct zskiplistNode { + struct zskiplistNode **forward; + struct zskiplistNode *backward; + unsigned int *span; + double score; + robj *obj; +} zskiplistNode; + +typedef struct zskiplist { + struct zskiplistNode *header, *tail; + unsigned long length; + int level; +} zskiplist; + +typedef struct zset { + dict *dict; + zskiplist *zsl; +} zset; + +/* VM threaded I/O request message */ +#define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */ +#define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */ +#define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */ +typedef struct iojob { + int type; /* Request type, REDIS_IOJOB_* */ + redisDb *db;/* Redis database */ + robj *key; /* This I/O request is about swapping this key */ + robj *id; /* Unique identifier of this job: + this is the object to swap for REDIS_IOREQ_*_SWAP, or the + vmpointer objct for REDIS_IOREQ_LOAD. */ + robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this + * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */ + off_t page; /* Swap page where to read/write the object */ + off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */ + int canceled; /* True if this command was canceled by blocking side of VM */ + pthread_t thread; /* ID of the thread processing this entry */ +} iojob; + +/* Structure to hold list iteration abstraction. */ +typedef struct { + robj *subject; + unsigned char encoding; + unsigned char direction; /* Iteration direction */ + unsigned char *zi; + listNode *ln; +} listTypeIterator; + +/* Structure for an entry while iterating over a list. */ +typedef struct { + listTypeIterator *li; + unsigned char *zi; /* Entry in ziplist */ + listNode *ln; /* Entry in linked list */ +} listTypeEntry; + +/* Structure to hold hash iteration abstration. Note that iteration over + * hashes involves both fields and values. Because it is possible that + * not both are required, store pointers in the iterator to avoid + * unnecessary memory allocation for fields/values. */ +typedef struct { + int encoding; + unsigned char *zi; + unsigned char *zk, *zv; + unsigned int zklen, zvlen; + + dictIterator *di; + dictEntry *de; +} hashTypeIterator; + +#define REDIS_HASH_KEY 1 +#define REDIS_HASH_VALUE 2 + +/*----------------------------------------------------------------------------- + * Extern declarations + *----------------------------------------------------------------------------*/ + +extern struct redisServer server; +extern struct sharedObjectsStruct shared; +extern dictType setDictType; +extern dictType zsetDictType; +extern double R_Zero, R_PosInf, R_NegInf, R_Nan; +dictType hashDictType; + +/*----------------------------------------------------------------------------- + * Functions prototypes + *----------------------------------------------------------------------------*/ + +/* networking.c -- Networking and Client related operations */ +redisClient *createClient(int fd); +void closeTimedoutClients(void); +void freeClient(redisClient *c); +void resetClient(redisClient *c); +void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask); +void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask); +void addReply(redisClient *c, robj *obj); +void addReplySds(redisClient *c, sds s); +void processInputBuffer(redisClient *c); +void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask); +void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask); +void addReplyBulk(redisClient *c, robj *obj); +void addReplyBulkCString(redisClient *c, char *s); +void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask); +void addReply(redisClient *c, robj *obj); +void addReplySds(redisClient *c, sds s); +void addReplyDouble(redisClient *c, double d); +void addReplyLongLong(redisClient *c, long long ll); +void addReplyUlong(redisClient *c, unsigned long ul); +void *dupClientReplyValue(void *o); + +/* List data type */ +void listTypeTryConversion(robj *subject, robj *value); +void listTypePush(robj *subject, robj *value, int where); +robj *listTypePop(robj *subject, int where); +unsigned long listTypeLength(robj *subject); +listTypeIterator *listTypeInitIterator(robj *subject, int index, unsigned char direction); +void listTypeReleaseIterator(listTypeIterator *li); +int listTypeNext(listTypeIterator *li, listTypeEntry *entry); +robj *listTypeGet(listTypeEntry *entry); +void listTypeInsert(listTypeEntry *entry, robj *value, int where); +int listTypeEqual(listTypeEntry *entry, robj *o); +void listTypeDelete(listTypeEntry *entry); +void listTypeConvert(robj *subject, int enc); +void unblockClientWaitingData(redisClient *c); +int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele); +void popGenericCommand(redisClient *c, int where); + +/* MULTI/EXEC/WATCH... */ +void unwatchAllKeys(redisClient *c); +void initClientMultiState(redisClient *c); +void freeClientMultiState(redisClient *c); +void queueMultiCommand(redisClient *c, struct redisCommand *cmd); +void touchWatchedKey(redisDb *db, robj *key); +void touchWatchedKeysOnFlush(int dbid); + +/* Redis object implementation */ +void decrRefCount(void *o); +void incrRefCount(robj *o); +void freeStringObject(robj *o); +void freeListObject(robj *o); +void freeSetObject(robj *o); +void freeZsetObject(robj *o); +void freeHashObject(robj *o); +robj *createObject(int type, void *ptr); +robj *createStringObject(char *ptr, size_t len); +robj *dupStringObject(robj *o); +robj *tryObjectEncoding(robj *o); +robj *getDecodedObject(robj *o); +size_t stringObjectLen(robj *o); +int tryFreeOneObjectFromFreelist(void); +robj *createStringObjectFromLongLong(long long value); +robj *createListObject(void); +robj *createZiplistObject(void); +robj *createSetObject(void); +robj *createHashObject(void); +robj *createZsetObject(void); +int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg); +int checkType(redisClient *c, robj *o, int type); +int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg); +int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg); +int getLongLongFromObject(robj *o, long long *target); +char *strEncoding(int encoding); +int compareStringObjects(robj *a, robj *b); +int equalStringObjects(robj *a, robj *b); + +/* Replication */ +void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc); +void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc); +int syncWithMaster(void); +void updateSlavesWaitingBgsave(int bgsaveerr); + +/* RDB persistence */ +int rdbLoad(char *filename); +int rdbSaveBackground(char *filename); +void rdbRemoveTempFile(pid_t childpid); +int rdbSave(char *filename); +int rdbSaveObject(FILE *fp, robj *o); +off_t rdbSavedObjectPages(robj *o, FILE *fp); +off_t rdbSavedObjectLen(robj *o, FILE *fp); +robj *rdbLoadObject(int type, FILE *fp); +void backgroundSaveDoneHandler(int statloc); + +/* AOF persistence */ +void flushAppendOnlyFile(void); +void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc); +void aofRemoveTempFile(pid_t childpid); +int rewriteAppendOnlyFileBackground(void); +int loadAppendOnlyFile(char *filename); +void stopAppendOnly(void); +int startAppendOnly(void); +void backgroundRewriteDoneHandler(int statloc); + +/* Sorted sets data type */ +zskiplist *zslCreate(void); +void zslFree(zskiplist *zsl); +void zslInsert(zskiplist *zsl, double score, robj *obj); + +/* Core functions */ +void freeMemoryIfNeeded(void); +int processCommand(redisClient *c); +void setupSigSegvAction(void); +struct redisCommand *lookupCommand(char *name); +void call(redisClient *c, struct redisCommand *cmd); +int prepareForShutdown(); +void redisLog(int level, const char *fmt, ...); +void usage(); +void updateDictResizePolicy(void); +int htNeedsResize(dict *dict); +void oom(const char *msg); + +/* Virtual Memory */ +void vmInit(void); +void vmMarkPagesFree(off_t page, off_t count); +robj *vmLoadObject(robj *o); +robj *vmPreviewObject(robj *o); +int vmSwapOneObjectBlocking(void); +int vmSwapOneObjectThreaded(void); +int vmCanSwapOut(void); +void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask); +void vmCancelThreadedIOJob(robj *o); +void lockThreadedIO(void); +void unlockThreadedIO(void); +int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db); +void freeIOJob(iojob *j); +void queueIOJob(iojob *j); +int vmWriteObjectOnSwap(robj *o, off_t page); +robj *vmReadObjectFromSwap(off_t page, int type); +void waitEmptyIOJobsQueue(void); +void vmReopenSwapFile(void); +int vmFreePage(off_t page); +void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv); +void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv); +int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd); +int dontWaitForSwappedKey(redisClient *c, robj *key); +void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key); +vmpointer *vmSwapObjectBlocking(robj *val); + +/* Hash data type */ +void convertToRealHash(robj *o); +void hashTypeTryConversion(robj *subject, robj **argv, int start, int end); +void hashTypeTryObjectEncoding(robj *subject, robj **o1, robj **o2); +robj *hashTypeGet(robj *o, robj *key); +int hashTypeExists(robj *o, robj *key); +int hashTypeSet(robj *o, robj *key, robj *value); +int hashTypeDelete(robj *o, robj *key); +unsigned long hashTypeLength(robj *o); +hashTypeIterator *hashTypeInitIterator(robj *subject); +void hashTypeReleaseIterator(hashTypeIterator *hi); +int hashTypeNext(hashTypeIterator *hi); +robj *hashTypeCurrent(hashTypeIterator *hi, int what); +robj *hashTypeLookupWriteOrCreate(redisClient *c, robj *key); + +/* Pub / Sub */ +int pubsubUnsubscribeAllChannels(redisClient *c, int notify); +int pubsubUnsubscribeAllPatterns(redisClient *c, int notify); +void freePubsubPattern(void *p); +int listMatchPubsubPattern(void *a, void *b); + +/* Utility functions */ +int stringmatchlen(const char *pattern, int patternLen, + const char *string, int stringLen, int nocase); +int stringmatch(const char *pattern, const char *string, int nocase); +long long memtoll(const char *p, int *err); +int ll2string(char *s, size_t len, long long value); +int isStringRepresentableAsLong(sds s, long *longval); + +/* Configuration */ +void loadServerConfig(char *filename); +void appendServerSaveParams(time_t seconds, int changes); +void resetServerSaveParams(); + +/* db.c -- Keyspace access API */ +int removeExpire(redisDb *db, robj *key); +int expireIfNeeded(redisDb *db, robj *key); +int deleteIfVolatile(redisDb *db, robj *key); +time_t getExpire(redisDb *db, robj *key); +int setExpire(redisDb *db, robj *key, time_t when); +robj *lookupKey(redisDb *db, robj *key); +robj *lookupKeyRead(redisDb *db, robj *key); +robj *lookupKeyWrite(redisDb *db, robj *key); +robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply); +robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply); +int dbAdd(redisDb *db, robj *key, robj *val); +int dbReplace(redisDb *db, robj *key, robj *val); +int dbExists(redisDb *db, robj *key); +robj *dbRandomKey(redisDb *db); +int dbDelete(redisDb *db, robj *key); +long long emptyDb(); +int selectDb(redisClient *c, int id); + +/* Git SHA1 */ +char *redisGitSHA1(void); +char *redisGitDirty(void); + +/* Commands prototypes */ +void authCommand(redisClient *c); +void pingCommand(redisClient *c); +void echoCommand(redisClient *c); +void setCommand(redisClient *c); +void setnxCommand(redisClient *c); +void setexCommand(redisClient *c); +void getCommand(redisClient *c); +void delCommand(redisClient *c); +void existsCommand(redisClient *c); +void incrCommand(redisClient *c); +void decrCommand(redisClient *c); +void incrbyCommand(redisClient *c); +void decrbyCommand(redisClient *c); +void selectCommand(redisClient *c); +void randomkeyCommand(redisClient *c); +void keysCommand(redisClient *c); +void dbsizeCommand(redisClient *c); +void lastsaveCommand(redisClient *c); +void saveCommand(redisClient *c); +void bgsaveCommand(redisClient *c); +void bgrewriteaofCommand(redisClient *c); +void shutdownCommand(redisClient *c); +void moveCommand(redisClient *c); +void renameCommand(redisClient *c); +void renamenxCommand(redisClient *c); +void lpushCommand(redisClient *c); +void rpushCommand(redisClient *c); +void lpushxCommand(redisClient *c); +void rpushxCommand(redisClient *c); +void linsertCommand(redisClient *c); +void lpopCommand(redisClient *c); +void rpopCommand(redisClient *c); +void llenCommand(redisClient *c); +void lindexCommand(redisClient *c); +void lrangeCommand(redisClient *c); +void ltrimCommand(redisClient *c); +void typeCommand(redisClient *c); +void lsetCommand(redisClient *c); +void saddCommand(redisClient *c); +void sremCommand(redisClient *c); +void smoveCommand(redisClient *c); +void sismemberCommand(redisClient *c); +void scardCommand(redisClient *c); +void spopCommand(redisClient *c); +void srandmemberCommand(redisClient *c); +void sinterCommand(redisClient *c); +void sinterstoreCommand(redisClient *c); +void sunionCommand(redisClient *c); +void sunionstoreCommand(redisClient *c); +void sdiffCommand(redisClient *c); +void sdiffstoreCommand(redisClient *c); +void syncCommand(redisClient *c); +void flushdbCommand(redisClient *c); +void flushallCommand(redisClient *c); +void sortCommand(redisClient *c); +void lremCommand(redisClient *c); +void rpoplpushcommand(redisClient *c); +void infoCommand(redisClient *c); +void mgetCommand(redisClient *c); +void monitorCommand(redisClient *c); +void expireCommand(redisClient *c); +void expireatCommand(redisClient *c); +void getsetCommand(redisClient *c); +void ttlCommand(redisClient *c); +void slaveofCommand(redisClient *c); +void debugCommand(redisClient *c); +void msetCommand(redisClient *c); +void msetnxCommand(redisClient *c); +void zaddCommand(redisClient *c); +void zincrbyCommand(redisClient *c); +void zrangeCommand(redisClient *c); +void zrangebyscoreCommand(redisClient *c); +void zcountCommand(redisClient *c); +void zrevrangeCommand(redisClient *c); +void zcardCommand(redisClient *c); +void zremCommand(redisClient *c); +void zscoreCommand(redisClient *c); +void zremrangebyscoreCommand(redisClient *c); +void multiCommand(redisClient *c); +void execCommand(redisClient *c); +void discardCommand(redisClient *c); +void blpopCommand(redisClient *c); +void brpopCommand(redisClient *c); +void appendCommand(redisClient *c); +void substrCommand(redisClient *c); +void zrankCommand(redisClient *c); +void zrevrankCommand(redisClient *c); +void hsetCommand(redisClient *c); +void hsetnxCommand(redisClient *c); +void hgetCommand(redisClient *c); +void hmsetCommand(redisClient *c); +void hmgetCommand(redisClient *c); +void hdelCommand(redisClient *c); +void hlenCommand(redisClient *c); +void zremrangebyrankCommand(redisClient *c); +void zunionstoreCommand(redisClient *c); +void zinterstoreCommand(redisClient *c); +void hkeysCommand(redisClient *c); +void hvalsCommand(redisClient *c); +void hgetallCommand(redisClient *c); +void hexistsCommand(redisClient *c); +void configCommand(redisClient *c); +void hincrbyCommand(redisClient *c); +void subscribeCommand(redisClient *c); +void unsubscribeCommand(redisClient *c); +void psubscribeCommand(redisClient *c); +void punsubscribeCommand(redisClient *c); +void publishCommand(redisClient *c); +void watchCommand(redisClient *c); +void unwatchCommand(redisClient *c); + +#endif diff --git a/src/release.c b/src/release.c new file mode 100644 index 000000000..64186ec4e --- /dev/null +++ b/src/release.c @@ -0,0 +1,13 @@ +/* Every time the Redis Git SHA1 or Dirty status changes only this file + * small file is recompiled, as we access this information in all the other + * files using this functions. */ + +#include "release.h" + +char *redisGitSHA1(void) { + return REDIS_GIT_SHA1; +} + +char *redisGitDirty(void) { + return REDIS_GIT_DIRTY; +} diff --git a/src/replication.c b/src/replication.c new file mode 100644 index 000000000..ecb04ce1a --- /dev/null +++ b/src/replication.c @@ -0,0 +1,475 @@ +#include "redis.h" + +#include +#include +#include +#include + +void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) { + listNode *ln; + listIter li; + int outc = 0, j; + robj **outv; + /* We need 1+(ARGS*3) objects since commands are using the new protocol + * and we one 1 object for the first "*\r\n" multibulk count, then + * for every additional object we have "$\r\n" + object + "\r\n". */ + robj *static_outv[REDIS_STATIC_ARGS*3+1]; + robj *lenobj; + + if (argc <= REDIS_STATIC_ARGS) { + outv = static_outv; + } else { + outv = zmalloc(sizeof(robj*)*(argc*3+1)); + } + + lenobj = createObject(REDIS_STRING, + sdscatprintf(sdsempty(), "*%d\r\n", argc)); + lenobj->refcount = 0; + outv[outc++] = lenobj; + for (j = 0; j < argc; j++) { + lenobj = createObject(REDIS_STRING, + sdscatprintf(sdsempty(),"$%lu\r\n", + (unsigned long) stringObjectLen(argv[j]))); + lenobj->refcount = 0; + outv[outc++] = lenobj; + outv[outc++] = argv[j]; + outv[outc++] = shared.crlf; + } + + /* Increment all the refcounts at start and decrement at end in order to + * be sure to free objects if there is no slave in a replication state + * able to be feed with commands */ + for (j = 0; j < outc; j++) incrRefCount(outv[j]); + listRewind(slaves,&li); + while((ln = listNext(&li))) { + redisClient *slave = ln->value; + + /* Don't feed slaves that are still waiting for BGSAVE to start */ + if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue; + + /* Feed all the other slaves, MONITORs and so on */ + if (slave->slaveseldb != dictid) { + robj *selectcmd; + + switch(dictid) { + case 0: selectcmd = shared.select0; break; + case 1: selectcmd = shared.select1; break; + case 2: selectcmd = shared.select2; break; + case 3: selectcmd = shared.select3; break; + case 4: selectcmd = shared.select4; break; + case 5: selectcmd = shared.select5; break; + case 6: selectcmd = shared.select6; break; + case 7: selectcmd = shared.select7; break; + case 8: selectcmd = shared.select8; break; + case 9: selectcmd = shared.select9; break; + default: + selectcmd = createObject(REDIS_STRING, + sdscatprintf(sdsempty(),"select %d\r\n",dictid)); + selectcmd->refcount = 0; + break; + } + addReply(slave,selectcmd); + slave->slaveseldb = dictid; + } + for (j = 0; j < outc; j++) addReply(slave,outv[j]); + } + for (j = 0; j < outc; j++) decrRefCount(outv[j]); + if (outv != static_outv) zfree(outv); +} + +void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc) { + listNode *ln; + listIter li; + int j; + sds cmdrepr = sdsnew("+"); + robj *cmdobj; + struct timeval tv; + + gettimeofday(&tv,NULL); + cmdrepr = sdscatprintf(cmdrepr,"%ld.%ld ",(long)tv.tv_sec,(long)tv.tv_usec); + if (dictid != 0) cmdrepr = sdscatprintf(cmdrepr,"(db %d) ", dictid); + + for (j = 0; j < argc; j++) { + if (argv[j]->encoding == REDIS_ENCODING_INT) { + cmdrepr = sdscatprintf(cmdrepr, "%ld", (long)argv[j]->ptr); + } else { + cmdrepr = sdscatrepr(cmdrepr,(char*)argv[j]->ptr, + sdslen(argv[j]->ptr)); + } + if (j != argc-1) + cmdrepr = sdscatlen(cmdrepr," ",1); + } + cmdrepr = sdscatlen(cmdrepr,"\r\n",2); + cmdobj = createObject(REDIS_STRING,cmdrepr); + + listRewind(monitors,&li); + while((ln = listNext(&li))) { + redisClient *monitor = ln->value; + addReply(monitor,cmdobj); + } + decrRefCount(cmdobj); +} + +int syncWrite(int fd, char *ptr, ssize_t size, int timeout) { + ssize_t nwritten, ret = size; + time_t start = time(NULL); + + timeout++; + while(size) { + if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) { + nwritten = write(fd,ptr,size); + if (nwritten == -1) return -1; + ptr += nwritten; + size -= nwritten; + } + if ((time(NULL)-start) > timeout) { + errno = ETIMEDOUT; + return -1; + } + } + return ret; +} + +int syncRead(int fd, char *ptr, ssize_t size, int timeout) { + ssize_t nread, totread = 0; + time_t start = time(NULL); + + timeout++; + while(size) { + if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) { + nread = read(fd,ptr,size); + if (nread == -1) return -1; + ptr += nread; + size -= nread; + totread += nread; + } + if ((time(NULL)-start) > timeout) { + errno = ETIMEDOUT; + return -1; + } + } + return totread; +} + +int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) { + ssize_t nread = 0; + + size--; + while(size) { + char c; + + if (syncRead(fd,&c,1,timeout) == -1) return -1; + if (c == '\n') { + *ptr = '\0'; + if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0'; + return nread; + } else { + *ptr++ = c; + *ptr = '\0'; + nread++; + } + } + return nread; +} + +void syncCommand(redisClient *c) { + /* ignore SYNC if aleady slave or in monitor mode */ + if (c->flags & REDIS_SLAVE) return; + + /* SYNC can't be issued when the server has pending data to send to + * the client about already issued commands. We need a fresh reply + * buffer registering the differences between the BGSAVE and the current + * dataset, so that we can copy to other slaves if needed. */ + if (listLength(c->reply) != 0) { + addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n")); + return; + } + + redisLog(REDIS_NOTICE,"Slave ask for synchronization"); + /* Here we need to check if there is a background saving operation + * in progress, or if it is required to start one */ + if (server.bgsavechildpid != -1) { + /* Ok a background save is in progress. Let's check if it is a good + * one for replication, i.e. if there is another slave that is + * registering differences since the server forked to save */ + redisClient *slave; + listNode *ln; + listIter li; + + listRewind(server.slaves,&li); + while((ln = listNext(&li))) { + slave = ln->value; + if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break; + } + if (ln) { + /* Perfect, the server is already registering differences for + * another slave. Set the right state, and copy the buffer. */ + listRelease(c->reply); + c->reply = listDup(slave->reply); + c->replstate = REDIS_REPL_WAIT_BGSAVE_END; + redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC"); + } else { + /* No way, we need to wait for the next BGSAVE in order to + * register differences */ + c->replstate = REDIS_REPL_WAIT_BGSAVE_START; + redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC"); + } + } else { + /* Ok we don't have a BGSAVE in progress, let's start one */ + redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC"); + if (rdbSaveBackground(server.dbfilename) != REDIS_OK) { + redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE"); + addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n")); + return; + } + c->replstate = REDIS_REPL_WAIT_BGSAVE_END; + } + c->repldbfd = -1; + c->flags |= REDIS_SLAVE; + c->slaveseldb = 0; + listAddNodeTail(server.slaves,c); + return; +} + +void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) { + redisClient *slave = privdata; + REDIS_NOTUSED(el); + REDIS_NOTUSED(mask); + char buf[REDIS_IOBUF_LEN]; + ssize_t nwritten, buflen; + + if (slave->repldboff == 0) { + /* Write the bulk write count before to transfer the DB. In theory here + * we don't know how much room there is in the output buffer of the + * socket, but in pratice SO_SNDLOWAT (the minimum count for output + * operations) will never be smaller than the few bytes we need. */ + sds bulkcount; + + bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long) + slave->repldbsize); + if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount)) + { + sdsfree(bulkcount); + freeClient(slave); + return; + } + sdsfree(bulkcount); + } + lseek(slave->repldbfd,slave->repldboff,SEEK_SET); + buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN); + if (buflen <= 0) { + redisLog(REDIS_WARNING,"Read error sending DB to slave: %s", + (buflen == 0) ? "premature EOF" : strerror(errno)); + freeClient(slave); + return; + } + if ((nwritten = write(fd,buf,buflen)) == -1) { + redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s", + strerror(errno)); + freeClient(slave); + return; + } + slave->repldboff += nwritten; + if (slave->repldboff == slave->repldbsize) { + close(slave->repldbfd); + slave->repldbfd = -1; + aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE); + slave->replstate = REDIS_REPL_ONLINE; + if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, + sendReplyToClient, slave) == AE_ERR) { + freeClient(slave); + return; + } + addReplySds(slave,sdsempty()); + redisLog(REDIS_NOTICE,"Synchronization with slave succeeded"); + } +} + +/* This function is called at the end of every backgrond saving. + * The argument bgsaveerr is REDIS_OK if the background saving succeeded + * otherwise REDIS_ERR is passed to the function. + * + * The goal of this function is to handle slaves waiting for a successful + * background saving in order to perform non-blocking synchronization. */ +void updateSlavesWaitingBgsave(int bgsaveerr) { + listNode *ln; + int startbgsave = 0; + listIter li; + + listRewind(server.slaves,&li); + while((ln = listNext(&li))) { + redisClient *slave = ln->value; + + if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) { + startbgsave = 1; + slave->replstate = REDIS_REPL_WAIT_BGSAVE_END; + } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) { + struct redis_stat buf; + + if (bgsaveerr != REDIS_OK) { + freeClient(slave); + redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error"); + continue; + } + if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 || + redis_fstat(slave->repldbfd,&buf) == -1) { + freeClient(slave); + redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno)); + continue; + } + slave->repldboff = 0; + slave->repldbsize = buf.st_size; + slave->replstate = REDIS_REPL_SEND_BULK; + aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE); + if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) { + freeClient(slave); + continue; + } + } + } + if (startbgsave) { + if (rdbSaveBackground(server.dbfilename) != REDIS_OK) { + listIter li; + + listRewind(server.slaves,&li); + redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed"); + while((ln = listNext(&li))) { + redisClient *slave = ln->value; + + if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) + freeClient(slave); + } + } + } +} + +int syncWithMaster(void) { + char buf[1024], tmpfile[256], authcmd[1024]; + long dumpsize; + int fd = anetTcpConnect(NULL,server.masterhost,server.masterport); + int dfd, maxtries = 5; + + if (fd == -1) { + redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s", + strerror(errno)); + return REDIS_ERR; + } + + /* AUTH with the master if required. */ + if(server.masterauth) { + snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth); + if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) { + close(fd); + redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s", + strerror(errno)); + return REDIS_ERR; + } + /* Read the AUTH result. */ + if (syncReadLine(fd,buf,1024,3600) == -1) { + close(fd); + redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s", + strerror(errno)); + return REDIS_ERR; + } + if (buf[0] != '+') { + close(fd); + redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?"); + return REDIS_ERR; + } + } + + /* Issue the SYNC command */ + if (syncWrite(fd,"SYNC \r\n",7,5) == -1) { + close(fd); + redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s", + strerror(errno)); + return REDIS_ERR; + } + /* Read the bulk write count */ + if (syncReadLine(fd,buf,1024,3600) == -1) { + close(fd); + redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s", + strerror(errno)); + return REDIS_ERR; + } + if (buf[0] != '$') { + close(fd); + redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?"); + return REDIS_ERR; + } + dumpsize = strtol(buf+1,NULL,10); + redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize); + /* Read the bulk write data on a temp file */ + while(maxtries--) { + snprintf(tmpfile,256, + "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid()); + dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644); + if (dfd != -1) break; + sleep(1); + } + if (dfd == -1) { + close(fd); + redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno)); + return REDIS_ERR; + } + while(dumpsize) { + int nread, nwritten; + + nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024); + if (nread == -1) { + redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s", + strerror(errno)); + close(fd); + close(dfd); + return REDIS_ERR; + } + nwritten = write(dfd,buf,nread); + if (nwritten == -1) { + redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno)); + close(fd); + close(dfd); + return REDIS_ERR; + } + dumpsize -= nread; + } + close(dfd); + if (rename(tmpfile,server.dbfilename) == -1) { + redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno)); + unlink(tmpfile); + close(fd); + return REDIS_ERR; + } + emptyDb(); + if (rdbLoad(server.dbfilename) != REDIS_OK) { + redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk"); + close(fd); + return REDIS_ERR; + } + server.master = createClient(fd); + server.master->flags |= REDIS_MASTER; + server.master->authenticated = 1; + server.replstate = REDIS_REPL_CONNECTED; + return REDIS_OK; +} + +void slaveofCommand(redisClient *c) { + if (!strcasecmp(c->argv[1]->ptr,"no") && + !strcasecmp(c->argv[2]->ptr,"one")) { + if (server.masterhost) { + sdsfree(server.masterhost); + server.masterhost = NULL; + if (server.master) freeClient(server.master); + server.replstate = REDIS_REPL_NONE; + redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)"); + } + } else { + sdsfree(server.masterhost); + server.masterhost = sdsdup(c->argv[1]->ptr); + server.masterport = atoi(c->argv[2]->ptr); + if (server.master) freeClient(server.master); + server.replstate = REDIS_REPL_CONNECT; + redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)", + server.masterhost, server.masterport); + } + addReply(c,shared.ok); +} diff --git a/src/sds.c b/src/sds.c new file mode 100644 index 000000000..5e67f0443 --- /dev/null +++ b/src/sds.c @@ -0,0 +1,384 @@ +/* SDSLib, A C dynamic strings library + * + * Copyright (c) 2006-2010, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#define SDS_ABORT_ON_OOM + +#include "sds.h" +#include +#include +#include +#include +#include +#include "zmalloc.h" + +static void sdsOomAbort(void) { + fprintf(stderr,"SDS: Out Of Memory (SDS_ABORT_ON_OOM defined)\n"); + abort(); +} + +sds sdsnewlen(const void *init, size_t initlen) { + struct sdshdr *sh; + + sh = zmalloc(sizeof(struct sdshdr)+initlen+1); +#ifdef SDS_ABORT_ON_OOM + if (sh == NULL) sdsOomAbort(); +#else + if (sh == NULL) return NULL; +#endif + sh->len = initlen; + sh->free = 0; + if (initlen) { + if (init) memcpy(sh->buf, init, initlen); + else memset(sh->buf,0,initlen); + } + sh->buf[initlen] = '\0'; + return (char*)sh->buf; +} + +sds sdsempty(void) { + return sdsnewlen("",0); +} + +sds sdsnew(const char *init) { + size_t initlen = (init == NULL) ? 0 : strlen(init); + return sdsnewlen(init, initlen); +} + +size_t sdslen(const sds s) { + struct sdshdr *sh = (void*) (s-(sizeof(struct sdshdr))); + return sh->len; +} + +sds sdsdup(const sds s) { + return sdsnewlen(s, sdslen(s)); +} + +void sdsfree(sds s) { + if (s == NULL) return; + zfree(s-sizeof(struct sdshdr)); +} + +size_t sdsavail(sds s) { + struct sdshdr *sh = (void*) (s-(sizeof(struct sdshdr))); + return sh->free; +} + +void sdsupdatelen(sds s) { + struct sdshdr *sh = (void*) (s-(sizeof(struct sdshdr))); + int reallen = strlen(s); + sh->free += (sh->len-reallen); + sh->len = reallen; +} + +static sds sdsMakeRoomFor(sds s, size_t addlen) { + struct sdshdr *sh, *newsh; + size_t free = sdsavail(s); + size_t len, newlen; + + if (free >= addlen) return s; + len = sdslen(s); + sh = (void*) (s-(sizeof(struct sdshdr))); + newlen = (len+addlen)*2; + newsh = zrealloc(sh, sizeof(struct sdshdr)+newlen+1); +#ifdef SDS_ABORT_ON_OOM + if (newsh == NULL) sdsOomAbort(); +#else + if (newsh == NULL) return NULL; +#endif + + newsh->free = newlen - len; + return newsh->buf; +} + +sds sdscatlen(sds s, void *t, size_t len) { + struct sdshdr *sh; + size_t curlen = sdslen(s); + + s = sdsMakeRoomFor(s,len); + if (s == NULL) return NULL; + sh = (void*) (s-(sizeof(struct sdshdr))); + memcpy(s+curlen, t, len); + sh->len = curlen+len; + sh->free = sh->free-len; + s[curlen+len] = '\0'; + return s; +} + +sds sdscat(sds s, char *t) { + return sdscatlen(s, t, strlen(t)); +} + +sds sdscpylen(sds s, char *t, size_t len) { + struct sdshdr *sh = (void*) (s-(sizeof(struct sdshdr))); + size_t totlen = sh->free+sh->len; + + if (totlen < len) { + s = sdsMakeRoomFor(s,len-sh->len); + if (s == NULL) return NULL; + sh = (void*) (s-(sizeof(struct sdshdr))); + totlen = sh->free+sh->len; + } + memcpy(s, t, len); + s[len] = '\0'; + sh->len = len; + sh->free = totlen-len; + return s; +} + +sds sdscpy(sds s, char *t) { + return sdscpylen(s, t, strlen(t)); +} + +sds sdscatprintf(sds s, const char *fmt, ...) { + va_list ap; + char *buf, *t; + size_t buflen = 16; + + while(1) { + buf = zmalloc(buflen); +#ifdef SDS_ABORT_ON_OOM + if (buf == NULL) sdsOomAbort(); +#else + if (buf == NULL) return NULL; +#endif + buf[buflen-2] = '\0'; + va_start(ap, fmt); + vsnprintf(buf, buflen, fmt, ap); + va_end(ap); + if (buf[buflen-2] != '\0') { + zfree(buf); + buflen *= 2; + continue; + } + break; + } + t = sdscat(s, buf); + zfree(buf); + return t; +} + +sds sdstrim(sds s, const char *cset) { + struct sdshdr *sh = (void*) (s-(sizeof(struct sdshdr))); + char *start, *end, *sp, *ep; + size_t len; + + sp = start = s; + ep = end = s+sdslen(s)-1; + while(sp <= end && strchr(cset, *sp)) sp++; + while(ep > start && strchr(cset, *ep)) ep--; + len = (sp > ep) ? 0 : ((ep-sp)+1); + if (sh->buf != sp) memmove(sh->buf, sp, len); + sh->buf[len] = '\0'; + sh->free = sh->free+(sh->len-len); + sh->len = len; + return s; +} + +sds sdsrange(sds s, int start, int end) { + struct sdshdr *sh = (void*) (s-(sizeof(struct sdshdr))); + size_t newlen, len = sdslen(s); + + if (len == 0) return s; + if (start < 0) { + start = len+start; + if (start < 0) start = 0; + } + if (end < 0) { + end = len+end; + if (end < 0) end = 0; + } + newlen = (start > end) ? 0 : (end-start)+1; + if (newlen != 0) { + if (start >= (signed)len) start = len-1; + if (end >= (signed)len) end = len-1; + newlen = (start > end) ? 0 : (end-start)+1; + } else { + start = 0; + } + if (start != 0) memmove(sh->buf, sh->buf+start, newlen); + sh->buf[newlen] = 0; + sh->free = sh->free+(sh->len-newlen); + sh->len = newlen; + return s; +} + +void sdstolower(sds s) { + int len = sdslen(s), j; + + for (j = 0; j < len; j++) s[j] = tolower(s[j]); +} + +void sdstoupper(sds s) { + int len = sdslen(s), j; + + for (j = 0; j < len; j++) s[j] = toupper(s[j]); +} + +int sdscmp(sds s1, sds s2) { + size_t l1, l2, minlen; + int cmp; + + l1 = sdslen(s1); + l2 = sdslen(s2); + minlen = (l1 < l2) ? l1 : l2; + cmp = memcmp(s1,s2,minlen); + if (cmp == 0) return l1-l2; + return cmp; +} + +/* Split 's' with separator in 'sep'. An array + * of sds strings is returned. *count will be set + * by reference to the number of tokens returned. + * + * On out of memory, zero length string, zero length + * separator, NULL is returned. + * + * Note that 'sep' is able to split a string using + * a multi-character separator. For example + * sdssplit("foo_-_bar","_-_"); will return two + * elements "foo" and "bar". + * + * This version of the function is binary-safe but + * requires length arguments. sdssplit() is just the + * same function but for zero-terminated strings. + */ +sds *sdssplitlen(char *s, int len, char *sep, int seplen, int *count) { + int elements = 0, slots = 5, start = 0, j; + + sds *tokens = zmalloc(sizeof(sds)*slots); +#ifdef SDS_ABORT_ON_OOM + if (tokens == NULL) sdsOomAbort(); +#endif + if (seplen < 1 || len < 0 || tokens == NULL) return NULL; + if (len == 0) { + *count = 0; + return tokens; + } + for (j = 0; j < (len-(seplen-1)); j++) { + /* make sure there is room for the next element and the final one */ + if (slots < elements+2) { + sds *newtokens; + + slots *= 2; + newtokens = zrealloc(tokens,sizeof(sds)*slots); + if (newtokens == NULL) { +#ifdef SDS_ABORT_ON_OOM + sdsOomAbort(); +#else + goto cleanup; +#endif + } + tokens = newtokens; + } + /* search the separator */ + if ((seplen == 1 && *(s+j) == sep[0]) || (memcmp(s+j,sep,seplen) == 0)) { + tokens[elements] = sdsnewlen(s+start,j-start); + if (tokens[elements] == NULL) { +#ifdef SDS_ABORT_ON_OOM + sdsOomAbort(); +#else + goto cleanup; +#endif + } + elements++; + start = j+seplen; + j = j+seplen-1; /* skip the separator */ + } + } + /* Add the final element. We are sure there is room in the tokens array. */ + tokens[elements] = sdsnewlen(s+start,len-start); + if (tokens[elements] == NULL) { +#ifdef SDS_ABORT_ON_OOM + sdsOomAbort(); +#else + goto cleanup; +#endif + } + elements++; + *count = elements; + return tokens; + +#ifndef SDS_ABORT_ON_OOM +cleanup: + { + int i; + for (i = 0; i < elements; i++) sdsfree(tokens[i]); + zfree(tokens); + return NULL; + } +#endif +} + +void sdsfreesplitres(sds *tokens, int count) { + if (!tokens) return; + while(count--) + sdsfree(tokens[count]); + zfree(tokens); +} + +sds sdsfromlonglong(long long value) { + char buf[32], *p; + unsigned long long v; + + v = (value < 0) ? -value : value; + p = buf+31; /* point to the last character */ + do { + *p-- = '0'+(v%10); + v /= 10; + } while(v); + if (value < 0) *p-- = '-'; + p++; + return sdsnewlen(p,32-(p-buf)); +} + +sds sdscatrepr(sds s, char *p, size_t len) { + s = sdscatlen(s,"\"",1); + while(len--) { + switch(*p) { + case '\\': + case '"': + s = sdscatprintf(s,"\\%c",*p); + break; + case '\n': s = sdscatlen(s,"\\n",1); break; + case '\r': s = sdscatlen(s,"\\r",1); break; + case '\t': s = sdscatlen(s,"\\t",1); break; + case '\a': s = sdscatlen(s,"\\a",1); break; + case '\b': s = sdscatlen(s,"\\b",1); break; + default: + if (isprint(*p)) + s = sdscatprintf(s,"%c",*p); + else + s = sdscatprintf(s,"\\x%02x",(unsigned char)*p); + break; + } + p++; + } + return sdscatlen(s,"\"",1); +} diff --git a/src/sds.h b/src/sds.h new file mode 100644 index 000000000..ef3a418f2 --- /dev/null +++ b/src/sds.h @@ -0,0 +1,74 @@ +/* SDSLib, A C dynamic strings library + * + * Copyright (c) 2006-2010, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __SDS_H +#define __SDS_H + +#include + +typedef char *sds; + +struct sdshdr { + int len; + int free; + char buf[]; +}; + +sds sdsnewlen(const void *init, size_t initlen); +sds sdsnew(const char *init); +sds sdsempty(); +size_t sdslen(const sds s); +sds sdsdup(const sds s); +void sdsfree(sds s); +size_t sdsavail(sds s); +sds sdscatlen(sds s, void *t, size_t len); +sds sdscat(sds s, char *t); +sds sdscpylen(sds s, char *t, size_t len); +sds sdscpy(sds s, char *t); + +#ifdef __GNUC__ +sds sdscatprintf(sds s, const char *fmt, ...) + __attribute__((format(printf, 2, 3))); +#else +sds sdscatprintf(sds s, const char *fmt, ...); +#endif + +sds sdstrim(sds s, const char *cset); +sds sdsrange(sds s, int start, int end); +void sdsupdatelen(sds s); +int sdscmp(sds s1, sds s2); +sds *sdssplitlen(char *s, int len, char *sep, int seplen, int *count); +void sdsfreesplitres(sds *tokens, int count); +void sdstolower(sds s); +void sdstoupper(sds s); +sds sdsfromlonglong(long long value); +sds sdscatrepr(sds s, char *p, size_t len); + +#endif diff --git a/src/sha1.c b/src/sha1.c new file mode 100644 index 000000000..2c50433e8 --- /dev/null +++ b/src/sha1.c @@ -0,0 +1,276 @@ + +/* from valgrind tests */ + +/* ================ sha1.c ================ */ +/* +SHA-1 in C +By Steve Reid +100% Public Domain + +Test Vectors (from FIPS PUB 180-1) +"abc" + A9993E36 4706816A BA3E2571 7850C26C 9CD0D89D +"abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq" + 84983E44 1C3BD26E BAAE4AA1 F95129E5 E54670F1 +A million repetitions of "a" + 34AA973C D4C4DAA4 F61EEB2B DBAD2731 6534016F +*/ + +/* #define LITTLE_ENDIAN * This should be #define'd already, if true. */ +/* #define SHA1HANDSOFF * Copies data before messing with it. */ + +#define SHA1HANDSOFF + +#include +#include +#include /* for u_int*_t */ +#if defined(__sun) +#include "solarisfixes.h" +#endif +#include "sha1.h" + +#ifndef BYTE_ORDER +#if (BSD >= 199103) +# include +#else +#if defined(linux) || defined(__linux__) +# include +#else +#define LITTLE_ENDIAN 1234 /* least-significant byte first (vax, pc) */ +#define BIG_ENDIAN 4321 /* most-significant byte first (IBM, net) */ +#define PDP_ENDIAN 3412 /* LSB first in word, MSW first in long (pdp)*/ + +#if defined(vax) || defined(ns32000) || defined(sun386) || defined(__i386__) || \ + defined(MIPSEL) || defined(_MIPSEL) || defined(BIT_ZERO_ON_RIGHT) || \ + defined(__alpha__) || defined(__alpha) +#define BYTE_ORDER LITTLE_ENDIAN +#endif + +#if defined(sel) || defined(pyr) || defined(mc68000) || defined(sparc) || \ + defined(is68k) || defined(tahoe) || defined(ibm032) || defined(ibm370) || \ + defined(MIPSEB) || defined(_MIPSEB) || defined(_IBMR2) || defined(DGUX) ||\ + defined(apollo) || defined(__convex__) || defined(_CRAY) || \ + defined(__hppa) || defined(__hp9000) || \ + defined(__hp9000s300) || defined(__hp9000s700) || \ + defined (BIT_ZERO_ON_LEFT) || defined(m68k) || defined(__sparc) +#define BYTE_ORDER BIG_ENDIAN +#endif +#endif /* linux */ +#endif /* BSD */ +#endif /* BYTE_ORDER */ + +#if defined(__BYTE_ORDER) && !defined(BYTE_ORDER) +#if (__BYTE_ORDER == __LITTLE_ENDIAN) +#define BYTE_ORDER LITTLE_ENDIAN +#else +#define BYTE_ORDER BIG_ENDIAN +#endif +#endif + +#if !defined(BYTE_ORDER) || \ + (BYTE_ORDER != BIG_ENDIAN && BYTE_ORDER != LITTLE_ENDIAN && \ + BYTE_ORDER != PDP_ENDIAN) + /* you must determine what the correct bit order is for + * your compiler - the next line is an intentional error + * which will force your compiles to bomb until you fix + * the above macros. + */ +#error "Undefined or invalid BYTE_ORDER" +#endif + +#define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits)))) + +/* blk0() and blk() perform the initial expand. */ +/* I got the idea of expanding during the round function from SSLeay */ +#if BYTE_ORDER == LITTLE_ENDIAN +#define blk0(i) (block->l[i] = (rol(block->l[i],24)&0xFF00FF00) \ + |(rol(block->l[i],8)&0x00FF00FF)) +#elif BYTE_ORDER == BIG_ENDIAN +#define blk0(i) block->l[i] +#else +#error "Endianness not defined!" +#endif +#define blk(i) (block->l[i&15] = rol(block->l[(i+13)&15]^block->l[(i+8)&15] \ + ^block->l[(i+2)&15]^block->l[i&15],1)) + +/* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */ +#define R0(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk0(i)+0x5A827999+rol(v,5);w=rol(w,30); +#define R1(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk(i)+0x5A827999+rol(v,5);w=rol(w,30); +#define R2(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0x6ED9EBA1+rol(v,5);w=rol(w,30); +#define R3(v,w,x,y,z,i) z+=(((w|x)&y)|(w&x))+blk(i)+0x8F1BBCDC+rol(v,5);w=rol(w,30); +#define R4(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0xCA62C1D6+rol(v,5);w=rol(w,30); + + +/* Hash a single 512-bit block. This is the core of the algorithm. */ + +void SHA1Transform(u_int32_t state[5], const unsigned char buffer[64]) +{ +u_int32_t a, b, c, d, e; +typedef union { + unsigned char c[64]; + u_int32_t l[16]; +} CHAR64LONG16; +#ifdef SHA1HANDSOFF +CHAR64LONG16 block[1]; /* use array to appear as a pointer */ + memcpy(block, buffer, 64); +#else + /* The following had better never be used because it causes the + * pointer-to-const buffer to be cast into a pointer to non-const. + * And the result is written through. I threw a "const" in, hoping + * this will cause a diagnostic. + */ +CHAR64LONG16* block = (const CHAR64LONG16*)buffer; +#endif + /* Copy context->state[] to working vars */ + a = state[0]; + b = state[1]; + c = state[2]; + d = state[3]; + e = state[4]; + /* 4 rounds of 20 operations each. Loop unrolled. */ + R0(a,b,c,d,e, 0); R0(e,a,b,c,d, 1); R0(d,e,a,b,c, 2); R0(c,d,e,a,b, 3); + R0(b,c,d,e,a, 4); R0(a,b,c,d,e, 5); R0(e,a,b,c,d, 6); R0(d,e,a,b,c, 7); + R0(c,d,e,a,b, 8); R0(b,c,d,e,a, 9); R0(a,b,c,d,e,10); R0(e,a,b,c,d,11); + R0(d,e,a,b,c,12); R0(c,d,e,a,b,13); R0(b,c,d,e,a,14); R0(a,b,c,d,e,15); + R1(e,a,b,c,d,16); R1(d,e,a,b,c,17); R1(c,d,e,a,b,18); R1(b,c,d,e,a,19); + R2(a,b,c,d,e,20); R2(e,a,b,c,d,21); R2(d,e,a,b,c,22); R2(c,d,e,a,b,23); + R2(b,c,d,e,a,24); R2(a,b,c,d,e,25); R2(e,a,b,c,d,26); R2(d,e,a,b,c,27); + R2(c,d,e,a,b,28); R2(b,c,d,e,a,29); R2(a,b,c,d,e,30); R2(e,a,b,c,d,31); + R2(d,e,a,b,c,32); R2(c,d,e,a,b,33); R2(b,c,d,e,a,34); R2(a,b,c,d,e,35); + R2(e,a,b,c,d,36); R2(d,e,a,b,c,37); R2(c,d,e,a,b,38); R2(b,c,d,e,a,39); + R3(a,b,c,d,e,40); R3(e,a,b,c,d,41); R3(d,e,a,b,c,42); R3(c,d,e,a,b,43); + R3(b,c,d,e,a,44); R3(a,b,c,d,e,45); R3(e,a,b,c,d,46); R3(d,e,a,b,c,47); + R3(c,d,e,a,b,48); R3(b,c,d,e,a,49); R3(a,b,c,d,e,50); R3(e,a,b,c,d,51); + R3(d,e,a,b,c,52); R3(c,d,e,a,b,53); R3(b,c,d,e,a,54); R3(a,b,c,d,e,55); + R3(e,a,b,c,d,56); R3(d,e,a,b,c,57); R3(c,d,e,a,b,58); R3(b,c,d,e,a,59); + R4(a,b,c,d,e,60); R4(e,a,b,c,d,61); R4(d,e,a,b,c,62); R4(c,d,e,a,b,63); + R4(b,c,d,e,a,64); R4(a,b,c,d,e,65); R4(e,a,b,c,d,66); R4(d,e,a,b,c,67); + R4(c,d,e,a,b,68); R4(b,c,d,e,a,69); R4(a,b,c,d,e,70); R4(e,a,b,c,d,71); + R4(d,e,a,b,c,72); R4(c,d,e,a,b,73); R4(b,c,d,e,a,74); R4(a,b,c,d,e,75); + R4(e,a,b,c,d,76); R4(d,e,a,b,c,77); R4(c,d,e,a,b,78); R4(b,c,d,e,a,79); + /* Add the working vars back into context.state[] */ + state[0] += a; + state[1] += b; + state[2] += c; + state[3] += d; + state[4] += e; + /* Wipe variables */ + a = b = c = d = e = 0; +#ifdef SHA1HANDSOFF + memset(block, '\0', sizeof(block)); +#endif +} + + +/* SHA1Init - Initialize new context */ + +void SHA1Init(SHA1_CTX* context) +{ + /* SHA1 initialization constants */ + context->state[0] = 0x67452301; + context->state[1] = 0xEFCDAB89; + context->state[2] = 0x98BADCFE; + context->state[3] = 0x10325476; + context->state[4] = 0xC3D2E1F0; + context->count[0] = context->count[1] = 0; +} + + +/* Run your data through this. */ + +void SHA1Update(SHA1_CTX* context, const unsigned char* data, u_int32_t len) +{ +u_int32_t i; +u_int32_t j; + + j = context->count[0]; + if ((context->count[0] += len << 3) < j) + context->count[1]++; + context->count[1] += (len>>29); + j = (j >> 3) & 63; + if ((j + len) > 63) { + memcpy(&context->buffer[j], data, (i = 64-j)); + SHA1Transform(context->state, context->buffer); + for ( ; i + 63 < len; i += 64) { + SHA1Transform(context->state, &data[i]); + } + j = 0; + } + else i = 0; + memcpy(&context->buffer[j], &data[i], len - i); +} + + +/* Add padding and return the message digest. */ + +void SHA1Final(unsigned char digest[20], SHA1_CTX* context) +{ +unsigned i; +unsigned char finalcount[8]; +unsigned char c; + +#if 0 /* untested "improvement" by DHR */ + /* Convert context->count to a sequence of bytes + * in finalcount. Second element first, but + * big-endian order within element. + * But we do it all backwards. + */ + unsigned char *fcp = &finalcount[8]; + + for (i = 0; i < 2; i++) + { + u_int32_t t = context->count[i]; + int j; + + for (j = 0; j < 4; t >>= 8, j++) + *--fcp = (unsigned char) t + } +#else + for (i = 0; i < 8; i++) { + finalcount[i] = (unsigned char)((context->count[(i >= 4 ? 0 : 1)] + >> ((3-(i & 3)) * 8) ) & 255); /* Endian independent */ + } +#endif + c = 0200; + SHA1Update(context, &c, 1); + while ((context->count[0] & 504) != 448) { + c = 0000; + SHA1Update(context, &c, 1); + } + SHA1Update(context, finalcount, 8); /* Should cause a SHA1Transform() */ + for (i = 0; i < 20; i++) { + digest[i] = (unsigned char) + ((context->state[i>>2] >> ((3-(i & 3)) * 8) ) & 255); + } + /* Wipe variables */ + memset(context, '\0', sizeof(*context)); + memset(&finalcount, '\0', sizeof(finalcount)); +} +/* ================ end of sha1.c ================ */ + +#if 0 +#define BUFSIZE 4096 + +int +main(int argc, char **argv) +{ + SHA1_CTX ctx; + unsigned char hash[20], buf[BUFSIZE]; + int i; + + for(i=0;i +100% Public Domain +*/ + +typedef struct { + u_int32_t state[5]; + u_int32_t count[2]; + unsigned char buffer[64]; +} SHA1_CTX; + +void SHA1Transform(u_int32_t state[5], const unsigned char buffer[64]); +void SHA1Init(SHA1_CTX* context); +void SHA1Update(SHA1_CTX* context, const unsigned char* data, u_int32_t len); +void SHA1Final(unsigned char digest[20], SHA1_CTX* context); diff --git a/src/solarisfixes.h b/src/solarisfixes.h new file mode 100644 index 000000000..ce8e7b6fd --- /dev/null +++ b/src/solarisfixes.h @@ -0,0 +1,21 @@ +/* Solaris specific fixes */ + +#if defined(__GNUC__) +#undef isnan +#define isnan(x) \ + __extension__({ __typeof (x) __x_a = (x); \ + __builtin_expect(__x_a != __x_a, 0); }) + +#undef isfinite +#define isfinite(x) \ + __extension__ ({ __typeof (x) __x_f = (x); \ + __builtin_expect(!isnan(__x_f - __x_f), 1); }) + +#undef isinf +#define isinf(x) \ + __extension__ ({ __typeof (x) __x_i = (x); \ + __builtin_expect(!isnan(__x_i) && !isfinite(__x_i), 0); }) + +#define u_int uint +#define u_int32_t uint32_t +#endif /* __GNUC__ */ diff --git a/src/sort.c b/src/sort.c new file mode 100644 index 000000000..0bc86b474 --- /dev/null +++ b/src/sort.c @@ -0,0 +1,383 @@ +#include "redis.h" +#include "pqsort.h" /* Partial qsort for SORT+LIMIT */ + +redisSortOperation *createSortOperation(int type, robj *pattern) { + redisSortOperation *so = zmalloc(sizeof(*so)); + so->type = type; + so->pattern = pattern; + return so; +} + +/* Return the value associated to the key with a name obtained + * substituting the first occurence of '*' in 'pattern' with 'subst'. + * The returned object will always have its refcount increased by 1 + * when it is non-NULL. */ +robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) { + char *p, *f; + sds spat, ssub; + robj keyobj, fieldobj, *o; + int prefixlen, sublen, postfixlen, fieldlen; + /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */ + struct { + int len; + int free; + char buf[REDIS_SORTKEY_MAX+1]; + } keyname, fieldname; + + /* If the pattern is "#" return the substitution object itself in order + * to implement the "SORT ... GET #" feature. */ + spat = pattern->ptr; + if (spat[0] == '#' && spat[1] == '\0') { + incrRefCount(subst); + return subst; + } + + /* The substitution object may be specially encoded. If so we create + * a decoded object on the fly. Otherwise getDecodedObject will just + * increment the ref count, that we'll decrement later. */ + subst = getDecodedObject(subst); + + ssub = subst->ptr; + if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL; + p = strchr(spat,'*'); + if (!p) { + decrRefCount(subst); + return NULL; + } + + /* Find out if we're dealing with a hash dereference. */ + if ((f = strstr(p+1, "->")) != NULL) { + fieldlen = sdslen(spat)-(f-spat); + /* this also copies \0 character */ + memcpy(fieldname.buf,f+2,fieldlen-1); + fieldname.len = fieldlen-2; + } else { + fieldlen = 0; + } + + prefixlen = p-spat; + sublen = sdslen(ssub); + postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen; + memcpy(keyname.buf,spat,prefixlen); + memcpy(keyname.buf+prefixlen,ssub,sublen); + memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen); + keyname.buf[prefixlen+sublen+postfixlen] = '\0'; + keyname.len = prefixlen+sublen+postfixlen; + decrRefCount(subst); + + /* Lookup substituted key */ + initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(struct sdshdr))); + o = lookupKeyRead(db,&keyobj); + if (o == NULL) return NULL; + + if (fieldlen > 0) { + if (o->type != REDIS_HASH || fieldname.len < 1) return NULL; + + /* Retrieve value from hash by the field name. This operation + * already increases the refcount of the returned object. */ + initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(struct sdshdr))); + o = hashTypeGet(o, &fieldobj); + } else { + if (o->type != REDIS_STRING) return NULL; + + /* Every object that this function returns needs to have its refcount + * increased. sortCommand decreases it again. */ + incrRefCount(o); + } + + return o; +} + +/* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with + * the additional parameter is not standard but a BSD-specific we have to + * pass sorting parameters via the global 'server' structure */ +int sortCompare(const void *s1, const void *s2) { + const redisSortObject *so1 = s1, *so2 = s2; + int cmp; + + if (!server.sort_alpha) { + /* Numeric sorting. Here it's trivial as we precomputed scores */ + if (so1->u.score > so2->u.score) { + cmp = 1; + } else if (so1->u.score < so2->u.score) { + cmp = -1; + } else { + cmp = 0; + } + } else { + /* Alphanumeric sorting */ + if (server.sort_bypattern) { + if (!so1->u.cmpobj || !so2->u.cmpobj) { + /* At least one compare object is NULL */ + if (so1->u.cmpobj == so2->u.cmpobj) + cmp = 0; + else if (so1->u.cmpobj == NULL) + cmp = -1; + else + cmp = 1; + } else { + /* We have both the objects, use strcoll */ + cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr); + } + } else { + /* Compare elements directly. */ + cmp = compareStringObjects(so1->obj,so2->obj); + } + } + return server.sort_desc ? -cmp : cmp; +} + +/* The SORT command is the most complex command in Redis. Warning: this code + * is optimized for speed and a bit less for readability */ +void sortCommand(redisClient *c) { + list *operations; + unsigned int outputlen = 0; + int desc = 0, alpha = 0; + int limit_start = 0, limit_count = -1, start, end; + int j, dontsort = 0, vectorlen; + int getop = 0; /* GET operation counter */ + robj *sortval, *sortby = NULL, *storekey = NULL; + redisSortObject *vector; /* Resulting vector to sort */ + + /* Lookup the key to sort. It must be of the right types */ + sortval = lookupKeyRead(c->db,c->argv[1]); + if (sortval == NULL) { + addReply(c,shared.emptymultibulk); + return; + } + if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST && + sortval->type != REDIS_ZSET) + { + addReply(c,shared.wrongtypeerr); + return; + } + + /* Create a list of operations to perform for every sorted element. + * Operations can be GET/DEL/INCR/DECR */ + operations = listCreate(); + listSetFreeMethod(operations,zfree); + j = 2; + + /* Now we need to protect sortval incrementing its count, in the future + * SORT may have options able to overwrite/delete keys during the sorting + * and the sorted key itself may get destroied */ + incrRefCount(sortval); + + /* The SORT command has an SQL-alike syntax, parse it */ + while(j < c->argc) { + int leftargs = c->argc-j-1; + if (!strcasecmp(c->argv[j]->ptr,"asc")) { + desc = 0; + } else if (!strcasecmp(c->argv[j]->ptr,"desc")) { + desc = 1; + } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) { + alpha = 1; + } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) { + limit_start = atoi(c->argv[j+1]->ptr); + limit_count = atoi(c->argv[j+2]->ptr); + j+=2; + } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) { + storekey = c->argv[j+1]; + j++; + } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) { + sortby = c->argv[j+1]; + /* If the BY pattern does not contain '*', i.e. it is constant, + * we don't need to sort nor to lookup the weight keys. */ + if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1; + j++; + } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) { + listAddNodeTail(operations,createSortOperation( + REDIS_SORT_GET,c->argv[j+1])); + getop++; + j++; + } else { + decrRefCount(sortval); + listRelease(operations); + addReply(c,shared.syntaxerr); + return; + } + j++; + } + + /* Load the sorting vector with all the objects to sort */ + switch(sortval->type) { + case REDIS_LIST: vectorlen = listTypeLength(sortval); break; + case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break; + case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break; + default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */ + } + vector = zmalloc(sizeof(redisSortObject)*vectorlen); + j = 0; + + if (sortval->type == REDIS_LIST) { + listTypeIterator *li = listTypeInitIterator(sortval,0,REDIS_TAIL); + listTypeEntry entry; + while(listTypeNext(li,&entry)) { + vector[j].obj = listTypeGet(&entry); + vector[j].u.score = 0; + vector[j].u.cmpobj = NULL; + j++; + } + listTypeReleaseIterator(li); + } else { + dict *set; + dictIterator *di; + dictEntry *setele; + + if (sortval->type == REDIS_SET) { + set = sortval->ptr; + } else { + zset *zs = sortval->ptr; + set = zs->dict; + } + + di = dictGetIterator(set); + while((setele = dictNext(di)) != NULL) { + vector[j].obj = dictGetEntryKey(setele); + vector[j].u.score = 0; + vector[j].u.cmpobj = NULL; + j++; + } + dictReleaseIterator(di); + } + redisAssert(j == vectorlen); + + /* Now it's time to load the right scores in the sorting vector */ + if (dontsort == 0) { + for (j = 0; j < vectorlen; j++) { + robj *byval; + if (sortby) { + /* lookup value to sort by */ + byval = lookupKeyByPattern(c->db,sortby,vector[j].obj); + if (!byval) continue; + } else { + /* use object itself to sort by */ + byval = vector[j].obj; + } + + if (alpha) { + if (sortby) vector[j].u.cmpobj = getDecodedObject(byval); + } else { + if (byval->encoding == REDIS_ENCODING_RAW) { + vector[j].u.score = strtod(byval->ptr,NULL); + } else if (byval->encoding == REDIS_ENCODING_INT) { + /* Don't need to decode the object if it's + * integer-encoded (the only encoding supported) so + * far. We can just cast it */ + vector[j].u.score = (long)byval->ptr; + } else { + redisAssert(1 != 1); + } + } + + /* when the object was retrieved using lookupKeyByPattern, + * its refcount needs to be decreased. */ + if (sortby) { + decrRefCount(byval); + } + } + } + + /* We are ready to sort the vector... perform a bit of sanity check + * on the LIMIT option too. We'll use a partial version of quicksort. */ + start = (limit_start < 0) ? 0 : limit_start; + end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1; + if (start >= vectorlen) { + start = vectorlen-1; + end = vectorlen-2; + } + if (end >= vectorlen) end = vectorlen-1; + + if (dontsort == 0) { + server.sort_desc = desc; + server.sort_alpha = alpha; + server.sort_bypattern = sortby ? 1 : 0; + if (sortby && (start != 0 || end != vectorlen-1)) + pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end); + else + qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare); + } + + /* Send command output to the output buffer, performing the specified + * GET/DEL/INCR/DECR operations if any. */ + outputlen = getop ? getop*(end-start+1) : end-start+1; + if (storekey == NULL) { + /* STORE option not specified, sent the sorting result to client */ + addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen)); + for (j = start; j <= end; j++) { + listNode *ln; + listIter li; + + if (!getop) addReplyBulk(c,vector[j].obj); + listRewind(operations,&li); + while((ln = listNext(&li))) { + redisSortOperation *sop = ln->value; + robj *val = lookupKeyByPattern(c->db,sop->pattern, + vector[j].obj); + + if (sop->type == REDIS_SORT_GET) { + if (!val) { + addReply(c,shared.nullbulk); + } else { + addReplyBulk(c,val); + decrRefCount(val); + } + } else { + redisAssert(sop->type == REDIS_SORT_GET); /* always fails */ + } + } + } + } else { + robj *sobj = createZiplistObject(); + + /* STORE option specified, set the sorting result as a List object */ + for (j = start; j <= end; j++) { + listNode *ln; + listIter li; + + if (!getop) { + listTypePush(sobj,vector[j].obj,REDIS_TAIL); + } else { + listRewind(operations,&li); + while((ln = listNext(&li))) { + redisSortOperation *sop = ln->value; + robj *val = lookupKeyByPattern(c->db,sop->pattern, + vector[j].obj); + + if (sop->type == REDIS_SORT_GET) { + if (!val) val = createStringObject("",0); + + /* listTypePush does an incrRefCount, so we should take care + * care of the incremented refcount caused by either + * lookupKeyByPattern or createStringObject("",0) */ + listTypePush(sobj,val,REDIS_TAIL); + decrRefCount(val); + } else { + /* always fails */ + redisAssert(sop->type == REDIS_SORT_GET); + } + } + } + } + dbReplace(c->db,storekey,sobj); + /* Note: we add 1 because the DB is dirty anyway since even if the + * SORT result is empty a new key is set and maybe the old content + * replaced. */ + server.dirty += 1+outputlen; + addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen)); + } + + /* Cleanup */ + if (sortval->type == REDIS_LIST) + for (j = 0; j < vectorlen; j++) + decrRefCount(vector[j].obj); + decrRefCount(sortval); + listRelease(operations); + for (j = 0; j < vectorlen; j++) { + if (alpha && vector[j].u.cmpobj) + decrRefCount(vector[j].u.cmpobj); + } + zfree(vector); +} + + diff --git a/src/t_hash.c b/src/t_hash.c new file mode 100644 index 000000000..3f5fd6e16 --- /dev/null +++ b/src/t_hash.c @@ -0,0 +1,397 @@ +#include "redis.h" + +#include + +/*----------------------------------------------------------------------------- + * Hash type API + *----------------------------------------------------------------------------*/ + +/* Check the length of a number of objects to see if we need to convert a + * zipmap to a real hash. Note that we only check string encoded objects + * as their string length can be queried in constant time. */ +void hashTypeTryConversion(robj *subject, robj **argv, int start, int end) { + int i; + if (subject->encoding != REDIS_ENCODING_ZIPMAP) return; + + for (i = start; i <= end; i++) { + if (argv[i]->encoding == REDIS_ENCODING_RAW && + sdslen(argv[i]->ptr) > server.hash_max_zipmap_value) + { + convertToRealHash(subject); + return; + } + } +} + +/* Encode given objects in-place when the hash uses a dict. */ +void hashTypeTryObjectEncoding(robj *subject, robj **o1, robj **o2) { + if (subject->encoding == REDIS_ENCODING_HT) { + if (o1) *o1 = tryObjectEncoding(*o1); + if (o2) *o2 = tryObjectEncoding(*o2); + } +} + +/* Get the value from a hash identified by key. Returns either a string + * object or NULL if the value cannot be found. The refcount of the object + * is always increased by 1 when the value was found. */ +robj *hashTypeGet(robj *o, robj *key) { + robj *value = NULL; + if (o->encoding == REDIS_ENCODING_ZIPMAP) { + unsigned char *v; + unsigned int vlen; + key = getDecodedObject(key); + if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) { + value = createStringObject((char*)v,vlen); + } + decrRefCount(key); + } else { + dictEntry *de = dictFind(o->ptr,key); + if (de != NULL) { + value = dictGetEntryVal(de); + incrRefCount(value); + } + } + return value; +} + +/* Test if the key exists in the given hash. Returns 1 if the key + * exists and 0 when it doesn't. */ +int hashTypeExists(robj *o, robj *key) { + if (o->encoding == REDIS_ENCODING_ZIPMAP) { + key = getDecodedObject(key); + if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) { + decrRefCount(key); + return 1; + } + decrRefCount(key); + } else { + if (dictFind(o->ptr,key) != NULL) { + return 1; + } + } + return 0; +} + +/* Add an element, discard the old if the key already exists. + * Return 0 on insert and 1 on update. */ +int hashTypeSet(robj *o, robj *key, robj *value) { + int update = 0; + if (o->encoding == REDIS_ENCODING_ZIPMAP) { + key = getDecodedObject(key); + value = getDecodedObject(value); + o->ptr = zipmapSet(o->ptr, + key->ptr,sdslen(key->ptr), + value->ptr,sdslen(value->ptr), &update); + decrRefCount(key); + decrRefCount(value); + + /* Check if the zipmap needs to be upgraded to a real hash table */ + if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries) + convertToRealHash(o); + } else { + if (dictReplace(o->ptr,key,value)) { + /* Insert */ + incrRefCount(key); + } else { + /* Update */ + update = 1; + } + incrRefCount(value); + } + return update; +} + +/* Delete an element from a hash. + * Return 1 on deleted and 0 on not found. */ +int hashTypeDelete(robj *o, robj *key) { + int deleted = 0; + if (o->encoding == REDIS_ENCODING_ZIPMAP) { + key = getDecodedObject(key); + o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted); + decrRefCount(key); + } else { + deleted = dictDelete((dict*)o->ptr,key) == DICT_OK; + /* Always check if the dictionary needs a resize after a delete. */ + if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr); + } + return deleted; +} + +/* Return the number of elements in a hash. */ +unsigned long hashTypeLength(robj *o) { + return (o->encoding == REDIS_ENCODING_ZIPMAP) ? + zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr); +} + +hashTypeIterator *hashTypeInitIterator(robj *subject) { + hashTypeIterator *hi = zmalloc(sizeof(hashTypeIterator)); + hi->encoding = subject->encoding; + if (hi->encoding == REDIS_ENCODING_ZIPMAP) { + hi->zi = zipmapRewind(subject->ptr); + } else if (hi->encoding == REDIS_ENCODING_HT) { + hi->di = dictGetIterator(subject->ptr); + } else { + redisAssert(NULL); + } + return hi; +} + +void hashTypeReleaseIterator(hashTypeIterator *hi) { + if (hi->encoding == REDIS_ENCODING_HT) { + dictReleaseIterator(hi->di); + } + zfree(hi); +} + +/* Move to the next entry in the hash. Return REDIS_OK when the next entry + * could be found and REDIS_ERR when the iterator reaches the end. */ +int hashTypeNext(hashTypeIterator *hi) { + if (hi->encoding == REDIS_ENCODING_ZIPMAP) { + if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen, + &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR; + } else { + if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR; + } + return REDIS_OK; +} + +/* Get key or value object at current iteration position. + * This increases the refcount of the field object by 1. */ +robj *hashTypeCurrent(hashTypeIterator *hi, int what) { + robj *o; + if (hi->encoding == REDIS_ENCODING_ZIPMAP) { + if (what & REDIS_HASH_KEY) { + o = createStringObject((char*)hi->zk,hi->zklen); + } else { + o = createStringObject((char*)hi->zv,hi->zvlen); + } + } else { + if (what & REDIS_HASH_KEY) { + o = dictGetEntryKey(hi->de); + } else { + o = dictGetEntryVal(hi->de); + } + incrRefCount(o); + } + return o; +} + +robj *hashTypeLookupWriteOrCreate(redisClient *c, robj *key) { + robj *o = lookupKeyWrite(c->db,key); + if (o == NULL) { + o = createHashObject(); + dbAdd(c->db,key,o); + } else { + if (o->type != REDIS_HASH) { + addReply(c,shared.wrongtypeerr); + return NULL; + } + } + return o; +} + +void convertToRealHash(robj *o) { + unsigned char *key, *val, *p, *zm = o->ptr; + unsigned int klen, vlen; + dict *dict = dictCreate(&hashDictType,NULL); + + redisAssert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT); + p = zipmapRewind(zm); + while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) { + robj *keyobj, *valobj; + + keyobj = createStringObject((char*)key,klen); + valobj = createStringObject((char*)val,vlen); + keyobj = tryObjectEncoding(keyobj); + valobj = tryObjectEncoding(valobj); + dictAdd(dict,keyobj,valobj); + } + o->encoding = REDIS_ENCODING_HT; + o->ptr = dict; + zfree(zm); +} + +/*----------------------------------------------------------------------------- + * Hash type commands + *----------------------------------------------------------------------------*/ + +void hsetCommand(redisClient *c) { + int update; + robj *o; + + if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return; + hashTypeTryConversion(o,c->argv,2,3); + hashTypeTryObjectEncoding(o,&c->argv[2], &c->argv[3]); + update = hashTypeSet(o,c->argv[2],c->argv[3]); + addReply(c, update ? shared.czero : shared.cone); + server.dirty++; +} + +void hsetnxCommand(redisClient *c) { + robj *o; + if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return; + hashTypeTryConversion(o,c->argv,2,3); + + if (hashTypeExists(o, c->argv[2])) { + addReply(c, shared.czero); + } else { + hashTypeTryObjectEncoding(o,&c->argv[2], &c->argv[3]); + hashTypeSet(o,c->argv[2],c->argv[3]); + addReply(c, shared.cone); + server.dirty++; + } +} + +void hmsetCommand(redisClient *c) { + int i; + robj *o; + + if ((c->argc % 2) == 1) { + addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n")); + return; + } + + if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return; + hashTypeTryConversion(o,c->argv,2,c->argc-1); + for (i = 2; i < c->argc; i += 2) { + hashTypeTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]); + hashTypeSet(o,c->argv[i],c->argv[i+1]); + } + addReply(c, shared.ok); + server.dirty++; +} + +void hincrbyCommand(redisClient *c) { + long long value, incr; + robj *o, *current, *new; + + if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return; + if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return; + if ((current = hashTypeGet(o,c->argv[2])) != NULL) { + if (getLongLongFromObjectOrReply(c,current,&value, + "hash value is not an integer") != REDIS_OK) { + decrRefCount(current); + return; + } + decrRefCount(current); + } else { + value = 0; + } + + value += incr; + new = createStringObjectFromLongLong(value); + hashTypeTryObjectEncoding(o,&c->argv[2],NULL); + hashTypeSet(o,c->argv[2],new); + decrRefCount(new); + addReplyLongLong(c,value); + server.dirty++; +} + +void hgetCommand(redisClient *c) { + robj *o, *value; + if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL || + checkType(c,o,REDIS_HASH)) return; + + if ((value = hashTypeGet(o,c->argv[2])) != NULL) { + addReplyBulk(c,value); + decrRefCount(value); + } else { + addReply(c,shared.nullbulk); + } +} + +void hmgetCommand(redisClient *c) { + int i; + robj *o, *value; + o = lookupKeyRead(c->db,c->argv[1]); + if (o != NULL && o->type != REDIS_HASH) { + addReply(c,shared.wrongtypeerr); + } + + /* Note the check for o != NULL happens inside the loop. This is + * done because objects that cannot be found are considered to be + * an empty hash. The reply should then be a series of NULLs. */ + addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2)); + for (i = 2; i < c->argc; i++) { + if (o != NULL && (value = hashTypeGet(o,c->argv[i])) != NULL) { + addReplyBulk(c,value); + decrRefCount(value); + } else { + addReply(c,shared.nullbulk); + } + } +} + +void hdelCommand(redisClient *c) { + robj *o; + if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL || + checkType(c,o,REDIS_HASH)) return; + + if (hashTypeDelete(o,c->argv[2])) { + if (hashTypeLength(o) == 0) dbDelete(c->db,c->argv[1]); + addReply(c,shared.cone); + server.dirty++; + } else { + addReply(c,shared.czero); + } +} + +void hlenCommand(redisClient *c) { + robj *o; + if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL || + checkType(c,o,REDIS_HASH)) return; + + addReplyUlong(c,hashTypeLength(o)); +} + +void genericHgetallCommand(redisClient *c, int flags) { + robj *o, *lenobj, *obj; + unsigned long count = 0; + hashTypeIterator *hi; + + if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL + || checkType(c,o,REDIS_HASH)) return; + + lenobj = createObject(REDIS_STRING,NULL); + addReply(c,lenobj); + decrRefCount(lenobj); + + hi = hashTypeInitIterator(o); + while (hashTypeNext(hi) != REDIS_ERR) { + if (flags & REDIS_HASH_KEY) { + obj = hashTypeCurrent(hi,REDIS_HASH_KEY); + addReplyBulk(c,obj); + decrRefCount(obj); + count++; + } + if (flags & REDIS_HASH_VALUE) { + obj = hashTypeCurrent(hi,REDIS_HASH_VALUE); + addReplyBulk(c,obj); + decrRefCount(obj); + count++; + } + } + hashTypeReleaseIterator(hi); + + lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count); +} + +void hkeysCommand(redisClient *c) { + genericHgetallCommand(c,REDIS_HASH_KEY); +} + +void hvalsCommand(redisClient *c) { + genericHgetallCommand(c,REDIS_HASH_VALUE); +} + +void hgetallCommand(redisClient *c) { + genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE); +} + +void hexistsCommand(redisClient *c) { + robj *o; + if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL || + checkType(c,o,REDIS_HASH)) return; + + addReply(c, hashTypeExists(o,c->argv[2]) ? shared.cone : shared.czero); +} diff --git a/src/t_list.c b/src/t_list.c new file mode 100644 index 000000000..ec8b30c3f --- /dev/null +++ b/src/t_list.c @@ -0,0 +1,829 @@ +#include "redis.h" + +/*----------------------------------------------------------------------------- + * List API + *----------------------------------------------------------------------------*/ + +/* Check the argument length to see if it requires us to convert the ziplist + * to a real list. Only check raw-encoded objects because integer encoded + * objects are never too long. */ +void listTypeTryConversion(robj *subject, robj *value) { + if (subject->encoding != REDIS_ENCODING_ZIPLIST) return; + if (value->encoding == REDIS_ENCODING_RAW && + sdslen(value->ptr) > server.list_max_ziplist_value) + listTypeConvert(subject,REDIS_ENCODING_LINKEDLIST); +} + +void listTypePush(robj *subject, robj *value, int where) { + /* Check if we need to convert the ziplist */ + listTypeTryConversion(subject,value); + if (subject->encoding == REDIS_ENCODING_ZIPLIST && + ziplistLen(subject->ptr) >= server.list_max_ziplist_entries) + listTypeConvert(subject,REDIS_ENCODING_LINKEDLIST); + + if (subject->encoding == REDIS_ENCODING_ZIPLIST) { + int pos = (where == REDIS_HEAD) ? ZIPLIST_HEAD : ZIPLIST_TAIL; + value = getDecodedObject(value); + subject->ptr = ziplistPush(subject->ptr,value->ptr,sdslen(value->ptr),pos); + decrRefCount(value); + } else if (subject->encoding == REDIS_ENCODING_LINKEDLIST) { + if (where == REDIS_HEAD) { + listAddNodeHead(subject->ptr,value); + } else { + listAddNodeTail(subject->ptr,value); + } + incrRefCount(value); + } else { + redisPanic("Unknown list encoding"); + } +} + +robj *listTypePop(robj *subject, int where) { + robj *value = NULL; + if (subject->encoding == REDIS_ENCODING_ZIPLIST) { + unsigned char *p; + unsigned char *vstr; + unsigned int vlen; + long long vlong; + int pos = (where == REDIS_HEAD) ? 0 : -1; + p = ziplistIndex(subject->ptr,pos); + if (ziplistGet(p,&vstr,&vlen,&vlong)) { + if (vstr) { + value = createStringObject((char*)vstr,vlen); + } else { + value = createStringObjectFromLongLong(vlong); + } + /* We only need to delete an element when it exists */ + subject->ptr = ziplistDelete(subject->ptr,&p); + } + } else if (subject->encoding == REDIS_ENCODING_LINKEDLIST) { + list *list = subject->ptr; + listNode *ln; + if (where == REDIS_HEAD) { + ln = listFirst(list); + } else { + ln = listLast(list); + } + if (ln != NULL) { + value = listNodeValue(ln); + incrRefCount(value); + listDelNode(list,ln); + } + } else { + redisPanic("Unknown list encoding"); + } + return value; +} + +unsigned long listTypeLength(robj *subject) { + if (subject->encoding == REDIS_ENCODING_ZIPLIST) { + return ziplistLen(subject->ptr); + } else if (subject->encoding == REDIS_ENCODING_LINKEDLIST) { + return listLength((list*)subject->ptr); + } else { + redisPanic("Unknown list encoding"); + } +} + +/* Initialize an iterator at the specified index. */ +listTypeIterator *listTypeInitIterator(robj *subject, int index, unsigned char direction) { + listTypeIterator *li = zmalloc(sizeof(listTypeIterator)); + li->subject = subject; + li->encoding = subject->encoding; + li->direction = direction; + if (li->encoding == REDIS_ENCODING_ZIPLIST) { + li->zi = ziplistIndex(subject->ptr,index); + } else if (li->encoding == REDIS_ENCODING_LINKEDLIST) { + li->ln = listIndex(subject->ptr,index); + } else { + redisPanic("Unknown list encoding"); + } + return li; +} + +/* Clean up the iterator. */ +void listTypeReleaseIterator(listTypeIterator *li) { + zfree(li); +} + +/* Stores pointer to current the entry in the provided entry structure + * and advances the position of the iterator. Returns 1 when the current + * entry is in fact an entry, 0 otherwise. */ +int listTypeNext(listTypeIterator *li, listTypeEntry *entry) { + /* Protect from converting when iterating */ + redisAssert(li->subject->encoding == li->encoding); + + entry->li = li; + if (li->encoding == REDIS_ENCODING_ZIPLIST) { + entry->zi = li->zi; + if (entry->zi != NULL) { + if (li->direction == REDIS_TAIL) + li->zi = ziplistNext(li->subject->ptr,li->zi); + else + li->zi = ziplistPrev(li->subject->ptr,li->zi); + return 1; + } + } else if (li->encoding == REDIS_ENCODING_LINKEDLIST) { + entry->ln = li->ln; + if (entry->ln != NULL) { + if (li->direction == REDIS_TAIL) + li->ln = li->ln->next; + else + li->ln = li->ln->prev; + return 1; + } + } else { + redisPanic("Unknown list encoding"); + } + return 0; +} + +/* Return entry or NULL at the current position of the iterator. */ +robj *listTypeGet(listTypeEntry *entry) { + listTypeIterator *li = entry->li; + robj *value = NULL; + if (li->encoding == REDIS_ENCODING_ZIPLIST) { + unsigned char *vstr; + unsigned int vlen; + long long vlong; + redisAssert(entry->zi != NULL); + if (ziplistGet(entry->zi,&vstr,&vlen,&vlong)) { + if (vstr) { + value = createStringObject((char*)vstr,vlen); + } else { + value = createStringObjectFromLongLong(vlong); + } + } + } else if (li->encoding == REDIS_ENCODING_LINKEDLIST) { + redisAssert(entry->ln != NULL); + value = listNodeValue(entry->ln); + incrRefCount(value); + } else { + redisPanic("Unknown list encoding"); + } + return value; +} + +void listTypeInsert(listTypeEntry *entry, robj *value, int where) { + robj *subject = entry->li->subject; + if (entry->li->encoding == REDIS_ENCODING_ZIPLIST) { + value = getDecodedObject(value); + if (where == REDIS_TAIL) { + unsigned char *next = ziplistNext(subject->ptr,entry->zi); + + /* When we insert after the current element, but the current element + * is the tail of the list, we need to do a push. */ + if (next == NULL) { + subject->ptr = ziplistPush(subject->ptr,value->ptr,sdslen(value->ptr),REDIS_TAIL); + } else { + subject->ptr = ziplistInsert(subject->ptr,next,value->ptr,sdslen(value->ptr)); + } + } else { + subject->ptr = ziplistInsert(subject->ptr,entry->zi,value->ptr,sdslen(value->ptr)); + } + decrRefCount(value); + } else if (entry->li->encoding == REDIS_ENCODING_LINKEDLIST) { + if (where == REDIS_TAIL) { + listInsertNode(subject->ptr,entry->ln,value,AL_START_TAIL); + } else { + listInsertNode(subject->ptr,entry->ln,value,AL_START_HEAD); + } + incrRefCount(value); + } else { + redisPanic("Unknown list encoding"); + } +} + +/* Compare the given object with the entry at the current position. */ +int listTypeEqual(listTypeEntry *entry, robj *o) { + listTypeIterator *li = entry->li; + if (li->encoding == REDIS_ENCODING_ZIPLIST) { + redisAssert(o->encoding == REDIS_ENCODING_RAW); + return ziplistCompare(entry->zi,o->ptr,sdslen(o->ptr)); + } else if (li->encoding == REDIS_ENCODING_LINKEDLIST) { + return equalStringObjects(o,listNodeValue(entry->ln)); + } else { + redisPanic("Unknown list encoding"); + } +} + +/* Delete the element pointed to. */ +void listTypeDelete(listTypeEntry *entry) { + listTypeIterator *li = entry->li; + if (li->encoding == REDIS_ENCODING_ZIPLIST) { + unsigned char *p = entry->zi; + li->subject->ptr = ziplistDelete(li->subject->ptr,&p); + + /* Update position of the iterator depending on the direction */ + if (li->direction == REDIS_TAIL) + li->zi = p; + else + li->zi = ziplistPrev(li->subject->ptr,p); + } else if (entry->li->encoding == REDIS_ENCODING_LINKEDLIST) { + listNode *next; + if (li->direction == REDIS_TAIL) + next = entry->ln->next; + else + next = entry->ln->prev; + listDelNode(li->subject->ptr,entry->ln); + li->ln = next; + } else { + redisPanic("Unknown list encoding"); + } +} + +void listTypeConvert(robj *subject, int enc) { + listTypeIterator *li; + listTypeEntry entry; + redisAssert(subject->type == REDIS_LIST); + + if (enc == REDIS_ENCODING_LINKEDLIST) { + list *l = listCreate(); + listSetFreeMethod(l,decrRefCount); + + /* listTypeGet returns a robj with incremented refcount */ + li = listTypeInitIterator(subject,0,REDIS_TAIL); + while (listTypeNext(li,&entry)) listAddNodeTail(l,listTypeGet(&entry)); + listTypeReleaseIterator(li); + + subject->encoding = REDIS_ENCODING_LINKEDLIST; + zfree(subject->ptr); + subject->ptr = l; + } else { + redisPanic("Unsupported list conversion"); + } +} + +/*----------------------------------------------------------------------------- + * List Commands + *----------------------------------------------------------------------------*/ + +void pushGenericCommand(redisClient *c, int where) { + robj *lobj = lookupKeyWrite(c->db,c->argv[1]); + if (lobj == NULL) { + if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) { + addReply(c,shared.cone); + return; + } + lobj = createZiplistObject(); + dbAdd(c->db,c->argv[1],lobj); + } else { + if (lobj->type != REDIS_LIST) { + addReply(c,shared.wrongtypeerr); + return; + } + if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) { + addReply(c,shared.cone); + return; + } + } + listTypePush(lobj,c->argv[2],where); + addReplyLongLong(c,listTypeLength(lobj)); + server.dirty++; +} + +void lpushCommand(redisClient *c) { + pushGenericCommand(c,REDIS_HEAD); +} + +void rpushCommand(redisClient *c) { + pushGenericCommand(c,REDIS_TAIL); +} + +void pushxGenericCommand(redisClient *c, robj *refval, robj *val, int where) { + robj *subject; + listTypeIterator *iter; + listTypeEntry entry; + int inserted = 0; + + if ((subject = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL || + checkType(c,subject,REDIS_LIST)) return; + + if (refval != NULL) { + /* Note: we expect refval to be string-encoded because it is *not* the + * last argument of the multi-bulk LINSERT. */ + redisAssert(refval->encoding == REDIS_ENCODING_RAW); + + /* We're not sure if this value can be inserted yet, but we cannot + * convert the list inside the iterator. We don't want to loop over + * the list twice (once to see if the value can be inserted and once + * to do the actual insert), so we assume this value can be inserted + * and convert the ziplist to a regular list if necessary. */ + listTypeTryConversion(subject,val); + + /* Seek refval from head to tail */ + iter = listTypeInitIterator(subject,0,REDIS_TAIL); + while (listTypeNext(iter,&entry)) { + if (listTypeEqual(&entry,refval)) { + listTypeInsert(&entry,val,where); + inserted = 1; + break; + } + } + listTypeReleaseIterator(iter); + + if (inserted) { + /* Check if the length exceeds the ziplist length threshold. */ + if (subject->encoding == REDIS_ENCODING_ZIPLIST && + ziplistLen(subject->ptr) > server.list_max_ziplist_entries) + listTypeConvert(subject,REDIS_ENCODING_LINKEDLIST); + server.dirty++; + } else { + /* Notify client of a failed insert */ + addReply(c,shared.cnegone); + return; + } + } else { + listTypePush(subject,val,where); + server.dirty++; + } + + addReplyUlong(c,listTypeLength(subject)); +} + +void lpushxCommand(redisClient *c) { + pushxGenericCommand(c,NULL,c->argv[2],REDIS_HEAD); +} + +void rpushxCommand(redisClient *c) { + pushxGenericCommand(c,NULL,c->argv[2],REDIS_TAIL); +} + +void linsertCommand(redisClient *c) { + if (strcasecmp(c->argv[2]->ptr,"after") == 0) { + pushxGenericCommand(c,c->argv[3],c->argv[4],REDIS_TAIL); + } else if (strcasecmp(c->argv[2]->ptr,"before") == 0) { + pushxGenericCommand(c,c->argv[3],c->argv[4],REDIS_HEAD); + } else { + addReply(c,shared.syntaxerr); + } +} + +void llenCommand(redisClient *c) { + robj *o = lookupKeyReadOrReply(c,c->argv[1],shared.czero); + if (o == NULL || checkType(c,o,REDIS_LIST)) return; + addReplyUlong(c,listTypeLength(o)); +} + +void lindexCommand(redisClient *c) { + robj *o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk); + if (o == NULL || checkType(c,o,REDIS_LIST)) return; + int index = atoi(c->argv[2]->ptr); + robj *value = NULL; + + if (o->encoding == REDIS_ENCODING_ZIPLIST) { + unsigned char *p; + unsigned char *vstr; + unsigned int vlen; + long long vlong; + p = ziplistIndex(o->ptr,index); + if (ziplistGet(p,&vstr,&vlen,&vlong)) { + if (vstr) { + value = createStringObject((char*)vstr,vlen); + } else { + value = createStringObjectFromLongLong(vlong); + } + addReplyBulk(c,value); + decrRefCount(value); + } else { + addReply(c,shared.nullbulk); + } + } else if (o->encoding == REDIS_ENCODING_LINKEDLIST) { + listNode *ln = listIndex(o->ptr,index); + if (ln != NULL) { + value = listNodeValue(ln); + addReplyBulk(c,value); + } else { + addReply(c,shared.nullbulk); + } + } else { + redisPanic("Unknown list encoding"); + } +} + +void lsetCommand(redisClient *c) { + robj *o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr); + if (o == NULL || checkType(c,o,REDIS_LIST)) return; + int index = atoi(c->argv[2]->ptr); + robj *value = c->argv[3]; + + listTypeTryConversion(o,value); + if (o->encoding == REDIS_ENCODING_ZIPLIST) { + unsigned char *p, *zl = o->ptr; + p = ziplistIndex(zl,index); + if (p == NULL) { + addReply(c,shared.outofrangeerr); + } else { + o->ptr = ziplistDelete(o->ptr,&p); + value = getDecodedObject(value); + o->ptr = ziplistInsert(o->ptr,p,value->ptr,sdslen(value->ptr)); + decrRefCount(value); + addReply(c,shared.ok); + server.dirty++; + } + } else if (o->encoding == REDIS_ENCODING_LINKEDLIST) { + listNode *ln = listIndex(o->ptr,index); + if (ln == NULL) { + addReply(c,shared.outofrangeerr); + } else { + decrRefCount((robj*)listNodeValue(ln)); + listNodeValue(ln) = value; + incrRefCount(value); + addReply(c,shared.ok); + server.dirty++; + } + } else { + redisPanic("Unknown list encoding"); + } +} + +void popGenericCommand(redisClient *c, int where) { + robj *o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk); + if (o == NULL || checkType(c,o,REDIS_LIST)) return; + + robj *value = listTypePop(o,where); + if (value == NULL) { + addReply(c,shared.nullbulk); + } else { + addReplyBulk(c,value); + decrRefCount(value); + if (listTypeLength(o) == 0) dbDelete(c->db,c->argv[1]); + server.dirty++; + } +} + +void lpopCommand(redisClient *c) { + popGenericCommand(c,REDIS_HEAD); +} + +void rpopCommand(redisClient *c) { + popGenericCommand(c,REDIS_TAIL); +} + +void lrangeCommand(redisClient *c) { + robj *o, *value; + int start = atoi(c->argv[2]->ptr); + int end = atoi(c->argv[3]->ptr); + int llen; + int rangelen, j; + listTypeEntry entry; + + if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL + || checkType(c,o,REDIS_LIST)) return; + llen = listTypeLength(o); + + /* convert negative indexes */ + if (start < 0) start = llen+start; + if (end < 0) end = llen+end; + if (start < 0) start = 0; + if (end < 0) end = 0; + + /* indexes sanity checks */ + if (start > end || start >= llen) { + /* Out of range start or start > end result in empty list */ + addReply(c,shared.emptymultibulk); + return; + } + if (end >= llen) end = llen-1; + rangelen = (end-start)+1; + + /* Return the result in form of a multi-bulk reply */ + addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen)); + listTypeIterator *li = listTypeInitIterator(o,start,REDIS_TAIL); + for (j = 0; j < rangelen; j++) { + redisAssert(listTypeNext(li,&entry)); + value = listTypeGet(&entry); + addReplyBulk(c,value); + decrRefCount(value); + } + listTypeReleaseIterator(li); +} + +void ltrimCommand(redisClient *c) { + robj *o; + int start = atoi(c->argv[2]->ptr); + int end = atoi(c->argv[3]->ptr); + int llen; + int j, ltrim, rtrim; + list *list; + listNode *ln; + + if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL || + checkType(c,o,REDIS_LIST)) return; + llen = listTypeLength(o); + + /* convert negative indexes */ + if (start < 0) start = llen+start; + if (end < 0) end = llen+end; + if (start < 0) start = 0; + if (end < 0) end = 0; + + /* indexes sanity checks */ + if (start > end || start >= llen) { + /* Out of range start or start > end result in empty list */ + ltrim = llen; + rtrim = 0; + } else { + if (end >= llen) end = llen-1; + ltrim = start; + rtrim = llen-end-1; + } + + /* Remove list elements to perform the trim */ + if (o->encoding == REDIS_ENCODING_ZIPLIST) { + o->ptr = ziplistDeleteRange(o->ptr,0,ltrim); + o->ptr = ziplistDeleteRange(o->ptr,-rtrim,rtrim); + } else if (o->encoding == REDIS_ENCODING_LINKEDLIST) { + list = o->ptr; + for (j = 0; j < ltrim; j++) { + ln = listFirst(list); + listDelNode(list,ln); + } + for (j = 0; j < rtrim; j++) { + ln = listLast(list); + listDelNode(list,ln); + } + } else { + redisPanic("Unknown list encoding"); + } + if (listTypeLength(o) == 0) dbDelete(c->db,c->argv[1]); + server.dirty++; + addReply(c,shared.ok); +} + +void lremCommand(redisClient *c) { + robj *subject, *obj = c->argv[3]; + int toremove = atoi(c->argv[2]->ptr); + int removed = 0; + listTypeEntry entry; + + subject = lookupKeyWriteOrReply(c,c->argv[1],shared.czero); + if (subject == NULL || checkType(c,subject,REDIS_LIST)) return; + + /* Make sure obj is raw when we're dealing with a ziplist */ + if (subject->encoding == REDIS_ENCODING_ZIPLIST) + obj = getDecodedObject(obj); + + listTypeIterator *li; + if (toremove < 0) { + toremove = -toremove; + li = listTypeInitIterator(subject,-1,REDIS_HEAD); + } else { + li = listTypeInitIterator(subject,0,REDIS_TAIL); + } + + while (listTypeNext(li,&entry)) { + if (listTypeEqual(&entry,obj)) { + listTypeDelete(&entry); + server.dirty++; + removed++; + if (toremove && removed == toremove) break; + } + } + listTypeReleaseIterator(li); + + /* Clean up raw encoded object */ + if (subject->encoding == REDIS_ENCODING_ZIPLIST) + decrRefCount(obj); + + if (listTypeLength(subject) == 0) dbDelete(c->db,c->argv[1]); + addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed)); +} + +/* This is the semantic of this command: + * RPOPLPUSH srclist dstlist: + * IF LLEN(srclist) > 0 + * element = RPOP srclist + * LPUSH dstlist element + * RETURN element + * ELSE + * RETURN nil + * END + * END + * + * The idea is to be able to get an element from a list in a reliable way + * since the element is not just returned but pushed against another list + * as well. This command was originally proposed by Ezra Zygmuntowicz. + */ +void rpoplpushcommand(redisClient *c) { + robj *sobj, *value; + if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL || + checkType(c,sobj,REDIS_LIST)) return; + + if (listTypeLength(sobj) == 0) { + addReply(c,shared.nullbulk); + } else { + robj *dobj = lookupKeyWrite(c->db,c->argv[2]); + if (dobj && checkType(c,dobj,REDIS_LIST)) return; + value = listTypePop(sobj,REDIS_TAIL); + + /* Add the element to the target list (unless it's directly + * passed to some BLPOP-ing client */ + if (!handleClientsWaitingListPush(c,c->argv[2],value)) { + /* Create the list if the key does not exist */ + if (!dobj) { + dobj = createZiplistObject(); + dbAdd(c->db,c->argv[2],dobj); + } + listTypePush(dobj,value,REDIS_HEAD); + } + + /* Send the element to the client as reply as well */ + addReplyBulk(c,value); + + /* listTypePop returns an object with its refcount incremented */ + decrRefCount(value); + + /* Delete the source list when it is empty */ + if (listTypeLength(sobj) == 0) dbDelete(c->db,c->argv[1]); + server.dirty++; + } +} + +/*----------------------------------------------------------------------------- + * Blocking POP operations + *----------------------------------------------------------------------------*/ + +/* Currently Redis blocking operations support is limited to list POP ops, + * so the current implementation is not fully generic, but it is also not + * completely specific so it will not require a rewrite to support new + * kind of blocking operations in the future. + * + * Still it's important to note that list blocking operations can be already + * used as a notification mechanism in order to implement other blocking + * operations at application level, so there must be a very strong evidence + * of usefulness and generality before new blocking operations are implemented. + * + * This is how the current blocking POP works, we use BLPOP as example: + * - If the user calls BLPOP and the key exists and contains a non empty list + * then LPOP is called instead. So BLPOP is semantically the same as LPOP + * if there is not to block. + * - If instead BLPOP is called and the key does not exists or the list is + * empty we need to block. In order to do so we remove the notification for + * new data to read in the client socket (so that we'll not serve new + * requests if the blocking request is not served). Also we put the client + * in a dictionary (db->blocking_keys) mapping keys to a list of clients + * blocking for this keys. + * - If a PUSH operation against a key with blocked clients waiting is + * performed, we serve the first in the list: basically instead to push + * the new element inside the list we return it to the (first / oldest) + * blocking client, unblock the client, and remove it form the list. + * + * The above comment and the source code should be enough in order to understand + * the implementation and modify / fix it later. + */ + +/* Set a client in blocking mode for the specified key, with the specified + * timeout */ +void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) { + dictEntry *de; + list *l; + int j; + + c->blocking_keys = zmalloc(sizeof(robj*)*numkeys); + c->blocking_keys_num = numkeys; + c->blockingto = timeout; + for (j = 0; j < numkeys; j++) { + /* Add the key in the client structure, to map clients -> keys */ + c->blocking_keys[j] = keys[j]; + incrRefCount(keys[j]); + + /* And in the other "side", to map keys -> clients */ + de = dictFind(c->db->blocking_keys,keys[j]); + if (de == NULL) { + int retval; + + /* For every key we take a list of clients blocked for it */ + l = listCreate(); + retval = dictAdd(c->db->blocking_keys,keys[j],l); + incrRefCount(keys[j]); + redisAssert(retval == DICT_OK); + } else { + l = dictGetEntryVal(de); + } + listAddNodeTail(l,c); + } + /* Mark the client as a blocked client */ + c->flags |= REDIS_BLOCKED; + server.blpop_blocked_clients++; +} + +/* Unblock a client that's waiting in a blocking operation such as BLPOP */ +void unblockClientWaitingData(redisClient *c) { + dictEntry *de; + list *l; + int j; + + redisAssert(c->blocking_keys != NULL); + /* The client may wait for multiple keys, so unblock it for every key. */ + for (j = 0; j < c->blocking_keys_num; j++) { + /* Remove this client from the list of clients waiting for this key. */ + de = dictFind(c->db->blocking_keys,c->blocking_keys[j]); + redisAssert(de != NULL); + l = dictGetEntryVal(de); + listDelNode(l,listSearchKey(l,c)); + /* If the list is empty we need to remove it to avoid wasting memory */ + if (listLength(l) == 0) + dictDelete(c->db->blocking_keys,c->blocking_keys[j]); + decrRefCount(c->blocking_keys[j]); + } + /* Cleanup the client structure */ + zfree(c->blocking_keys); + c->blocking_keys = NULL; + c->flags &= (~REDIS_BLOCKED); + server.blpop_blocked_clients--; + /* We want to process data if there is some command waiting + * in the input buffer. Note that this is safe even if + * unblockClientWaitingData() gets called from freeClient() because + * freeClient() will be smart enough to call this function + * *after* c->querybuf was set to NULL. */ + if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c); +} + +/* This should be called from any function PUSHing into lists. + * 'c' is the "pushing client", 'key' is the key it is pushing data against, + * 'ele' is the element pushed. + * + * If the function returns 0 there was no client waiting for a list push + * against this key. + * + * If the function returns 1 there was a client waiting for a list push + * against this key, the element was passed to this client thus it's not + * needed to actually add it to the list and the caller should return asap. */ +int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) { + struct dictEntry *de; + redisClient *receiver; + list *l; + listNode *ln; + + de = dictFind(c->db->blocking_keys,key); + if (de == NULL) return 0; + l = dictGetEntryVal(de); + ln = listFirst(l); + redisAssert(ln != NULL); + receiver = ln->value; + + addReplySds(receiver,sdsnew("*2\r\n")); + addReplyBulk(receiver,key); + addReplyBulk(receiver,ele); + unblockClientWaitingData(receiver); + return 1; +} + +/* Blocking RPOP/LPOP */ +void blockingPopGenericCommand(redisClient *c, int where) { + robj *o; + time_t timeout; + int j; + + for (j = 1; j < c->argc-1; j++) { + o = lookupKeyWrite(c->db,c->argv[j]); + if (o != NULL) { + if (o->type != REDIS_LIST) { + addReply(c,shared.wrongtypeerr); + return; + } else { + if (listTypeLength(o) != 0) { + /* If the list contains elements fall back to the usual + * non-blocking POP operation */ + robj *argv[2], **orig_argv; + int orig_argc; + + /* We need to alter the command arguments before to call + * popGenericCommand() as the command takes a single key. */ + orig_argv = c->argv; + orig_argc = c->argc; + argv[1] = c->argv[j]; + c->argv = argv; + c->argc = 2; + + /* Also the return value is different, we need to output + * the multi bulk reply header and the key name. The + * "real" command will add the last element (the value) + * for us. If this souds like an hack to you it's just + * because it is... */ + addReplySds(c,sdsnew("*2\r\n")); + addReplyBulk(c,argv[1]); + popGenericCommand(c,where); + + /* Fix the client structure with the original stuff */ + c->argv = orig_argv; + c->argc = orig_argc; + return; + } + } + } + } + /* If the list is empty or the key does not exists we must block */ + timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10); + if (timeout > 0) timeout += time(NULL); + blockForKeys(c,c->argv+1,c->argc-2,timeout); +} + +void blpopCommand(redisClient *c) { + blockingPopGenericCommand(c,REDIS_HEAD); +} + +void brpopCommand(redisClient *c) { + blockingPopGenericCommand(c,REDIS_TAIL); +} diff --git a/src/t_set.c b/src/t_set.c new file mode 100644 index 000000000..808ef268e --- /dev/null +++ b/src/t_set.c @@ -0,0 +1,349 @@ +#include "redis.h" + +/*----------------------------------------------------------------------------- + * Set Commands + *----------------------------------------------------------------------------*/ + +void saddCommand(redisClient *c) { + robj *set; + + set = lookupKeyWrite(c->db,c->argv[1]); + if (set == NULL) { + set = createSetObject(); + dbAdd(c->db,c->argv[1],set); + } else { + if (set->type != REDIS_SET) { + addReply(c,shared.wrongtypeerr); + return; + } + } + if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) { + incrRefCount(c->argv[2]); + server.dirty++; + addReply(c,shared.cone); + } else { + addReply(c,shared.czero); + } +} + +void sremCommand(redisClient *c) { + robj *set; + + if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL || + checkType(c,set,REDIS_SET)) return; + + if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) { + server.dirty++; + if (htNeedsResize(set->ptr)) dictResize(set->ptr); + if (dictSize((dict*)set->ptr) == 0) dbDelete(c->db,c->argv[1]); + addReply(c,shared.cone); + } else { + addReply(c,shared.czero); + } +} + +void smoveCommand(redisClient *c) { + robj *srcset, *dstset; + + srcset = lookupKeyWrite(c->db,c->argv[1]); + dstset = lookupKeyWrite(c->db,c->argv[2]); + + /* If the source key does not exist return 0, if it's of the wrong type + * raise an error */ + if (srcset == NULL || srcset->type != REDIS_SET) { + addReply(c, srcset ? shared.wrongtypeerr : shared.czero); + return; + } + /* Error if the destination key is not a set as well */ + if (dstset && dstset->type != REDIS_SET) { + addReply(c,shared.wrongtypeerr); + return; + } + /* Remove the element from the source set */ + if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) { + /* Key not found in the src set! return zero */ + addReply(c,shared.czero); + return; + } + if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset) + dbDelete(c->db,c->argv[1]); + server.dirty++; + /* Add the element to the destination set */ + if (!dstset) { + dstset = createSetObject(); + dbAdd(c->db,c->argv[2],dstset); + } + if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK) + incrRefCount(c->argv[3]); + addReply(c,shared.cone); +} + +void sismemberCommand(redisClient *c) { + robj *set; + + if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL || + checkType(c,set,REDIS_SET)) return; + + if (dictFind(set->ptr,c->argv[2])) + addReply(c,shared.cone); + else + addReply(c,shared.czero); +} + +void scardCommand(redisClient *c) { + robj *o; + dict *s; + + if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL || + checkType(c,o,REDIS_SET)) return; + + s = o->ptr; + addReplyUlong(c,dictSize(s)); +} + +void spopCommand(redisClient *c) { + robj *set; + dictEntry *de; + + if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL || + checkType(c,set,REDIS_SET)) return; + + de = dictGetRandomKey(set->ptr); + if (de == NULL) { + addReply(c,shared.nullbulk); + } else { + robj *ele = dictGetEntryKey(de); + + addReplyBulk(c,ele); + dictDelete(set->ptr,ele); + if (htNeedsResize(set->ptr)) dictResize(set->ptr); + if (dictSize((dict*)set->ptr) == 0) dbDelete(c->db,c->argv[1]); + server.dirty++; + } +} + +void srandmemberCommand(redisClient *c) { + robj *set; + dictEntry *de; + + if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL || + checkType(c,set,REDIS_SET)) return; + + de = dictGetRandomKey(set->ptr); + if (de == NULL) { + addReply(c,shared.nullbulk); + } else { + robj *ele = dictGetEntryKey(de); + + addReplyBulk(c,ele); + } +} + +int qsortCompareSetsByCardinality(const void *s1, const void *s2) { + dict **d1 = (void*) s1, **d2 = (void*) s2; + + return dictSize(*d1)-dictSize(*d2); +} + +void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) { + dict **dv = zmalloc(sizeof(dict*)*setsnum); + dictIterator *di; + dictEntry *de; + robj *lenobj = NULL, *dstset = NULL; + unsigned long j, cardinality = 0; + + for (j = 0; j < setsnum; j++) { + robj *setobj; + + setobj = dstkey ? + lookupKeyWrite(c->db,setskeys[j]) : + lookupKeyRead(c->db,setskeys[j]); + if (!setobj) { + zfree(dv); + if (dstkey) { + if (dbDelete(c->db,dstkey)) + server.dirty++; + addReply(c,shared.czero); + } else { + addReply(c,shared.emptymultibulk); + } + return; + } + if (setobj->type != REDIS_SET) { + zfree(dv); + addReply(c,shared.wrongtypeerr); + return; + } + dv[j] = setobj->ptr; + } + /* Sort sets from the smallest to largest, this will improve our + * algorithm's performace */ + qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality); + + /* The first thing we should output is the total number of elements... + * since this is a multi-bulk write, but at this stage we don't know + * the intersection set size, so we use a trick, append an empty object + * to the output list and save the pointer to later modify it with the + * right length */ + if (!dstkey) { + lenobj = createObject(REDIS_STRING,NULL); + addReply(c,lenobj); + decrRefCount(lenobj); + } else { + /* If we have a target key where to store the resulting set + * create this key with an empty set inside */ + dstset = createSetObject(); + } + + /* Iterate all the elements of the first (smallest) set, and test + * the element against all the other sets, if at least one set does + * not include the element it is discarded */ + di = dictGetIterator(dv[0]); + + while((de = dictNext(di)) != NULL) { + robj *ele; + + for (j = 1; j < setsnum; j++) + if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break; + if (j != setsnum) + continue; /* at least one set does not contain the member */ + ele = dictGetEntryKey(de); + if (!dstkey) { + addReplyBulk(c,ele); + cardinality++; + } else { + dictAdd(dstset->ptr,ele,NULL); + incrRefCount(ele); + } + } + dictReleaseIterator(di); + + if (dstkey) { + /* Store the resulting set into the target, if the intersection + * is not an empty set. */ + dbDelete(c->db,dstkey); + if (dictSize((dict*)dstset->ptr) > 0) { + dbAdd(c->db,dstkey,dstset); + addReplyLongLong(c,dictSize((dict*)dstset->ptr)); + } else { + decrRefCount(dstset); + addReply(c,shared.czero); + } + server.dirty++; + } else { + lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality); + } + zfree(dv); +} + +void sinterCommand(redisClient *c) { + sinterGenericCommand(c,c->argv+1,c->argc-1,NULL); +} + +void sinterstoreCommand(redisClient *c) { + sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]); +} + +void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) { + dict **dv = zmalloc(sizeof(dict*)*setsnum); + dictIterator *di; + dictEntry *de; + robj *dstset = NULL; + int j, cardinality = 0; + + for (j = 0; j < setsnum; j++) { + robj *setobj; + + setobj = dstkey ? + lookupKeyWrite(c->db,setskeys[j]) : + lookupKeyRead(c->db,setskeys[j]); + if (!setobj) { + dv[j] = NULL; + continue; + } + if (setobj->type != REDIS_SET) { + zfree(dv); + addReply(c,shared.wrongtypeerr); + return; + } + dv[j] = setobj->ptr; + } + + /* We need a temp set object to store our union. If the dstkey + * is not NULL (that is, we are inside an SUNIONSTORE operation) then + * this set object will be the resulting object to set into the target key*/ + dstset = createSetObject(); + + /* Iterate all the elements of all the sets, add every element a single + * time to the result set */ + for (j = 0; j < setsnum; j++) { + if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */ + if (!dv[j]) continue; /* non existing keys are like empty sets */ + + di = dictGetIterator(dv[j]); + + while((de = dictNext(di)) != NULL) { + robj *ele; + + /* dictAdd will not add the same element multiple times */ + ele = dictGetEntryKey(de); + if (op == REDIS_OP_UNION || j == 0) { + if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) { + incrRefCount(ele); + cardinality++; + } + } else if (op == REDIS_OP_DIFF) { + if (dictDelete(dstset->ptr,ele) == DICT_OK) { + cardinality--; + } + } + } + dictReleaseIterator(di); + + /* result set is empty? Exit asap. */ + if (op == REDIS_OP_DIFF && cardinality == 0) break; + } + + /* Output the content of the resulting set, if not in STORE mode */ + if (!dstkey) { + addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality)); + di = dictGetIterator(dstset->ptr); + while((de = dictNext(di)) != NULL) { + robj *ele; + + ele = dictGetEntryKey(de); + addReplyBulk(c,ele); + } + dictReleaseIterator(di); + decrRefCount(dstset); + } else { + /* If we have a target key where to store the resulting set + * create this key with the result set inside */ + dbDelete(c->db,dstkey); + if (dictSize((dict*)dstset->ptr) > 0) { + dbAdd(c->db,dstkey,dstset); + addReplyLongLong(c,dictSize((dict*)dstset->ptr)); + } else { + decrRefCount(dstset); + addReply(c,shared.czero); + } + server.dirty++; + } + zfree(dv); +} + +void sunionCommand(redisClient *c) { + sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION); +} + +void sunionstoreCommand(redisClient *c) { + sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION); +} + +void sdiffCommand(redisClient *c) { + sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF); +} + +void sdiffstoreCommand(redisClient *c) { + sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF); +} diff --git a/src/t_string.c b/src/t_string.c new file mode 100644 index 000000000..eaaec05be --- /dev/null +++ b/src/t_string.c @@ -0,0 +1,251 @@ +#include "redis.h" + +/*----------------------------------------------------------------------------- + * String Commands + *----------------------------------------------------------------------------*/ + +void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) { + int retval; + long seconds = 0; /* initialized to avoid an harmness warning */ + + if (expire) { + if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK) + return; + if (seconds <= 0) { + addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n")); + return; + } + } + + touchWatchedKey(c->db,key); + if (nx) deleteIfVolatile(c->db,key); + retval = dbAdd(c->db,key,val); + if (retval == REDIS_ERR) { + if (!nx) { + dbReplace(c->db,key,val); + incrRefCount(val); + } else { + addReply(c,shared.czero); + return; + } + } else { + incrRefCount(val); + } + server.dirty++; + removeExpire(c->db,key); + if (expire) setExpire(c->db,key,time(NULL)+seconds); + addReply(c, nx ? shared.cone : shared.ok); +} + +void setCommand(redisClient *c) { + setGenericCommand(c,0,c->argv[1],c->argv[2],NULL); +} + +void setnxCommand(redisClient *c) { + setGenericCommand(c,1,c->argv[1],c->argv[2],NULL); +} + +void setexCommand(redisClient *c) { + setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]); +} + +int getGenericCommand(redisClient *c) { + robj *o; + + if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL) + return REDIS_OK; + + if (o->type != REDIS_STRING) { + addReply(c,shared.wrongtypeerr); + return REDIS_ERR; + } else { + addReplyBulk(c,o); + return REDIS_OK; + } +} + +void getCommand(redisClient *c) { + getGenericCommand(c); +} + +void getsetCommand(redisClient *c) { + if (getGenericCommand(c) == REDIS_ERR) return; + dbReplace(c->db,c->argv[1],c->argv[2]); + incrRefCount(c->argv[2]); + server.dirty++; + removeExpire(c->db,c->argv[1]); +} + +void mgetCommand(redisClient *c) { + int j; + + addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1)); + for (j = 1; j < c->argc; j++) { + robj *o = lookupKeyRead(c->db,c->argv[j]); + if (o == NULL) { + addReply(c,shared.nullbulk); + } else { + if (o->type != REDIS_STRING) { + addReply(c,shared.nullbulk); + } else { + addReplyBulk(c,o); + } + } + } +} + +void msetGenericCommand(redisClient *c, int nx) { + int j, busykeys = 0; + + if ((c->argc % 2) == 0) { + addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n")); + return; + } + /* Handle the NX flag. The MSETNX semantic is to return zero and don't + * set nothing at all if at least one already key exists. */ + if (nx) { + for (j = 1; j < c->argc; j += 2) { + if (lookupKeyWrite(c->db,c->argv[j]) != NULL) { + busykeys++; + } + } + } + if (busykeys) { + addReply(c, shared.czero); + return; + } + + for (j = 1; j < c->argc; j += 2) { + c->argv[j+1] = tryObjectEncoding(c->argv[j+1]); + dbReplace(c->db,c->argv[j],c->argv[j+1]); + incrRefCount(c->argv[j+1]); + removeExpire(c->db,c->argv[j]); + } + server.dirty += (c->argc-1)/2; + addReply(c, nx ? shared.cone : shared.ok); +} + +void msetCommand(redisClient *c) { + msetGenericCommand(c,0); +} + +void msetnxCommand(redisClient *c) { + msetGenericCommand(c,1); +} + +void incrDecrCommand(redisClient *c, long long incr) { + long long value; + robj *o; + + o = lookupKeyWrite(c->db,c->argv[1]); + if (o != NULL && checkType(c,o,REDIS_STRING)) return; + if (getLongLongFromObjectOrReply(c,o,&value,NULL) != REDIS_OK) return; + + value += incr; + o = createStringObjectFromLongLong(value); + dbReplace(c->db,c->argv[1],o); + server.dirty++; + addReply(c,shared.colon); + addReply(c,o); + addReply(c,shared.crlf); +} + +void incrCommand(redisClient *c) { + incrDecrCommand(c,1); +} + +void decrCommand(redisClient *c) { + incrDecrCommand(c,-1); +} + +void incrbyCommand(redisClient *c) { + long long incr; + + if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return; + incrDecrCommand(c,incr); +} + +void decrbyCommand(redisClient *c) { + long long incr; + + if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return; + incrDecrCommand(c,-incr); +} + +void appendCommand(redisClient *c) { + int retval; + size_t totlen; + robj *o; + + o = lookupKeyWrite(c->db,c->argv[1]); + if (o == NULL) { + /* Create the key */ + retval = dbAdd(c->db,c->argv[1],c->argv[2]); + incrRefCount(c->argv[2]); + totlen = stringObjectLen(c->argv[2]); + } else { + if (o->type != REDIS_STRING) { + addReply(c,shared.wrongtypeerr); + return; + } + /* If the object is specially encoded or shared we have to make + * a copy */ + if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) { + robj *decoded = getDecodedObject(o); + + o = createStringObject(decoded->ptr, sdslen(decoded->ptr)); + decrRefCount(decoded); + dbReplace(c->db,c->argv[1],o); + } + /* APPEND! */ + if (c->argv[2]->encoding == REDIS_ENCODING_RAW) { + o->ptr = sdscatlen(o->ptr, + c->argv[2]->ptr, sdslen(c->argv[2]->ptr)); + } else { + o->ptr = sdscatprintf(o->ptr, "%ld", + (unsigned long) c->argv[2]->ptr); + } + totlen = sdslen(o->ptr); + } + server.dirty++; + addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen)); +} + +void substrCommand(redisClient *c) { + robj *o; + long start = atoi(c->argv[2]->ptr); + long end = atoi(c->argv[3]->ptr); + size_t rangelen, strlen; + sds range; + + if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL || + checkType(c,o,REDIS_STRING)) return; + + o = getDecodedObject(o); + strlen = sdslen(o->ptr); + + /* convert negative indexes */ + if (start < 0) start = strlen+start; + if (end < 0) end = strlen+end; + if (start < 0) start = 0; + if (end < 0) end = 0; + + /* indexes sanity checks */ + if (start > end || (size_t)start >= strlen) { + /* Out of range start or start > end result in null reply */ + addReply(c,shared.nullbulk); + decrRefCount(o); + return; + } + if ((size_t)end >= strlen) end = strlen-1; + rangelen = (end-start)+1; + + /* Return the result */ + addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen)); + range = sdsnewlen((char*)o->ptr+start,rangelen); + addReplySds(c,range); + addReply(c,shared.crlf); + decrRefCount(o); +} + + diff --git a/src/t_zset.c b/src/t_zset.c new file mode 100644 index 000000000..de32a8eed --- /dev/null +++ b/src/t_zset.c @@ -0,0 +1,985 @@ +#include "redis.h" + +#include + +/*----------------------------------------------------------------------------- + * Sorted set API + *----------------------------------------------------------------------------*/ + +/* ZSETs are ordered sets using two data structures to hold the same elements + * in order to get O(log(N)) INSERT and REMOVE operations into a sorted + * data structure. + * + * The elements are added to an hash table mapping Redis objects to scores. + * At the same time the elements are added to a skip list mapping scores + * to Redis objects (so objects are sorted by scores in this "view"). */ + +/* This skiplist implementation is almost a C translation of the original + * algorithm described by William Pugh in "Skip Lists: A Probabilistic + * Alternative to Balanced Trees", modified in three ways: + * a) this implementation allows for repeated values. + * b) the comparison is not just by key (our 'score') but by satellite data. + * c) there is a back pointer, so it's a doubly linked list with the back + * pointers being only at "level 1". This allows to traverse the list + * from tail to head, useful for ZREVRANGE. */ + +zskiplistNode *zslCreateNode(int level, double score, robj *obj) { + zskiplistNode *zn = zmalloc(sizeof(*zn)); + + zn->forward = zmalloc(sizeof(zskiplistNode*) * level); + if (level > 1) + zn->span = zmalloc(sizeof(unsigned int) * (level - 1)); + else + zn->span = NULL; + zn->score = score; + zn->obj = obj; + return zn; +} + +zskiplist *zslCreate(void) { + int j; + zskiplist *zsl; + + zsl = zmalloc(sizeof(*zsl)); + zsl->level = 1; + zsl->length = 0; + zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL); + for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) { + zsl->header->forward[j] = NULL; + + /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */ + if (j < ZSKIPLIST_MAXLEVEL-1) + zsl->header->span[j] = 0; + } + zsl->header->backward = NULL; + zsl->tail = NULL; + return zsl; +} + +void zslFreeNode(zskiplistNode *node) { + decrRefCount(node->obj); + zfree(node->forward); + zfree(node->span); + zfree(node); +} + +void zslFree(zskiplist *zsl) { + zskiplistNode *node = zsl->header->forward[0], *next; + + zfree(zsl->header->forward); + zfree(zsl->header->span); + zfree(zsl->header); + while(node) { + next = node->forward[0]; + zslFreeNode(node); + node = next; + } + zfree(zsl); +} + +int zslRandomLevel(void) { + int level = 1; + while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF)) + level += 1; + return (levelheader; + for (i = zsl->level-1; i >= 0; i--) { + /* store rank that is crossed to reach the insert position */ + rank[i] = i == (zsl->level-1) ? 0 : rank[i+1]; + + while (x->forward[i] && + (x->forward[i]->score < score || + (x->forward[i]->score == score && + compareStringObjects(x->forward[i]->obj,obj) < 0))) { + rank[i] += i > 0 ? x->span[i-1] : 1; + x = x->forward[i]; + } + update[i] = x; + } + /* we assume the key is not already inside, since we allow duplicated + * scores, and the re-insertion of score and redis object should never + * happpen since the caller of zslInsert() should test in the hash table + * if the element is already inside or not. */ + level = zslRandomLevel(); + if (level > zsl->level) { + for (i = zsl->level; i < level; i++) { + rank[i] = 0; + update[i] = zsl->header; + update[i]->span[i-1] = zsl->length; + } + zsl->level = level; + } + x = zslCreateNode(level,score,obj); + for (i = 0; i < level; i++) { + x->forward[i] = update[i]->forward[i]; + update[i]->forward[i] = x; + + /* update span covered by update[i] as x is inserted here */ + if (i > 0) { + x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]); + update[i]->span[i-1] = (rank[0] - rank[i]) + 1; + } + } + + /* increment span for untouched levels */ + for (i = level; i < zsl->level; i++) { + update[i]->span[i-1]++; + } + + x->backward = (update[0] == zsl->header) ? NULL : update[0]; + if (x->forward[0]) + x->forward[0]->backward = x; + else + zsl->tail = x; + zsl->length++; +} + +/* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */ +void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) { + int i; + for (i = 0; i < zsl->level; i++) { + if (update[i]->forward[i] == x) { + if (i > 0) { + update[i]->span[i-1] += x->span[i-1] - 1; + } + update[i]->forward[i] = x->forward[i]; + } else { + /* invariant: i > 0, because update[0]->forward[0] + * is always equal to x */ + update[i]->span[i-1] -= 1; + } + } + if (x->forward[0]) { + x->forward[0]->backward = x->backward; + } else { + zsl->tail = x->backward; + } + while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL) + zsl->level--; + zsl->length--; +} + +/* Delete an element with matching score/object from the skiplist. */ +int zslDelete(zskiplist *zsl, double score, robj *obj) { + zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x; + int i; + + x = zsl->header; + for (i = zsl->level-1; i >= 0; i--) { + while (x->forward[i] && + (x->forward[i]->score < score || + (x->forward[i]->score == score && + compareStringObjects(x->forward[i]->obj,obj) < 0))) + x = x->forward[i]; + update[i] = x; + } + /* We may have multiple elements with the same score, what we need + * is to find the element with both the right score and object. */ + x = x->forward[0]; + if (x && score == x->score && equalStringObjects(x->obj,obj)) { + zslDeleteNode(zsl, x, update); + zslFreeNode(x); + return 1; + } else { + return 0; /* not found */ + } + return 0; /* not found */ +} + +/* Delete all the elements with score between min and max from the skiplist. + * Min and mx are inclusive, so a score >= min || score <= max is deleted. + * Note that this function takes the reference to the hash table view of the + * sorted set, in order to remove the elements from the hash table too. */ +unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) { + zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x; + unsigned long removed = 0; + int i; + + x = zsl->header; + for (i = zsl->level-1; i >= 0; i--) { + while (x->forward[i] && x->forward[i]->score < min) + x = x->forward[i]; + update[i] = x; + } + /* We may have multiple elements with the same score, what we need + * is to find the element with both the right score and object. */ + x = x->forward[0]; + while (x && x->score <= max) { + zskiplistNode *next = x->forward[0]; + zslDeleteNode(zsl, x, update); + dictDelete(dict,x->obj); + zslFreeNode(x); + removed++; + x = next; + } + return removed; /* not found */ +} + +/* Delete all the elements with rank between start and end from the skiplist. + * Start and end are inclusive. Note that start and end need to be 1-based */ +unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) { + zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x; + unsigned long traversed = 0, removed = 0; + int i; + + x = zsl->header; + for (i = zsl->level-1; i >= 0; i--) { + while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) { + traversed += i > 0 ? x->span[i-1] : 1; + x = x->forward[i]; + } + update[i] = x; + } + + traversed++; + x = x->forward[0]; + while (x && traversed <= end) { + zskiplistNode *next = x->forward[0]; + zslDeleteNode(zsl, x, update); + dictDelete(dict,x->obj); + zslFreeNode(x); + removed++; + traversed++; + x = next; + } + return removed; +} + +/* Find the first node having a score equal or greater than the specified one. + * Returns NULL if there is no match. */ +zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) { + zskiplistNode *x; + int i; + + x = zsl->header; + for (i = zsl->level-1; i >= 0; i--) { + while (x->forward[i] && x->forward[i]->score < score) + x = x->forward[i]; + } + /* We may have multiple elements with the same score, what we need + * is to find the element with both the right score and object. */ + return x->forward[0]; +} + +/* Find the rank for an element by both score and key. + * Returns 0 when the element cannot be found, rank otherwise. + * Note that the rank is 1-based due to the span of zsl->header to the + * first element. */ +unsigned long zslistTypeGetRank(zskiplist *zsl, double score, robj *o) { + zskiplistNode *x; + unsigned long rank = 0; + int i; + + x = zsl->header; + for (i = zsl->level-1; i >= 0; i--) { + while (x->forward[i] && + (x->forward[i]->score < score || + (x->forward[i]->score == score && + compareStringObjects(x->forward[i]->obj,o) <= 0))) { + rank += i > 0 ? x->span[i-1] : 1; + x = x->forward[i]; + } + + /* x might be equal to zsl->header, so test if obj is non-NULL */ + if (x->obj && equalStringObjects(x->obj,o)) { + return rank; + } + } + return 0; +} + +/* Finds an element by its rank. The rank argument needs to be 1-based. */ +zskiplistNode* zslistTypeGetElementByRank(zskiplist *zsl, unsigned long rank) { + zskiplistNode *x; + unsigned long traversed = 0; + int i; + + x = zsl->header; + for (i = zsl->level-1; i >= 0; i--) { + while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank) + { + traversed += i > 0 ? x->span[i-1] : 1; + x = x->forward[i]; + } + if (traversed == rank) { + return x; + } + } + return NULL; +} + +/*----------------------------------------------------------------------------- + * Sorted set commands + *----------------------------------------------------------------------------*/ + +/* This generic command implements both ZADD and ZINCRBY. + * scoreval is the score if the operation is a ZADD (doincrement == 0) or + * the increment if the operation is a ZINCRBY (doincrement == 1). */ +void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) { + robj *zsetobj; + zset *zs; + double *score; + + if (isnan(scoreval)) { + addReplySds(c,sdsnew("-ERR provide score is Not A Number (nan)\r\n")); + return; + } + + zsetobj = lookupKeyWrite(c->db,key); + if (zsetobj == NULL) { + zsetobj = createZsetObject(); + dbAdd(c->db,key,zsetobj); + } else { + if (zsetobj->type != REDIS_ZSET) { + addReply(c,shared.wrongtypeerr); + return; + } + } + zs = zsetobj->ptr; + + /* Ok now since we implement both ZADD and ZINCRBY here the code + * needs to handle the two different conditions. It's all about setting + * '*score', that is, the new score to set, to the right value. */ + score = zmalloc(sizeof(double)); + if (doincrement) { + dictEntry *de; + + /* Read the old score. If the element was not present starts from 0 */ + de = dictFind(zs->dict,ele); + if (de) { + double *oldscore = dictGetEntryVal(de); + *score = *oldscore + scoreval; + } else { + *score = scoreval; + } + if (isnan(*score)) { + addReplySds(c, + sdsnew("-ERR resulting score is Not A Number (nan)\r\n")); + zfree(score); + /* Note that we don't need to check if the zset may be empty and + * should be removed here, as we can only obtain Nan as score if + * there was already an element in the sorted set. */ + return; + } + } else { + *score = scoreval; + } + + /* What follows is a simple remove and re-insert operation that is common + * to both ZADD and ZINCRBY... */ + if (dictAdd(zs->dict,ele,score) == DICT_OK) { + /* case 1: New element */ + incrRefCount(ele); /* added to hash */ + zslInsert(zs->zsl,*score,ele); + incrRefCount(ele); /* added to skiplist */ + server.dirty++; + if (doincrement) + addReplyDouble(c,*score); + else + addReply(c,shared.cone); + } else { + dictEntry *de; + double *oldscore; + + /* case 2: Score update operation */ + de = dictFind(zs->dict,ele); + redisAssert(de != NULL); + oldscore = dictGetEntryVal(de); + if (*score != *oldscore) { + int deleted; + + /* Remove and insert the element in the skip list with new score */ + deleted = zslDelete(zs->zsl,*oldscore,ele); + redisAssert(deleted != 0); + zslInsert(zs->zsl,*score,ele); + incrRefCount(ele); + /* Update the score in the hash table */ + dictReplace(zs->dict,ele,score); + server.dirty++; + } else { + zfree(score); + } + if (doincrement) + addReplyDouble(c,*score); + else + addReply(c,shared.czero); + } +} + +void zaddCommand(redisClient *c) { + double scoreval; + + if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return; + zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0); +} + +void zincrbyCommand(redisClient *c) { + double scoreval; + + if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return; + zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1); +} + +void zremCommand(redisClient *c) { + robj *zsetobj; + zset *zs; + dictEntry *de; + double *oldscore; + int deleted; + + if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL || + checkType(c,zsetobj,REDIS_ZSET)) return; + + zs = zsetobj->ptr; + de = dictFind(zs->dict,c->argv[2]); + if (de == NULL) { + addReply(c,shared.czero); + return; + } + /* Delete from the skiplist */ + oldscore = dictGetEntryVal(de); + deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]); + redisAssert(deleted != 0); + + /* Delete from the hash table */ + dictDelete(zs->dict,c->argv[2]); + if (htNeedsResize(zs->dict)) dictResize(zs->dict); + if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]); + server.dirty++; + addReply(c,shared.cone); +} + +void zremrangebyscoreCommand(redisClient *c) { + double min; + double max; + long deleted; + robj *zsetobj; + zset *zs; + + if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) || + (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return; + + if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL || + checkType(c,zsetobj,REDIS_ZSET)) return; + + zs = zsetobj->ptr; + deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict); + if (htNeedsResize(zs->dict)) dictResize(zs->dict); + if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]); + server.dirty += deleted; + addReplyLongLong(c,deleted); +} + +void zremrangebyrankCommand(redisClient *c) { + long start; + long end; + int llen; + long deleted; + robj *zsetobj; + zset *zs; + + if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) || + (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return; + + if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL || + checkType(c,zsetobj,REDIS_ZSET)) return; + zs = zsetobj->ptr; + llen = zs->zsl->length; + + /* convert negative indexes */ + if (start < 0) start = llen+start; + if (end < 0) end = llen+end; + if (start < 0) start = 0; + if (end < 0) end = 0; + + /* indexes sanity checks */ + if (start > end || start >= llen) { + addReply(c,shared.czero); + return; + } + if (end >= llen) end = llen-1; + + /* increment start and end because zsl*Rank functions + * use 1-based rank */ + deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict); + if (htNeedsResize(zs->dict)) dictResize(zs->dict); + if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]); + server.dirty += deleted; + addReplyLongLong(c, deleted); +} + +typedef struct { + dict *dict; + double weight; +} zsetopsrc; + +int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) { + zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2; + unsigned long size1, size2; + size1 = d1->dict ? dictSize(d1->dict) : 0; + size2 = d2->dict ? dictSize(d2->dict) : 0; + return size1 - size2; +} + +#define REDIS_AGGR_SUM 1 +#define REDIS_AGGR_MIN 2 +#define REDIS_AGGR_MAX 3 +#define zunionInterDictValue(_e) (dictGetEntryVal(_e) == NULL ? 1.0 : *(double*)dictGetEntryVal(_e)) + +inline static void zunionInterAggregate(double *target, double val, int aggregate) { + if (aggregate == REDIS_AGGR_SUM) { + *target = *target + val; + } else if (aggregate == REDIS_AGGR_MIN) { + *target = val < *target ? val : *target; + } else if (aggregate == REDIS_AGGR_MAX) { + *target = val > *target ? val : *target; + } else { + /* safety net */ + redisPanic("Unknown ZUNION/INTER aggregate type"); + } +} + +void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) { + int i, j, setnum; + int aggregate = REDIS_AGGR_SUM; + zsetopsrc *src; + robj *dstobj; + zset *dstzset; + dictIterator *di; + dictEntry *de; + + /* expect setnum input keys to be given */ + setnum = atoi(c->argv[2]->ptr); + if (setnum < 1) { + addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNIONSTORE/ZINTERSTORE\r\n")); + return; + } + + /* test if the expected number of keys would overflow */ + if (3+setnum > c->argc) { + addReply(c,shared.syntaxerr); + return; + } + + /* read keys to be used for input */ + src = zmalloc(sizeof(zsetopsrc) * setnum); + for (i = 0, j = 3; i < setnum; i++, j++) { + robj *obj = lookupKeyWrite(c->db,c->argv[j]); + if (!obj) { + src[i].dict = NULL; + } else { + if (obj->type == REDIS_ZSET) { + src[i].dict = ((zset*)obj->ptr)->dict; + } else if (obj->type == REDIS_SET) { + src[i].dict = (obj->ptr); + } else { + zfree(src); + addReply(c,shared.wrongtypeerr); + return; + } + } + + /* default all weights to 1 */ + src[i].weight = 1.0; + } + + /* parse optional extra arguments */ + if (j < c->argc) { + int remaining = c->argc - j; + + while (remaining) { + if (remaining >= (setnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) { + j++; remaining--; + for (i = 0; i < setnum; i++, j++, remaining--) { + if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK) + return; + } + } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) { + j++; remaining--; + if (!strcasecmp(c->argv[j]->ptr,"sum")) { + aggregate = REDIS_AGGR_SUM; + } else if (!strcasecmp(c->argv[j]->ptr,"min")) { + aggregate = REDIS_AGGR_MIN; + } else if (!strcasecmp(c->argv[j]->ptr,"max")) { + aggregate = REDIS_AGGR_MAX; + } else { + zfree(src); + addReply(c,shared.syntaxerr); + return; + } + j++; remaining--; + } else { + zfree(src); + addReply(c,shared.syntaxerr); + return; + } + } + } + + /* sort sets from the smallest to largest, this will improve our + * algorithm's performance */ + qsort(src,setnum,sizeof(zsetopsrc),qsortCompareZsetopsrcByCardinality); + + dstobj = createZsetObject(); + dstzset = dstobj->ptr; + + if (op == REDIS_OP_INTER) { + /* skip going over all entries if the smallest zset is NULL or empty */ + if (src[0].dict && dictSize(src[0].dict) > 0) { + /* precondition: as src[0].dict is non-empty and the zsets are ordered + * from small to large, all src[i > 0].dict are non-empty too */ + di = dictGetIterator(src[0].dict); + while((de = dictNext(di)) != NULL) { + double *score = zmalloc(sizeof(double)), value; + *score = src[0].weight * zunionInterDictValue(de); + + for (j = 1; j < setnum; j++) { + dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de)); + if (other) { + value = src[j].weight * zunionInterDictValue(other); + zunionInterAggregate(score, value, aggregate); + } else { + break; + } + } + + /* skip entry when not present in every source dict */ + if (j != setnum) { + zfree(score); + } else { + robj *o = dictGetEntryKey(de); + dictAdd(dstzset->dict,o,score); + incrRefCount(o); /* added to dictionary */ + zslInsert(dstzset->zsl,*score,o); + incrRefCount(o); /* added to skiplist */ + } + } + dictReleaseIterator(di); + } + } else if (op == REDIS_OP_UNION) { + for (i = 0; i < setnum; i++) { + if (!src[i].dict) continue; + + di = dictGetIterator(src[i].dict); + while((de = dictNext(di)) != NULL) { + /* skip key when already processed */ + if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue; + + double *score = zmalloc(sizeof(double)), value; + *score = src[i].weight * zunionInterDictValue(de); + + /* because the zsets are sorted by size, its only possible + * for sets at larger indices to hold this entry */ + for (j = (i+1); j < setnum; j++) { + dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de)); + if (other) { + value = src[j].weight * zunionInterDictValue(other); + zunionInterAggregate(score, value, aggregate); + } + } + + robj *o = dictGetEntryKey(de); + dictAdd(dstzset->dict,o,score); + incrRefCount(o); /* added to dictionary */ + zslInsert(dstzset->zsl,*score,o); + incrRefCount(o); /* added to skiplist */ + } + dictReleaseIterator(di); + } + } else { + /* unknown operator */ + redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION); + } + + dbDelete(c->db,dstkey); + if (dstzset->zsl->length) { + dbAdd(c->db,dstkey,dstobj); + addReplyLongLong(c, dstzset->zsl->length); + server.dirty++; + } else { + decrRefCount(dstobj); + addReply(c, shared.czero); + } + zfree(src); +} + +void zunionstoreCommand(redisClient *c) { + zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION); +} + +void zinterstoreCommand(redisClient *c) { + zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER); +} + +void zrangeGenericCommand(redisClient *c, int reverse) { + robj *o; + long start; + long end; + int withscores = 0; + int llen; + int rangelen, j; + zset *zsetobj; + zskiplist *zsl; + zskiplistNode *ln; + robj *ele; + + if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) || + (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return; + + if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) { + withscores = 1; + } else if (c->argc >= 5) { + addReply(c,shared.syntaxerr); + return; + } + + if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL + || checkType(c,o,REDIS_ZSET)) return; + zsetobj = o->ptr; + zsl = zsetobj->zsl; + llen = zsl->length; + + /* convert negative indexes */ + if (start < 0) start = llen+start; + if (end < 0) end = llen+end; + if (start < 0) start = 0; + if (end < 0) end = 0; + + /* indexes sanity checks */ + if (start > end || start >= llen) { + /* Out of range start or start > end result in empty list */ + addReply(c,shared.emptymultibulk); + return; + } + if (end >= llen) end = llen-1; + rangelen = (end-start)+1; + + /* check if starting point is trivial, before searching + * the element in log(N) time */ + if (reverse) { + ln = start == 0 ? zsl->tail : zslistTypeGetElementByRank(zsl, llen-start); + } else { + ln = start == 0 ? + zsl->header->forward[0] : zslistTypeGetElementByRank(zsl, start+1); + } + + /* Return the result in form of a multi-bulk reply */ + addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n", + withscores ? (rangelen*2) : rangelen)); + for (j = 0; j < rangelen; j++) { + ele = ln->obj; + addReplyBulk(c,ele); + if (withscores) + addReplyDouble(c,ln->score); + ln = reverse ? ln->backward : ln->forward[0]; + } +} + +void zrangeCommand(redisClient *c) { + zrangeGenericCommand(c,0); +} + +void zrevrangeCommand(redisClient *c) { + zrangeGenericCommand(c,1); +} + +/* This command implements both ZRANGEBYSCORE and ZCOUNT. + * If justcount is non-zero, just the count is returned. */ +void genericZrangebyscoreCommand(redisClient *c, int justcount) { + robj *o; + double min, max; + int minex = 0, maxex = 0; /* are min or max exclusive? */ + int offset = 0, limit = -1; + int withscores = 0; + int badsyntax = 0; + + /* Parse the min-max interval. If one of the values is prefixed + * by the "(" character, it's considered "open". For instance + * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max + * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */ + if (((char*)c->argv[2]->ptr)[0] == '(') { + min = strtod((char*)c->argv[2]->ptr+1,NULL); + minex = 1; + } else { + min = strtod(c->argv[2]->ptr,NULL); + } + if (((char*)c->argv[3]->ptr)[0] == '(') { + max = strtod((char*)c->argv[3]->ptr+1,NULL); + maxex = 1; + } else { + max = strtod(c->argv[3]->ptr,NULL); + } + + /* Parse "WITHSCORES": note that if the command was called with + * the name ZCOUNT then we are sure that c->argc == 4, so we'll never + * enter the following paths to parse WITHSCORES and LIMIT. */ + if (c->argc == 5 || c->argc == 8) { + if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0) + withscores = 1; + else + badsyntax = 1; + } + if (c->argc != (4 + withscores) && c->argc != (7 + withscores)) + badsyntax = 1; + if (badsyntax) { + addReplySds(c, + sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n")); + return; + } + + /* Parse "LIMIT" */ + if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) { + addReply(c,shared.syntaxerr); + return; + } else if (c->argc == (7 + withscores)) { + offset = atoi(c->argv[5]->ptr); + limit = atoi(c->argv[6]->ptr); + if (offset < 0) offset = 0; + } + + /* Ok, lookup the key and get the range */ + o = lookupKeyRead(c->db,c->argv[1]); + if (o == NULL) { + addReply(c,justcount ? shared.czero : shared.emptymultibulk); + } else { + if (o->type != REDIS_ZSET) { + addReply(c,shared.wrongtypeerr); + } else { + zset *zsetobj = o->ptr; + zskiplist *zsl = zsetobj->zsl; + zskiplistNode *ln; + robj *ele, *lenobj = NULL; + unsigned long rangelen = 0; + + /* Get the first node with the score >= min, or with + * score > min if 'minex' is true. */ + ln = zslFirstWithScore(zsl,min); + while (minex && ln && ln->score == min) ln = ln->forward[0]; + + if (ln == NULL) { + /* No element matching the speciifed interval */ + addReply(c,justcount ? shared.czero : shared.emptymultibulk); + return; + } + + /* We don't know in advance how many matching elements there + * are in the list, so we push this object that will represent + * the multi-bulk length in the output buffer, and will "fix" + * it later */ + if (!justcount) { + lenobj = createObject(REDIS_STRING,NULL); + addReply(c,lenobj); + decrRefCount(lenobj); + } + + while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) { + if (offset) { + offset--; + ln = ln->forward[0]; + continue; + } + if (limit == 0) break; + if (!justcount) { + ele = ln->obj; + addReplyBulk(c,ele); + if (withscores) + addReplyDouble(c,ln->score); + } + ln = ln->forward[0]; + rangelen++; + if (limit > 0) limit--; + } + if (justcount) { + addReplyLongLong(c,(long)rangelen); + } else { + lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n", + withscores ? (rangelen*2) : rangelen); + } + } + } +} + +void zrangebyscoreCommand(redisClient *c) { + genericZrangebyscoreCommand(c,0); +} + +void zcountCommand(redisClient *c) { + genericZrangebyscoreCommand(c,1); +} + +void zcardCommand(redisClient *c) { + robj *o; + zset *zs; + + if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL || + checkType(c,o,REDIS_ZSET)) return; + + zs = o->ptr; + addReplyUlong(c,zs->zsl->length); +} + +void zscoreCommand(redisClient *c) { + robj *o; + zset *zs; + dictEntry *de; + + if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL || + checkType(c,o,REDIS_ZSET)) return; + + zs = o->ptr; + de = dictFind(zs->dict,c->argv[2]); + if (!de) { + addReply(c,shared.nullbulk); + } else { + double *score = dictGetEntryVal(de); + + addReplyDouble(c,*score); + } +} + +void zrankGenericCommand(redisClient *c, int reverse) { + robj *o; + zset *zs; + zskiplist *zsl; + dictEntry *de; + unsigned long rank; + double *score; + + if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL || + checkType(c,o,REDIS_ZSET)) return; + + zs = o->ptr; + zsl = zs->zsl; + de = dictFind(zs->dict,c->argv[2]); + if (!de) { + addReply(c,shared.nullbulk); + return; + } + + score = dictGetEntryVal(de); + rank = zslistTypeGetRank(zsl, *score, c->argv[2]); + if (rank) { + if (reverse) { + addReplyLongLong(c, zsl->length - rank); + } else { + addReplyLongLong(c, rank-1); + } + } else { + addReply(c,shared.nullbulk); + } +} + +void zrankCommand(redisClient *c) { + zrankGenericCommand(c, 0); +} + +void zrevrankCommand(redisClient *c) { + zrankGenericCommand(c, 1); +} diff --git a/src/util.c b/src/util.c new file mode 100644 index 000000000..cc2794f6a --- /dev/null +++ b/src/util.c @@ -0,0 +1,223 @@ +#include "redis.h" +#include +#include + +/* Glob-style pattern matching. */ +int stringmatchlen(const char *pattern, int patternLen, + const char *string, int stringLen, int nocase) +{ + while(patternLen) { + switch(pattern[0]) { + case '*': + while (pattern[1] == '*') { + pattern++; + patternLen--; + } + if (patternLen == 1) + return 1; /* match */ + while(stringLen) { + if (stringmatchlen(pattern+1, patternLen-1, + string, stringLen, nocase)) + return 1; /* match */ + string++; + stringLen--; + } + return 0; /* no match */ + break; + case '?': + if (stringLen == 0) + return 0; /* no match */ + string++; + stringLen--; + break; + case '[': + { + int not, match; + + pattern++; + patternLen--; + not = pattern[0] == '^'; + if (not) { + pattern++; + patternLen--; + } + match = 0; + while(1) { + if (pattern[0] == '\\') { + pattern++; + patternLen--; + if (pattern[0] == string[0]) + match = 1; + } else if (pattern[0] == ']') { + break; + } else if (patternLen == 0) { + pattern--; + patternLen++; + break; + } else if (pattern[1] == '-' && patternLen >= 3) { + int start = pattern[0]; + int end = pattern[2]; + int c = string[0]; + if (start > end) { + int t = start; + start = end; + end = t; + } + if (nocase) { + start = tolower(start); + end = tolower(end); + c = tolower(c); + } + pattern += 2; + patternLen -= 2; + if (c >= start && c <= end) + match = 1; + } else { + if (!nocase) { + if (pattern[0] == string[0]) + match = 1; + } else { + if (tolower((int)pattern[0]) == tolower((int)string[0])) + match = 1; + } + } + pattern++; + patternLen--; + } + if (not) + match = !match; + if (!match) + return 0; /* no match */ + string++; + stringLen--; + break; + } + case '\\': + if (patternLen >= 2) { + pattern++; + patternLen--; + } + /* fall through */ + default: + if (!nocase) { + if (pattern[0] != string[0]) + return 0; /* no match */ + } else { + if (tolower((int)pattern[0]) != tolower((int)string[0])) + return 0; /* no match */ + } + string++; + stringLen--; + break; + } + pattern++; + patternLen--; + if (stringLen == 0) { + while(*pattern == '*') { + pattern++; + patternLen--; + } + break; + } + } + if (patternLen == 0 && stringLen == 0) + return 1; + return 0; +} + +int stringmatch(const char *pattern, const char *string, int nocase) { + return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase); +} + +/* Convert a string representing an amount of memory into the number of + * bytes, so for instance memtoll("1Gi") will return 1073741824 that is + * (1024*1024*1024). + * + * On parsing error, if *err is not NULL, it's set to 1, otherwise it's + * set to 0 */ +long long memtoll(const char *p, int *err) { + const char *u; + char buf[128]; + long mul; /* unit multiplier */ + long long val; + unsigned int digits; + + if (err) *err = 0; + /* Search the first non digit character. */ + u = p; + if (*u == '-') u++; + while(*u && isdigit(*u)) u++; + if (*u == '\0' || !strcasecmp(u,"b")) { + mul = 1; + } else if (!strcasecmp(u,"k")) { + mul = 1000; + } else if (!strcasecmp(u,"kb")) { + mul = 1024; + } else if (!strcasecmp(u,"m")) { + mul = 1000*1000; + } else if (!strcasecmp(u,"mb")) { + mul = 1024*1024; + } else if (!strcasecmp(u,"g")) { + mul = 1000L*1000*1000; + } else if (!strcasecmp(u,"gb")) { + mul = 1024L*1024*1024; + } else { + if (err) *err = 1; + mul = 1; + } + digits = u-p; + if (digits >= sizeof(buf)) { + if (err) *err = 1; + return LLONG_MAX; + } + memcpy(buf,p,digits); + buf[digits] = '\0'; + val = strtoll(buf,NULL,10); + return val*mul; +} + +/* Convert a long long into a string. Returns the number of + * characters needed to represent the number, that can be shorter if passed + * buffer length is not enough to store the whole number. */ +int ll2string(char *s, size_t len, long long value) { + char buf[32], *p; + unsigned long long v; + size_t l; + + if (len == 0) return 0; + v = (value < 0) ? -value : value; + p = buf+31; /* point to the last character */ + do { + *p-- = '0'+(v%10); + v /= 10; + } while(v); + if (value < 0) *p-- = '-'; + p++; + l = 32-(p-buf); + if (l+1 > len) l = len-1; /* Make sure it fits, including the nul term */ + memcpy(s,p,l); + s[l] = '\0'; + return l; +} + +/* Check if the nul-terminated string 's' can be represented by a long + * (that is, is a number that fits into long without any other space or + * character before or after the digits). + * + * If so, the function returns REDIS_OK and *longval is set to the value + * of the number. Otherwise REDIS_ERR is returned */ +int isStringRepresentableAsLong(sds s, long *longval) { + char buf[32], *endptr; + long value; + int slen; + + value = strtol(s, &endptr, 10); + if (endptr[0] != '\0') return REDIS_ERR; + slen = ll2string(buf,32,value); + + /* If the number converted back into a string is not identical + * then it's not possible to encode the string as integer */ + if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR; + if (longval) *longval = value; + return REDIS_OK; +} diff --git a/src/version.h b/src/version.h new file mode 100644 index 000000000..86d422474 --- /dev/null +++ b/src/version.h @@ -0,0 +1 @@ +#define REDIS_VERSION "2.1.1" diff --git a/src/vm.c b/src/vm.c new file mode 100644 index 000000000..1aaa57eb5 --- /dev/null +++ b/src/vm.c @@ -0,0 +1,1126 @@ +#include "redis.h" + +#include +#include +#include +#include + +/* Virtual Memory is composed mainly of two subsystems: + * - Blocking Virutal Memory + * - Threaded Virtual Memory I/O + * The two parts are not fully decoupled, but functions are split among two + * different sections of the source code (delimited by comments) in order to + * make more clear what functionality is about the blocking VM and what about + * the threaded (not blocking) VM. + * + * Redis VM design: + * + * Redis VM is a blocking VM (one that blocks reading swapped values from + * disk into memory when a value swapped out is needed in memory) that is made + * unblocking by trying to examine the command argument vector in order to + * load in background values that will likely be needed in order to exec + * the command. The command is executed only once all the relevant keys + * are loaded into memory. + * + * This basically is almost as simple of a blocking VM, but almost as parallel + * as a fully non-blocking VM. + */ + +/* =================== Virtual Memory - Blocking Side ====================== */ + +/* Create a VM pointer object. This kind of objects are used in place of + * values in the key -> value hash table, for swapped out objects. */ +vmpointer *createVmPointer(int vtype) { + vmpointer *vp = zmalloc(sizeof(vmpointer)); + + vp->type = REDIS_VMPOINTER; + vp->storage = REDIS_VM_SWAPPED; + vp->vtype = vtype; + return vp; +} + +void vmInit(void) { + off_t totsize; + int pipefds[2]; + size_t stacksize; + struct flock fl; + + if (server.vm_max_threads != 0) + zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */ + + redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file); + /* Try to open the old swap file, otherwise create it */ + if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) { + server.vm_fp = fopen(server.vm_swap_file,"w+b"); + } + if (server.vm_fp == NULL) { + redisLog(REDIS_WARNING, + "Can't open the swap file: %s. Exiting.", + strerror(errno)); + exit(1); + } + server.vm_fd = fileno(server.vm_fp); + /* Lock the swap file for writing, this is useful in order to avoid + * another instance to use the same swap file for a config error. */ + fl.l_type = F_WRLCK; + fl.l_whence = SEEK_SET; + fl.l_start = fl.l_len = 0; + if (fcntl(server.vm_fd,F_SETLK,&fl) == -1) { + redisLog(REDIS_WARNING, + "Can't lock the swap file at '%s': %s. Make sure it is not used by another Redis instance.", server.vm_swap_file, strerror(errno)); + exit(1); + } + /* Initialize */ + server.vm_next_page = 0; + server.vm_near_pages = 0; + server.vm_stats_used_pages = 0; + server.vm_stats_swapped_objects = 0; + server.vm_stats_swapouts = 0; + server.vm_stats_swapins = 0; + totsize = server.vm_pages*server.vm_page_size; + redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize); + if (ftruncate(server.vm_fd,totsize) == -1) { + redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.", + strerror(errno)); + exit(1); + } else { + redisLog(REDIS_NOTICE,"Swap file allocated with success"); + } + server.vm_bitmap = zmalloc((server.vm_pages+7)/8); + redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages", + (long long) (server.vm_pages+7)/8, server.vm_pages); + memset(server.vm_bitmap,0,(server.vm_pages+7)/8); + + /* Initialize threaded I/O (used by Virtual Memory) */ + server.io_newjobs = listCreate(); + server.io_processing = listCreate(); + server.io_processed = listCreate(); + server.io_ready_clients = listCreate(); + pthread_mutex_init(&server.io_mutex,NULL); + pthread_mutex_init(&server.obj_freelist_mutex,NULL); + pthread_mutex_init(&server.io_swapfile_mutex,NULL); + server.io_active_threads = 0; + if (pipe(pipefds) == -1) { + redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting." + ,strerror(errno)); + exit(1); + } + server.io_ready_pipe_read = pipefds[0]; + server.io_ready_pipe_write = pipefds[1]; + redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR); + /* LZF requires a lot of stack */ + pthread_attr_init(&server.io_threads_attr); + pthread_attr_getstacksize(&server.io_threads_attr, &stacksize); + while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2; + pthread_attr_setstacksize(&server.io_threads_attr, stacksize); + /* Listen for events in the threaded I/O pipe */ + if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE, + vmThreadedIOCompletedJob, NULL) == AE_ERR) + oom("creating file event"); +} + +/* Mark the page as used */ +void vmMarkPageUsed(off_t page) { + off_t byte = page/8; + int bit = page&7; + redisAssert(vmFreePage(page) == 1); + server.vm_bitmap[byte] |= 1<= server.vm_pages) { + this -= server.vm_pages; + if (this == 0) { + /* Just overflowed, what we found on tail is no longer + * interesting, as it's no longer contiguous. */ + numfree = 0; + } + } + if (vmFreePage(this)) { + /* This is a free page */ + numfree++; + /* Already got N free pages? Return to the caller, with success */ + if (numfree == n) { + *first = this-(n-1); + server.vm_next_page = this+1; + redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first); + return REDIS_OK; + } + } else { + /* The current one is not a free page */ + numfree = 0; + } + + /* Fast-forward if the current page is not free and we already + * searched enough near this place. */ + since_jump++; + if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) { + offset += random() % REDIS_VM_MAX_RANDOM_JUMP; + since_jump = 0; + /* Note that even if we rewind after the jump, we are don't need + * to make sure numfree is set to zero as we only jump *if* it + * is set to zero. */ + } else { + /* Otherwise just check the next page */ + offset++; + } + } + return REDIS_ERR; +} + +/* Write the specified object at the specified page of the swap file */ +int vmWriteObjectOnSwap(robj *o, off_t page) { + if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex); + if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) { + if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex); + redisLog(REDIS_WARNING, + "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s", + strerror(errno)); + return REDIS_ERR; + } + rdbSaveObject(server.vm_fp,o); + fflush(server.vm_fp); + if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex); + return REDIS_OK; +} + +/* Transfers the 'val' object to disk. Store all the information + * a 'vmpointer' object containing all the information needed to load the + * object back later is returned. + * + * If we can't find enough contiguous empty pages to swap the object on disk + * NULL is returned. */ +vmpointer *vmSwapObjectBlocking(robj *val) { + off_t pages = rdbSavedObjectPages(val,NULL); + off_t page; + vmpointer *vp; + + redisAssert(val->storage == REDIS_VM_MEMORY); + redisAssert(val->refcount == 1); + if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return NULL; + if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return NULL; + + vp = createVmPointer(val->type); + vp->page = page; + vp->usedpages = pages; + decrRefCount(val); /* Deallocate the object from memory. */ + vmMarkPagesUsed(page,pages); + redisLog(REDIS_DEBUG,"VM: object %p swapped out at %lld (%lld pages)", + (void*) val, + (unsigned long long) page, (unsigned long long) pages); + server.vm_stats_swapped_objects++; + server.vm_stats_swapouts++; + return vp; +} + +robj *vmReadObjectFromSwap(off_t page, int type) { + robj *o; + + if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex); + if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) { + redisLog(REDIS_WARNING, + "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s", + strerror(errno)); + _exit(1); + } + o = rdbLoadObject(type,server.vm_fp); + if (o == NULL) { + redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno)); + _exit(1); + } + if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex); + return o; +} + +/* Load the specified object from swap to memory. + * The newly allocated object is returned. + * + * If preview is true the unserialized object is returned to the caller but + * the pages are not marked as freed, nor the vp object is freed. */ +robj *vmGenericLoadObject(vmpointer *vp, int preview) { + robj *val; + + redisAssert(vp->type == REDIS_VMPOINTER && + (vp->storage == REDIS_VM_SWAPPED || vp->storage == REDIS_VM_LOADING)); + val = vmReadObjectFromSwap(vp->page,vp->vtype); + if (!preview) { + redisLog(REDIS_DEBUG, "VM: object %p loaded from disk", (void*)vp); + vmMarkPagesFree(vp->page,vp->usedpages); + zfree(vp); + server.vm_stats_swapped_objects--; + } else { + redisLog(REDIS_DEBUG, "VM: object %p previewed from disk", (void*)vp); + } + server.vm_stats_swapins++; + return val; +} + +/* Plain object loading, from swap to memory. + * + * 'o' is actually a redisVmPointer structure that will be freed by the call. + * The return value is the loaded object. */ +robj *vmLoadObject(robj *o) { + /* If we are loading the object in background, stop it, we + * need to load this object synchronously ASAP. */ + if (o->storage == REDIS_VM_LOADING) + vmCancelThreadedIOJob(o); + return vmGenericLoadObject((vmpointer*)o,0); +} + +/* Just load the value on disk, without to modify the key. + * This is useful when we want to perform some operation on the value + * without to really bring it from swap to memory, like while saving the + * dataset or rewriting the append only log. */ +robj *vmPreviewObject(robj *o) { + return vmGenericLoadObject((vmpointer*)o,1); +} + +/* How a good candidate is this object for swapping? + * The better candidate it is, the greater the returned value. + * + * Currently we try to perform a fast estimation of the object size in + * memory, and combine it with aging informations. + * + * Basically swappability = idle-time * log(estimated size) + * + * Bigger objects are preferred over smaller objects, but not + * proportionally, this is why we use the logarithm. This algorithm is + * just a first try and will probably be tuned later. */ +double computeObjectSwappability(robj *o) { + /* actual age can be >= minage, but not < minage. As we use wrapping + * 21 bit clocks with minutes resolution for the LRU. */ + time_t minage = abs(server.lruclock - o->lru); + long asize = 0, elesize; + robj *ele; + list *l; + listNode *ln; + dict *d; + struct dictEntry *de; + int z; + + if (minage <= 0) return 0; + switch(o->type) { + case REDIS_STRING: + if (o->encoding != REDIS_ENCODING_RAW) { + asize = sizeof(*o); + } else { + asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2; + } + break; + case REDIS_LIST: + if (o->encoding == REDIS_ENCODING_ZIPLIST) { + asize = sizeof(*o)+ziplistSize(o->ptr); + } else { + l = o->ptr; + ln = listFirst(l); + asize = sizeof(list); + if (ln) { + ele = ln->value; + elesize = (ele->encoding == REDIS_ENCODING_RAW) ? + (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o); + asize += (sizeof(listNode)+elesize)*listLength(l); + } + } + break; + case REDIS_SET: + case REDIS_ZSET: + z = (o->type == REDIS_ZSET); + d = z ? ((zset*)o->ptr)->dict : o->ptr; + + asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d)); + if (z) asize += sizeof(zset)-sizeof(dict); + if (dictSize(d)) { + de = dictGetRandomKey(d); + ele = dictGetEntryKey(de); + elesize = (ele->encoding == REDIS_ENCODING_RAW) ? + (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o); + asize += (sizeof(struct dictEntry)+elesize)*dictSize(d); + if (z) asize += sizeof(zskiplistNode)*dictSize(d); + } + break; + case REDIS_HASH: + if (o->encoding == REDIS_ENCODING_ZIPMAP) { + unsigned char *p = zipmapRewind((unsigned char*)o->ptr); + unsigned int len = zipmapLen((unsigned char*)o->ptr); + unsigned int klen, vlen; + unsigned char *key, *val; + + if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) { + klen = 0; + vlen = 0; + } + asize = len*(klen+vlen+3); + } else if (o->encoding == REDIS_ENCODING_HT) { + d = o->ptr; + asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d)); + if (dictSize(d)) { + de = dictGetRandomKey(d); + ele = dictGetEntryKey(de); + elesize = (ele->encoding == REDIS_ENCODING_RAW) ? + (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o); + ele = dictGetEntryVal(de); + elesize = (ele->encoding == REDIS_ENCODING_RAW) ? + (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o); + asize += (sizeof(struct dictEntry)+elesize)*dictSize(d); + } + } + break; + } + return (double)minage*log(1+asize); +} + +/* Try to swap an object that's a good candidate for swapping. + * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible + * to swap any object at all. + * + * If 'usethreaded' is true, Redis will try to swap the object in background + * using I/O threads. */ +int vmSwapOneObject(int usethreads) { + int j, i; + struct dictEntry *best = NULL; + double best_swappability = 0; + redisDb *best_db = NULL; + robj *val; + sds key; + + for (j = 0; j < server.dbnum; j++) { + redisDb *db = server.db+j; + /* Why maxtries is set to 100? + * Because this way (usually) we'll find 1 object even if just 1% - 2% + * are swappable objects */ + int maxtries = 100; + + if (dictSize(db->dict) == 0) continue; + for (i = 0; i < 5; i++) { + dictEntry *de; + double swappability; + + if (maxtries) maxtries--; + de = dictGetRandomKey(db->dict); + val = dictGetEntryVal(de); + /* Only swap objects that are currently in memory. + * + * Also don't swap shared objects: not a good idea in general and + * we need to ensure that the main thread does not touch the + * object while the I/O thread is using it, but we can't + * control other keys without adding additional mutex. */ + if (val->storage != REDIS_VM_MEMORY || val->refcount != 1) { + if (maxtries) i--; /* don't count this try */ + continue; + } + swappability = computeObjectSwappability(val); + if (!best || swappability > best_swappability) { + best = de; + best_swappability = swappability; + best_db = db; + } + } + } + if (best == NULL) return REDIS_ERR; + key = dictGetEntryKey(best); + val = dictGetEntryVal(best); + + redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f", + key, best_swappability); + + /* Swap it */ + if (usethreads) { + robj *keyobj = createStringObject(key,sdslen(key)); + vmSwapObjectThreaded(keyobj,val,best_db); + decrRefCount(keyobj); + return REDIS_OK; + } else { + vmpointer *vp; + + if ((vp = vmSwapObjectBlocking(val)) != NULL) { + dictGetEntryVal(best) = vp; + return REDIS_OK; + } else { + return REDIS_ERR; + } + } +} + +int vmSwapOneObjectBlocking() { + return vmSwapOneObject(0); +} + +int vmSwapOneObjectThreaded() { + return vmSwapOneObject(1); +} + +/* Return true if it's safe to swap out objects in a given moment. + * Basically we don't want to swap objects out while there is a BGSAVE + * or a BGAEOREWRITE running in backgroud. */ +int vmCanSwapOut(void) { + return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1); +} + +/* =================== Virtual Memory - Threaded I/O ======================= */ + +void freeIOJob(iojob *j) { + if ((j->type == REDIS_IOJOB_PREPARE_SWAP || + j->type == REDIS_IOJOB_DO_SWAP || + j->type == REDIS_IOJOB_LOAD) && j->val != NULL) + { + /* we fix the storage type, otherwise decrRefCount() will try to + * kill the I/O thread Job (that does no longer exists). */ + if (j->val->storage == REDIS_VM_SWAPPING) + j->val->storage = REDIS_VM_MEMORY; + decrRefCount(j->val); + } + decrRefCount(j->key); + zfree(j); +} + +/* Every time a thread finished a Job, it writes a byte into the write side + * of an unix pipe in order to "awake" the main thread, and this function + * is called. */ +void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, + int mask) +{ + char buf[1]; + int retval, processed = 0, toprocess = -1, trytoswap = 1; + REDIS_NOTUSED(el); + REDIS_NOTUSED(mask); + REDIS_NOTUSED(privdata); + + /* For every byte we read in the read side of the pipe, there is one + * I/O job completed to process. */ + while((retval = read(fd,buf,1)) == 1) { + iojob *j; + listNode *ln; + struct dictEntry *de; + + redisLog(REDIS_DEBUG,"Processing I/O completed job"); + + /* Get the processed element (the oldest one) */ + lockThreadedIO(); + redisAssert(listLength(server.io_processed) != 0); + if (toprocess == -1) { + toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100; + if (toprocess <= 0) toprocess = 1; + } + ln = listFirst(server.io_processed); + j = ln->value; + listDelNode(server.io_processed,ln); + unlockThreadedIO(); + /* If this job is marked as canceled, just ignore it */ + if (j->canceled) { + freeIOJob(j); + continue; + } + /* Post process it in the main thread, as there are things we + * can do just here to avoid race conditions and/or invasive locks */ + redisLog(REDIS_DEBUG,"COMPLETED Job type: %d, ID %p, key: %s", j->type, (void*)j->id, (unsigned char*)j->key->ptr); + de = dictFind(j->db->dict,j->key->ptr); + redisAssert(de != NULL); + if (j->type == REDIS_IOJOB_LOAD) { + redisDb *db; + vmpointer *vp = dictGetEntryVal(de); + + /* Key loaded, bring it at home */ + vmMarkPagesFree(vp->page,vp->usedpages); + redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)", + (unsigned char*) j->key->ptr); + server.vm_stats_swapped_objects--; + server.vm_stats_swapins++; + dictGetEntryVal(de) = j->val; + incrRefCount(j->val); + db = j->db; + /* Handle clients waiting for this key to be loaded. */ + handleClientsBlockedOnSwappedKey(db,j->key); + freeIOJob(j); + zfree(vp); + } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) { + /* Now we know the amount of pages required to swap this object. + * Let's find some space for it, and queue this task again + * rebranded as REDIS_IOJOB_DO_SWAP. */ + if (!vmCanSwapOut() || + vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR) + { + /* Ooops... no space or we can't swap as there is + * a fork()ed Redis trying to save stuff on disk. */ + j->val->storage = REDIS_VM_MEMORY; /* undo operation */ + freeIOJob(j); + } else { + /* Note that we need to mark this pages as used now, + * if the job will be canceled, we'll mark them as freed + * again. */ + vmMarkPagesUsed(j->page,j->pages); + j->type = REDIS_IOJOB_DO_SWAP; + lockThreadedIO(); + queueIOJob(j); + unlockThreadedIO(); + } + } else if (j->type == REDIS_IOJOB_DO_SWAP) { + vmpointer *vp; + + /* Key swapped. We can finally free some memory. */ + if (j->val->storage != REDIS_VM_SWAPPING) { + vmpointer *vp = (vmpointer*) j->id; + printf("storage: %d\n",vp->storage); + printf("key->name: %s\n",(char*)j->key->ptr); + printf("val: %p\n",(void*)j->val); + printf("val->type: %d\n",j->val->type); + printf("val->ptr: %s\n",(char*)j->val->ptr); + } + redisAssert(j->val->storage == REDIS_VM_SWAPPING); + vp = createVmPointer(j->val->type); + vp->page = j->page; + vp->usedpages = j->pages; + dictGetEntryVal(de) = vp; + /* Fix the storage otherwise decrRefCount will attempt to + * remove the associated I/O job */ + j->val->storage = REDIS_VM_MEMORY; + decrRefCount(j->val); + redisLog(REDIS_DEBUG, + "VM: object %s swapped out at %lld (%lld pages) (threaded)", + (unsigned char*) j->key->ptr, + (unsigned long long) j->page, (unsigned long long) j->pages); + server.vm_stats_swapped_objects++; + server.vm_stats_swapouts++; + freeIOJob(j); + /* Put a few more swap requests in queue if we are still + * out of memory */ + if (trytoswap && vmCanSwapOut() && + zmalloc_used_memory() > server.vm_max_memory) + { + int more = 1; + while(more) { + lockThreadedIO(); + more = listLength(server.io_newjobs) < + (unsigned) server.vm_max_threads; + unlockThreadedIO(); + /* Don't waste CPU time if swappable objects are rare. */ + if (vmSwapOneObjectThreaded() == REDIS_ERR) { + trytoswap = 0; + break; + } + } + } + } + processed++; + if (processed == toprocess) return; + } + if (retval < 0 && errno != EAGAIN) { + redisLog(REDIS_WARNING, + "WARNING: read(2) error in vmThreadedIOCompletedJob() %s", + strerror(errno)); + } +} + +void lockThreadedIO(void) { + pthread_mutex_lock(&server.io_mutex); +} + +void unlockThreadedIO(void) { + pthread_mutex_unlock(&server.io_mutex); +} + +/* Remove the specified object from the threaded I/O queue if still not + * processed, otherwise make sure to flag it as canceled. */ +void vmCancelThreadedIOJob(robj *o) { + list *lists[3] = { + server.io_newjobs, /* 0 */ + server.io_processing, /* 1 */ + server.io_processed /* 2 */ + }; + int i; + + redisAssert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING); +again: + lockThreadedIO(); + /* Search for a matching object in one of the queues */ + for (i = 0; i < 3; i++) { + listNode *ln; + listIter li; + + listRewind(lists[i],&li); + while ((ln = listNext(&li)) != NULL) { + iojob *job = ln->value; + + if (job->canceled) continue; /* Skip this, already canceled. */ + if (job->id == o) { + redisLog(REDIS_DEBUG,"*** CANCELED %p (key %s) (type %d) (LIST ID %d)\n", + (void*)job, (char*)job->key->ptr, job->type, i); + /* Mark the pages as free since the swap didn't happened + * or happened but is now discarded. */ + if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP) + vmMarkPagesFree(job->page,job->pages); + /* Cancel the job. It depends on the list the job is + * living in. */ + switch(i) { + case 0: /* io_newjobs */ + /* If the job was yet not processed the best thing to do + * is to remove it from the queue at all */ + freeIOJob(job); + listDelNode(lists[i],ln); + break; + case 1: /* io_processing */ + /* Oh Shi- the thread is messing with the Job: + * + * Probably it's accessing the object if this is a + * PREPARE_SWAP or DO_SWAP job. + * If it's a LOAD job it may be reading from disk and + * if we don't wait for the job to terminate before to + * cancel it, maybe in a few microseconds data can be + * corrupted in this pages. So the short story is: + * + * Better to wait for the job to move into the + * next queue (processed)... */ + + /* We try again and again until the job is completed. */ + unlockThreadedIO(); + /* But let's wait some time for the I/O thread + * to finish with this job. After all this condition + * should be very rare. */ + usleep(1); + goto again; + case 2: /* io_processed */ + /* The job was already processed, that's easy... + * just mark it as canceled so that we'll ignore it + * when processing completed jobs. */ + job->canceled = 1; + break; + } + /* Finally we have to adjust the storage type of the object + * in order to "UNDO" the operaiton. */ + if (o->storage == REDIS_VM_LOADING) + o->storage = REDIS_VM_SWAPPED; + else if (o->storage == REDIS_VM_SWAPPING) + o->storage = REDIS_VM_MEMORY; + unlockThreadedIO(); + redisLog(REDIS_DEBUG,"*** DONE"); + return; + } + } + } + unlockThreadedIO(); + printf("Not found: %p\n", (void*)o); + redisAssert(1 != 1); /* We should never reach this */ +} + +void *IOThreadEntryPoint(void *arg) { + iojob *j; + listNode *ln; + REDIS_NOTUSED(arg); + + pthread_detach(pthread_self()); + while(1) { + /* Get a new job to process */ + lockThreadedIO(); + if (listLength(server.io_newjobs) == 0) { + /* No new jobs in queue, exit. */ + redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do", + (long) pthread_self()); + server.io_active_threads--; + unlockThreadedIO(); + return NULL; + } + ln = listFirst(server.io_newjobs); + j = ln->value; + listDelNode(server.io_newjobs,ln); + /* Add the job in the processing queue */ + j->thread = pthread_self(); + listAddNodeTail(server.io_processing,j); + ln = listLast(server.io_processing); /* We use ln later to remove it */ + unlockThreadedIO(); + redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'", + (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr); + + /* Process the Job */ + if (j->type == REDIS_IOJOB_LOAD) { + vmpointer *vp = (vmpointer*)j->id; + j->val = vmReadObjectFromSwap(j->page,vp->vtype); + } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) { + FILE *fp = fopen("/dev/null","w+"); + j->pages = rdbSavedObjectPages(j->val,fp); + fclose(fp); + } else if (j->type == REDIS_IOJOB_DO_SWAP) { + if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR) + j->canceled = 1; + } + + /* Done: insert the job into the processed queue */ + redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)", + (long) pthread_self(), (void*)j, (char*)j->key->ptr); + lockThreadedIO(); + listDelNode(server.io_processing,ln); + listAddNodeTail(server.io_processed,j); + unlockThreadedIO(); + + /* Signal the main thread there is new stuff to process */ + redisAssert(write(server.io_ready_pipe_write,"x",1) == 1); + } + return NULL; /* never reached */ +} + +void spawnIOThread(void) { + pthread_t thread; + sigset_t mask, omask; + int err; + + sigemptyset(&mask); + sigaddset(&mask,SIGCHLD); + sigaddset(&mask,SIGHUP); + sigaddset(&mask,SIGPIPE); + pthread_sigmask(SIG_SETMASK, &mask, &omask); + while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) { + redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s", + strerror(err)); + usleep(1000000); + } + pthread_sigmask(SIG_SETMASK, &omask, NULL); + server.io_active_threads++; +} + +/* We need to wait for the last thread to exit before we are able to + * fork() in order to BGSAVE or BGREWRITEAOF. */ +void waitEmptyIOJobsQueue(void) { + while(1) { + int io_processed_len; + + lockThreadedIO(); + if (listLength(server.io_newjobs) == 0 && + listLength(server.io_processing) == 0 && + server.io_active_threads == 0) + { + unlockThreadedIO(); + return; + } + /* While waiting for empty jobs queue condition we post-process some + * finshed job, as I/O threads may be hanging trying to write against + * the io_ready_pipe_write FD but there are so much pending jobs that + * it's blocking. */ + io_processed_len = listLength(server.io_processed); + unlockThreadedIO(); + if (io_processed_len) { + vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0); + usleep(1000); /* 1 millisecond */ + } else { + usleep(10000); /* 10 milliseconds */ + } + } +} + +void vmReopenSwapFile(void) { + /* Note: we don't close the old one as we are in the child process + * and don't want to mess at all with the original file object. */ + server.vm_fp = fopen(server.vm_swap_file,"r+b"); + if (server.vm_fp == NULL) { + redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.", + server.vm_swap_file); + _exit(1); + } + server.vm_fd = fileno(server.vm_fp); +} + +/* This function must be called while with threaded IO locked */ +void queueIOJob(iojob *j) { + redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n", + (void*)j, j->type, (char*)j->key->ptr); + listAddNodeTail(server.io_newjobs,j); + if (server.io_active_threads < server.vm_max_threads) + spawnIOThread(); +} + +int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) { + iojob *j; + + j = zmalloc(sizeof(*j)); + j->type = REDIS_IOJOB_PREPARE_SWAP; + j->db = db; + j->key = key; + incrRefCount(key); + j->id = j->val = val; + incrRefCount(val); + j->canceled = 0; + j->thread = (pthread_t) -1; + val->storage = REDIS_VM_SWAPPING; + + lockThreadedIO(); + queueIOJob(j); + unlockThreadedIO(); + return REDIS_OK; +} + +/* ============ Virtual Memory - Blocking clients on missing keys =========== */ + +/* This function makes the clinet 'c' waiting for the key 'key' to be loaded. + * If there is not already a job loading the key, it is craeted. + * The key is added to the io_keys list in the client structure, and also + * in the hash table mapping swapped keys to waiting clients, that is, + * server.io_waited_keys. */ +int waitForSwappedKey(redisClient *c, robj *key) { + struct dictEntry *de; + robj *o; + list *l; + + /* If the key does not exist or is already in RAM we don't need to + * block the client at all. */ + de = dictFind(c->db->dict,key->ptr); + if (de == NULL) return 0; + o = dictGetEntryVal(de); + if (o->storage == REDIS_VM_MEMORY) { + return 0; + } else if (o->storage == REDIS_VM_SWAPPING) { + /* We were swapping the key, undo it! */ + vmCancelThreadedIOJob(o); + return 0; + } + + /* OK: the key is either swapped, or being loaded just now. */ + + /* Add the key to the list of keys this client is waiting for. + * This maps clients to keys they are waiting for. */ + listAddNodeTail(c->io_keys,key); + incrRefCount(key); + + /* Add the client to the swapped keys => clients waiting map. */ + de = dictFind(c->db->io_keys,key); + if (de == NULL) { + int retval; + + /* For every key we take a list of clients blocked for it */ + l = listCreate(); + retval = dictAdd(c->db->io_keys,key,l); + incrRefCount(key); + redisAssert(retval == DICT_OK); + } else { + l = dictGetEntryVal(de); + } + listAddNodeTail(l,c); + + /* Are we already loading the key from disk? If not create a job */ + if (o->storage == REDIS_VM_SWAPPED) { + iojob *j; + vmpointer *vp = (vmpointer*)o; + + o->storage = REDIS_VM_LOADING; + j = zmalloc(sizeof(*j)); + j->type = REDIS_IOJOB_LOAD; + j->db = c->db; + j->id = (robj*)vp; + j->key = key; + incrRefCount(key); + j->page = vp->page; + j->val = NULL; + j->canceled = 0; + j->thread = (pthread_t) -1; + lockThreadedIO(); + queueIOJob(j); + unlockThreadedIO(); + } + return 1; +} + +/* Preload keys for any command with first, last and step values for + * the command keys prototype, as defined in the command table. */ +void waitForMultipleSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) { + int j, last; + if (cmd->vm_firstkey == 0) return; + last = cmd->vm_lastkey; + if (last < 0) last = argc+last; + for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep) { + redisAssert(j < argc); + waitForSwappedKey(c,argv[j]); + } +} + +/* Preload keys needed for the ZUNIONSTORE and ZINTERSTORE commands. + * Note that the number of keys to preload is user-defined, so we need to + * apply a sanity check against argc. */ +void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) { + int i, num; + REDIS_NOTUSED(cmd); + + num = atoi(argv[2]->ptr); + if (num > (argc-3)) return; + for (i = 0; i < num; i++) { + waitForSwappedKey(c,argv[3+i]); + } +} + +/* Preload keys needed to execute the entire MULTI/EXEC block. + * + * This function is called by blockClientOnSwappedKeys when EXEC is issued, + * and will block the client when any command requires a swapped out value. */ +void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) { + int i, margc; + struct redisCommand *mcmd; + robj **margv; + REDIS_NOTUSED(cmd); + REDIS_NOTUSED(argc); + REDIS_NOTUSED(argv); + + if (!(c->flags & REDIS_MULTI)) return; + for (i = 0; i < c->mstate.count; i++) { + mcmd = c->mstate.commands[i].cmd; + margc = c->mstate.commands[i].argc; + margv = c->mstate.commands[i].argv; + + if (mcmd->vm_preload_proc != NULL) { + mcmd->vm_preload_proc(c,mcmd,margc,margv); + } else { + waitForMultipleSwappedKeys(c,mcmd,margc,margv); + } + } +} + +/* Is this client attempting to run a command against swapped keys? + * If so, block it ASAP, load the keys in background, then resume it. + * + * The important idea about this function is that it can fail! If keys will + * still be swapped when the client is resumed, this key lookups will + * just block loading keys from disk. In practical terms this should only + * happen with SORT BY command or if there is a bug in this function. + * + * Return 1 if the client is marked as blocked, 0 if the client can + * continue as the keys it is going to access appear to be in memory. */ +int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd) { + if (cmd->vm_preload_proc != NULL) { + cmd->vm_preload_proc(c,cmd,c->argc,c->argv); + } else { + waitForMultipleSwappedKeys(c,cmd,c->argc,c->argv); + } + + /* If the client was blocked for at least one key, mark it as blocked. */ + if (listLength(c->io_keys)) { + c->flags |= REDIS_IO_WAIT; + aeDeleteFileEvent(server.el,c->fd,AE_READABLE); + server.vm_blocked_clients++; + return 1; + } else { + return 0; + } +} + +/* Remove the 'key' from the list of blocked keys for a given client. + * + * The function returns 1 when there are no longer blocking keys after + * the current one was removed (and the client can be unblocked). */ +int dontWaitForSwappedKey(redisClient *c, robj *key) { + list *l; + listNode *ln; + listIter li; + struct dictEntry *de; + + /* Remove the key from the list of keys this client is waiting for. */ + listRewind(c->io_keys,&li); + while ((ln = listNext(&li)) != NULL) { + if (equalStringObjects(ln->value,key)) { + listDelNode(c->io_keys,ln); + break; + } + } + redisAssert(ln != NULL); + + /* Remove the client form the key => waiting clients map. */ + de = dictFind(c->db->io_keys,key); + redisAssert(de != NULL); + l = dictGetEntryVal(de); + ln = listSearchKey(l,c); + redisAssert(ln != NULL); + listDelNode(l,ln); + if (listLength(l) == 0) + dictDelete(c->db->io_keys,key); + + return listLength(c->io_keys) == 0; +} + +/* Every time we now a key was loaded back in memory, we handle clients + * waiting for this key if any. */ +void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) { + struct dictEntry *de; + list *l; + listNode *ln; + int len; + + de = dictFind(db->io_keys,key); + if (!de) return; + + l = dictGetEntryVal(de); + len = listLength(l); + /* Note: we can't use something like while(listLength(l)) as the list + * can be freed by the calling function when we remove the last element. */ + while (len--) { + ln = listFirst(l); + redisClient *c = ln->value; + + if (dontWaitForSwappedKey(c,key)) { + /* Put the client in the list of clients ready to go as we + * loaded all the keys about it. */ + listAddNodeTail(server.io_ready_clients,c); + } + } +} diff --git a/src/ziplist.c b/src/ziplist.c new file mode 100644 index 000000000..4b9d0fadc --- /dev/null +++ b/src/ziplist.c @@ -0,0 +1,959 @@ +/* Memory layout of a ziplist, containing "foo", "bar", "quux": + * "foo""bar""quux" + * + * is an unsigned integer to hold the number of bytes that + * the ziplist occupies. This is stored to not have to traverse the ziplist + * to know the new length when pushing. + * + * is the number of items in the ziplist. When this value is + * greater than 254, we need to traverse the entire list to know + * how many items it holds. + * + * is the number of bytes occupied by a single entry. When this + * number is greater than 253, the length will occupy 5 bytes, where + * the extra bytes contain an unsigned integer to hold the length. + */ + +#include +#include +#include +#include +#include +#include +#include "zmalloc.h" +#include "ziplist.h" + +/* Important note: the ZIP_END value is used to depict the end of the + * ziplist structure. When a pointer contains an entry, the first couple + * of bytes contain the encoded length of the previous entry. This length + * is encoded as ZIP_ENC_RAW length, so the first two bits will contain 00 + * and the byte will therefore never have a value of 255. */ +#define ZIP_END 255 +#define ZIP_BIGLEN 254 + +/* Entry encoding */ +#define ZIP_ENC_RAW 0 +#define ZIP_ENC_INT16 1 +#define ZIP_ENC_INT32 2 +#define ZIP_ENC_INT64 3 +#define ZIP_ENCODING(p) ((p)[0] >> 6) + +/* Length encoding for raw entries */ +#define ZIP_LEN_INLINE 0 +#define ZIP_LEN_UINT16 1 +#define ZIP_LEN_UINT32 2 + +/* Utility macros */ +#define ZIPLIST_BYTES(zl) (*((uint32_t*)(zl))) +#define ZIPLIST_TAIL_OFFSET(zl) (*((uint32_t*)((zl)+sizeof(uint32_t)))) +#define ZIPLIST_LENGTH(zl) (*((uint16_t*)((zl)+sizeof(uint32_t)*2))) +#define ZIPLIST_HEADER_SIZE (sizeof(uint32_t)*2+sizeof(uint16_t)) +#define ZIPLIST_ENTRY_HEAD(zl) ((zl)+ZIPLIST_HEADER_SIZE) +#define ZIPLIST_ENTRY_TAIL(zl) ((zl)+ZIPLIST_TAIL_OFFSET(zl)) +#define ZIPLIST_ENTRY_END(zl) ((zl)+ZIPLIST_BYTES(zl)-1) + +/* We know a positive increment can only be 1 because entries can only be + * pushed one at a time. */ +#define ZIPLIST_INCR_LENGTH(zl,incr) { \ + if (ZIPLIST_LENGTH(zl) < UINT16_MAX) ZIPLIST_LENGTH(zl)+=incr; } + +typedef struct zlentry { + unsigned int prevrawlensize, prevrawlen; + unsigned int lensize, len; + unsigned int headersize; + unsigned char encoding; + unsigned char *p; +} zlentry; + +/* Return bytes needed to store integer encoded by 'encoding' */ +static unsigned int zipEncodingSize(unsigned char encoding) { + if (encoding == ZIP_ENC_INT16) { + return sizeof(int16_t); + } else if (encoding == ZIP_ENC_INT32) { + return sizeof(int32_t); + } else if (encoding == ZIP_ENC_INT64) { + return sizeof(int64_t); + } + assert(NULL); +} + +/* Decode the encoded length pointed by 'p'. If a pointer to 'lensize' is + * provided, it is set to the number of bytes required to encode the length. */ +static unsigned int zipDecodeLength(unsigned char *p, unsigned int *lensize) { + unsigned char encoding = ZIP_ENCODING(p), lenenc; + unsigned int len; + + if (encoding == ZIP_ENC_RAW) { + lenenc = (p[0] >> 4) & 0x3; + if (lenenc == ZIP_LEN_INLINE) { + len = p[0] & 0xf; + if (lensize) *lensize = 1; + } else if (lenenc == ZIP_LEN_UINT16) { + len = p[1] | (p[2] << 8); + if (lensize) *lensize = 3; + } else { + len = p[1] | (p[2] << 8) | (p[3] << 16) | (p[4] << 24); + if (lensize) *lensize = 5; + } + } else { + len = zipEncodingSize(encoding); + if (lensize) *lensize = 1; + } + return len; +} + +/* Encode the length 'l' writing it in 'p'. If p is NULL it just returns + * the amount of bytes required to encode such a length. */ +static unsigned int zipEncodeLength(unsigned char *p, char encoding, unsigned int rawlen) { + unsigned char len = 1, lenenc, buf[5]; + if (encoding == ZIP_ENC_RAW) { + if (rawlen <= 0xf) { + if (!p) return len; + lenenc = ZIP_LEN_INLINE; + buf[0] = rawlen; + } else if (rawlen <= 0xffff) { + len += 2; + if (!p) return len; + lenenc = ZIP_LEN_UINT16; + buf[1] = (rawlen ) & 0xff; + buf[2] = (rawlen >> 8) & 0xff; + } else { + len += 4; + if (!p) return len; + lenenc = ZIP_LEN_UINT32; + buf[1] = (rawlen ) & 0xff; + buf[2] = (rawlen >> 8) & 0xff; + buf[3] = (rawlen >> 16) & 0xff; + buf[4] = (rawlen >> 24) & 0xff; + } + buf[0] = (lenenc << 4) | (buf[0] & 0xf); + } + if (!p) return len; + + /* Apparently we need to store the length in 'p' */ + buf[0] = (encoding << 6) | (buf[0] & 0x3f); + memcpy(p,buf,len); + return len; +} + +/* Decode the length of the previous element stored at "p". */ +static unsigned int zipPrevDecodeLength(unsigned char *p, unsigned int *lensize) { + unsigned int len = *p; + if (len < ZIP_BIGLEN) { + if (lensize) *lensize = 1; + } else { + if (lensize) *lensize = 1+sizeof(len); + memcpy(&len,p+1,sizeof(len)); + } + return len; +} + +/* Encode the length of the previous entry and write it to "p". Return the + * number of bytes needed to encode this length if "p" is NULL. */ +static unsigned int zipPrevEncodeLength(unsigned char *p, unsigned int len) { + if (p == NULL) { + return (len < ZIP_BIGLEN) ? 1 : sizeof(len)+1; + } else { + if (len < ZIP_BIGLEN) { + p[0] = len; + return 1; + } else { + p[0] = ZIP_BIGLEN; + memcpy(p+1,&len,sizeof(len)); + return 1+sizeof(len); + } + } +} + +/* Return the difference in number of bytes needed to store the new length + * "len" on the entry pointed to by "p". */ +static int zipPrevLenByteDiff(unsigned char *p, unsigned int len) { + unsigned int prevlensize; + zipPrevDecodeLength(p,&prevlensize); + return zipPrevEncodeLength(NULL,len)-prevlensize; +} + +/* Check if string pointed to by 'entry' can be encoded as an integer. + * Stores the integer value in 'v' and its encoding in 'encoding'. + * Warning: this function requires a NULL-terminated string! */ +static int zipTryEncoding(unsigned char *entry, long long *v, unsigned char *encoding) { + long long value; + char *eptr; + + if (entry[0] == '-' || (entry[0] >= '0' && entry[0] <= '9')) { + value = strtoll((char*)entry,&eptr,10); + if (eptr[0] != '\0') return 0; + if (value >= INT16_MIN && value <= INT16_MAX) { + *encoding = ZIP_ENC_INT16; + } else if (value >= INT32_MIN && value <= INT32_MAX) { + *encoding = ZIP_ENC_INT32; + } else { + *encoding = ZIP_ENC_INT64; + } + *v = value; + return 1; + } + return 0; +} + +/* Store integer 'value' at 'p', encoded as 'encoding' */ +static void zipSaveInteger(unsigned char *p, int64_t value, unsigned char encoding) { + int16_t i16; + int32_t i32; + int64_t i64; + if (encoding == ZIP_ENC_INT16) { + i16 = value; + memcpy(p,&i16,sizeof(i16)); + } else if (encoding == ZIP_ENC_INT32) { + i32 = value; + memcpy(p,&i32,sizeof(i32)); + } else if (encoding == ZIP_ENC_INT64) { + i64 = value; + memcpy(p,&i64,sizeof(i64)); + } else { + assert(NULL); + } +} + +/* Read integer encoded as 'encoding' from 'p' */ +static int64_t zipLoadInteger(unsigned char *p, unsigned char encoding) { + int16_t i16; + int32_t i32; + int64_t i64, ret; + if (encoding == ZIP_ENC_INT16) { + memcpy(&i16,p,sizeof(i16)); + ret = i16; + } else if (encoding == ZIP_ENC_INT32) { + memcpy(&i32,p,sizeof(i32)); + ret = i32; + } else if (encoding == ZIP_ENC_INT64) { + memcpy(&i64,p,sizeof(i64)); + ret = i64; + } else { + assert(NULL); + } + return ret; +} + +/* Return a struct with all information about an entry. */ +static zlentry zipEntry(unsigned char *p) { + zlentry e; + e.prevrawlen = zipPrevDecodeLength(p,&e.prevrawlensize); + e.len = zipDecodeLength(p+e.prevrawlensize,&e.lensize); + e.headersize = e.prevrawlensize+e.lensize; + e.encoding = ZIP_ENCODING(p+e.prevrawlensize); + e.p = p; + return e; +} + +/* Return the total number of bytes used by the entry at "p". */ +static unsigned int zipRawEntryLength(unsigned char *p) { + zlentry e = zipEntry(p); + return e.headersize + e.len; +} + +/* Create a new empty ziplist. */ +unsigned char *ziplistNew(void) { + unsigned int bytes = ZIPLIST_HEADER_SIZE+1; + unsigned char *zl = zmalloc(bytes); + ZIPLIST_BYTES(zl) = bytes; + ZIPLIST_TAIL_OFFSET(zl) = ZIPLIST_HEADER_SIZE; + ZIPLIST_LENGTH(zl) = 0; + zl[bytes-1] = ZIP_END; + return zl; +} + +/* Resize the ziplist. */ +static unsigned char *ziplistResize(unsigned char *zl, unsigned int len) { + zl = zrealloc(zl,len); + ZIPLIST_BYTES(zl) = len; + zl[len-1] = ZIP_END; + return zl; +} + +/* Delete "num" entries, starting at "p". Returns pointer to the ziplist. */ +static unsigned char *__ziplistDelete(unsigned char *zl, unsigned char *p, unsigned int num) { + unsigned int i, totlen, deleted = 0; + int nextdiff = 0; + zlentry first = zipEntry(p); + for (i = 0; p[0] != ZIP_END && i < num; i++) { + p += zipRawEntryLength(p); + deleted++; + } + + totlen = p-first.p; + if (totlen > 0) { + if (p[0] != ZIP_END) { + /* Tricky: storing the prevlen in this entry might reduce or + * increase the number of bytes needed, compared to the current + * prevlen. Note that we can always store this length because + * it was previously stored by an entry that is being deleted. */ + nextdiff = zipPrevLenByteDiff(p,first.prevrawlen); + zipPrevEncodeLength(p-nextdiff,first.prevrawlen); + + /* Update offset for tail */ + ZIPLIST_TAIL_OFFSET(zl) -= totlen+nextdiff; + + /* Move tail to the front of the ziplist */ + memmove(first.p,p-nextdiff,ZIPLIST_BYTES(zl)-(p-zl)-1+nextdiff); + } else { + /* The entire tail was deleted. No need to move memory. */ + ZIPLIST_TAIL_OFFSET(zl) = (first.p-zl)-first.prevrawlen; + } + + /* Resize and update length */ + zl = ziplistResize(zl, ZIPLIST_BYTES(zl)-totlen+nextdiff); + ZIPLIST_INCR_LENGTH(zl,-deleted); + } + return zl; +} + +/* Insert item at "p". */ +static unsigned char *__ziplistInsert(unsigned char *zl, unsigned char *p, unsigned char *s, unsigned int slen) { + unsigned int curlen = ZIPLIST_BYTES(zl), reqlen, prevlen = 0; + unsigned int offset, nextdiff = 0; + unsigned char *tail; + unsigned char encoding = ZIP_ENC_RAW; + long long value; + zlentry entry; + + /* Find out prevlen for the entry that is inserted. */ + if (p[0] != ZIP_END) { + entry = zipEntry(p); + prevlen = entry.prevrawlen; + } else { + tail = ZIPLIST_ENTRY_TAIL(zl); + if (tail[0] != ZIP_END) { + prevlen = zipRawEntryLength(tail); + } + } + + /* See if the entry can be encoded */ + if (zipTryEncoding(s,&value,&encoding)) { + reqlen = zipEncodingSize(encoding); + } else { + reqlen = slen; + } + + /* We need space for both the length of the previous entry and + * the length of the payload. */ + reqlen += zipPrevEncodeLength(NULL,prevlen); + reqlen += zipEncodeLength(NULL,encoding,slen); + + /* When the insert position is not equal to the tail, we need to + * make sure that the next entry can hold this entry's length in + * its prevlen field. */ + nextdiff = (p[0] != ZIP_END) ? zipPrevLenByteDiff(p,reqlen) : 0; + + /* Store offset because a realloc may change the address of zl. */ + offset = p-zl; + zl = ziplistResize(zl,curlen+reqlen+nextdiff); + p = zl+offset; + + /* Apply memory move when necessary and update tail offset. */ + if (p[0] != ZIP_END) { + /* Subtract one because of the ZIP_END bytes */ + memmove(p+reqlen,p-nextdiff,curlen-offset-1+nextdiff); + /* Encode this entry's raw length in the next entry. */ + zipPrevEncodeLength(p+reqlen,reqlen); + /* Update offset for tail */ + ZIPLIST_TAIL_OFFSET(zl) += reqlen+nextdiff; + } else { + /* This element will be the new tail. */ + ZIPLIST_TAIL_OFFSET(zl) = p-zl; + } + + /* Write the entry */ + p += zipPrevEncodeLength(p,prevlen); + p += zipEncodeLength(p,encoding,slen); + if (encoding != ZIP_ENC_RAW) { + zipSaveInteger(p,value,encoding); + } else { + memcpy(p,s,slen); + } + ZIPLIST_INCR_LENGTH(zl,1); + return zl; +} + +unsigned char *ziplistPush(unsigned char *zl, unsigned char *s, unsigned int slen, int where) { + unsigned char *p; + p = (where == ZIPLIST_HEAD) ? ZIPLIST_ENTRY_HEAD(zl) : ZIPLIST_ENTRY_END(zl); + return __ziplistInsert(zl,p,s,slen); +} + +/* Returns an offset to use for iterating with ziplistNext. When the given + * index is negative, the list is traversed back to front. When the list + * doesn't contain an element at the provided index, NULL is returned. */ +unsigned char *ziplistIndex(unsigned char *zl, int index) { + unsigned char *p; + zlentry entry; + if (index < 0) { + index = (-index)-1; + p = ZIPLIST_ENTRY_TAIL(zl); + if (p[0] != ZIP_END) { + entry = zipEntry(p); + while (entry.prevrawlen > 0 && index--) { + p -= entry.prevrawlen; + entry = zipEntry(p); + } + } + } else { + p = ZIPLIST_ENTRY_HEAD(zl); + while (p[0] != ZIP_END && index--) { + p += zipRawEntryLength(p); + } + } + return (p[0] == ZIP_END || index > 0) ? NULL : p; +} + +/* Return pointer to next entry in ziplist. */ +unsigned char *ziplistNext(unsigned char *zl, unsigned char *p) { + ((void) zl); + + /* "p" could be equal to ZIP_END, caused by ziplistDelete, + * and we should return NULL. Otherwise, we should return NULL + * when the *next* element is ZIP_END (there is no next entry). */ + if (p[0] == ZIP_END) { + return NULL; + } else { + p = p+zipRawEntryLength(p); + return (p[0] == ZIP_END) ? NULL : p; + } +} + +/* Return pointer to previous entry in ziplist. */ +unsigned char *ziplistPrev(unsigned char *zl, unsigned char *p) { + zlentry entry; + + /* Iterating backwards from ZIP_END should return the tail. When "p" is + * equal to the first element of the list, we're already at the head, + * and should return NULL. */ + if (p[0] == ZIP_END) { + p = ZIPLIST_ENTRY_TAIL(zl); + return (p[0] == ZIP_END) ? NULL : p; + } else if (p == ZIPLIST_ENTRY_HEAD(zl)) { + return NULL; + } else { + entry = zipEntry(p); + return p-entry.prevrawlen; + } +} + +/* Get entry pointer to by 'p' and store in either 'e' or 'v' depending + * on the encoding of the entry. 'e' is always set to NULL to be able + * to find out whether the string pointer or the integer value was set. + * Return 0 if 'p' points to the end of the zipmap, 1 otherwise. */ +unsigned int ziplistGet(unsigned char *p, unsigned char **sstr, unsigned int *slen, long long *sval) { + zlentry entry; + if (p == NULL || p[0] == ZIP_END) return 0; + if (sstr) *sstr = NULL; + + entry = zipEntry(p); + if (entry.encoding == ZIP_ENC_RAW) { + if (sstr) { + *slen = entry.len; + *sstr = p+entry.headersize; + } + } else { + if (sval) { + *sval = zipLoadInteger(p+entry.headersize,entry.encoding); + } + } + return 1; +} + +/* Insert an entry at "p". */ +unsigned char *ziplistInsert(unsigned char *zl, unsigned char *p, unsigned char *s, unsigned int slen) { + return __ziplistInsert(zl,p,s,slen); +} + +/* Delete a single entry from the ziplist, pointed to by *p. + * Also update *p in place, to be able to iterate over the + * ziplist, while deleting entries. */ +unsigned char *ziplistDelete(unsigned char *zl, unsigned char **p) { + unsigned int offset = *p-zl; + zl = __ziplistDelete(zl,*p,1); + + /* Store pointer to current element in p, because ziplistDelete will + * do a realloc which might result in a different "zl"-pointer. + * When the delete direction is back to front, we might delete the last + * entry and end up with "p" pointing to ZIP_END, so check this. */ + *p = zl+offset; + return zl; +} + +/* Delete a range of entries from the ziplist. */ +unsigned char *ziplistDeleteRange(unsigned char *zl, unsigned int index, unsigned int num) { + unsigned char *p = ziplistIndex(zl,index); + return (p == NULL) ? zl : __ziplistDelete(zl,p,num); +} + +/* Compare entry pointer to by 'p' with 'entry'. Return 1 if equal. */ +unsigned int ziplistCompare(unsigned char *p, unsigned char *sstr, unsigned int slen) { + zlentry entry; + unsigned char sencoding; + long long zval, sval; + if (p[0] == ZIP_END) return 0; + + entry = zipEntry(p); + if (entry.encoding == ZIP_ENC_RAW) { + /* Raw compare */ + if (entry.len == slen) { + return memcmp(p+entry.headersize,sstr,slen) == 0; + } else { + return 0; + } + } else { + /* Try to compare encoded values */ + if (zipTryEncoding(sstr,&sval,&sencoding)) { + if (entry.encoding == sencoding) { + zval = zipLoadInteger(p+entry.headersize,entry.encoding); + return zval == sval; + } + } + } + return 0; +} + +/* Return length of ziplist. */ +unsigned int ziplistLen(unsigned char *zl) { + unsigned int len = 0; + if (ZIPLIST_LENGTH(zl) < UINT16_MAX) { + len = ZIPLIST_LENGTH(zl); + } else { + unsigned char *p = zl+ZIPLIST_HEADER_SIZE; + while (*p != ZIP_END) { + p += zipRawEntryLength(p); + len++; + } + + /* Re-store length if small enough */ + if (len < UINT16_MAX) ZIPLIST_LENGTH(zl) = len; + } + return len; +} + +/* Return size in bytes of ziplist. */ +unsigned int ziplistSize(unsigned char *zl) { + return ZIPLIST_BYTES(zl); +} + +void ziplistRepr(unsigned char *zl) { + unsigned char *p; + zlentry entry; + + printf("{total bytes %d} {length %u}\n",ZIPLIST_BYTES(zl), ZIPLIST_LENGTH(zl)); + p = ZIPLIST_ENTRY_HEAD(zl); + while(*p != ZIP_END) { + entry = zipEntry(p); + printf("{offset %ld, header %u, payload %u} ",p-zl,entry.headersize,entry.len); + p += entry.headersize; + if (entry.encoding == ZIP_ENC_RAW) { + fwrite(p,entry.len,1,stdout); + } else { + printf("%lld", zipLoadInteger(p,entry.encoding)); + } + printf("\n"); + p += entry.len; + } + printf("{end}\n\n"); +} + +#ifdef ZIPLIST_TEST_MAIN +#include + +unsigned char *createList() { + unsigned char *zl = ziplistNew(); + zl = ziplistPush(zl, (unsigned char*)"foo", 3, ZIPLIST_TAIL); + zl = ziplistPush(zl, (unsigned char*)"quux", 4, ZIPLIST_TAIL); + zl = ziplistPush(zl, (unsigned char*)"hello", 5, ZIPLIST_HEAD); + zl = ziplistPush(zl, (unsigned char*)"1024", 4, ZIPLIST_TAIL); + return zl; +} + +unsigned char *createIntList() { + unsigned char *zl = ziplistNew(); + char buf[32]; + + sprintf(buf, "100"); + zl = ziplistPush(zl, (unsigned char*)buf, strlen(buf), ZIPLIST_TAIL); + sprintf(buf, "128000"); + zl = ziplistPush(zl, (unsigned char*)buf, strlen(buf), ZIPLIST_TAIL); + sprintf(buf, "-100"); + zl = ziplistPush(zl, (unsigned char*)buf, strlen(buf), ZIPLIST_HEAD); + sprintf(buf, "4294967296"); + zl = ziplistPush(zl, (unsigned char*)buf, strlen(buf), ZIPLIST_HEAD); + sprintf(buf, "non integer"); + zl = ziplistPush(zl, (unsigned char*)buf, strlen(buf), ZIPLIST_TAIL); + sprintf(buf, "much much longer non integer"); + zl = ziplistPush(zl, (unsigned char*)buf, strlen(buf), ZIPLIST_TAIL); + return zl; +} + +long long usec(void) { + struct timeval tv; + gettimeofday(&tv,NULL); + return (((long long)tv.tv_sec)*1000000)+tv.tv_usec; +} + +void stress(int pos, int num, int maxsize, int dnum) { + int i,j,k; + unsigned char *zl; + char posstr[2][5] = { "HEAD", "TAIL" }; + long long start; + for (i = 0; i < maxsize; i+=dnum) { + zl = ziplistNew(); + for (j = 0; j < i; j++) { + zl = ziplistPush(zl,(unsigned char*)"quux",4,ZIPLIST_TAIL); + } + + /* Do num times a push+pop from pos */ + start = usec(); + for (k = 0; k < num; k++) { + zl = ziplistPush(zl,(unsigned char*)"quux",4,pos); + zl = ziplistDeleteRange(zl,0,1); + } + printf("List size: %8d, bytes: %8d, %dx push+pop (%s): %6lld usec\n", + i,ZIPLIST_BYTES(zl),num,posstr[pos],usec()-start); + zfree(zl); + } +} + +void pop(unsigned char *zl, int where) { + unsigned char *p, *vstr; + unsigned int vlen; + long long vlong; + + p = ziplistIndex(zl,where == ZIPLIST_HEAD ? 0 : -1); + if (ziplistGet(p,&vstr,&vlen,&vlong)) { + if (where == ZIPLIST_HEAD) + printf("Pop head: "); + else + printf("Pop tail: "); + + if (vstr) + fwrite(vstr,vlen,1,stdout); + else + printf("%lld", vlong); + + printf("\n"); + ziplistDeleteRange(zl,-1,1); + } else { + printf("ERROR: Could not pop\n"); + exit(1); + } +} + +int main(int argc, char **argv) { + unsigned char *zl, *p; + unsigned char *entry; + unsigned int elen; + long long value; + + zl = createIntList(); + ziplistRepr(zl); + + zl = createList(); + ziplistRepr(zl); + + pop(zl,ZIPLIST_TAIL); + ziplistRepr(zl); + + pop(zl,ZIPLIST_HEAD); + ziplistRepr(zl); + + pop(zl,ZIPLIST_TAIL); + ziplistRepr(zl); + + pop(zl,ZIPLIST_TAIL); + ziplistRepr(zl); + + printf("Get element at index 3:\n"); + { + zl = createList(); + p = ziplistIndex(zl, 3); + if (!ziplistGet(p, &entry, &elen, &value)) { + printf("ERROR: Could not access index 3\n"); + return 1; + } + if (entry) { + fwrite(entry,elen,1,stdout); + printf("\n"); + } else { + printf("%lld\n", value); + } + printf("\n"); + } + + printf("Get element at index 4 (out of range):\n"); + { + zl = createList(); + p = ziplistIndex(zl, 4); + if (p == NULL) { + printf("No entry\n"); + } else { + printf("ERROR: Out of range index should return NULL, returned offset: %ld\n", p-zl); + return 1; + } + printf("\n"); + } + + printf("Get element at index -1 (last element):\n"); + { + zl = createList(); + p = ziplistIndex(zl, -1); + if (!ziplistGet(p, &entry, &elen, &value)) { + printf("ERROR: Could not access index -1\n"); + return 1; + } + if (entry) { + fwrite(entry,elen,1,stdout); + printf("\n"); + } else { + printf("%lld\n", value); + } + printf("\n"); + } + + printf("Get element at index -4 (first element):\n"); + { + zl = createList(); + p = ziplistIndex(zl, -4); + if (!ziplistGet(p, &entry, &elen, &value)) { + printf("ERROR: Could not access index -4\n"); + return 1; + } + if (entry) { + fwrite(entry,elen,1,stdout); + printf("\n"); + } else { + printf("%lld\n", value); + } + printf("\n"); + } + + printf("Get element at index -5 (reverse out of range):\n"); + { + zl = createList(); + p = ziplistIndex(zl, -5); + if (p == NULL) { + printf("No entry\n"); + } else { + printf("ERROR: Out of range index should return NULL, returned offset: %ld\n", p-zl); + return 1; + } + printf("\n"); + } + + printf("Iterate list from 0 to end:\n"); + { + zl = createList(); + p = ziplistIndex(zl, 0); + while (ziplistGet(p, &entry, &elen, &value)) { + printf("Entry: "); + if (entry) { + fwrite(entry,elen,1,stdout); + } else { + printf("%lld", value); + } + p = ziplistNext(zl,p); + printf("\n"); + } + printf("\n"); + } + + printf("Iterate list from 1 to end:\n"); + { + zl = createList(); + p = ziplistIndex(zl, 1); + while (ziplistGet(p, &entry, &elen, &value)) { + printf("Entry: "); + if (entry) { + fwrite(entry,elen,1,stdout); + } else { + printf("%lld", value); + } + p = ziplistNext(zl,p); + printf("\n"); + } + printf("\n"); + } + + printf("Iterate list from 2 to end:\n"); + { + zl = createList(); + p = ziplistIndex(zl, 2); + while (ziplistGet(p, &entry, &elen, &value)) { + printf("Entry: "); + if (entry) { + fwrite(entry,elen,1,stdout); + } else { + printf("%lld", value); + } + p = ziplistNext(zl,p); + printf("\n"); + } + printf("\n"); + } + + printf("Iterate starting out of range:\n"); + { + zl = createList(); + p = ziplistIndex(zl, 4); + if (!ziplistGet(p, &entry, &elen, &value)) { + printf("No entry\n"); + } else { + printf("ERROR\n"); + } + printf("\n"); + } + + printf("Iterate from back to front:\n"); + { + zl = createList(); + p = ziplistIndex(zl, -1); + while (ziplistGet(p, &entry, &elen, &value)) { + printf("Entry: "); + if (entry) { + fwrite(entry,elen,1,stdout); + } else { + printf("%lld", value); + } + p = ziplistPrev(zl,p); + printf("\n"); + } + printf("\n"); + } + + printf("Iterate from back to front, deleting all items:\n"); + { + zl = createList(); + p = ziplistIndex(zl, -1); + while (ziplistGet(p, &entry, &elen, &value)) { + printf("Entry: "); + if (entry) { + fwrite(entry,elen,1,stdout); + } else { + printf("%lld", value); + } + zl = ziplistDelete(zl,&p); + p = ziplistPrev(zl,p); + printf("\n"); + } + printf("\n"); + } + + printf("Delete inclusive range 0,0:\n"); + { + zl = createList(); + zl = ziplistDeleteRange(zl, 0, 1); + ziplistRepr(zl); + } + + printf("Delete inclusive range 0,1:\n"); + { + zl = createList(); + zl = ziplistDeleteRange(zl, 0, 2); + ziplistRepr(zl); + } + + printf("Delete inclusive range 1,2:\n"); + { + zl = createList(); + zl = ziplistDeleteRange(zl, 1, 2); + ziplistRepr(zl); + } + + printf("Delete with start index out of range:\n"); + { + zl = createList(); + zl = ziplistDeleteRange(zl, 5, 1); + ziplistRepr(zl); + } + + printf("Delete with num overflow:\n"); + { + zl = createList(); + zl = ziplistDeleteRange(zl, 1, 5); + ziplistRepr(zl); + } + + printf("Delete foo while iterating:\n"); + { + zl = createList(); + p = ziplistIndex(zl,0); + while (ziplistGet(p,&entry,&elen,&value)) { + if (entry && strncmp("foo",(char*)entry,elen) == 0) { + printf("Delete foo\n"); + zl = ziplistDelete(zl,&p); + } else { + printf("Entry: "); + if (entry) { + fwrite(entry,elen,1,stdout); + } else { + printf("%lld",value); + } + p = ziplistNext(zl,p); + printf("\n"); + } + } + printf("\n"); + ziplistRepr(zl); + } + + printf("Create long list and check indices:\n"); + { + zl = ziplistNew(); + char buf[32]; + int i,len; + for (i = 0; i < 1000; i++) { + len = sprintf(buf,"%d",i); + zl = ziplistPush(zl,(unsigned char*)buf,len,ZIPLIST_TAIL); + } + for (i = 0; i < 1000; i++) { + p = ziplistIndex(zl,i); + assert(ziplistGet(p,NULL,NULL,&value)); + assert(i == value); + + p = ziplistIndex(zl,-i-1); + assert(ziplistGet(p,NULL,NULL,&value)); + assert(999-i == value); + } + printf("SUCCESS\n\n"); + } + + printf("Compare strings with ziplist entries:\n"); + { + zl = createList(); + p = ziplistIndex(zl,0); + if (!ziplistCompare(p,(unsigned char*)"hello",5)) { + printf("ERROR: not \"hello\"\n"); + return 1; + } + if (ziplistCompare(p,(unsigned char*)"hella",5)) { + printf("ERROR: \"hella\"\n"); + return 1; + } + + p = ziplistIndex(zl,3); + if (!ziplistCompare(p,(unsigned char*)"1024",4)) { + printf("ERROR: not \"1024\"\n"); + return 1; + } + if (ziplistCompare(p,(unsigned char*)"1025",4)) { + printf("ERROR: \"1025\"\n"); + return 1; + } + printf("SUCCESS\n"); + } + + printf("Stress with variable ziplist size:\n"); + { + stress(ZIPLIST_HEAD,100000,16384,256); + stress(ZIPLIST_TAIL,100000,16384,256); + } + + return 0; +} + +#endif diff --git a/src/ziplist.h b/src/ziplist.h new file mode 100644 index 000000000..311257256 --- /dev/null +++ b/src/ziplist.h @@ -0,0 +1,15 @@ +#define ZIPLIST_HEAD 0 +#define ZIPLIST_TAIL 1 + +unsigned char *ziplistNew(void); +unsigned char *ziplistPush(unsigned char *zl, unsigned char *s, unsigned int slen, int where); +unsigned char *ziplistIndex(unsigned char *zl, int index); +unsigned char *ziplistNext(unsigned char *zl, unsigned char *p); +unsigned char *ziplistPrev(unsigned char *zl, unsigned char *p); +unsigned int ziplistGet(unsigned char *p, unsigned char **sval, unsigned int *slen, long long *lval); +unsigned char *ziplistInsert(unsigned char *zl, unsigned char *p, unsigned char *s, unsigned int slen); +unsigned char *ziplistDelete(unsigned char *zl, unsigned char **p); +unsigned char *ziplistDeleteRange(unsigned char *zl, unsigned int index, unsigned int num); +unsigned int ziplistCompare(unsigned char *p, unsigned char *s, unsigned int slen); +unsigned int ziplistLen(unsigned char *zl); +unsigned int ziplistSize(unsigned char *zl); diff --git a/src/zipmap.c b/src/zipmap.c new file mode 100644 index 000000000..35faeabef --- /dev/null +++ b/src/zipmap.c @@ -0,0 +1,455 @@ +/* String -> String Map data structure optimized for size. + * This file implements a data structure mapping strings to other strings + * implementing an O(n) lookup data structure designed to be very memory + * efficient. + * + * The Redis Hash type uses this data structure for hashes composed of a small + * number of elements, to switch to an hash table once a given number of + * elements is reached. + * + * Given that many times Redis Hashes are used to represent objects composed + * of few fields, this is a very big win in terms of used memory. + * + * -------------------------------------------------------------------------- + * + * Copyright (c) 2009-2010, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* Memory layout of a zipmap, for the map "foo" => "bar", "hello" => "world": + * + * "foo""bar""hello""world" + * + * is 1 byte length that holds the current size of the zipmap. + * When the zipmap length is greater than or equal to 254, this value + * is not used and the zipmap needs to be traversed to find out the length. + * + * is the length of the following string (key or value). + * lengths are encoded in a single value or in a 5 bytes value. + * If the first byte value (as an unsigned 8 bit value) is between 0 and + * 252, it's a single-byte length. If it is 253 then a four bytes unsigned + * integer follows (in the host byte ordering). A value fo 255 is used to + * signal the end of the hash. The special value 254 is used to mark + * empty space that can be used to add new key/value pairs. + * + * is the number of free unused bytes + * after the string, resulting from modification of values associated to a + * key (for instance if "foo" is set to "bar', and later "foo" will be se to + * "hi", I'll have a free byte to use if the value will enlarge again later, + * or even in order to add a key/value pair if it fits. + * + * is always an unsigned 8 bit number, because if after an + * update operation there are more than a few free bytes, the zipmap will be + * reallocated to make sure it is as small as possible. + * + * The most compact representation of the above two elements hash is actually: + * + * "\x02\x03foo\x03\x00bar\x05hello\x05\x00world\xff" + * + * Note that because keys and values are prefixed length "objects", + * the lookup will take O(N) where N is the number of elements + * in the zipmap and *not* the number of bytes needed to represent the zipmap. + * This lowers the constant times considerably. + */ + +#include +#include +#include +#include "zmalloc.h" + +#define ZIPMAP_BIGLEN 254 +#define ZIPMAP_END 255 + +/* The following defines the max value for the field described in the + * comments above, that is, the max number of trailing bytes in a value. */ +#define ZIPMAP_VALUE_MAX_FREE 4 + +/* The following macro returns the number of bytes needed to encode the length + * for the integer value _l, that is, 1 byte for lengths < ZIPMAP_BIGLEN and + * 5 bytes for all the other lengths. */ +#define ZIPMAP_LEN_BYTES(_l) (((_l) < ZIPMAP_BIGLEN) ? 1 : sizeof(unsigned int)+1) + +/* Create a new empty zipmap. */ +unsigned char *zipmapNew(void) { + unsigned char *zm = zmalloc(2); + + zm[0] = 0; /* Length */ + zm[1] = ZIPMAP_END; + return zm; +} + +/* Decode the encoded length pointed by 'p' */ +static unsigned int zipmapDecodeLength(unsigned char *p) { + unsigned int len = *p; + + if (len < ZIPMAP_BIGLEN) return len; + memcpy(&len,p+1,sizeof(unsigned int)); + return len; +} + +/* Encode the length 'l' writing it in 'p'. If p is NULL it just returns + * the amount of bytes required to encode such a length. */ +static unsigned int zipmapEncodeLength(unsigned char *p, unsigned int len) { + if (p == NULL) { + return ZIPMAP_LEN_BYTES(len); + } else { + if (len < ZIPMAP_BIGLEN) { + p[0] = len; + return 1; + } else { + p[0] = ZIPMAP_BIGLEN; + memcpy(p+1,&len,sizeof(len)); + return 1+sizeof(len); + } + } +} + +/* Search for a matching key, returning a pointer to the entry inside the + * zipmap. Returns NULL if the key is not found. + * + * If NULL is returned, and totlen is not NULL, it is set to the entire + * size of the zimap, so that the calling function will be able to + * reallocate the original zipmap to make room for more entries. */ +static unsigned char *zipmapLookupRaw(unsigned char *zm, unsigned char *key, unsigned int klen, unsigned int *totlen) { + unsigned char *p = zm+1, *k = NULL; + unsigned int l,llen; + + while(*p != ZIPMAP_END) { + unsigned char free; + + /* Match or skip the key */ + l = zipmapDecodeLength(p); + llen = zipmapEncodeLength(NULL,l); + if (k == NULL && l == klen && !memcmp(p+llen,key,l)) { + /* Only return when the user doesn't care + * for the total length of the zipmap. */ + if (totlen != NULL) { + k = p; + } else { + return p; + } + } + p += llen+l; + /* Skip the value as well */ + l = zipmapDecodeLength(p); + p += zipmapEncodeLength(NULL,l); + free = p[0]; + p += l+1+free; /* +1 to skip the free byte */ + } + if (totlen != NULL) *totlen = (unsigned int)(p-zm)+1; + return k; +} + +static unsigned long zipmapRequiredLength(unsigned int klen, unsigned int vlen) { + unsigned int l; + + l = klen+vlen+3; + if (klen >= ZIPMAP_BIGLEN) l += 4; + if (vlen >= ZIPMAP_BIGLEN) l += 4; + return l; +} + +/* Return the total amount used by a key (encoded length + payload) */ +static unsigned int zipmapRawKeyLength(unsigned char *p) { + unsigned int l = zipmapDecodeLength(p); + return zipmapEncodeLength(NULL,l) + l; +} + +/* Return the total amount used by a value + * (encoded length + single byte free count + payload) */ +static unsigned int zipmapRawValueLength(unsigned char *p) { + unsigned int l = zipmapDecodeLength(p); + unsigned int used; + + used = zipmapEncodeLength(NULL,l); + used += p[used] + 1 + l; + return used; +} + +/* If 'p' points to a key, this function returns the total amount of + * bytes used to store this entry (entry = key + associated value + trailing + * free space if any). */ +static unsigned int zipmapRawEntryLength(unsigned char *p) { + unsigned int l = zipmapRawKeyLength(p); + return l + zipmapRawValueLength(p+l); +} + +static inline unsigned char *zipmapResize(unsigned char *zm, unsigned int len) { + zm = zrealloc(zm, len); + zm[len-1] = ZIPMAP_END; + return zm; +} + +/* Set key to value, creating the key if it does not already exist. + * If 'update' is not NULL, *update is set to 1 if the key was + * already preset, otherwise to 0. */ +unsigned char *zipmapSet(unsigned char *zm, unsigned char *key, unsigned int klen, unsigned char *val, unsigned int vlen, int *update) { + unsigned int zmlen, offset; + unsigned int freelen, reqlen = zipmapRequiredLength(klen,vlen); + unsigned int empty, vempty; + unsigned char *p; + + freelen = reqlen; + if (update) *update = 0; + p = zipmapLookupRaw(zm,key,klen,&zmlen); + if (p == NULL) { + /* Key not found: enlarge */ + zm = zipmapResize(zm, zmlen+reqlen); + p = zm+zmlen-1; + zmlen = zmlen+reqlen; + + /* Increase zipmap length (this is an insert) */ + if (zm[0] < ZIPMAP_BIGLEN) zm[0]++; + } else { + /* Key found. Is there enough space for the new value? */ + /* Compute the total length: */ + if (update) *update = 1; + freelen = zipmapRawEntryLength(p); + if (freelen < reqlen) { + /* Store the offset of this key within the current zipmap, so + * it can be resized. Then, move the tail backwards so this + * pair fits at the current position. */ + offset = p-zm; + zm = zipmapResize(zm, zmlen-freelen+reqlen); + p = zm+offset; + + /* The +1 in the number of bytes to be moved is caused by the + * end-of-zipmap byte. Note: the *original* zmlen is used. */ + memmove(p+reqlen, p+freelen, zmlen-(offset+freelen+1)); + zmlen = zmlen-freelen+reqlen; + freelen = reqlen; + } + } + + /* We now have a suitable block where the key/value entry can + * be written. If there is too much free space, move the tail + * of the zipmap a few bytes to the front and shrink the zipmap, + * as we want zipmaps to be very space efficient. */ + empty = freelen-reqlen; + if (empty >= ZIPMAP_VALUE_MAX_FREE) { + /* First, move the tail bytes to the front, then resize + * the zipmap to be bytes smaller. */ + offset = p-zm; + memmove(p+reqlen, p+freelen, zmlen-(offset+freelen+1)); + zmlen -= empty; + zm = zipmapResize(zm, zmlen); + p = zm+offset; + vempty = 0; + } else { + vempty = empty; + } + + /* Just write the key + value and we are done. */ + /* Key: */ + p += zipmapEncodeLength(p,klen); + memcpy(p,key,klen); + p += klen; + /* Value: */ + p += zipmapEncodeLength(p,vlen); + *p++ = vempty; + memcpy(p,val,vlen); + return zm; +} + +/* Remove the specified key. If 'deleted' is not NULL the pointed integer is + * set to 0 if the key was not found, to 1 if it was found and deleted. */ +unsigned char *zipmapDel(unsigned char *zm, unsigned char *key, unsigned int klen, int *deleted) { + unsigned int zmlen, freelen; + unsigned char *p = zipmapLookupRaw(zm,key,klen,&zmlen); + if (p) { + freelen = zipmapRawEntryLength(p); + memmove(p, p+freelen, zmlen-((p-zm)+freelen+1)); + zm = zipmapResize(zm, zmlen-freelen); + + /* Decrease zipmap length */ + if (zm[0] < ZIPMAP_BIGLEN) zm[0]--; + + if (deleted) *deleted = 1; + } else { + if (deleted) *deleted = 0; + } + return zm; +} + +/* Call it before to iterate trought elements via zipmapNext() */ +unsigned char *zipmapRewind(unsigned char *zm) { + return zm+1; +} + +/* This function is used to iterate through all the zipmap elements. + * In the first call the first argument is the pointer to the zipmap + 1. + * In the next calls what zipmapNext returns is used as first argument. + * Example: + * + * unsigned char *i = zipmapRewind(my_zipmap); + * while((i = zipmapNext(i,&key,&klen,&value,&vlen)) != NULL) { + * printf("%d bytes key at $p\n", klen, key); + * printf("%d bytes value at $p\n", vlen, value); + * } + */ +unsigned char *zipmapNext(unsigned char *zm, unsigned char **key, unsigned int *klen, unsigned char **value, unsigned int *vlen) { + if (zm[0] == ZIPMAP_END) return NULL; + if (key) { + *key = zm; + *klen = zipmapDecodeLength(zm); + *key += ZIPMAP_LEN_BYTES(*klen); + } + zm += zipmapRawKeyLength(zm); + if (value) { + *value = zm+1; + *vlen = zipmapDecodeLength(zm); + *value += ZIPMAP_LEN_BYTES(*vlen); + } + zm += zipmapRawValueLength(zm); + return zm; +} + +/* Search a key and retrieve the pointer and len of the associated value. + * If the key is found the function returns 1, otherwise 0. */ +int zipmapGet(unsigned char *zm, unsigned char *key, unsigned int klen, unsigned char **value, unsigned int *vlen) { + unsigned char *p; + + if ((p = zipmapLookupRaw(zm,key,klen,NULL)) == NULL) return 0; + p += zipmapRawKeyLength(p); + *vlen = zipmapDecodeLength(p); + *value = p + ZIPMAP_LEN_BYTES(*vlen) + 1; + return 1; +} + +/* Return 1 if the key exists, otherwise 0 is returned. */ +int zipmapExists(unsigned char *zm, unsigned char *key, unsigned int klen) { + return zipmapLookupRaw(zm,key,klen,NULL) != NULL; +} + +/* Return the number of entries inside a zipmap */ +unsigned int zipmapLen(unsigned char *zm) { + unsigned int len = 0; + if (zm[0] < ZIPMAP_BIGLEN) { + len = zm[0]; + } else { + unsigned char *p = zipmapRewind(zm); + while((p = zipmapNext(p,NULL,NULL,NULL,NULL)) != NULL) len++; + + /* Re-store length if small enough */ + if (len < ZIPMAP_BIGLEN) zm[0] = len; + } + return len; +} + +void zipmapRepr(unsigned char *p) { + unsigned int l; + + printf("{status %u}",*p++); + while(1) { + if (p[0] == ZIPMAP_END) { + printf("{end}"); + break; + } else { + unsigned char e; + + l = zipmapDecodeLength(p); + printf("{key %u}",l); + p += zipmapEncodeLength(NULL,l); + fwrite(p,l,1,stdout); + p += l; + + l = zipmapDecodeLength(p); + printf("{value %u}",l); + p += zipmapEncodeLength(NULL,l); + e = *p++; + fwrite(p,l,1,stdout); + p += l+e; + if (e) { + printf("["); + while(e--) printf("."); + printf("]"); + } + } + } + printf("\n"); +} + +#ifdef ZIPMAP_TEST_MAIN +int main(void) { + unsigned char *zm; + + zm = zipmapNew(); + + zm = zipmapSet(zm,(unsigned char*) "name",4, (unsigned char*) "foo",3,NULL); + zm = zipmapSet(zm,(unsigned char*) "surname",7, (unsigned char*) "foo",3,NULL); + zm = zipmapSet(zm,(unsigned char*) "age",3, (unsigned char*) "foo",3,NULL); + zipmapRepr(zm); + + zm = zipmapSet(zm,(unsigned char*) "hello",5, (unsigned char*) "world!",6,NULL); + zm = zipmapSet(zm,(unsigned char*) "foo",3, (unsigned char*) "bar",3,NULL); + zm = zipmapSet(zm,(unsigned char*) "foo",3, (unsigned char*) "!",1,NULL); + zipmapRepr(zm); + zm = zipmapSet(zm,(unsigned char*) "foo",3, (unsigned char*) "12345",5,NULL); + zipmapRepr(zm); + zm = zipmapSet(zm,(unsigned char*) "new",3, (unsigned char*) "xx",2,NULL); + zm = zipmapSet(zm,(unsigned char*) "noval",5, (unsigned char*) "",0,NULL); + zipmapRepr(zm); + zm = zipmapDel(zm,(unsigned char*) "new",3,NULL); + zipmapRepr(zm); + + printf("\nLook up large key:\n"); + { + unsigned char buf[512]; + unsigned char *value; + unsigned int vlen, i; + for (i = 0; i < 512; i++) buf[i] = 'a'; + + zm = zipmapSet(zm,buf,512,(unsigned char*) "long",4,NULL); + if (zipmapGet(zm,buf,512,&value,&vlen)) { + printf(" is associated to the %d bytes value: %.*s\n", + vlen, vlen, value); + } + } + + printf("\nPerform a direct lookup:\n"); + { + unsigned char *value; + unsigned int vlen; + + if (zipmapGet(zm,(unsigned char*) "foo",3,&value,&vlen)) { + printf(" foo is associated to the %d bytes value: %.*s\n", + vlen, vlen, value); + } + } + printf("\nIterate trought elements:\n"); + { + unsigned char *i = zipmapRewind(zm); + unsigned char *key, *value; + unsigned int klen, vlen; + + while((i = zipmapNext(i,&key,&klen,&value,&vlen)) != NULL) { + printf(" %d:%.*s => %d:%.*s\n", klen, klen, key, vlen, vlen, value); + } + } + return 0; +} +#endif diff --git a/src/zipmap.h b/src/zipmap.h new file mode 100644 index 000000000..e5f6c9f28 --- /dev/null +++ b/src/zipmap.h @@ -0,0 +1,48 @@ +/* String -> String Map data structure optimized for size. + * + * See zipmap.c for more info. + * + * -------------------------------------------------------------------------- + * + * Copyright (c) 2009-2010, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _ZIMMAP_H +#define _ZIPMAP_H + +unsigned char *zipmapNew(void); +unsigned char *zipmapSet(unsigned char *zm, unsigned char *key, unsigned int klen, unsigned char *val, unsigned int vlen, int *update); +unsigned char *zipmapDel(unsigned char *zm, unsigned char *key, unsigned int klen, int *deleted); +unsigned char *zipmapRewind(unsigned char *zm); +unsigned char *zipmapNext(unsigned char *zm, unsigned char **key, unsigned int *klen, unsigned char **value, unsigned int *vlen); +int zipmapGet(unsigned char *zm, unsigned char *key, unsigned int klen, unsigned char **value, unsigned int *vlen); +int zipmapExists(unsigned char *zm, unsigned char *key, unsigned int klen); +unsigned int zipmapLen(unsigned char *zm); +void zipmapRepr(unsigned char *p); + +#endif diff --git a/src/zmalloc.c b/src/zmalloc.c new file mode 100644 index 000000000..8658376a3 --- /dev/null +++ b/src/zmalloc.c @@ -0,0 +1,158 @@ +/* zmalloc - total amount of allocated memory aware version of malloc() + * + * Copyright (c) 2009-2010, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include "config.h" + +#if defined(__sun) +#define PREFIX_SIZE sizeof(long long) +#else +#define PREFIX_SIZE sizeof(size_t) +#endif + +#define increment_used_memory(__n) do { \ + size_t _n = (__n); \ + if (_n&(sizeof(long)-1)) _n += sizeof(long)-(_n&(sizeof(long)-1)); \ + if (zmalloc_thread_safe) { \ + pthread_mutex_lock(&used_memory_mutex); \ + used_memory += _n; \ + pthread_mutex_unlock(&used_memory_mutex); \ + } else { \ + used_memory += _n; \ + } \ +} while(0) + +#define decrement_used_memory(__n) do { \ + size_t _n = (__n); \ + if (_n&(sizeof(long)-1)) _n += sizeof(long)-(_n&(sizeof(long)-1)); \ + if (zmalloc_thread_safe) { \ + pthread_mutex_lock(&used_memory_mutex); \ + used_memory -= _n; \ + pthread_mutex_unlock(&used_memory_mutex); \ + } else { \ + used_memory -= _n; \ + } \ +} while(0) + +static size_t used_memory = 0; +static int zmalloc_thread_safe = 0; +pthread_mutex_t used_memory_mutex = PTHREAD_MUTEX_INITIALIZER; + +static void zmalloc_oom(size_t size) { + fprintf(stderr, "zmalloc: Out of memory trying to allocate %zu bytes\n", + size); + fflush(stderr); + abort(); +} + +void *zmalloc(size_t size) { + void *ptr = malloc(size+PREFIX_SIZE); + + if (!ptr) zmalloc_oom(size); +#ifdef HAVE_MALLOC_SIZE + increment_used_memory(redis_malloc_size(ptr)); + return ptr; +#else + *((size_t*)ptr) = size; + increment_used_memory(size+PREFIX_SIZE); + return (char*)ptr+PREFIX_SIZE; +#endif +} + +void *zrealloc(void *ptr, size_t size) { +#ifndef HAVE_MALLOC_SIZE + void *realptr; +#endif + size_t oldsize; + void *newptr; + + if (ptr == NULL) return zmalloc(size); +#ifdef HAVE_MALLOC_SIZE + oldsize = redis_malloc_size(ptr); + newptr = realloc(ptr,size); + if (!newptr) zmalloc_oom(size); + + decrement_used_memory(oldsize); + increment_used_memory(redis_malloc_size(newptr)); + return newptr; +#else + realptr = (char*)ptr-PREFIX_SIZE; + oldsize = *((size_t*)realptr); + newptr = realloc(realptr,size+PREFIX_SIZE); + if (!newptr) zmalloc_oom(size); + + *((size_t*)newptr) = size; + decrement_used_memory(oldsize); + increment_used_memory(size); + return (char*)newptr+PREFIX_SIZE; +#endif +} + +void zfree(void *ptr) { +#ifndef HAVE_MALLOC_SIZE + void *realptr; + size_t oldsize; +#endif + + if (ptr == NULL) return; +#ifdef HAVE_MALLOC_SIZE + decrement_used_memory(redis_malloc_size(ptr)); + free(ptr); +#else + realptr = (char*)ptr-PREFIX_SIZE; + oldsize = *((size_t*)realptr); + decrement_used_memory(oldsize+PREFIX_SIZE); + free(realptr); +#endif +} + +char *zstrdup(const char *s) { + size_t l = strlen(s)+1; + char *p = zmalloc(l); + + memcpy(p,s,l); + return p; +} + +size_t zmalloc_used_memory(void) { + size_t um; + + if (zmalloc_thread_safe) pthread_mutex_lock(&used_memory_mutex); + um = used_memory; + if (zmalloc_thread_safe) pthread_mutex_unlock(&used_memory_mutex); + return um; +} + +void zmalloc_enable_thread_safeness(void) { + zmalloc_thread_safe = 1; +} diff --git a/src/zmalloc.h b/src/zmalloc.h new file mode 100644 index 000000000..193e7eda5 --- /dev/null +++ b/src/zmalloc.h @@ -0,0 +1,41 @@ +/* zmalloc - total amount of allocated memory aware version of malloc() + * + * Copyright (c) 2009-2010, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _ZMALLOC_H +#define _ZMALLOC_H + +void *zmalloc(size_t size); +void *zrealloc(void *ptr, size_t size); +void zfree(void *ptr); +char *zstrdup(const char *s); +size_t zmalloc_used_memory(void); +void zmalloc_enable_thread_safeness(void); + +#endif /* _ZMALLOC_H */ diff --git a/staticsymbols.h b/staticsymbols.h deleted file mode 100644 index 0bf3723e6..000000000 --- a/staticsymbols.h +++ /dev/null @@ -1,374 +0,0 @@ -static struct redisFunctionSym symsTable[] = { -{"IOThreadEntryPoint",(unsigned long)IOThreadEntryPoint}, -{"_redisAssert",(unsigned long)_redisAssert}, -{"_redisPanic",(unsigned long)_redisPanic}, -{"acceptHandler",(unsigned long)acceptHandler}, -{"addReply",(unsigned long)addReply}, -{"addReplyBulk",(unsigned long)addReplyBulk}, -{"addReplyBulkCString",(unsigned long)addReplyBulkCString}, -{"addReplyBulkLen",(unsigned long)addReplyBulkLen}, -{"addReplyBulkSds",(unsigned long)addReplyBulkSds}, -{"addReplyDouble",(unsigned long)addReplyDouble}, -{"addReplyLongLong",(unsigned long)addReplyLongLong}, -{"addReplySds",(unsigned long)addReplySds}, -{"addReplyUlong",(unsigned long)addReplyUlong}, -{"aofRemoveTempFile",(unsigned long)aofRemoveTempFile}, -{"appendCommand",(unsigned long)appendCommand}, -{"appendServerSaveParams",(unsigned long)appendServerSaveParams}, -{"authCommand",(unsigned long)authCommand}, -{"beforeSleep",(unsigned long)beforeSleep}, -{"bgrewriteaofCommand",(unsigned long)bgrewriteaofCommand}, -{"bgsaveCommand",(unsigned long)bgsaveCommand}, -{"blockClientOnSwappedKeys",(unsigned long)blockClientOnSwappedKeys}, -{"blockForKeys",(unsigned long)blockForKeys}, -{"blockingPopGenericCommand",(unsigned long)blockingPopGenericCommand}, -{"blpopCommand",(unsigned long)blpopCommand}, -{"brpopCommand",(unsigned long)brpopCommand}, -{"bytesToHuman",(unsigned long)bytesToHuman}, -{"call",(unsigned long)call}, -{"catAppendOnlyExpireAtCommand",(unsigned long)catAppendOnlyExpireAtCommand}, -{"catAppendOnlyGenericCommand",(unsigned long)catAppendOnlyGenericCommand}, -{"checkType",(unsigned long)checkType}, -{"closeTimedoutClients",(unsigned long)closeTimedoutClients}, -{"compareStringObjects",(unsigned long)compareStringObjects}, -{"computeDatasetDigest",(unsigned long)computeDatasetDigest}, -{"computeObjectSwappability",(unsigned long)computeObjectSwappability}, -{"configCommand",(unsigned long)configCommand}, -{"configGetCommand",(unsigned long)configGetCommand}, -{"configSetCommand",(unsigned long)configSetCommand}, -{"convertToRealHash",(unsigned long)convertToRealHash}, -{"createClient",(unsigned long)createClient}, -{"createHashObject",(unsigned long)createHashObject}, -{"createListObject",(unsigned long)createListObject}, -{"createObject",(unsigned long)createObject}, -{"createSetObject",(unsigned long)createSetObject}, -{"createSharedObjects",(unsigned long)createSharedObjects}, -{"createSortOperation",(unsigned long)createSortOperation}, -{"createStringObject",(unsigned long)createStringObject}, -{"createStringObjectFromLongLong",(unsigned long)createStringObjectFromLongLong}, -{"createVmPointer",(unsigned long)createVmPointer}, -{"createZsetObject",(unsigned long)createZsetObject}, -{"daemonize",(unsigned long)daemonize}, -{"dbAdd",(unsigned long)dbAdd}, -{"dbDelete",(unsigned long)dbDelete}, -{"dbExists",(unsigned long)dbExists}, -{"dbRandomKey",(unsigned long)dbRandomKey}, -{"dbReplace",(unsigned long)dbReplace}, -{"dbsizeCommand",(unsigned long)dbsizeCommand}, -{"debugCommand",(unsigned long)debugCommand}, -{"decrCommand",(unsigned long)decrCommand}, -{"decrRefCount",(unsigned long)decrRefCount}, -{"decrbyCommand",(unsigned long)decrbyCommand}, -{"delCommand",(unsigned long)delCommand}, -{"deleteIfVolatile",(unsigned long)deleteIfVolatile}, -{"dictEncObjKeyCompare",(unsigned long)dictEncObjKeyCompare}, -{"dictListDestructor",(unsigned long)dictListDestructor}, -{"dictObjKeyCompare",(unsigned long)dictObjKeyCompare}, -{"dictRedisObjectDestructor",(unsigned long)dictRedisObjectDestructor}, -{"dictSdsDestructor",(unsigned long)dictSdsDestructor}, -{"dictSdsKeyCompare",(unsigned long)dictSdsKeyCompare}, -{"dictVanillaFree",(unsigned long)dictVanillaFree}, -{"discardCommand",(unsigned long)discardCommand}, -{"dontWaitForSwappedKey",(unsigned long)dontWaitForSwappedKey}, -{"dupClientReplyValue",(unsigned long)dupClientReplyValue}, -{"dupStringObject",(unsigned long)dupStringObject}, -{"echoCommand",(unsigned long)echoCommand}, -{"equalStringObjects",(unsigned long)equalStringObjects}, -{"execBlockClientOnSwappedKeys",(unsigned long)execBlockClientOnSwappedKeys}, -{"execCommand",(unsigned long)execCommand}, -{"execCommandReplicateMulti",(unsigned long)execCommandReplicateMulti}, -{"existsCommand",(unsigned long)existsCommand}, -{"expireCommand",(unsigned long)expireCommand}, -{"expireGenericCommand",(unsigned long)expireGenericCommand}, -{"expireIfNeeded",(unsigned long)expireIfNeeded}, -{"expireatCommand",(unsigned long)expireatCommand}, -{"feedAppendOnlyFile",(unsigned long)feedAppendOnlyFile}, -{"findFuncName",(unsigned long)findFuncName}, -{"flushAppendOnlyFile",(unsigned long)flushAppendOnlyFile}, -{"flushallCommand",(unsigned long)flushallCommand}, -{"flushdbCommand",(unsigned long)flushdbCommand}, -{"freeClient",(unsigned long)freeClient}, -{"freeClientArgv",(unsigned long)freeClientArgv}, -{"freeClientMultiState",(unsigned long)freeClientMultiState}, -{"freeFakeClient",(unsigned long)freeFakeClient}, -{"freeHashObject",(unsigned long)freeHashObject}, -{"freeIOJob",(unsigned long)freeIOJob}, -{"freeListObject",(unsigned long)freeListObject}, -{"freeMemoryIfNeeded",(unsigned long)freeMemoryIfNeeded}, -{"freePubsubPattern",(unsigned long)freePubsubPattern}, -{"freeSetObject",(unsigned long)freeSetObject}, -{"freeStringObject",(unsigned long)freeStringObject}, -{"freeZsetObject",(unsigned long)freeZsetObject}, -{"fwriteBulkDouble",(unsigned long)fwriteBulkDouble}, -{"fwriteBulkLongLong",(unsigned long)fwriteBulkLongLong}, -{"fwriteBulkObject",(unsigned long)fwriteBulkObject}, -{"fwriteBulkString",(unsigned long)fwriteBulkString}, -{"genRedisInfoString",(unsigned long)genRedisInfoString}, -{"genericHgetallCommand",(unsigned long)genericHgetallCommand}, -{"genericZrangebyscoreCommand",(unsigned long)genericZrangebyscoreCommand}, -{"getCommand",(unsigned long)getCommand}, -{"getDecodedObject",(unsigned long)getDecodedObject}, -{"getDoubleFromObject",(unsigned long)getDoubleFromObject}, -{"getDoubleFromObjectOrReply",(unsigned long)getDoubleFromObjectOrReply}, -{"getExpire",(unsigned long)getExpire}, -{"getGenericCommand",(unsigned long)getGenericCommand}, -{"getLongFromObjectOrReply",(unsigned long)getLongFromObjectOrReply}, -{"getLongLongFromObject",(unsigned long)getLongLongFromObject}, -{"getLongLongFromObjectOrReply",(unsigned long)getLongLongFromObjectOrReply}, -{"getMcontextEip",(unsigned long)getMcontextEip}, -{"getsetCommand",(unsigned long)getsetCommand}, -{"glueReplyBuffersIfNeeded",(unsigned long)glueReplyBuffersIfNeeded}, -{"handleClientsBlockedOnSwappedKey",(unsigned long)handleClientsBlockedOnSwappedKey}, -{"handleClientsWaitingListPush",(unsigned long)handleClientsWaitingListPush}, -{"hashTypeCurrent",(unsigned long)hashTypeCurrent}, -{"hashTypeDelete",(unsigned long)hashTypeDelete}, -{"hashTypeExists",(unsigned long)hashTypeExists}, -{"hashTypeGet",(unsigned long)hashTypeGet}, -{"hashTypeInitIterator",(unsigned long)hashTypeInitIterator}, -{"hashTypeLookupWriteOrCreate",(unsigned long)hashTypeLookupWriteOrCreate}, -{"hashTypeNext",(unsigned long)hashTypeNext}, -{"hashTypeReleaseIterator",(unsigned long)hashTypeReleaseIterator}, -{"hashTypeSet",(unsigned long)hashTypeSet}, -{"hashTypeTryConversion",(unsigned long)hashTypeTryConversion}, -{"hashTypeTryObjectEncoding",(unsigned long)hashTypeTryObjectEncoding}, -{"hdelCommand",(unsigned long)hdelCommand}, -{"hexistsCommand",(unsigned long)hexistsCommand}, -{"hgetCommand",(unsigned long)hgetCommand}, -{"hgetallCommand",(unsigned long)hgetallCommand}, -{"hincrbyCommand",(unsigned long)hincrbyCommand}, -{"hkeysCommand",(unsigned long)hkeysCommand}, -{"hlenCommand",(unsigned long)hlenCommand}, -{"hmgetCommand",(unsigned long)hmgetCommand}, -{"hmsetCommand",(unsigned long)hmsetCommand}, -{"hsetCommand",(unsigned long)hsetCommand}, -{"hsetnxCommand",(unsigned long)hsetnxCommand}, -{"htNeedsResize",(unsigned long)htNeedsResize}, -{"hvalsCommand",(unsigned long)hvalsCommand}, -{"incrCommand",(unsigned long)incrCommand}, -{"incrDecrCommand",(unsigned long)incrDecrCommand}, -{"incrRefCount",(unsigned long)incrRefCount}, -{"incrbyCommand",(unsigned long)incrbyCommand}, -{"incrementallyRehash",(unsigned long)incrementallyRehash}, -{"infoCommand",(unsigned long)infoCommand}, -{"initClientMultiState",(unsigned long)initClientMultiState}, -{"initServer",(unsigned long)initServer}, -{"initServerConfig",(unsigned long)initServerConfig}, -{"isStringRepresentableAsLong",(unsigned long)isStringRepresentableAsLong}, -{"keysCommand",(unsigned long)keysCommand}, -{"lastsaveCommand",(unsigned long)lastsaveCommand}, -{"lindexCommand",(unsigned long)lindexCommand}, -{"listMatchObjects",(unsigned long)listMatchObjects}, -{"listMatchPubsubPattern",(unsigned long)listMatchPubsubPattern}, -{"ll2string",(unsigned long)ll2string}, -{"llenCommand",(unsigned long)llenCommand}, -{"loadServerConfig",(unsigned long)loadServerConfig}, -{"lockThreadedIO",(unsigned long)lockThreadedIO}, -{"lookupKey",(unsigned long)lookupKey}, -{"lookupKeyByPattern",(unsigned long)lookupKeyByPattern}, -{"lookupKeyRead",(unsigned long)lookupKeyRead}, -{"lookupKeyReadOrReply",(unsigned long)lookupKeyReadOrReply}, -{"lookupKeyWrite",(unsigned long)lookupKeyWrite}, -{"lookupKeyWriteOrReply",(unsigned long)lookupKeyWriteOrReply}, -{"lpopCommand",(unsigned long)lpopCommand}, -{"lpushCommand",(unsigned long)lpushCommand}, -{"lrangeCommand",(unsigned long)lrangeCommand}, -{"lremCommand",(unsigned long)lremCommand}, -{"lsetCommand",(unsigned long)lsetCommand}, -{"ltrimCommand",(unsigned long)ltrimCommand}, -{"mgetCommand",(unsigned long)mgetCommand}, -{"mixDigest",(unsigned long)mixDigest}, -{"mixObjectDigest",(unsigned long)mixObjectDigest}, -{"monitorCommand",(unsigned long)monitorCommand}, -{"moveCommand",(unsigned long)moveCommand}, -{"msetCommand",(unsigned long)msetCommand}, -{"msetGenericCommand",(unsigned long)msetGenericCommand}, -{"msetnxCommand",(unsigned long)msetnxCommand}, -{"multiCommand",(unsigned long)multiCommand}, -{"oom",(unsigned long)oom}, -{"pingCommand",(unsigned long)pingCommand}, -{"popGenericCommand",(unsigned long)popGenericCommand}, -{"prepareForShutdown",(unsigned long)prepareForShutdown}, -{"processCommand",(unsigned long)processCommand}, -{"processInputBuffer",(unsigned long)processInputBuffer}, -{"psubscribeCommand",(unsigned long)psubscribeCommand}, -{"publishCommand",(unsigned long)publishCommand}, -{"pubsubPublishMessage",(unsigned long)pubsubPublishMessage}, -{"pubsubSubscribeChannel",(unsigned long)pubsubSubscribeChannel}, -{"pubsubSubscribePattern",(unsigned long)pubsubSubscribePattern}, -{"pubsubUnsubscribeAllChannels",(unsigned long)pubsubUnsubscribeAllChannels}, -{"pubsubUnsubscribeAllPatterns",(unsigned long)pubsubUnsubscribeAllPatterns}, -{"pubsubUnsubscribeChannel",(unsigned long)pubsubUnsubscribeChannel}, -{"pubsubUnsubscribePattern",(unsigned long)pubsubUnsubscribePattern}, -{"punsubscribeCommand",(unsigned long)punsubscribeCommand}, -{"pushGenericCommand",(unsigned long)pushGenericCommand}, -{"qsortCompareSetsByCardinality",(unsigned long)qsortCompareSetsByCardinality}, -{"qsortCompareZsetopsrcByCardinality",(unsigned long)qsortCompareZsetopsrcByCardinality}, -{"qsortRedisCommands",(unsigned long)qsortRedisCommands}, -{"queueIOJob",(unsigned long)queueIOJob}, -{"queueMultiCommand",(unsigned long)queueMultiCommand}, -{"randomkeyCommand",(unsigned long)randomkeyCommand}, -{"rdbEncodeInteger",(unsigned long)rdbEncodeInteger}, -{"rdbGenericLoadStringObject",(unsigned long)rdbGenericLoadStringObject}, -{"rdbLoad",(unsigned long)rdbLoad}, -{"rdbLoadDoubleValue",(unsigned long)rdbLoadDoubleValue}, -{"rdbLoadEncodedStringObject",(unsigned long)rdbLoadEncodedStringObject}, -{"rdbLoadIntegerObject",(unsigned long)rdbLoadIntegerObject}, -{"rdbLoadLen",(unsigned long)rdbLoadLen}, -{"rdbLoadLzfStringObject",(unsigned long)rdbLoadLzfStringObject}, -{"rdbLoadObject",(unsigned long)rdbLoadObject}, -{"rdbLoadStringObject",(unsigned long)rdbLoadStringObject}, -{"rdbLoadTime",(unsigned long)rdbLoadTime}, -{"rdbLoadType",(unsigned long)rdbLoadType}, -{"rdbRemoveTempFile",(unsigned long)rdbRemoveTempFile}, -{"rdbSave",(unsigned long)rdbSave}, -{"rdbSaveBackground",(unsigned long)rdbSaveBackground}, -{"rdbSaveDoubleValue",(unsigned long)rdbSaveDoubleValue}, -{"rdbSaveLen",(unsigned long)rdbSaveLen}, -{"rdbSaveLzfStringObject",(unsigned long)rdbSaveLzfStringObject}, -{"rdbSaveObject",(unsigned long)rdbSaveObject}, -{"rdbSaveRawString",(unsigned long)rdbSaveRawString}, -{"rdbSaveStringObject",(unsigned long)rdbSaveStringObject}, -{"rdbSaveTime",(unsigned long)rdbSaveTime}, -{"rdbSaveType",(unsigned long)rdbSaveType}, -{"rdbSavedObjectLen",(unsigned long)rdbSavedObjectLen}, -{"rdbSavedObjectPages",(unsigned long)rdbSavedObjectPages}, -{"rdbTryIntegerEncoding",(unsigned long)rdbTryIntegerEncoding}, -{"readQueryFromClient",(unsigned long)readQueryFromClient}, -{"redisLog",(unsigned long)redisLog}, -{"removeExpire",(unsigned long)removeExpire}, -{"renameCommand",(unsigned long)renameCommand}, -{"renameGenericCommand",(unsigned long)renameGenericCommand}, -{"renamenxCommand",(unsigned long)renamenxCommand}, -{"replicationFeedMonitors",(unsigned long)replicationFeedMonitors}, -{"replicationFeedSlaves",(unsigned long)replicationFeedSlaves}, -{"resetClient",(unsigned long)resetClient}, -{"resetServerSaveParams",(unsigned long)resetServerSaveParams}, -{"rewriteAppendOnlyFile",(unsigned long)rewriteAppendOnlyFile}, -{"rewriteAppendOnlyFileBackground",(unsigned long)rewriteAppendOnlyFileBackground}, -{"rpopCommand",(unsigned long)rpopCommand}, -{"rpoplpushcommand",(unsigned long)rpoplpushcommand}, -{"rpushCommand",(unsigned long)rpushCommand}, -{"saddCommand",(unsigned long)saddCommand}, -{"saveCommand",(unsigned long)saveCommand}, -{"scardCommand",(unsigned long)scardCommand}, -{"sdiffCommand",(unsigned long)sdiffCommand}, -{"sdiffstoreCommand",(unsigned long)sdiffstoreCommand}, -{"sdscatrepr",(unsigned long)sdscatrepr}, -{"segvHandler",(unsigned long)segvHandler}, -{"selectCommand",(unsigned long)selectCommand}, -{"selectDb",(unsigned long)selectDb}, -{"sendBulkToSlave",(unsigned long)sendBulkToSlave}, -{"sendReplyToClient",(unsigned long)sendReplyToClient}, -{"sendReplyToClientWritev",(unsigned long)sendReplyToClientWritev}, -{"serverCron",(unsigned long)serverCron}, -{"setCommand",(unsigned long)setCommand}, -{"setExpire",(unsigned long)setExpire}, -{"setGenericCommand",(unsigned long)setGenericCommand}, -{"setexCommand",(unsigned long)setexCommand}, -{"setnxCommand",(unsigned long)setnxCommand}, -{"setupSigSegvAction",(unsigned long)setupSigSegvAction}, -{"shutdownCommand",(unsigned long)shutdownCommand}, -{"sigtermHandler",(unsigned long)sigtermHandler}, -{"sinterCommand",(unsigned long)sinterCommand}, -{"sinterGenericCommand",(unsigned long)sinterGenericCommand}, -{"sinterstoreCommand",(unsigned long)sinterstoreCommand}, -{"sismemberCommand",(unsigned long)sismemberCommand}, -{"slaveofCommand",(unsigned long)slaveofCommand}, -{"smoveCommand",(unsigned long)smoveCommand}, -{"sortCommand",(unsigned long)sortCommand}, -{"sortCommandTable",(unsigned long)sortCommandTable}, -{"sortCompare",(unsigned long)sortCompare}, -{"spawnIOThread",(unsigned long)spawnIOThread}, -{"spopCommand",(unsigned long)spopCommand}, -{"srandmemberCommand",(unsigned long)srandmemberCommand}, -{"sremCommand",(unsigned long)sremCommand}, -{"startAppendOnly",(unsigned long)startAppendOnly}, -{"stopAppendOnly",(unsigned long)stopAppendOnly}, -{"stringObjectLen",(unsigned long)stringObjectLen}, -{"stringmatch",(unsigned long)stringmatch}, -{"stringmatchlen",(unsigned long)stringmatchlen}, -{"subscribeCommand",(unsigned long)subscribeCommand}, -{"substrCommand",(unsigned long)substrCommand}, -{"sunionCommand",(unsigned long)sunionCommand}, -{"sunionDiffGenericCommand",(unsigned long)sunionDiffGenericCommand}, -{"sunionstoreCommand",(unsigned long)sunionstoreCommand}, -{"syncCommand",(unsigned long)syncCommand}, -{"syncRead",(unsigned long)syncRead}, -{"syncReadLine",(unsigned long)syncReadLine}, -{"syncWithMaster",(unsigned long)syncWithMaster}, -{"syncWrite",(unsigned long)syncWrite}, -{"touchWatchedKey",(unsigned long)touchWatchedKey}, -{"touchWatchedKeysOnFlush",(unsigned long)touchWatchedKeysOnFlush}, -{"tryFreeOneObjectFromFreelist",(unsigned long)tryFreeOneObjectFromFreelist}, -{"tryObjectEncoding",(unsigned long)tryObjectEncoding}, -{"tryResizeHashTables",(unsigned long)tryResizeHashTables}, -{"ttlCommand",(unsigned long)ttlCommand}, -{"typeCommand",(unsigned long)typeCommand}, -{"unblockClientWaitingData",(unsigned long)unblockClientWaitingData}, -{"unlockThreadedIO",(unsigned long)unlockThreadedIO}, -{"unsubscribeCommand",(unsigned long)unsubscribeCommand}, -{"unwatchAllKeys",(unsigned long)unwatchAllKeys}, -{"unwatchCommand",(unsigned long)unwatchCommand}, -{"updateDictResizePolicy",(unsigned long)updateDictResizePolicy}, -{"updateSlavesWaitingBgsave",(unsigned long)updateSlavesWaitingBgsave}, -{"usage",(unsigned long)usage}, -{"version",(unsigned long)version}, -{"vmCanSwapOut",(unsigned long)vmCanSwapOut}, -{"vmCancelThreadedIOJob",(unsigned long)vmCancelThreadedIOJob}, -{"vmFindContiguousPages",(unsigned long)vmFindContiguousPages}, -{"vmFreePage",(unsigned long)vmFreePage}, -{"vmGenericLoadObject",(unsigned long)vmGenericLoadObject}, -{"vmInit",(unsigned long)vmInit}, -{"vmLoadObject",(unsigned long)vmLoadObject}, -{"vmMarkPageFree",(unsigned long)vmMarkPageFree}, -{"vmMarkPageUsed",(unsigned long)vmMarkPageUsed}, -{"vmMarkPagesFree",(unsigned long)vmMarkPagesFree}, -{"vmMarkPagesUsed",(unsigned long)vmMarkPagesUsed}, -{"vmPreviewObject",(unsigned long)vmPreviewObject}, -{"vmReadObjectFromSwap",(unsigned long)vmReadObjectFromSwap}, -{"vmReopenSwapFile",(unsigned long)vmReopenSwapFile}, -{"vmSwapObjectBlocking",(unsigned long)vmSwapObjectBlocking}, -{"vmSwapObjectThreaded",(unsigned long)vmSwapObjectThreaded}, -{"vmSwapOneObject",(unsigned long)vmSwapOneObject}, -{"vmSwapOneObjectBlocking",(unsigned long)vmSwapOneObjectBlocking}, -{"vmSwapOneObjectThreaded",(unsigned long)vmSwapOneObjectThreaded}, -{"vmThreadedIOCompletedJob",(unsigned long)vmThreadedIOCompletedJob}, -{"vmWriteObjectOnSwap",(unsigned long)vmWriteObjectOnSwap}, -{"waitEmptyIOJobsQueue",(unsigned long)waitEmptyIOJobsQueue}, -{"waitForMultipleSwappedKeys",(unsigned long)waitForMultipleSwappedKeys}, -{"waitForSwappedKey",(unsigned long)waitForSwappedKey}, -{"watchCommand",(unsigned long)watchCommand}, -{"watchForKey",(unsigned long)watchForKey}, -{"xorDigest",(unsigned long)xorDigest}, -{"xorObjectDigest",(unsigned long)xorObjectDigest}, -{"yesnotoi",(unsigned long)yesnotoi}, -{"zaddCommand",(unsigned long)zaddCommand}, -{"zaddGenericCommand",(unsigned long)zaddGenericCommand}, -{"zcardCommand",(unsigned long)zcardCommand}, -{"zcountCommand",(unsigned long)zcountCommand}, -{"zincrbyCommand",(unsigned long)zincrbyCommand}, -{"zinterstoreCommand",(unsigned long)zinterstoreCommand}, -{"zrangeCommand",(unsigned long)zrangeCommand}, -{"zrangeGenericCommand",(unsigned long)zrangeGenericCommand}, -{"zrangebyscoreCommand",(unsigned long)zrangebyscoreCommand}, -{"zrankCommand",(unsigned long)zrankCommand}, -{"zrankGenericCommand",(unsigned long)zrankGenericCommand}, -{"zremCommand",(unsigned long)zremCommand}, -{"zremrangebyrankCommand",(unsigned long)zremrangebyrankCommand}, -{"zremrangebyscoreCommand",(unsigned long)zremrangebyscoreCommand}, -{"zrevrangeCommand",(unsigned long)zrevrangeCommand}, -{"zrevrankCommand",(unsigned long)zrevrankCommand}, -{"zscoreCommand",(unsigned long)zscoreCommand}, -{"zslCreate",(unsigned long)zslCreate}, -{"zslCreateNode",(unsigned long)zslCreateNode}, -{"zslDelete",(unsigned long)zslDelete}, -{"zslFirstWithScore",(unsigned long)zslFirstWithScore}, -{"zslFree",(unsigned long)zslFree}, -{"zslFreeNode",(unsigned long)zslFreeNode}, -{"zslInsert",(unsigned long)zslInsert}, -{"zslRandomLevel",(unsigned long)zslRandomLevel}, -{"zunionInterBlockClientOnSwappedKeys",(unsigned long)zunionInterBlockClientOnSwappedKeys}, -{"zunionInterGenericCommand",(unsigned long)zunionInterGenericCommand}, -{"zunionstoreCommand",(unsigned long)zunionstoreCommand}, -{NULL,0} -}; diff --git a/tests/integration/aof.tcl b/tests/integration/aof.tcl index abcebe130..4cbe6eaae 100644 --- a/tests/integration/aof.tcl +++ b/tests/integration/aof.tcl @@ -59,13 +59,13 @@ tags {"aof"} { ## Test that redis-check-aof indeed sees this AOF is not valid test {Short read: Utility should confirm the AOF is not valid} { catch { - exec ./redis-check-aof $aof_path + exec src/redis-check-aof $aof_path } str set _ $str } {*not valid*} test {Short read: Utility should be able to fix the AOF} { - exec echo y | ./redis-check-aof --fix $aof_path + exec echo y | src/redis-check-aof --fix $aof_path } {*Successfully truncated AOF*} ## Test that the server can be started using the truncated AOF diff --git a/tests/support/server.tcl b/tests/support/server.tcl index eb5fe6214..8e226a7dd 100644 --- a/tests/support/server.tcl +++ b/tests/support/server.tcl @@ -169,10 +169,10 @@ proc start_server {options {code undefined}} { set stderr [format "%s/%s" [dict get $config "dir"] "stderr"] if {$::valgrind} { - exec valgrind ./redis-server $config_file > $stdout 2> $stderr & + exec valgrind src/redis-server $config_file > $stdout 2> $stderr & after 2000 } else { - exec ./redis-server $config_file > $stdout 2> $stderr & + exec src/redis-server $config_file > $stdout 2> $stderr & after 500 } diff --git a/ziplist.c b/ziplist.c deleted file mode 100644 index 4b9d0fadc..000000000 --- a/ziplist.c +++ /dev/null @@ -1,959 +0,0 @@ -/* Memory layout of a ziplist, containing "foo", "bar", "quux": - * "foo""bar""quux" - * - * is an unsigned integer to hold the number of bytes that - * the ziplist occupies. This is stored to not have to traverse the ziplist - * to know the new length when pushing. - * - * is the number of items in the ziplist. When this value is - * greater than 254, we need to traverse the entire list to know - * how many items it holds. - * - * is the number of bytes occupied by a single entry. When this - * number is greater than 253, the length will occupy 5 bytes, where - * the extra bytes contain an unsigned integer to hold the length. - */ - -#include -#include -#include -#include -#include -#include -#include "zmalloc.h" -#include "ziplist.h" - -/* Important note: the ZIP_END value is used to depict the end of the - * ziplist structure. When a pointer contains an entry, the first couple - * of bytes contain the encoded length of the previous entry. This length - * is encoded as ZIP_ENC_RAW length, so the first two bits will contain 00 - * and the byte will therefore never have a value of 255. */ -#define ZIP_END 255 -#define ZIP_BIGLEN 254 - -/* Entry encoding */ -#define ZIP_ENC_RAW 0 -#define ZIP_ENC_INT16 1 -#define ZIP_ENC_INT32 2 -#define ZIP_ENC_INT64 3 -#define ZIP_ENCODING(p) ((p)[0] >> 6) - -/* Length encoding for raw entries */ -#define ZIP_LEN_INLINE 0 -#define ZIP_LEN_UINT16 1 -#define ZIP_LEN_UINT32 2 - -/* Utility macros */ -#define ZIPLIST_BYTES(zl) (*((uint32_t*)(zl))) -#define ZIPLIST_TAIL_OFFSET(zl) (*((uint32_t*)((zl)+sizeof(uint32_t)))) -#define ZIPLIST_LENGTH(zl) (*((uint16_t*)((zl)+sizeof(uint32_t)*2))) -#define ZIPLIST_HEADER_SIZE (sizeof(uint32_t)*2+sizeof(uint16_t)) -#define ZIPLIST_ENTRY_HEAD(zl) ((zl)+ZIPLIST_HEADER_SIZE) -#define ZIPLIST_ENTRY_TAIL(zl) ((zl)+ZIPLIST_TAIL_OFFSET(zl)) -#define ZIPLIST_ENTRY_END(zl) ((zl)+ZIPLIST_BYTES(zl)-1) - -/* We know a positive increment can only be 1 because entries can only be - * pushed one at a time. */ -#define ZIPLIST_INCR_LENGTH(zl,incr) { \ - if (ZIPLIST_LENGTH(zl) < UINT16_MAX) ZIPLIST_LENGTH(zl)+=incr; } - -typedef struct zlentry { - unsigned int prevrawlensize, prevrawlen; - unsigned int lensize, len; - unsigned int headersize; - unsigned char encoding; - unsigned char *p; -} zlentry; - -/* Return bytes needed to store integer encoded by 'encoding' */ -static unsigned int zipEncodingSize(unsigned char encoding) { - if (encoding == ZIP_ENC_INT16) { - return sizeof(int16_t); - } else if (encoding == ZIP_ENC_INT32) { - return sizeof(int32_t); - } else if (encoding == ZIP_ENC_INT64) { - return sizeof(int64_t); - } - assert(NULL); -} - -/* Decode the encoded length pointed by 'p'. If a pointer to 'lensize' is - * provided, it is set to the number of bytes required to encode the length. */ -static unsigned int zipDecodeLength(unsigned char *p, unsigned int *lensize) { - unsigned char encoding = ZIP_ENCODING(p), lenenc; - unsigned int len; - - if (encoding == ZIP_ENC_RAW) { - lenenc = (p[0] >> 4) & 0x3; - if (lenenc == ZIP_LEN_INLINE) { - len = p[0] & 0xf; - if (lensize) *lensize = 1; - } else if (lenenc == ZIP_LEN_UINT16) { - len = p[1] | (p[2] << 8); - if (lensize) *lensize = 3; - } else { - len = p[1] | (p[2] << 8) | (p[3] << 16) | (p[4] << 24); - if (lensize) *lensize = 5; - } - } else { - len = zipEncodingSize(encoding); - if (lensize) *lensize = 1; - } - return len; -} - -/* Encode the length 'l' writing it in 'p'. If p is NULL it just returns - * the amount of bytes required to encode such a length. */ -static unsigned int zipEncodeLength(unsigned char *p, char encoding, unsigned int rawlen) { - unsigned char len = 1, lenenc, buf[5]; - if (encoding == ZIP_ENC_RAW) { - if (rawlen <= 0xf) { - if (!p) return len; - lenenc = ZIP_LEN_INLINE; - buf[0] = rawlen; - } else if (rawlen <= 0xffff) { - len += 2; - if (!p) return len; - lenenc = ZIP_LEN_UINT16; - buf[1] = (rawlen ) & 0xff; - buf[2] = (rawlen >> 8) & 0xff; - } else { - len += 4; - if (!p) return len; - lenenc = ZIP_LEN_UINT32; - buf[1] = (rawlen ) & 0xff; - buf[2] = (rawlen >> 8) & 0xff; - buf[3] = (rawlen >> 16) & 0xff; - buf[4] = (rawlen >> 24) & 0xff; - } - buf[0] = (lenenc << 4) | (buf[0] & 0xf); - } - if (!p) return len; - - /* Apparently we need to store the length in 'p' */ - buf[0] = (encoding << 6) | (buf[0] & 0x3f); - memcpy(p,buf,len); - return len; -} - -/* Decode the length of the previous element stored at "p". */ -static unsigned int zipPrevDecodeLength(unsigned char *p, unsigned int *lensize) { - unsigned int len = *p; - if (len < ZIP_BIGLEN) { - if (lensize) *lensize = 1; - } else { - if (lensize) *lensize = 1+sizeof(len); - memcpy(&len,p+1,sizeof(len)); - } - return len; -} - -/* Encode the length of the previous entry and write it to "p". Return the - * number of bytes needed to encode this length if "p" is NULL. */ -static unsigned int zipPrevEncodeLength(unsigned char *p, unsigned int len) { - if (p == NULL) { - return (len < ZIP_BIGLEN) ? 1 : sizeof(len)+1; - } else { - if (len < ZIP_BIGLEN) { - p[0] = len; - return 1; - } else { - p[0] = ZIP_BIGLEN; - memcpy(p+1,&len,sizeof(len)); - return 1+sizeof(len); - } - } -} - -/* Return the difference in number of bytes needed to store the new length - * "len" on the entry pointed to by "p". */ -static int zipPrevLenByteDiff(unsigned char *p, unsigned int len) { - unsigned int prevlensize; - zipPrevDecodeLength(p,&prevlensize); - return zipPrevEncodeLength(NULL,len)-prevlensize; -} - -/* Check if string pointed to by 'entry' can be encoded as an integer. - * Stores the integer value in 'v' and its encoding in 'encoding'. - * Warning: this function requires a NULL-terminated string! */ -static int zipTryEncoding(unsigned char *entry, long long *v, unsigned char *encoding) { - long long value; - char *eptr; - - if (entry[0] == '-' || (entry[0] >= '0' && entry[0] <= '9')) { - value = strtoll((char*)entry,&eptr,10); - if (eptr[0] != '\0') return 0; - if (value >= INT16_MIN && value <= INT16_MAX) { - *encoding = ZIP_ENC_INT16; - } else if (value >= INT32_MIN && value <= INT32_MAX) { - *encoding = ZIP_ENC_INT32; - } else { - *encoding = ZIP_ENC_INT64; - } - *v = value; - return 1; - } - return 0; -} - -/* Store integer 'value' at 'p', encoded as 'encoding' */ -static void zipSaveInteger(unsigned char *p, int64_t value, unsigned char encoding) { - int16_t i16; - int32_t i32; - int64_t i64; - if (encoding == ZIP_ENC_INT16) { - i16 = value; - memcpy(p,&i16,sizeof(i16)); - } else if (encoding == ZIP_ENC_INT32) { - i32 = value; - memcpy(p,&i32,sizeof(i32)); - } else if (encoding == ZIP_ENC_INT64) { - i64 = value; - memcpy(p,&i64,sizeof(i64)); - } else { - assert(NULL); - } -} - -/* Read integer encoded as 'encoding' from 'p' */ -static int64_t zipLoadInteger(unsigned char *p, unsigned char encoding) { - int16_t i16; - int32_t i32; - int64_t i64, ret; - if (encoding == ZIP_ENC_INT16) { - memcpy(&i16,p,sizeof(i16)); - ret = i16; - } else if (encoding == ZIP_ENC_INT32) { - memcpy(&i32,p,sizeof(i32)); - ret = i32; - } else if (encoding == ZIP_ENC_INT64) { - memcpy(&i64,p,sizeof(i64)); - ret = i64; - } else { - assert(NULL); - } - return ret; -} - -/* Return a struct with all information about an entry. */ -static zlentry zipEntry(unsigned char *p) { - zlentry e; - e.prevrawlen = zipPrevDecodeLength(p,&e.prevrawlensize); - e.len = zipDecodeLength(p+e.prevrawlensize,&e.lensize); - e.headersize = e.prevrawlensize+e.lensize; - e.encoding = ZIP_ENCODING(p+e.prevrawlensize); - e.p = p; - return e; -} - -/* Return the total number of bytes used by the entry at "p". */ -static unsigned int zipRawEntryLength(unsigned char *p) { - zlentry e = zipEntry(p); - return e.headersize + e.len; -} - -/* Create a new empty ziplist. */ -unsigned char *ziplistNew(void) { - unsigned int bytes = ZIPLIST_HEADER_SIZE+1; - unsigned char *zl = zmalloc(bytes); - ZIPLIST_BYTES(zl) = bytes; - ZIPLIST_TAIL_OFFSET(zl) = ZIPLIST_HEADER_SIZE; - ZIPLIST_LENGTH(zl) = 0; - zl[bytes-1] = ZIP_END; - return zl; -} - -/* Resize the ziplist. */ -static unsigned char *ziplistResize(unsigned char *zl, unsigned int len) { - zl = zrealloc(zl,len); - ZIPLIST_BYTES(zl) = len; - zl[len-1] = ZIP_END; - return zl; -} - -/* Delete "num" entries, starting at "p". Returns pointer to the ziplist. */ -static unsigned char *__ziplistDelete(unsigned char *zl, unsigned char *p, unsigned int num) { - unsigned int i, totlen, deleted = 0; - int nextdiff = 0; - zlentry first = zipEntry(p); - for (i = 0; p[0] != ZIP_END && i < num; i++) { - p += zipRawEntryLength(p); - deleted++; - } - - totlen = p-first.p; - if (totlen > 0) { - if (p[0] != ZIP_END) { - /* Tricky: storing the prevlen in this entry might reduce or - * increase the number of bytes needed, compared to the current - * prevlen. Note that we can always store this length because - * it was previously stored by an entry that is being deleted. */ - nextdiff = zipPrevLenByteDiff(p,first.prevrawlen); - zipPrevEncodeLength(p-nextdiff,first.prevrawlen); - - /* Update offset for tail */ - ZIPLIST_TAIL_OFFSET(zl) -= totlen+nextdiff; - - /* Move tail to the front of the ziplist */ - memmove(first.p,p-nextdiff,ZIPLIST_BYTES(zl)-(p-zl)-1+nextdiff); - } else { - /* The entire tail was deleted. No need to move memory. */ - ZIPLIST_TAIL_OFFSET(zl) = (first.p-zl)-first.prevrawlen; - } - - /* Resize and update length */ - zl = ziplistResize(zl, ZIPLIST_BYTES(zl)-totlen+nextdiff); - ZIPLIST_INCR_LENGTH(zl,-deleted); - } - return zl; -} - -/* Insert item at "p". */ -static unsigned char *__ziplistInsert(unsigned char *zl, unsigned char *p, unsigned char *s, unsigned int slen) { - unsigned int curlen = ZIPLIST_BYTES(zl), reqlen, prevlen = 0; - unsigned int offset, nextdiff = 0; - unsigned char *tail; - unsigned char encoding = ZIP_ENC_RAW; - long long value; - zlentry entry; - - /* Find out prevlen for the entry that is inserted. */ - if (p[0] != ZIP_END) { - entry = zipEntry(p); - prevlen = entry.prevrawlen; - } else { - tail = ZIPLIST_ENTRY_TAIL(zl); - if (tail[0] != ZIP_END) { - prevlen = zipRawEntryLength(tail); - } - } - - /* See if the entry can be encoded */ - if (zipTryEncoding(s,&value,&encoding)) { - reqlen = zipEncodingSize(encoding); - } else { - reqlen = slen; - } - - /* We need space for both the length of the previous entry and - * the length of the payload. */ - reqlen += zipPrevEncodeLength(NULL,prevlen); - reqlen += zipEncodeLength(NULL,encoding,slen); - - /* When the insert position is not equal to the tail, we need to - * make sure that the next entry can hold this entry's length in - * its prevlen field. */ - nextdiff = (p[0] != ZIP_END) ? zipPrevLenByteDiff(p,reqlen) : 0; - - /* Store offset because a realloc may change the address of zl. */ - offset = p-zl; - zl = ziplistResize(zl,curlen+reqlen+nextdiff); - p = zl+offset; - - /* Apply memory move when necessary and update tail offset. */ - if (p[0] != ZIP_END) { - /* Subtract one because of the ZIP_END bytes */ - memmove(p+reqlen,p-nextdiff,curlen-offset-1+nextdiff); - /* Encode this entry's raw length in the next entry. */ - zipPrevEncodeLength(p+reqlen,reqlen); - /* Update offset for tail */ - ZIPLIST_TAIL_OFFSET(zl) += reqlen+nextdiff; - } else { - /* This element will be the new tail. */ - ZIPLIST_TAIL_OFFSET(zl) = p-zl; - } - - /* Write the entry */ - p += zipPrevEncodeLength(p,prevlen); - p += zipEncodeLength(p,encoding,slen); - if (encoding != ZIP_ENC_RAW) { - zipSaveInteger(p,value,encoding); - } else { - memcpy(p,s,slen); - } - ZIPLIST_INCR_LENGTH(zl,1); - return zl; -} - -unsigned char *ziplistPush(unsigned char *zl, unsigned char *s, unsigned int slen, int where) { - unsigned char *p; - p = (where == ZIPLIST_HEAD) ? ZIPLIST_ENTRY_HEAD(zl) : ZIPLIST_ENTRY_END(zl); - return __ziplistInsert(zl,p,s,slen); -} - -/* Returns an offset to use for iterating with ziplistNext. When the given - * index is negative, the list is traversed back to front. When the list - * doesn't contain an element at the provided index, NULL is returned. */ -unsigned char *ziplistIndex(unsigned char *zl, int index) { - unsigned char *p; - zlentry entry; - if (index < 0) { - index = (-index)-1; - p = ZIPLIST_ENTRY_TAIL(zl); - if (p[0] != ZIP_END) { - entry = zipEntry(p); - while (entry.prevrawlen > 0 && index--) { - p -= entry.prevrawlen; - entry = zipEntry(p); - } - } - } else { - p = ZIPLIST_ENTRY_HEAD(zl); - while (p[0] != ZIP_END && index--) { - p += zipRawEntryLength(p); - } - } - return (p[0] == ZIP_END || index > 0) ? NULL : p; -} - -/* Return pointer to next entry in ziplist. */ -unsigned char *ziplistNext(unsigned char *zl, unsigned char *p) { - ((void) zl); - - /* "p" could be equal to ZIP_END, caused by ziplistDelete, - * and we should return NULL. Otherwise, we should return NULL - * when the *next* element is ZIP_END (there is no next entry). */ - if (p[0] == ZIP_END) { - return NULL; - } else { - p = p+zipRawEntryLength(p); - return (p[0] == ZIP_END) ? NULL : p; - } -} - -/* Return pointer to previous entry in ziplist. */ -unsigned char *ziplistPrev(unsigned char *zl, unsigned char *p) { - zlentry entry; - - /* Iterating backwards from ZIP_END should return the tail. When "p" is - * equal to the first element of the list, we're already at the head, - * and should return NULL. */ - if (p[0] == ZIP_END) { - p = ZIPLIST_ENTRY_TAIL(zl); - return (p[0] == ZIP_END) ? NULL : p; - } else if (p == ZIPLIST_ENTRY_HEAD(zl)) { - return NULL; - } else { - entry = zipEntry(p); - return p-entry.prevrawlen; - } -} - -/* Get entry pointer to by 'p' and store in either 'e' or 'v' depending - * on the encoding of the entry. 'e' is always set to NULL to be able - * to find out whether the string pointer or the integer value was set. - * Return 0 if 'p' points to the end of the zipmap, 1 otherwise. */ -unsigned int ziplistGet(unsigned char *p, unsigned char **sstr, unsigned int *slen, long long *sval) { - zlentry entry; - if (p == NULL || p[0] == ZIP_END) return 0; - if (sstr) *sstr = NULL; - - entry = zipEntry(p); - if (entry.encoding == ZIP_ENC_RAW) { - if (sstr) { - *slen = entry.len; - *sstr = p+entry.headersize; - } - } else { - if (sval) { - *sval = zipLoadInteger(p+entry.headersize,entry.encoding); - } - } - return 1; -} - -/* Insert an entry at "p". */ -unsigned char *ziplistInsert(unsigned char *zl, unsigned char *p, unsigned char *s, unsigned int slen) { - return __ziplistInsert(zl,p,s,slen); -} - -/* Delete a single entry from the ziplist, pointed to by *p. - * Also update *p in place, to be able to iterate over the - * ziplist, while deleting entries. */ -unsigned char *ziplistDelete(unsigned char *zl, unsigned char **p) { - unsigned int offset = *p-zl; - zl = __ziplistDelete(zl,*p,1); - - /* Store pointer to current element in p, because ziplistDelete will - * do a realloc which might result in a different "zl"-pointer. - * When the delete direction is back to front, we might delete the last - * entry and end up with "p" pointing to ZIP_END, so check this. */ - *p = zl+offset; - return zl; -} - -/* Delete a range of entries from the ziplist. */ -unsigned char *ziplistDeleteRange(unsigned char *zl, unsigned int index, unsigned int num) { - unsigned char *p = ziplistIndex(zl,index); - return (p == NULL) ? zl : __ziplistDelete(zl,p,num); -} - -/* Compare entry pointer to by 'p' with 'entry'. Return 1 if equal. */ -unsigned int ziplistCompare(unsigned char *p, unsigned char *sstr, unsigned int slen) { - zlentry entry; - unsigned char sencoding; - long long zval, sval; - if (p[0] == ZIP_END) return 0; - - entry = zipEntry(p); - if (entry.encoding == ZIP_ENC_RAW) { - /* Raw compare */ - if (entry.len == slen) { - return memcmp(p+entry.headersize,sstr,slen) == 0; - } else { - return 0; - } - } else { - /* Try to compare encoded values */ - if (zipTryEncoding(sstr,&sval,&sencoding)) { - if (entry.encoding == sencoding) { - zval = zipLoadInteger(p+entry.headersize,entry.encoding); - return zval == sval; - } - } - } - return 0; -} - -/* Return length of ziplist. */ -unsigned int ziplistLen(unsigned char *zl) { - unsigned int len = 0; - if (ZIPLIST_LENGTH(zl) < UINT16_MAX) { - len = ZIPLIST_LENGTH(zl); - } else { - unsigned char *p = zl+ZIPLIST_HEADER_SIZE; - while (*p != ZIP_END) { - p += zipRawEntryLength(p); - len++; - } - - /* Re-store length if small enough */ - if (len < UINT16_MAX) ZIPLIST_LENGTH(zl) = len; - } - return len; -} - -/* Return size in bytes of ziplist. */ -unsigned int ziplistSize(unsigned char *zl) { - return ZIPLIST_BYTES(zl); -} - -void ziplistRepr(unsigned char *zl) { - unsigned char *p; - zlentry entry; - - printf("{total bytes %d} {length %u}\n",ZIPLIST_BYTES(zl), ZIPLIST_LENGTH(zl)); - p = ZIPLIST_ENTRY_HEAD(zl); - while(*p != ZIP_END) { - entry = zipEntry(p); - printf("{offset %ld, header %u, payload %u} ",p-zl,entry.headersize,entry.len); - p += entry.headersize; - if (entry.encoding == ZIP_ENC_RAW) { - fwrite(p,entry.len,1,stdout); - } else { - printf("%lld", zipLoadInteger(p,entry.encoding)); - } - printf("\n"); - p += entry.len; - } - printf("{end}\n\n"); -} - -#ifdef ZIPLIST_TEST_MAIN -#include - -unsigned char *createList() { - unsigned char *zl = ziplistNew(); - zl = ziplistPush(zl, (unsigned char*)"foo", 3, ZIPLIST_TAIL); - zl = ziplistPush(zl, (unsigned char*)"quux", 4, ZIPLIST_TAIL); - zl = ziplistPush(zl, (unsigned char*)"hello", 5, ZIPLIST_HEAD); - zl = ziplistPush(zl, (unsigned char*)"1024", 4, ZIPLIST_TAIL); - return zl; -} - -unsigned char *createIntList() { - unsigned char *zl = ziplistNew(); - char buf[32]; - - sprintf(buf, "100"); - zl = ziplistPush(zl, (unsigned char*)buf, strlen(buf), ZIPLIST_TAIL); - sprintf(buf, "128000"); - zl = ziplistPush(zl, (unsigned char*)buf, strlen(buf), ZIPLIST_TAIL); - sprintf(buf, "-100"); - zl = ziplistPush(zl, (unsigned char*)buf, strlen(buf), ZIPLIST_HEAD); - sprintf(buf, "4294967296"); - zl = ziplistPush(zl, (unsigned char*)buf, strlen(buf), ZIPLIST_HEAD); - sprintf(buf, "non integer"); - zl = ziplistPush(zl, (unsigned char*)buf, strlen(buf), ZIPLIST_TAIL); - sprintf(buf, "much much longer non integer"); - zl = ziplistPush(zl, (unsigned char*)buf, strlen(buf), ZIPLIST_TAIL); - return zl; -} - -long long usec(void) { - struct timeval tv; - gettimeofday(&tv,NULL); - return (((long long)tv.tv_sec)*1000000)+tv.tv_usec; -} - -void stress(int pos, int num, int maxsize, int dnum) { - int i,j,k; - unsigned char *zl; - char posstr[2][5] = { "HEAD", "TAIL" }; - long long start; - for (i = 0; i < maxsize; i+=dnum) { - zl = ziplistNew(); - for (j = 0; j < i; j++) { - zl = ziplistPush(zl,(unsigned char*)"quux",4,ZIPLIST_TAIL); - } - - /* Do num times a push+pop from pos */ - start = usec(); - for (k = 0; k < num; k++) { - zl = ziplistPush(zl,(unsigned char*)"quux",4,pos); - zl = ziplistDeleteRange(zl,0,1); - } - printf("List size: %8d, bytes: %8d, %dx push+pop (%s): %6lld usec\n", - i,ZIPLIST_BYTES(zl),num,posstr[pos],usec()-start); - zfree(zl); - } -} - -void pop(unsigned char *zl, int where) { - unsigned char *p, *vstr; - unsigned int vlen; - long long vlong; - - p = ziplistIndex(zl,where == ZIPLIST_HEAD ? 0 : -1); - if (ziplistGet(p,&vstr,&vlen,&vlong)) { - if (where == ZIPLIST_HEAD) - printf("Pop head: "); - else - printf("Pop tail: "); - - if (vstr) - fwrite(vstr,vlen,1,stdout); - else - printf("%lld", vlong); - - printf("\n"); - ziplistDeleteRange(zl,-1,1); - } else { - printf("ERROR: Could not pop\n"); - exit(1); - } -} - -int main(int argc, char **argv) { - unsigned char *zl, *p; - unsigned char *entry; - unsigned int elen; - long long value; - - zl = createIntList(); - ziplistRepr(zl); - - zl = createList(); - ziplistRepr(zl); - - pop(zl,ZIPLIST_TAIL); - ziplistRepr(zl); - - pop(zl,ZIPLIST_HEAD); - ziplistRepr(zl); - - pop(zl,ZIPLIST_TAIL); - ziplistRepr(zl); - - pop(zl,ZIPLIST_TAIL); - ziplistRepr(zl); - - printf("Get element at index 3:\n"); - { - zl = createList(); - p = ziplistIndex(zl, 3); - if (!ziplistGet(p, &entry, &elen, &value)) { - printf("ERROR: Could not access index 3\n"); - return 1; - } - if (entry) { - fwrite(entry,elen,1,stdout); - printf("\n"); - } else { - printf("%lld\n", value); - } - printf("\n"); - } - - printf("Get element at index 4 (out of range):\n"); - { - zl = createList(); - p = ziplistIndex(zl, 4); - if (p == NULL) { - printf("No entry\n"); - } else { - printf("ERROR: Out of range index should return NULL, returned offset: %ld\n", p-zl); - return 1; - } - printf("\n"); - } - - printf("Get element at index -1 (last element):\n"); - { - zl = createList(); - p = ziplistIndex(zl, -1); - if (!ziplistGet(p, &entry, &elen, &value)) { - printf("ERROR: Could not access index -1\n"); - return 1; - } - if (entry) { - fwrite(entry,elen,1,stdout); - printf("\n"); - } else { - printf("%lld\n", value); - } - printf("\n"); - } - - printf("Get element at index -4 (first element):\n"); - { - zl = createList(); - p = ziplistIndex(zl, -4); - if (!ziplistGet(p, &entry, &elen, &value)) { - printf("ERROR: Could not access index -4\n"); - return 1; - } - if (entry) { - fwrite(entry,elen,1,stdout); - printf("\n"); - } else { - printf("%lld\n", value); - } - printf("\n"); - } - - printf("Get element at index -5 (reverse out of range):\n"); - { - zl = createList(); - p = ziplistIndex(zl, -5); - if (p == NULL) { - printf("No entry\n"); - } else { - printf("ERROR: Out of range index should return NULL, returned offset: %ld\n", p-zl); - return 1; - } - printf("\n"); - } - - printf("Iterate list from 0 to end:\n"); - { - zl = createList(); - p = ziplistIndex(zl, 0); - while (ziplistGet(p, &entry, &elen, &value)) { - printf("Entry: "); - if (entry) { - fwrite(entry,elen,1,stdout); - } else { - printf("%lld", value); - } - p = ziplistNext(zl,p); - printf("\n"); - } - printf("\n"); - } - - printf("Iterate list from 1 to end:\n"); - { - zl = createList(); - p = ziplistIndex(zl, 1); - while (ziplistGet(p, &entry, &elen, &value)) { - printf("Entry: "); - if (entry) { - fwrite(entry,elen,1,stdout); - } else { - printf("%lld", value); - } - p = ziplistNext(zl,p); - printf("\n"); - } - printf("\n"); - } - - printf("Iterate list from 2 to end:\n"); - { - zl = createList(); - p = ziplistIndex(zl, 2); - while (ziplistGet(p, &entry, &elen, &value)) { - printf("Entry: "); - if (entry) { - fwrite(entry,elen,1,stdout); - } else { - printf("%lld", value); - } - p = ziplistNext(zl,p); - printf("\n"); - } - printf("\n"); - } - - printf("Iterate starting out of range:\n"); - { - zl = createList(); - p = ziplistIndex(zl, 4); - if (!ziplistGet(p, &entry, &elen, &value)) { - printf("No entry\n"); - } else { - printf("ERROR\n"); - } - printf("\n"); - } - - printf("Iterate from back to front:\n"); - { - zl = createList(); - p = ziplistIndex(zl, -1); - while (ziplistGet(p, &entry, &elen, &value)) { - printf("Entry: "); - if (entry) { - fwrite(entry,elen,1,stdout); - } else { - printf("%lld", value); - } - p = ziplistPrev(zl,p); - printf("\n"); - } - printf("\n"); - } - - printf("Iterate from back to front, deleting all items:\n"); - { - zl = createList(); - p = ziplistIndex(zl, -1); - while (ziplistGet(p, &entry, &elen, &value)) { - printf("Entry: "); - if (entry) { - fwrite(entry,elen,1,stdout); - } else { - printf("%lld", value); - } - zl = ziplistDelete(zl,&p); - p = ziplistPrev(zl,p); - printf("\n"); - } - printf("\n"); - } - - printf("Delete inclusive range 0,0:\n"); - { - zl = createList(); - zl = ziplistDeleteRange(zl, 0, 1); - ziplistRepr(zl); - } - - printf("Delete inclusive range 0,1:\n"); - { - zl = createList(); - zl = ziplistDeleteRange(zl, 0, 2); - ziplistRepr(zl); - } - - printf("Delete inclusive range 1,2:\n"); - { - zl = createList(); - zl = ziplistDeleteRange(zl, 1, 2); - ziplistRepr(zl); - } - - printf("Delete with start index out of range:\n"); - { - zl = createList(); - zl = ziplistDeleteRange(zl, 5, 1); - ziplistRepr(zl); - } - - printf("Delete with num overflow:\n"); - { - zl = createList(); - zl = ziplistDeleteRange(zl, 1, 5); - ziplistRepr(zl); - } - - printf("Delete foo while iterating:\n"); - { - zl = createList(); - p = ziplistIndex(zl,0); - while (ziplistGet(p,&entry,&elen,&value)) { - if (entry && strncmp("foo",(char*)entry,elen) == 0) { - printf("Delete foo\n"); - zl = ziplistDelete(zl,&p); - } else { - printf("Entry: "); - if (entry) { - fwrite(entry,elen,1,stdout); - } else { - printf("%lld",value); - } - p = ziplistNext(zl,p); - printf("\n"); - } - } - printf("\n"); - ziplistRepr(zl); - } - - printf("Create long list and check indices:\n"); - { - zl = ziplistNew(); - char buf[32]; - int i,len; - for (i = 0; i < 1000; i++) { - len = sprintf(buf,"%d",i); - zl = ziplistPush(zl,(unsigned char*)buf,len,ZIPLIST_TAIL); - } - for (i = 0; i < 1000; i++) { - p = ziplistIndex(zl,i); - assert(ziplistGet(p,NULL,NULL,&value)); - assert(i == value); - - p = ziplistIndex(zl,-i-1); - assert(ziplistGet(p,NULL,NULL,&value)); - assert(999-i == value); - } - printf("SUCCESS\n\n"); - } - - printf("Compare strings with ziplist entries:\n"); - { - zl = createList(); - p = ziplistIndex(zl,0); - if (!ziplistCompare(p,(unsigned char*)"hello",5)) { - printf("ERROR: not \"hello\"\n"); - return 1; - } - if (ziplistCompare(p,(unsigned char*)"hella",5)) { - printf("ERROR: \"hella\"\n"); - return 1; - } - - p = ziplistIndex(zl,3); - if (!ziplistCompare(p,(unsigned char*)"1024",4)) { - printf("ERROR: not \"1024\"\n"); - return 1; - } - if (ziplistCompare(p,(unsigned char*)"1025",4)) { - printf("ERROR: \"1025\"\n"); - return 1; - } - printf("SUCCESS\n"); - } - - printf("Stress with variable ziplist size:\n"); - { - stress(ZIPLIST_HEAD,100000,16384,256); - stress(ZIPLIST_TAIL,100000,16384,256); - } - - return 0; -} - -#endif diff --git a/ziplist.h b/ziplist.h deleted file mode 100644 index 311257256..000000000 --- a/ziplist.h +++ /dev/null @@ -1,15 +0,0 @@ -#define ZIPLIST_HEAD 0 -#define ZIPLIST_TAIL 1 - -unsigned char *ziplistNew(void); -unsigned char *ziplistPush(unsigned char *zl, unsigned char *s, unsigned int slen, int where); -unsigned char *ziplistIndex(unsigned char *zl, int index); -unsigned char *ziplistNext(unsigned char *zl, unsigned char *p); -unsigned char *ziplistPrev(unsigned char *zl, unsigned char *p); -unsigned int ziplistGet(unsigned char *p, unsigned char **sval, unsigned int *slen, long long *lval); -unsigned char *ziplistInsert(unsigned char *zl, unsigned char *p, unsigned char *s, unsigned int slen); -unsigned char *ziplistDelete(unsigned char *zl, unsigned char **p); -unsigned char *ziplistDeleteRange(unsigned char *zl, unsigned int index, unsigned int num); -unsigned int ziplistCompare(unsigned char *p, unsigned char *s, unsigned int slen); -unsigned int ziplistLen(unsigned char *zl); -unsigned int ziplistSize(unsigned char *zl); diff --git a/zipmap.c b/zipmap.c deleted file mode 100644 index 35faeabef..000000000 --- a/zipmap.c +++ /dev/null @@ -1,455 +0,0 @@ -/* String -> String Map data structure optimized for size. - * This file implements a data structure mapping strings to other strings - * implementing an O(n) lookup data structure designed to be very memory - * efficient. - * - * The Redis Hash type uses this data structure for hashes composed of a small - * number of elements, to switch to an hash table once a given number of - * elements is reached. - * - * Given that many times Redis Hashes are used to represent objects composed - * of few fields, this is a very big win in terms of used memory. - * - * -------------------------------------------------------------------------- - * - * Copyright (c) 2009-2010, Salvatore Sanfilippo - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -/* Memory layout of a zipmap, for the map "foo" => "bar", "hello" => "world": - * - * "foo""bar""hello""world" - * - * is 1 byte length that holds the current size of the zipmap. - * When the zipmap length is greater than or equal to 254, this value - * is not used and the zipmap needs to be traversed to find out the length. - * - * is the length of the following string (key or value). - * lengths are encoded in a single value or in a 5 bytes value. - * If the first byte value (as an unsigned 8 bit value) is between 0 and - * 252, it's a single-byte length. If it is 253 then a four bytes unsigned - * integer follows (in the host byte ordering). A value fo 255 is used to - * signal the end of the hash. The special value 254 is used to mark - * empty space that can be used to add new key/value pairs. - * - * is the number of free unused bytes - * after the string, resulting from modification of values associated to a - * key (for instance if "foo" is set to "bar', and later "foo" will be se to - * "hi", I'll have a free byte to use if the value will enlarge again later, - * or even in order to add a key/value pair if it fits. - * - * is always an unsigned 8 bit number, because if after an - * update operation there are more than a few free bytes, the zipmap will be - * reallocated to make sure it is as small as possible. - * - * The most compact representation of the above two elements hash is actually: - * - * "\x02\x03foo\x03\x00bar\x05hello\x05\x00world\xff" - * - * Note that because keys and values are prefixed length "objects", - * the lookup will take O(N) where N is the number of elements - * in the zipmap and *not* the number of bytes needed to represent the zipmap. - * This lowers the constant times considerably. - */ - -#include -#include -#include -#include "zmalloc.h" - -#define ZIPMAP_BIGLEN 254 -#define ZIPMAP_END 255 - -/* The following defines the max value for the field described in the - * comments above, that is, the max number of trailing bytes in a value. */ -#define ZIPMAP_VALUE_MAX_FREE 4 - -/* The following macro returns the number of bytes needed to encode the length - * for the integer value _l, that is, 1 byte for lengths < ZIPMAP_BIGLEN and - * 5 bytes for all the other lengths. */ -#define ZIPMAP_LEN_BYTES(_l) (((_l) < ZIPMAP_BIGLEN) ? 1 : sizeof(unsigned int)+1) - -/* Create a new empty zipmap. */ -unsigned char *zipmapNew(void) { - unsigned char *zm = zmalloc(2); - - zm[0] = 0; /* Length */ - zm[1] = ZIPMAP_END; - return zm; -} - -/* Decode the encoded length pointed by 'p' */ -static unsigned int zipmapDecodeLength(unsigned char *p) { - unsigned int len = *p; - - if (len < ZIPMAP_BIGLEN) return len; - memcpy(&len,p+1,sizeof(unsigned int)); - return len; -} - -/* Encode the length 'l' writing it in 'p'. If p is NULL it just returns - * the amount of bytes required to encode such a length. */ -static unsigned int zipmapEncodeLength(unsigned char *p, unsigned int len) { - if (p == NULL) { - return ZIPMAP_LEN_BYTES(len); - } else { - if (len < ZIPMAP_BIGLEN) { - p[0] = len; - return 1; - } else { - p[0] = ZIPMAP_BIGLEN; - memcpy(p+1,&len,sizeof(len)); - return 1+sizeof(len); - } - } -} - -/* Search for a matching key, returning a pointer to the entry inside the - * zipmap. Returns NULL if the key is not found. - * - * If NULL is returned, and totlen is not NULL, it is set to the entire - * size of the zimap, so that the calling function will be able to - * reallocate the original zipmap to make room for more entries. */ -static unsigned char *zipmapLookupRaw(unsigned char *zm, unsigned char *key, unsigned int klen, unsigned int *totlen) { - unsigned char *p = zm+1, *k = NULL; - unsigned int l,llen; - - while(*p != ZIPMAP_END) { - unsigned char free; - - /* Match or skip the key */ - l = zipmapDecodeLength(p); - llen = zipmapEncodeLength(NULL,l); - if (k == NULL && l == klen && !memcmp(p+llen,key,l)) { - /* Only return when the user doesn't care - * for the total length of the zipmap. */ - if (totlen != NULL) { - k = p; - } else { - return p; - } - } - p += llen+l; - /* Skip the value as well */ - l = zipmapDecodeLength(p); - p += zipmapEncodeLength(NULL,l); - free = p[0]; - p += l+1+free; /* +1 to skip the free byte */ - } - if (totlen != NULL) *totlen = (unsigned int)(p-zm)+1; - return k; -} - -static unsigned long zipmapRequiredLength(unsigned int klen, unsigned int vlen) { - unsigned int l; - - l = klen+vlen+3; - if (klen >= ZIPMAP_BIGLEN) l += 4; - if (vlen >= ZIPMAP_BIGLEN) l += 4; - return l; -} - -/* Return the total amount used by a key (encoded length + payload) */ -static unsigned int zipmapRawKeyLength(unsigned char *p) { - unsigned int l = zipmapDecodeLength(p); - return zipmapEncodeLength(NULL,l) + l; -} - -/* Return the total amount used by a value - * (encoded length + single byte free count + payload) */ -static unsigned int zipmapRawValueLength(unsigned char *p) { - unsigned int l = zipmapDecodeLength(p); - unsigned int used; - - used = zipmapEncodeLength(NULL,l); - used += p[used] + 1 + l; - return used; -} - -/* If 'p' points to a key, this function returns the total amount of - * bytes used to store this entry (entry = key + associated value + trailing - * free space if any). */ -static unsigned int zipmapRawEntryLength(unsigned char *p) { - unsigned int l = zipmapRawKeyLength(p); - return l + zipmapRawValueLength(p+l); -} - -static inline unsigned char *zipmapResize(unsigned char *zm, unsigned int len) { - zm = zrealloc(zm, len); - zm[len-1] = ZIPMAP_END; - return zm; -} - -/* Set key to value, creating the key if it does not already exist. - * If 'update' is not NULL, *update is set to 1 if the key was - * already preset, otherwise to 0. */ -unsigned char *zipmapSet(unsigned char *zm, unsigned char *key, unsigned int klen, unsigned char *val, unsigned int vlen, int *update) { - unsigned int zmlen, offset; - unsigned int freelen, reqlen = zipmapRequiredLength(klen,vlen); - unsigned int empty, vempty; - unsigned char *p; - - freelen = reqlen; - if (update) *update = 0; - p = zipmapLookupRaw(zm,key,klen,&zmlen); - if (p == NULL) { - /* Key not found: enlarge */ - zm = zipmapResize(zm, zmlen+reqlen); - p = zm+zmlen-1; - zmlen = zmlen+reqlen; - - /* Increase zipmap length (this is an insert) */ - if (zm[0] < ZIPMAP_BIGLEN) zm[0]++; - } else { - /* Key found. Is there enough space for the new value? */ - /* Compute the total length: */ - if (update) *update = 1; - freelen = zipmapRawEntryLength(p); - if (freelen < reqlen) { - /* Store the offset of this key within the current zipmap, so - * it can be resized. Then, move the tail backwards so this - * pair fits at the current position. */ - offset = p-zm; - zm = zipmapResize(zm, zmlen-freelen+reqlen); - p = zm+offset; - - /* The +1 in the number of bytes to be moved is caused by the - * end-of-zipmap byte. Note: the *original* zmlen is used. */ - memmove(p+reqlen, p+freelen, zmlen-(offset+freelen+1)); - zmlen = zmlen-freelen+reqlen; - freelen = reqlen; - } - } - - /* We now have a suitable block where the key/value entry can - * be written. If there is too much free space, move the tail - * of the zipmap a few bytes to the front and shrink the zipmap, - * as we want zipmaps to be very space efficient. */ - empty = freelen-reqlen; - if (empty >= ZIPMAP_VALUE_MAX_FREE) { - /* First, move the tail bytes to the front, then resize - * the zipmap to be bytes smaller. */ - offset = p-zm; - memmove(p+reqlen, p+freelen, zmlen-(offset+freelen+1)); - zmlen -= empty; - zm = zipmapResize(zm, zmlen); - p = zm+offset; - vempty = 0; - } else { - vempty = empty; - } - - /* Just write the key + value and we are done. */ - /* Key: */ - p += zipmapEncodeLength(p,klen); - memcpy(p,key,klen); - p += klen; - /* Value: */ - p += zipmapEncodeLength(p,vlen); - *p++ = vempty; - memcpy(p,val,vlen); - return zm; -} - -/* Remove the specified key. If 'deleted' is not NULL the pointed integer is - * set to 0 if the key was not found, to 1 if it was found and deleted. */ -unsigned char *zipmapDel(unsigned char *zm, unsigned char *key, unsigned int klen, int *deleted) { - unsigned int zmlen, freelen; - unsigned char *p = zipmapLookupRaw(zm,key,klen,&zmlen); - if (p) { - freelen = zipmapRawEntryLength(p); - memmove(p, p+freelen, zmlen-((p-zm)+freelen+1)); - zm = zipmapResize(zm, zmlen-freelen); - - /* Decrease zipmap length */ - if (zm[0] < ZIPMAP_BIGLEN) zm[0]--; - - if (deleted) *deleted = 1; - } else { - if (deleted) *deleted = 0; - } - return zm; -} - -/* Call it before to iterate trought elements via zipmapNext() */ -unsigned char *zipmapRewind(unsigned char *zm) { - return zm+1; -} - -/* This function is used to iterate through all the zipmap elements. - * In the first call the first argument is the pointer to the zipmap + 1. - * In the next calls what zipmapNext returns is used as first argument. - * Example: - * - * unsigned char *i = zipmapRewind(my_zipmap); - * while((i = zipmapNext(i,&key,&klen,&value,&vlen)) != NULL) { - * printf("%d bytes key at $p\n", klen, key); - * printf("%d bytes value at $p\n", vlen, value); - * } - */ -unsigned char *zipmapNext(unsigned char *zm, unsigned char **key, unsigned int *klen, unsigned char **value, unsigned int *vlen) { - if (zm[0] == ZIPMAP_END) return NULL; - if (key) { - *key = zm; - *klen = zipmapDecodeLength(zm); - *key += ZIPMAP_LEN_BYTES(*klen); - } - zm += zipmapRawKeyLength(zm); - if (value) { - *value = zm+1; - *vlen = zipmapDecodeLength(zm); - *value += ZIPMAP_LEN_BYTES(*vlen); - } - zm += zipmapRawValueLength(zm); - return zm; -} - -/* Search a key and retrieve the pointer and len of the associated value. - * If the key is found the function returns 1, otherwise 0. */ -int zipmapGet(unsigned char *zm, unsigned char *key, unsigned int klen, unsigned char **value, unsigned int *vlen) { - unsigned char *p; - - if ((p = zipmapLookupRaw(zm,key,klen,NULL)) == NULL) return 0; - p += zipmapRawKeyLength(p); - *vlen = zipmapDecodeLength(p); - *value = p + ZIPMAP_LEN_BYTES(*vlen) + 1; - return 1; -} - -/* Return 1 if the key exists, otherwise 0 is returned. */ -int zipmapExists(unsigned char *zm, unsigned char *key, unsigned int klen) { - return zipmapLookupRaw(zm,key,klen,NULL) != NULL; -} - -/* Return the number of entries inside a zipmap */ -unsigned int zipmapLen(unsigned char *zm) { - unsigned int len = 0; - if (zm[0] < ZIPMAP_BIGLEN) { - len = zm[0]; - } else { - unsigned char *p = zipmapRewind(zm); - while((p = zipmapNext(p,NULL,NULL,NULL,NULL)) != NULL) len++; - - /* Re-store length if small enough */ - if (len < ZIPMAP_BIGLEN) zm[0] = len; - } - return len; -} - -void zipmapRepr(unsigned char *p) { - unsigned int l; - - printf("{status %u}",*p++); - while(1) { - if (p[0] == ZIPMAP_END) { - printf("{end}"); - break; - } else { - unsigned char e; - - l = zipmapDecodeLength(p); - printf("{key %u}",l); - p += zipmapEncodeLength(NULL,l); - fwrite(p,l,1,stdout); - p += l; - - l = zipmapDecodeLength(p); - printf("{value %u}",l); - p += zipmapEncodeLength(NULL,l); - e = *p++; - fwrite(p,l,1,stdout); - p += l+e; - if (e) { - printf("["); - while(e--) printf("."); - printf("]"); - } - } - } - printf("\n"); -} - -#ifdef ZIPMAP_TEST_MAIN -int main(void) { - unsigned char *zm; - - zm = zipmapNew(); - - zm = zipmapSet(zm,(unsigned char*) "name",4, (unsigned char*) "foo",3,NULL); - zm = zipmapSet(zm,(unsigned char*) "surname",7, (unsigned char*) "foo",3,NULL); - zm = zipmapSet(zm,(unsigned char*) "age",3, (unsigned char*) "foo",3,NULL); - zipmapRepr(zm); - - zm = zipmapSet(zm,(unsigned char*) "hello",5, (unsigned char*) "world!",6,NULL); - zm = zipmapSet(zm,(unsigned char*) "foo",3, (unsigned char*) "bar",3,NULL); - zm = zipmapSet(zm,(unsigned char*) "foo",3, (unsigned char*) "!",1,NULL); - zipmapRepr(zm); - zm = zipmapSet(zm,(unsigned char*) "foo",3, (unsigned char*) "12345",5,NULL); - zipmapRepr(zm); - zm = zipmapSet(zm,(unsigned char*) "new",3, (unsigned char*) "xx",2,NULL); - zm = zipmapSet(zm,(unsigned char*) "noval",5, (unsigned char*) "",0,NULL); - zipmapRepr(zm); - zm = zipmapDel(zm,(unsigned char*) "new",3,NULL); - zipmapRepr(zm); - - printf("\nLook up large key:\n"); - { - unsigned char buf[512]; - unsigned char *value; - unsigned int vlen, i; - for (i = 0; i < 512; i++) buf[i] = 'a'; - - zm = zipmapSet(zm,buf,512,(unsigned char*) "long",4,NULL); - if (zipmapGet(zm,buf,512,&value,&vlen)) { - printf(" is associated to the %d bytes value: %.*s\n", - vlen, vlen, value); - } - } - - printf("\nPerform a direct lookup:\n"); - { - unsigned char *value; - unsigned int vlen; - - if (zipmapGet(zm,(unsigned char*) "foo",3,&value,&vlen)) { - printf(" foo is associated to the %d bytes value: %.*s\n", - vlen, vlen, value); - } - } - printf("\nIterate trought elements:\n"); - { - unsigned char *i = zipmapRewind(zm); - unsigned char *key, *value; - unsigned int klen, vlen; - - while((i = zipmapNext(i,&key,&klen,&value,&vlen)) != NULL) { - printf(" %d:%.*s => %d:%.*s\n", klen, klen, key, vlen, vlen, value); - } - } - return 0; -} -#endif diff --git a/zipmap.h b/zipmap.h deleted file mode 100644 index e5f6c9f28..000000000 --- a/zipmap.h +++ /dev/null @@ -1,48 +0,0 @@ -/* String -> String Map data structure optimized for size. - * - * See zipmap.c for more info. - * - * -------------------------------------------------------------------------- - * - * Copyright (c) 2009-2010, Salvatore Sanfilippo - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef _ZIMMAP_H -#define _ZIPMAP_H - -unsigned char *zipmapNew(void); -unsigned char *zipmapSet(unsigned char *zm, unsigned char *key, unsigned int klen, unsigned char *val, unsigned int vlen, int *update); -unsigned char *zipmapDel(unsigned char *zm, unsigned char *key, unsigned int klen, int *deleted); -unsigned char *zipmapRewind(unsigned char *zm); -unsigned char *zipmapNext(unsigned char *zm, unsigned char **key, unsigned int *klen, unsigned char **value, unsigned int *vlen); -int zipmapGet(unsigned char *zm, unsigned char *key, unsigned int klen, unsigned char **value, unsigned int *vlen); -int zipmapExists(unsigned char *zm, unsigned char *key, unsigned int klen); -unsigned int zipmapLen(unsigned char *zm); -void zipmapRepr(unsigned char *p); - -#endif diff --git a/zmalloc.c b/zmalloc.c deleted file mode 100644 index 8658376a3..000000000 --- a/zmalloc.c +++ /dev/null @@ -1,158 +0,0 @@ -/* zmalloc - total amount of allocated memory aware version of malloc() - * - * Copyright (c) 2009-2010, Salvatore Sanfilippo - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include -#include -#include -#include -#include "config.h" - -#if defined(__sun) -#define PREFIX_SIZE sizeof(long long) -#else -#define PREFIX_SIZE sizeof(size_t) -#endif - -#define increment_used_memory(__n) do { \ - size_t _n = (__n); \ - if (_n&(sizeof(long)-1)) _n += sizeof(long)-(_n&(sizeof(long)-1)); \ - if (zmalloc_thread_safe) { \ - pthread_mutex_lock(&used_memory_mutex); \ - used_memory += _n; \ - pthread_mutex_unlock(&used_memory_mutex); \ - } else { \ - used_memory += _n; \ - } \ -} while(0) - -#define decrement_used_memory(__n) do { \ - size_t _n = (__n); \ - if (_n&(sizeof(long)-1)) _n += sizeof(long)-(_n&(sizeof(long)-1)); \ - if (zmalloc_thread_safe) { \ - pthread_mutex_lock(&used_memory_mutex); \ - used_memory -= _n; \ - pthread_mutex_unlock(&used_memory_mutex); \ - } else { \ - used_memory -= _n; \ - } \ -} while(0) - -static size_t used_memory = 0; -static int zmalloc_thread_safe = 0; -pthread_mutex_t used_memory_mutex = PTHREAD_MUTEX_INITIALIZER; - -static void zmalloc_oom(size_t size) { - fprintf(stderr, "zmalloc: Out of memory trying to allocate %zu bytes\n", - size); - fflush(stderr); - abort(); -} - -void *zmalloc(size_t size) { - void *ptr = malloc(size+PREFIX_SIZE); - - if (!ptr) zmalloc_oom(size); -#ifdef HAVE_MALLOC_SIZE - increment_used_memory(redis_malloc_size(ptr)); - return ptr; -#else - *((size_t*)ptr) = size; - increment_used_memory(size+PREFIX_SIZE); - return (char*)ptr+PREFIX_SIZE; -#endif -} - -void *zrealloc(void *ptr, size_t size) { -#ifndef HAVE_MALLOC_SIZE - void *realptr; -#endif - size_t oldsize; - void *newptr; - - if (ptr == NULL) return zmalloc(size); -#ifdef HAVE_MALLOC_SIZE - oldsize = redis_malloc_size(ptr); - newptr = realloc(ptr,size); - if (!newptr) zmalloc_oom(size); - - decrement_used_memory(oldsize); - increment_used_memory(redis_malloc_size(newptr)); - return newptr; -#else - realptr = (char*)ptr-PREFIX_SIZE; - oldsize = *((size_t*)realptr); - newptr = realloc(realptr,size+PREFIX_SIZE); - if (!newptr) zmalloc_oom(size); - - *((size_t*)newptr) = size; - decrement_used_memory(oldsize); - increment_used_memory(size); - return (char*)newptr+PREFIX_SIZE; -#endif -} - -void zfree(void *ptr) { -#ifndef HAVE_MALLOC_SIZE - void *realptr; - size_t oldsize; -#endif - - if (ptr == NULL) return; -#ifdef HAVE_MALLOC_SIZE - decrement_used_memory(redis_malloc_size(ptr)); - free(ptr); -#else - realptr = (char*)ptr-PREFIX_SIZE; - oldsize = *((size_t*)realptr); - decrement_used_memory(oldsize+PREFIX_SIZE); - free(realptr); -#endif -} - -char *zstrdup(const char *s) { - size_t l = strlen(s)+1; - char *p = zmalloc(l); - - memcpy(p,s,l); - return p; -} - -size_t zmalloc_used_memory(void) { - size_t um; - - if (zmalloc_thread_safe) pthread_mutex_lock(&used_memory_mutex); - um = used_memory; - if (zmalloc_thread_safe) pthread_mutex_unlock(&used_memory_mutex); - return um; -} - -void zmalloc_enable_thread_safeness(void) { - zmalloc_thread_safe = 1; -} diff --git a/zmalloc.h b/zmalloc.h deleted file mode 100644 index 193e7eda5..000000000 --- a/zmalloc.h +++ /dev/null @@ -1,41 +0,0 @@ -/* zmalloc - total amount of allocated memory aware version of malloc() - * - * Copyright (c) 2009-2010, Salvatore Sanfilippo - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef _ZMALLOC_H -#define _ZMALLOC_H - -void *zmalloc(size_t size); -void *zrealloc(void *ptr, size_t size); -void zfree(void *ptr); -char *zstrdup(const char *s); -size_t zmalloc_used_memory(void); -void zmalloc_enable_thread_safeness(void); - -#endif /* _ZMALLOC_H */ -- cgit v1.2.1