From e2641e09cc0daf44f63f654230f72d22acf3a9af Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 22 Jun 2010 00:07:48 +0200 Subject: redis.c split into many different C files. networking related stuff moved into networking.c moved more code more work on layout of source code SDS instantaneuos memory saving. By Pieter and Salvatore at VMware ;) cleanly compiling again after the first split, now splitting it in more C files moving more things around... work in progress split replication code splitting more Sets split Hash split replication split even more splitting more splitting minor change --- src/Makefile | 111 ++++ src/adlist.c | 325 +++++++++++ src/adlist.h | 92 +++ src/ae.c | 390 +++++++++++++ src/ae.h | 117 ++++ src/ae_epoll.c | 91 +++ src/ae_kqueue.c | 93 +++ src/ae_select.c | 72 +++ src/anet.c | 270 +++++++++ src/anet.h | 49 ++ src/aof.c | 694 ++++++++++++++++++++++ src/config.c | 438 ++++++++++++++ src/config.h | 45 ++ src/db.c | 508 ++++++++++++++++ src/debug.c | 309 ++++++++++ src/dict.c | 727 +++++++++++++++++++++++ src/dict.h | 151 +++++ src/fmacros.h | 15 + src/linenoise.c | 433 ++++++++++++++ src/linenoise.h | 41 ++ src/lzf.h | 100 ++++ src/lzfP.h | 159 +++++ src/lzf_c.c | 295 ++++++++++ src/lzf_d.c | 150 +++++ src/mkreleasehdr.sh | 9 + src/multi.c | 266 +++++++++ src/networking.c | 589 +++++++++++++++++++ src/object.c | 405 +++++++++++++ src/pqsort.c | 197 +++++++ src/pqsort.h | 15 + src/pubsub.c | 259 +++++++++ src/rdb.c | 886 ++++++++++++++++++++++++++++ src/redis-benchmark.c | 665 +++++++++++++++++++++ src/redis-check-aof.c | 185 ++++++ src/redis-check-dump.c | 671 +++++++++++++++++++++ src/redis-cli.c | 493 ++++++++++++++++ src/redis.c | 1516 ++++++++++++++++++++++++++++++++++++++++++++++++ src/redis.h | 885 ++++++++++++++++++++++++++++ src/release.c | 13 + src/replication.c | 475 +++++++++++++++ src/sds.c | 384 ++++++++++++ src/sds.h | 74 +++ src/sha1.c | 276 +++++++++ src/sha1.h | 17 + src/solarisfixes.h | 21 + src/sort.c | 383 ++++++++++++ src/t_hash.c | 397 +++++++++++++ src/t_list.c | 829 ++++++++++++++++++++++++++ src/t_set.c | 349 +++++++++++ src/t_string.c | 251 ++++++++ src/t_zset.c | 985 +++++++++++++++++++++++++++++++ src/util.c | 223 +++++++ src/version.h | 1 + src/vm.c | 1126 +++++++++++++++++++++++++++++++++++ src/ziplist.c | 959 ++++++++++++++++++++++++++++++ src/ziplist.h | 15 + src/zipmap.c | 455 +++++++++++++++ src/zipmap.h | 48 ++ src/zmalloc.c | 158 +++++ src/zmalloc.h | 41 ++ 60 files changed, 20196 insertions(+) create mode 100644 src/Makefile create mode 100644 src/adlist.c create mode 100644 src/adlist.h create mode 100644 src/ae.c create mode 100644 src/ae.h create mode 100644 src/ae_epoll.c create mode 100644 src/ae_kqueue.c create mode 100644 src/ae_select.c create mode 100644 src/anet.c create mode 100644 src/anet.h create mode 100644 src/aof.c create mode 100644 src/config.c create mode 100644 src/config.h create mode 100644 src/db.c create mode 100644 src/debug.c create mode 100644 src/dict.c create mode 100644 src/dict.h create mode 100644 src/fmacros.h create mode 100644 src/linenoise.c create mode 100644 src/linenoise.h create mode 100644 src/lzf.h create mode 100644 src/lzfP.h create mode 100644 src/lzf_c.c create mode 100644 src/lzf_d.c create mode 100755 src/mkreleasehdr.sh create mode 100644 src/multi.c create mode 100644 src/networking.c create mode 100644 src/object.c create mode 100644 src/pqsort.c create mode 100644 src/pqsort.h create mode 100644 src/pubsub.c create mode 100644 src/rdb.c create mode 100644 src/redis-benchmark.c create mode 100644 src/redis-check-aof.c create mode 100644 src/redis-check-dump.c create mode 100644 src/redis-cli.c create mode 100644 src/redis.c create mode 100644 src/redis.h create mode 100644 src/release.c create mode 100644 src/replication.c create mode 100644 src/sds.c create mode 100644 src/sds.h create mode 100644 src/sha1.c create mode 100644 src/sha1.h create mode 100644 src/solarisfixes.h create mode 100644 src/sort.c create mode 100644 src/t_hash.c create mode 100644 src/t_list.c create mode 100644 src/t_set.c create mode 100644 src/t_string.c create mode 100644 src/t_zset.c create mode 100644 src/util.c create mode 100644 src/version.h create mode 100644 src/vm.c create mode 100644 src/ziplist.c create mode 100644 src/ziplist.h create mode 100644 src/zipmap.c create mode 100644 src/zipmap.h create mode 100644 src/zmalloc.c create mode 100644 src/zmalloc.h (limited to 'src') diff --git a/src/Makefile b/src/Makefile new file mode 100644 index 000000000..3cba3c069 --- /dev/null +++ b/src/Makefile @@ -0,0 +1,111 @@ +# Redis Makefile +# Copyright (C) 2009 Salvatore Sanfilippo +# This file is released under the BSD license, see the COPYING file + +release_hdr := $(shell sh -c './mkreleasehdr.sh') +uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not') +OPTIMIZATION?=-O2 +ifeq ($(uname_S),SunOS) + CFLAGS?= -std=c99 -pedantic $(OPTIMIZATION) -Wall -W -D__EXTENSIONS__ -D_XPG6 + CCLINK?= -ldl -lnsl -lsocket -lm -lpthread +else + CFLAGS?= -std=c99 -pedantic $(OPTIMIZATION) -Wall -W $(ARCH) $(PROF) + CCLINK?= -lm -pthread +endif +CCOPT= $(CFLAGS) $(CCLINK) $(ARCH) $(PROF) +DEBUG?= -g -rdynamic -ggdb + +OBJ = adlist.o ae.o anet.o dict.o redis.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o vm.o pubsub.o multi.o debug.o sort.o +BENCHOBJ = ae.o anet.o redis-benchmark.o sds.o adlist.o zmalloc.o +CLIOBJ = anet.o sds.o adlist.o redis-cli.o zmalloc.o linenoise.o +CHECKDUMPOBJ = redis-check-dump.o lzf_c.o lzf_d.o +CHECKAOFOBJ = redis-check-aof.o + +PRGNAME = redis-server +BENCHPRGNAME = redis-benchmark +CLIPRGNAME = redis-cli +CHECKDUMPPRGNAME = redis-check-dump +CHECKAOFPRGNAME = redis-check-aof + +all: redis-server redis-benchmark redis-cli redis-check-dump redis-check-aof + +# Deps (use make dep to generate this) +adlist.o: adlist.c adlist.h zmalloc.h +ae.o: ae.c ae.h zmalloc.h config.h ae_kqueue.c +ae_epoll.o: ae_epoll.c +ae_kqueue.o: ae_kqueue.c +ae_select.o: ae_select.c +anet.o: anet.c fmacros.h anet.h +dict.o: dict.c fmacros.h dict.h zmalloc.h +linenoise.o: linenoise.c fmacros.h +lzf_c.o: lzf_c.c lzfP.h +lzf_d.o: lzf_d.c lzfP.h +pqsort.o: pqsort.c +redis-benchmark.o: redis-benchmark.c fmacros.h ae.h anet.h sds.h adlist.h \ + zmalloc.h +redis-check-aof.o: redis-check-aof.c fmacros.h config.h +redis-check-dump.o: redis-check-dump.c lzf.h +redis-cli.o: redis-cli.c fmacros.h anet.h sds.h adlist.h zmalloc.h \ + linenoise.h +redis.o: redis.c fmacros.h config.h redis.h ae.h sds.h anet.h dict.h \ + adlist.h zmalloc.h lzf.h pqsort.h zipmap.h ziplist.h sha1.h +release.o: release.c release.h +sds.o: sds.c sds.h zmalloc.h +sha1.o: sha1.c sha1.h +ziplist.o: ziplist.c zmalloc.h ziplist.h +zipmap.o: zipmap.c zmalloc.h +zmalloc.o: zmalloc.c config.h + +redis-server: $(OBJ) + $(CC) -o $(PRGNAME) $(CCOPT) $(DEBUG) $(OBJ) + @echo "" + @echo "Hint: To run 'make test' is a good idea ;)" + @echo "" + +redis-benchmark: $(BENCHOBJ) + $(CC) -o $(BENCHPRGNAME) $(CCOPT) $(DEBUG) $(BENCHOBJ) + +redis-cli: $(CLIOBJ) + $(CC) -o $(CLIPRGNAME) $(CCOPT) $(DEBUG) $(CLIOBJ) + +redis-check-dump: $(CHECKDUMPOBJ) + $(CC) -o $(CHECKDUMPPRGNAME) $(CCOPT) $(DEBUG) $(CHECKDUMPOBJ) + +redis-check-aof: $(CHECKAOFOBJ) + $(CC) -o $(CHECKAOFPRGNAME) $(CCOPT) $(DEBUG) $(CHECKAOFOBJ) + +.c.o: + $(CC) -c $(CFLAGS) $(DEBUG) $(COMPILE_TIME) $< + +clean: + rm -rf $(PRGNAME) $(BENCHPRGNAME) $(CLIPRGNAME) $(CHECKDUMPPRGNAME) $(CHECKAOFPRGNAME) *.o *.gcda *.gcno *.gcov + +dep: + $(CC) -MM *.c + +test: + (cd ..; tclsh8.5 tests/test_helper.tcl --tags "${TAGS}") + +bench: + ./redis-benchmark + +log: + git log '--pretty=format:%ad %s (%cn)' --date=short > Changelog + +32bit: + @echo "" + @echo "WARNING: if it fails under Linux you probably need to install libc6-dev-i386" + @echo "" + make ARCH="-m32" + +gprof: + make PROF="-pg" + +gcov: + make PROF="-fprofile-arcs -ftest-coverage" + +noopt: + make OPTIMIZATION="" + +32bitgprof: + make PROF="-pg" ARCH="-arch i386" diff --git a/src/adlist.c b/src/adlist.c new file mode 100644 index 000000000..015012f5c --- /dev/null +++ b/src/adlist.c @@ -0,0 +1,325 @@ +/* adlist.c - A generic doubly linked list implementation + * + * Copyright (c) 2006-2010, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + + +#include +#include "adlist.h" +#include "zmalloc.h" + +/* Create a new list. The created list can be freed with + * AlFreeList(), but private value of every node need to be freed + * by the user before to call AlFreeList(). + * + * On error, NULL is returned. Otherwise the pointer to the new list. */ +list *listCreate(void) +{ + struct list *list; + + if ((list = zmalloc(sizeof(*list))) == NULL) + return NULL; + list->head = list->tail = NULL; + list->len = 0; + list->dup = NULL; + list->free = NULL; + list->match = NULL; + return list; +} + +/* Free the whole list. + * + * This function can't fail. */ +void listRelease(list *list) +{ + unsigned int len; + listNode *current, *next; + + current = list->head; + len = list->len; + while(len--) { + next = current->next; + if (list->free) list->free(current->value); + zfree(current); + current = next; + } + zfree(list); +} + +/* Add a new node to the list, to head, contaning the specified 'value' + * pointer as value. + * + * On error, NULL is returned and no operation is performed (i.e. the + * list remains unaltered). + * On success the 'list' pointer you pass to the function is returned. */ +list *listAddNodeHead(list *list, void *value) +{ + listNode *node; + + if ((node = zmalloc(sizeof(*node))) == NULL) + return NULL; + node->value = value; + if (list->len == 0) { + list->head = list->tail = node; + node->prev = node->next = NULL; + } else { + node->prev = NULL; + node->next = list->head; + list->head->prev = node; + list->head = node; + } + list->len++; + return list; +} + +/* Add a new node to the list, to tail, contaning the specified 'value' + * pointer as value. + * + * On error, NULL is returned and no operation is performed (i.e. the + * list remains unaltered). + * On success the 'list' pointer you pass to the function is returned. */ +list *listAddNodeTail(list *list, void *value) +{ + listNode *node; + + if ((node = zmalloc(sizeof(*node))) == NULL) + return NULL; + node->value = value; + if (list->len == 0) { + list->head = list->tail = node; + node->prev = node->next = NULL; + } else { + node->prev = list->tail; + node->next = NULL; + list->tail->next = node; + list->tail = node; + } + list->len++; + return list; +} + +list *listInsertNode(list *list, listNode *old_node, void *value, int after) { + listNode *node; + + if ((node = zmalloc(sizeof(*node))) == NULL) + return NULL; + node->value = value; + if (after) { + node->prev = old_node; + node->next = old_node->next; + if (list->tail == old_node) { + list->tail = node; + } + } else { + node->next = old_node; + node->prev = old_node->prev; + if (list->head == old_node) { + list->head = node; + } + } + if (node->prev != NULL) { + node->prev->next = node; + } + if (node->next != NULL) { + node->next->prev = node; + } + list->len++; + return list; +} + +/* Remove the specified node from the specified list. + * It's up to the caller to free the private value of the node. + * + * This function can't fail. */ +void listDelNode(list *list, listNode *node) +{ + if (node->prev) + node->prev->next = node->next; + else + list->head = node->next; + if (node->next) + node->next->prev = node->prev; + else + list->tail = node->prev; + if (list->free) list->free(node->value); + zfree(node); + list->len--; +} + +/* Returns a list iterator 'iter'. After the initialization every + * call to listNext() will return the next element of the list. + * + * This function can't fail. */ +listIter *listGetIterator(list *list, int direction) +{ + listIter *iter; + + if ((iter = zmalloc(sizeof(*iter))) == NULL) return NULL; + if (direction == AL_START_HEAD) + iter->next = list->head; + else + iter->next = list->tail; + iter->direction = direction; + return iter; +} + +/* Release the iterator memory */ +void listReleaseIterator(listIter *iter) { + zfree(iter); +} + +/* Create an iterator in the list private iterator structure */ +void listRewind(list *list, listIter *li) { + li->next = list->head; + li->direction = AL_START_HEAD; +} + +void listRewindTail(list *list, listIter *li) { + li->next = list->tail; + li->direction = AL_START_TAIL; +} + +/* Return the next element of an iterator. + * It's valid to remove the currently returned element using + * listDelNode(), but not to remove other elements. + * + * The function returns a pointer to the next element of the list, + * or NULL if there are no more elements, so the classical usage patter + * is: + * + * iter = listGetIterator(list,); + * while ((node = listNext(iter)) != NULL) { + * doSomethingWith(listNodeValue(node)); + * } + * + * */ +listNode *listNext(listIter *iter) +{ + listNode *current = iter->next; + + if (current != NULL) { + if (iter->direction == AL_START_HEAD) + iter->next = current->next; + else + iter->next = current->prev; + } + return current; +} + +/* Duplicate the whole list. On out of memory NULL is returned. + * On success a copy of the original list is returned. + * + * The 'Dup' method set with listSetDupMethod() function is used + * to copy the node value. Otherwise the same pointer value of + * the original node is used as value of the copied node. + * + * The original list both on success or error is never modified. */ +list *listDup(list *orig) +{ + list *copy; + listIter *iter; + listNode *node; + + if ((copy = listCreate()) == NULL) + return NULL; + copy->dup = orig->dup; + copy->free = orig->free; + copy->match = orig->match; + iter = listGetIterator(orig, AL_START_HEAD); + while((node = listNext(iter)) != NULL) { + void *value; + + if (copy->dup) { + value = copy->dup(node->value); + if (value == NULL) { + listRelease(copy); + listReleaseIterator(iter); + return NULL; + } + } else + value = node->value; + if (listAddNodeTail(copy, value) == NULL) { + listRelease(copy); + listReleaseIterator(iter); + return NULL; + } + } + listReleaseIterator(iter); + return copy; +} + +/* Search the list for a node matching a given key. + * The match is performed using the 'match' method + * set with listSetMatchMethod(). If no 'match' method + * is set, the 'value' pointer of every node is directly + * compared with the 'key' pointer. + * + * On success the first matching node pointer is returned + * (search starts from head). If no matching node exists + * NULL is returned. */ +listNode *listSearchKey(list *list, void *key) +{ + listIter *iter; + listNode *node; + + iter = listGetIterator(list, AL_START_HEAD); + while((node = listNext(iter)) != NULL) { + if (list->match) { + if (list->match(node->value, key)) { + listReleaseIterator(iter); + return node; + } + } else { + if (key == node->value) { + listReleaseIterator(iter); + return node; + } + } + } + listReleaseIterator(iter); + return NULL; +} + +/* Return the element at the specified zero-based index + * where 0 is the head, 1 is the element next to head + * and so on. Negative integers are used in order to count + * from the tail, -1 is the last element, -2 the penultimante + * and so on. If the index is out of range NULL is returned. */ +listNode *listIndex(list *list, int index) { + listNode *n; + + if (index < 0) { + index = (-index)-1; + n = list->tail; + while(index-- && n) n = n->prev; + } else { + n = list->head; + while(index-- && n) n = n->next; + } + return n; +} diff --git a/src/adlist.h b/src/adlist.h new file mode 100644 index 000000000..a1209f62f --- /dev/null +++ b/src/adlist.h @@ -0,0 +1,92 @@ +/* adlist.h - A generic doubly linked list implementation + * + * Copyright (c) 2006-2010, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ADLIST_H__ +#define __ADLIST_H__ + +/* Node, List, and Iterator are the only data structures used currently. */ + +typedef struct listNode { + struct listNode *prev; + struct listNode *next; + void *value; +} listNode; + +typedef struct listIter { + listNode *next; + int direction; +} listIter; + +typedef struct list { + listNode *head; + listNode *tail; + void *(*dup)(void *ptr); + void (*free)(void *ptr); + int (*match)(void *ptr, void *key); + unsigned int len; +} list; + +/* Functions implemented as macros */ +#define listLength(l) ((l)->len) +#define listFirst(l) ((l)->head) +#define listLast(l) ((l)->tail) +#define listPrevNode(n) ((n)->prev) +#define listNextNode(n) ((n)->next) +#define listNodeValue(n) ((n)->value) + +#define listSetDupMethod(l,m) ((l)->dup = (m)) +#define listSetFreeMethod(l,m) ((l)->free = (m)) +#define listSetMatchMethod(l,m) ((l)->match = (m)) + +#define listGetDupMethod(l) ((l)->dup) +#define listGetFree(l) ((l)->free) +#define listGetMatchMethod(l) ((l)->match) + +/* Prototypes */ +list *listCreate(void); +void listRelease(list *list); +list *listAddNodeHead(list *list, void *value); +list *listAddNodeTail(list *list, void *value); +list *listInsertNode(list *list, listNode *old_node, void *value, int after); +void listDelNode(list *list, listNode *node); +listIter *listGetIterator(list *list, int direction); +listNode *listNext(listIter *iter); +void listReleaseIterator(listIter *iter); +list *listDup(list *orig); +listNode *listSearchKey(list *list, void *key); +listNode *listIndex(list *list, int index); +void listRewind(list *list, listIter *li); +void listRewindTail(list *list, listIter *li); + +/* Directions for iterators */ +#define AL_START_HEAD 0 +#define AL_START_TAIL 1 + +#endif /* __ADLIST_H__ */ diff --git a/src/ae.c b/src/ae.c new file mode 100644 index 000000000..c7918ee1d --- /dev/null +++ b/src/ae.c @@ -0,0 +1,390 @@ +/* A simple event-driven programming library. Originally I wrote this code + * for the Jim's event-loop (Jim is a Tcl interpreter) but later translated + * it in form of a library for easy reuse. + * + * Copyright (c) 2006-2010, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include + +#include "ae.h" +#include "zmalloc.h" +#include "config.h" + +/* Include the best multiplexing layer supported by this system. + * The following should be ordered by performances, descending. */ +#ifdef HAVE_EPOLL +#include "ae_epoll.c" +#else + #ifdef HAVE_KQUEUE + #include "ae_kqueue.c" + #else + #include "ae_select.c" + #endif +#endif + +aeEventLoop *aeCreateEventLoop(void) { + aeEventLoop *eventLoop; + int i; + + eventLoop = zmalloc(sizeof(*eventLoop)); + if (!eventLoop) return NULL; + eventLoop->timeEventHead = NULL; + eventLoop->timeEventNextId = 0; + eventLoop->stop = 0; + eventLoop->maxfd = -1; + eventLoop->beforesleep = NULL; + if (aeApiCreate(eventLoop) == -1) { + zfree(eventLoop); + return NULL; + } + /* Events with mask == AE_NONE are not set. So let's initialize the + * vector with it. */ + for (i = 0; i < AE_SETSIZE; i++) + eventLoop->events[i].mask = AE_NONE; + return eventLoop; +} + +void aeDeleteEventLoop(aeEventLoop *eventLoop) { + aeApiFree(eventLoop); + zfree(eventLoop); +} + +void aeStop(aeEventLoop *eventLoop) { + eventLoop->stop = 1; +} + +int aeCreateFileEvent(aeEventLoop *eventLoop, int fd, int mask, + aeFileProc *proc, void *clientData) +{ + if (fd >= AE_SETSIZE) return AE_ERR; + aeFileEvent *fe = &eventLoop->events[fd]; + + if (aeApiAddEvent(eventLoop, fd, mask) == -1) + return AE_ERR; + fe->mask |= mask; + if (mask & AE_READABLE) fe->rfileProc = proc; + if (mask & AE_WRITABLE) fe->wfileProc = proc; + fe->clientData = clientData; + if (fd > eventLoop->maxfd) + eventLoop->maxfd = fd; + return AE_OK; +} + +void aeDeleteFileEvent(aeEventLoop *eventLoop, int fd, int mask) +{ + if (fd >= AE_SETSIZE) return; + aeFileEvent *fe = &eventLoop->events[fd]; + + if (fe->mask == AE_NONE) return; + fe->mask = fe->mask & (~mask); + if (fd == eventLoop->maxfd && fe->mask == AE_NONE) { + /* Update the max fd */ + int j; + + for (j = eventLoop->maxfd-1; j >= 0; j--) + if (eventLoop->events[j].mask != AE_NONE) break; + eventLoop->maxfd = j; + } + aeApiDelEvent(eventLoop, fd, mask); +} + +static void aeGetTime(long *seconds, long *milliseconds) +{ + struct timeval tv; + + gettimeofday(&tv, NULL); + *seconds = tv.tv_sec; + *milliseconds = tv.tv_usec/1000; +} + +static void aeAddMillisecondsToNow(long long milliseconds, long *sec, long *ms) { + long cur_sec, cur_ms, when_sec, when_ms; + + aeGetTime(&cur_sec, &cur_ms); + when_sec = cur_sec + milliseconds/1000; + when_ms = cur_ms + milliseconds%1000; + if (when_ms >= 1000) { + when_sec ++; + when_ms -= 1000; + } + *sec = when_sec; + *ms = when_ms; +} + +long long aeCreateTimeEvent(aeEventLoop *eventLoop, long long milliseconds, + aeTimeProc *proc, void *clientData, + aeEventFinalizerProc *finalizerProc) +{ + long long id = eventLoop->timeEventNextId++; + aeTimeEvent *te; + + te = zmalloc(sizeof(*te)); + if (te == NULL) return AE_ERR; + te->id = id; + aeAddMillisecondsToNow(milliseconds,&te->when_sec,&te->when_ms); + te->timeProc = proc; + te->finalizerProc = finalizerProc; + te->clientData = clientData; + te->next = eventLoop->timeEventHead; + eventLoop->timeEventHead = te; + return id; +} + +int aeDeleteTimeEvent(aeEventLoop *eventLoop, long long id) +{ + aeTimeEvent *te, *prev = NULL; + + te = eventLoop->timeEventHead; + while(te) { + if (te->id == id) { + if (prev == NULL) + eventLoop->timeEventHead = te->next; + else + prev->next = te->next; + if (te->finalizerProc) + te->finalizerProc(eventLoop, te->clientData); + zfree(te); + return AE_OK; + } + prev = te; + te = te->next; + } + return AE_ERR; /* NO event with the specified ID found */ +} + +/* Search the first timer to fire. + * This operation is useful to know how many time the select can be + * put in sleep without to delay any event. + * If there are no timers NULL is returned. + * + * Note that's O(N) since time events are unsorted. + * Possible optimizations (not needed by Redis so far, but...): + * 1) Insert the event in order, so that the nearest is just the head. + * Much better but still insertion or deletion of timers is O(N). + * 2) Use a skiplist to have this operation as O(1) and insertion as O(log(N)). + */ +static aeTimeEvent *aeSearchNearestTimer(aeEventLoop *eventLoop) +{ + aeTimeEvent *te = eventLoop->timeEventHead; + aeTimeEvent *nearest = NULL; + + while(te) { + if (!nearest || te->when_sec < nearest->when_sec || + (te->when_sec == nearest->when_sec && + te->when_ms < nearest->when_ms)) + nearest = te; + te = te->next; + } + return nearest; +} + +/* Process time events */ +static int processTimeEvents(aeEventLoop *eventLoop) { + int processed = 0; + aeTimeEvent *te; + long long maxId; + + te = eventLoop->timeEventHead; + maxId = eventLoop->timeEventNextId-1; + while(te) { + long now_sec, now_ms; + long long id; + + if (te->id > maxId) { + te = te->next; + continue; + } + aeGetTime(&now_sec, &now_ms); + if (now_sec > te->when_sec || + (now_sec == te->when_sec && now_ms >= te->when_ms)) + { + int retval; + + id = te->id; + retval = te->timeProc(eventLoop, id, te->clientData); + processed++; + /* After an event is processed our time event list may + * no longer be the same, so we restart from head. + * Still we make sure to don't process events registered + * by event handlers itself in order to don't loop forever. + * To do so we saved the max ID we want to handle. + * + * FUTURE OPTIMIZATIONS: + * Note that this is NOT great algorithmically. Redis uses + * a single time event so it's not a problem but the right + * way to do this is to add the new elements on head, and + * to flag deleted elements in a special way for later + * deletion (putting references to the nodes to delete into + * another linked list). */ + if (retval != AE_NOMORE) { + aeAddMillisecondsToNow(retval,&te->when_sec,&te->when_ms); + } else { + aeDeleteTimeEvent(eventLoop, id); + } + te = eventLoop->timeEventHead; + } else { + te = te->next; + } + } + return processed; +} + +/* Process every pending time event, then every pending file event + * (that may be registered by time event callbacks just processed). + * Without special flags the function sleeps until some file event + * fires, or when the next time event occurrs (if any). + * + * If flags is 0, the function does nothing and returns. + * if flags has AE_ALL_EVENTS set, all the kind of events are processed. + * if flags has AE_FILE_EVENTS set, file events are processed. + * if flags has AE_TIME_EVENTS set, time events are processed. + * if flags has AE_DONT_WAIT set the function returns ASAP until all + * the events that's possible to process without to wait are processed. + * + * The function returns the number of events processed. */ +int aeProcessEvents(aeEventLoop *eventLoop, int flags) +{ + int processed = 0, numevents; + + /* Nothing to do? return ASAP */ + if (!(flags & AE_TIME_EVENTS) && !(flags & AE_FILE_EVENTS)) return 0; + + /* Note that we want call select() even if there are no + * file events to process as long as we want to process time + * events, in order to sleep until the next time event is ready + * to fire. */ + if (eventLoop->maxfd != -1 || + ((flags & AE_TIME_EVENTS) && !(flags & AE_DONT_WAIT))) { + int j; + aeTimeEvent *shortest = NULL; + struct timeval tv, *tvp; + + if (flags & AE_TIME_EVENTS && !(flags & AE_DONT_WAIT)) + shortest = aeSearchNearestTimer(eventLoop); + if (shortest) { + long now_sec, now_ms; + + /* Calculate the time missing for the nearest + * timer to fire. */ + aeGetTime(&now_sec, &now_ms); + tvp = &tv; + tvp->tv_sec = shortest->when_sec - now_sec; + if (shortest->when_ms < now_ms) { + tvp->tv_usec = ((shortest->when_ms+1000) - now_ms)*1000; + tvp->tv_sec --; + } else { + tvp->tv_usec = (shortest->when_ms - now_ms)*1000; + } + if (tvp->tv_sec < 0) tvp->tv_sec = 0; + if (tvp->tv_usec < 0) tvp->tv_usec = 0; + } else { + /* If we have to check for events but need to return + * ASAP because of AE_DONT_WAIT we need to se the timeout + * to zero */ + if (flags & AE_DONT_WAIT) { + tv.tv_sec = tv.tv_usec = 0; + tvp = &tv; + } else { + /* Otherwise we can block */ + tvp = NULL; /* wait forever */ + } + } + + numevents = aeApiPoll(eventLoop, tvp); + for (j = 0; j < numevents; j++) { + aeFileEvent *fe = &eventLoop->events[eventLoop->fired[j].fd]; + int mask = eventLoop->fired[j].mask; + int fd = eventLoop->fired[j].fd; + int rfired = 0; + + /* note the fe->mask & mask & ... code: maybe an already processed + * event removed an element that fired and we still didn't + * processed, so we check if the event is still valid. */ + if (fe->mask & mask & AE_READABLE) { + rfired = 1; + fe->rfileProc(eventLoop,fd,fe->clientData,mask); + } + if (fe->mask & mask & AE_WRITABLE) { + if (!rfired || fe->wfileProc != fe->rfileProc) + fe->wfileProc(eventLoop,fd,fe->clientData,mask); + } + processed++; + } + } + /* Check time events */ + if (flags & AE_TIME_EVENTS) + processed += processTimeEvents(eventLoop); + + return processed; /* return the number of processed file/time events */ +} + +/* Wait for millseconds until the given file descriptor becomes + * writable/readable/exception */ +int aeWait(int fd, int mask, long long milliseconds) { + struct timeval tv; + fd_set rfds, wfds, efds; + int retmask = 0, retval; + + tv.tv_sec = milliseconds/1000; + tv.tv_usec = (milliseconds%1000)*1000; + FD_ZERO(&rfds); + FD_ZERO(&wfds); + FD_ZERO(&efds); + + if (mask & AE_READABLE) FD_SET(fd,&rfds); + if (mask & AE_WRITABLE) FD_SET(fd,&wfds); + if ((retval = select(fd+1, &rfds, &wfds, &efds, &tv)) > 0) { + if (FD_ISSET(fd,&rfds)) retmask |= AE_READABLE; + if (FD_ISSET(fd,&wfds)) retmask |= AE_WRITABLE; + return retmask; + } else { + return retval; + } +} + +void aeMain(aeEventLoop *eventLoop) { + eventLoop->stop = 0; + while (!eventLoop->stop) { + if (eventLoop->beforesleep != NULL) + eventLoop->beforesleep(eventLoop); + aeProcessEvents(eventLoop, AE_ALL_EVENTS); + } +} + +char *aeGetApiName(void) { + return aeApiName(); +} + +void aeSetBeforeSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *beforesleep) { + eventLoop->beforesleep = beforesleep; +} diff --git a/src/ae.h b/src/ae.h new file mode 100644 index 000000000..a9db18ed9 --- /dev/null +++ b/src/ae.h @@ -0,0 +1,117 @@ +/* A simple event-driven programming library. Originally I wrote this code + * for the Jim's event-loop (Jim is a Tcl interpreter) but later translated + * it in form of a library for easy reuse. + * + * Copyright (c) 2006-2010, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __AE_H__ +#define __AE_H__ + +#define AE_SETSIZE (1024*10) /* Max number of fd supported */ + +#define AE_OK 0 +#define AE_ERR -1 + +#define AE_NONE 0 +#define AE_READABLE 1 +#define AE_WRITABLE 2 + +#define AE_FILE_EVENTS 1 +#define AE_TIME_EVENTS 2 +#define AE_ALL_EVENTS (AE_FILE_EVENTS|AE_TIME_EVENTS) +#define AE_DONT_WAIT 4 + +#define AE_NOMORE -1 + +/* Macros */ +#define AE_NOTUSED(V) ((void) V) + +struct aeEventLoop; + +/* Types and data structures */ +typedef void aeFileProc(struct aeEventLoop *eventLoop, int fd, void *clientData, int mask); +typedef int aeTimeProc(struct aeEventLoop *eventLoop, long long id, void *clientData); +typedef void aeEventFinalizerProc(struct aeEventLoop *eventLoop, void *clientData); +typedef void aeBeforeSleepProc(struct aeEventLoop *eventLoop); + +/* File event structure */ +typedef struct aeFileEvent { + int mask; /* one of AE_(READABLE|WRITABLE) */ + aeFileProc *rfileProc; + aeFileProc *wfileProc; + void *clientData; +} aeFileEvent; + +/* Time event structure */ +typedef struct aeTimeEvent { + long long id; /* time event identifier. */ + long when_sec; /* seconds */ + long when_ms; /* milliseconds */ + aeTimeProc *timeProc; + aeEventFinalizerProc *finalizerProc; + void *clientData; + struct aeTimeEvent *next; +} aeTimeEvent; + +/* A fired event */ +typedef struct aeFiredEvent { + int fd; + int mask; +} aeFiredEvent; + +/* State of an event based program */ +typedef struct aeEventLoop { + int maxfd; + long long timeEventNextId; + aeFileEvent events[AE_SETSIZE]; /* Registered events */ + aeFiredEvent fired[AE_SETSIZE]; /* Fired events */ + aeTimeEvent *timeEventHead; + int stop; + void *apidata; /* This is used for polling API specific data */ + aeBeforeSleepProc *beforesleep; +} aeEventLoop; + +/* Prototypes */ +aeEventLoop *aeCreateEventLoop(void); +void aeDeleteEventLoop(aeEventLoop *eventLoop); +void aeStop(aeEventLoop *eventLoop); +int aeCreateFileEvent(aeEventLoop *eventLoop, int fd, int mask, + aeFileProc *proc, void *clientData); +void aeDeleteFileEvent(aeEventLoop *eventLoop, int fd, int mask); +long long aeCreateTimeEvent(aeEventLoop *eventLoop, long long milliseconds, + aeTimeProc *proc, void *clientData, + aeEventFinalizerProc *finalizerProc); +int aeDeleteTimeEvent(aeEventLoop *eventLoop, long long id); +int aeProcessEvents(aeEventLoop *eventLoop, int flags); +int aeWait(int fd, int mask, long long milliseconds); +void aeMain(aeEventLoop *eventLoop); +char *aeGetApiName(void); +void aeSetBeforeSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *beforesleep); + +#endif diff --git a/src/ae_epoll.c b/src/ae_epoll.c new file mode 100644 index 000000000..d48977b65 --- /dev/null +++ b/src/ae_epoll.c @@ -0,0 +1,91 @@ +/* Linux epoll(2) based ae.c module + * Copyright (C) 2009-2010 Salvatore Sanfilippo - antirez@gmail.com + * Released under the BSD license. See the COPYING file for more info. */ + +#include + +typedef struct aeApiState { + int epfd; + struct epoll_event events[AE_SETSIZE]; +} aeApiState; + +static int aeApiCreate(aeEventLoop *eventLoop) { + aeApiState *state = zmalloc(sizeof(aeApiState)); + + if (!state) return -1; + state->epfd = epoll_create(1024); /* 1024 is just an hint for the kernel */ + if (state->epfd == -1) return -1; + eventLoop->apidata = state; + return 0; +} + +static void aeApiFree(aeEventLoop *eventLoop) { + aeApiState *state = eventLoop->apidata; + + close(state->epfd); + zfree(state); +} + +static int aeApiAddEvent(aeEventLoop *eventLoop, int fd, int mask) { + aeApiState *state = eventLoop->apidata; + struct epoll_event ee; + /* If the fd was already monitored for some event, we need a MOD + * operation. Otherwise we need an ADD operation. */ + int op = eventLoop->events[fd].mask == AE_NONE ? + EPOLL_CTL_ADD : EPOLL_CTL_MOD; + + ee.events = 0; + mask |= eventLoop->events[fd].mask; /* Merge old events */ + if (mask & AE_READABLE) ee.events |= EPOLLIN; + if (mask & AE_WRITABLE) ee.events |= EPOLLOUT; + ee.data.u64 = 0; /* avoid valgrind warning */ + ee.data.fd = fd; + if (epoll_ctl(state->epfd,op,fd,&ee) == -1) return -1; + return 0; +} + +static void aeApiDelEvent(aeEventLoop *eventLoop, int fd, int delmask) { + aeApiState *state = eventLoop->apidata; + struct epoll_event ee; + int mask = eventLoop->events[fd].mask & (~delmask); + + ee.events = 0; + if (mask & AE_READABLE) ee.events |= EPOLLIN; + if (mask & AE_WRITABLE) ee.events |= EPOLLOUT; + ee.data.u64 = 0; /* avoid valgrind warning */ + ee.data.fd = fd; + if (mask != AE_NONE) { + epoll_ctl(state->epfd,EPOLL_CTL_MOD,fd,&ee); + } else { + /* Note, Kernel < 2.6.9 requires a non null event pointer even for + * EPOLL_CTL_DEL. */ + epoll_ctl(state->epfd,EPOLL_CTL_DEL,fd,&ee); + } +} + +static int aeApiPoll(aeEventLoop *eventLoop, struct timeval *tvp) { + aeApiState *state = eventLoop->apidata; + int retval, numevents = 0; + + retval = epoll_wait(state->epfd,state->events,AE_SETSIZE, + tvp ? (tvp->tv_sec*1000 + tvp->tv_usec/1000) : -1); + if (retval > 0) { + int j; + + numevents = retval; + for (j = 0; j < numevents; j++) { + int mask = 0; + struct epoll_event *e = state->events+j; + + if (e->events & EPOLLIN) mask |= AE_READABLE; + if (e->events & EPOLLOUT) mask |= AE_WRITABLE; + eventLoop->fired[j].fd = e->data.fd; + eventLoop->fired[j].mask = mask; + } + } + return numevents; +} + +static char *aeApiName(void) { + return "epoll"; +} diff --git a/src/ae_kqueue.c b/src/ae_kqueue.c new file mode 100644 index 000000000..04c3536ba --- /dev/null +++ b/src/ae_kqueue.c @@ -0,0 +1,93 @@ +/* Kqueue(2)-based ae.c module + * Copyright (C) 2009 Harish Mallipeddi - harish.mallipeddi@gmail.com + * Released under the BSD license. See the COPYING file for more info. */ + +#include +#include +#include + +typedef struct aeApiState { + int kqfd; + struct kevent events[AE_SETSIZE]; +} aeApiState; + +static int aeApiCreate(aeEventLoop *eventLoop) { + aeApiState *state = zmalloc(sizeof(aeApiState)); + + if (!state) return -1; + state->kqfd = kqueue(); + if (state->kqfd == -1) return -1; + eventLoop->apidata = state; + + return 0; +} + +static void aeApiFree(aeEventLoop *eventLoop) { + aeApiState *state = eventLoop->apidata; + + close(state->kqfd); + zfree(state); +} + +static int aeApiAddEvent(aeEventLoop *eventLoop, int fd, int mask) { + aeApiState *state = eventLoop->apidata; + struct kevent ke; + + if (mask & AE_READABLE) { + EV_SET(&ke, fd, EVFILT_READ, EV_ADD, 0, 0, NULL); + if (kevent(state->kqfd, &ke, 1, NULL, 0, NULL) == -1) return -1; + } + if (mask & AE_WRITABLE) { + EV_SET(&ke, fd, EVFILT_WRITE, EV_ADD, 0, 0, NULL); + if (kevent(state->kqfd, &ke, 1, NULL, 0, NULL) == -1) return -1; + } + return 0; +} + +static void aeApiDelEvent(aeEventLoop *eventLoop, int fd, int mask) { + aeApiState *state = eventLoop->apidata; + struct kevent ke; + + if (mask & AE_READABLE) { + EV_SET(&ke, fd, EVFILT_READ, EV_DELETE, 0, 0, NULL); + kevent(state->kqfd, &ke, 1, NULL, 0, NULL); + } + if (mask & AE_WRITABLE) { + EV_SET(&ke, fd, EVFILT_WRITE, EV_DELETE, 0, 0, NULL); + kevent(state->kqfd, &ke, 1, NULL, 0, NULL); + } +} + +static int aeApiPoll(aeEventLoop *eventLoop, struct timeval *tvp) { + aeApiState *state = eventLoop->apidata; + int retval, numevents = 0; + + if (tvp != NULL) { + struct timespec timeout; + timeout.tv_sec = tvp->tv_sec; + timeout.tv_nsec = tvp->tv_usec * 1000; + retval = kevent(state->kqfd, NULL, 0, state->events, AE_SETSIZE, &timeout); + } else { + retval = kevent(state->kqfd, NULL, 0, state->events, AE_SETSIZE, NULL); + } + + if (retval > 0) { + int j; + + numevents = retval; + for(j = 0; j < numevents; j++) { + int mask = 0; + struct kevent *e = state->events+j; + + if (e->filter == EVFILT_READ) mask |= AE_READABLE; + if (e->filter == EVFILT_WRITE) mask |= AE_WRITABLE; + eventLoop->fired[j].fd = e->ident; + eventLoop->fired[j].mask = mask; + } + } + return numevents; +} + +static char *aeApiName(void) { + return "kqueue"; +} diff --git a/src/ae_select.c b/src/ae_select.c new file mode 100644 index 000000000..43f5867f3 --- /dev/null +++ b/src/ae_select.c @@ -0,0 +1,72 @@ +/* Select()-based ae.c module + * Copyright (C) 2009-2010 Salvatore Sanfilippo - antirez@gmail.com + * Released under the BSD license. See the COPYING file for more info. */ + +#include + +typedef struct aeApiState { + fd_set rfds, wfds; + /* We need to have a copy of the fd sets as it's not safe to reuse + * FD sets after select(). */ + fd_set _rfds, _wfds; +} aeApiState; + +static int aeApiCreate(aeEventLoop *eventLoop) { + aeApiState *state = zmalloc(sizeof(aeApiState)); + + if (!state) return -1; + FD_ZERO(&state->rfds); + FD_ZERO(&state->wfds); + eventLoop->apidata = state; + return 0; +} + +static void aeApiFree(aeEventLoop *eventLoop) { + zfree(eventLoop->apidata); +} + +static int aeApiAddEvent(aeEventLoop *eventLoop, int fd, int mask) { + aeApiState *state = eventLoop->apidata; + + if (mask & AE_READABLE) FD_SET(fd,&state->rfds); + if (mask & AE_WRITABLE) FD_SET(fd,&state->wfds); + return 0; +} + +static void aeApiDelEvent(aeEventLoop *eventLoop, int fd, int mask) { + aeApiState *state = eventLoop->apidata; + + if (mask & AE_READABLE) FD_CLR(fd,&state->rfds); + if (mask & AE_WRITABLE) FD_CLR(fd,&state->wfds); +} + +static int aeApiPoll(aeEventLoop *eventLoop, struct timeval *tvp) { + aeApiState *state = eventLoop->apidata; + int retval, j, numevents = 0; + + memcpy(&state->_rfds,&state->rfds,sizeof(fd_set)); + memcpy(&state->_wfds,&state->wfds,sizeof(fd_set)); + + retval = select(eventLoop->maxfd+1, + &state->_rfds,&state->_wfds,NULL,tvp); + if (retval > 0) { + for (j = 0; j <= eventLoop->maxfd; j++) { + int mask = 0; + aeFileEvent *fe = &eventLoop->events[j]; + + if (fe->mask == AE_NONE) continue; + if (fe->mask & AE_READABLE && FD_ISSET(j,&state->_rfds)) + mask |= AE_READABLE; + if (fe->mask & AE_WRITABLE && FD_ISSET(j,&state->_wfds)) + mask |= AE_WRITABLE; + eventLoop->fired[numevents].fd = j; + eventLoop->fired[numevents].mask = mask; + numevents++; + } + } + return numevents; +} + +static char *aeApiName(void) { + return "select"; +} diff --git a/src/anet.c b/src/anet.c new file mode 100644 index 000000000..4fe811a11 --- /dev/null +++ b/src/anet.c @@ -0,0 +1,270 @@ +/* anet.c -- Basic TCP socket stuff made a bit less boring + * + * Copyright (c) 2006-2010, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "fmacros.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "anet.h" + +static void anetSetError(char *err, const char *fmt, ...) +{ + va_list ap; + + if (!err) return; + va_start(ap, fmt); + vsnprintf(err, ANET_ERR_LEN, fmt, ap); + va_end(ap); +} + +int anetNonBlock(char *err, int fd) +{ + int flags; + + /* Set the socket nonblocking. + * Note that fcntl(2) for F_GETFL and F_SETFL can't be + * interrupted by a signal. */ + if ((flags = fcntl(fd, F_GETFL)) == -1) { + anetSetError(err, "fcntl(F_GETFL): %s\n", strerror(errno)); + return ANET_ERR; + } + if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) == -1) { + anetSetError(err, "fcntl(F_SETFL,O_NONBLOCK): %s\n", strerror(errno)); + return ANET_ERR; + } + return ANET_OK; +} + +int anetTcpNoDelay(char *err, int fd) +{ + int yes = 1; + if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &yes, sizeof(yes)) == -1) + { + anetSetError(err, "setsockopt TCP_NODELAY: %s\n", strerror(errno)); + return ANET_ERR; + } + return ANET_OK; +} + +int anetSetSendBuffer(char *err, int fd, int buffsize) +{ + if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &buffsize, sizeof(buffsize)) == -1) + { + anetSetError(err, "setsockopt SO_SNDBUF: %s\n", strerror(errno)); + return ANET_ERR; + } + return ANET_OK; +} + +int anetTcpKeepAlive(char *err, int fd) +{ + int yes = 1; + if (setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &yes, sizeof(yes)) == -1) { + anetSetError(err, "setsockopt SO_KEEPALIVE: %s\n", strerror(errno)); + return ANET_ERR; + } + return ANET_OK; +} + +int anetResolve(char *err, char *host, char *ipbuf) +{ + struct sockaddr_in sa; + + sa.sin_family = AF_INET; + if (inet_aton(host, &sa.sin_addr) == 0) { + struct hostent *he; + + he = gethostbyname(host); + if (he == NULL) { + anetSetError(err, "can't resolve: %s\n", host); + return ANET_ERR; + } + memcpy(&sa.sin_addr, he->h_addr, sizeof(struct in_addr)); + } + strcpy(ipbuf,inet_ntoa(sa.sin_addr)); + return ANET_OK; +} + +#define ANET_CONNECT_NONE 0 +#define ANET_CONNECT_NONBLOCK 1 +static int anetTcpGenericConnect(char *err, char *addr, int port, int flags) +{ + int s, on = 1; + struct sockaddr_in sa; + + if ((s = socket(AF_INET, SOCK_STREAM, 0)) == -1) { + anetSetError(err, "creating socket: %s\n", strerror(errno)); + return ANET_ERR; + } + /* Make sure connection-intensive things like the redis benckmark + * will be able to close/open sockets a zillion of times */ + setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)); + + sa.sin_family = AF_INET; + sa.sin_port = htons(port); + if (inet_aton(addr, &sa.sin_addr) == 0) { + struct hostent *he; + + he = gethostbyname(addr); + if (he == NULL) { + anetSetError(err, "can't resolve: %s\n", addr); + close(s); + return ANET_ERR; + } + memcpy(&sa.sin_addr, he->h_addr, sizeof(struct in_addr)); + } + if (flags & ANET_CONNECT_NONBLOCK) { + if (anetNonBlock(err,s) != ANET_OK) + return ANET_ERR; + } + if (connect(s, (struct sockaddr*)&sa, sizeof(sa)) == -1) { + if (errno == EINPROGRESS && + flags & ANET_CONNECT_NONBLOCK) + return s; + + anetSetError(err, "connect: %s\n", strerror(errno)); + close(s); + return ANET_ERR; + } + return s; +} + +int anetTcpConnect(char *err, char *addr, int port) +{ + return anetTcpGenericConnect(err,addr,port,ANET_CONNECT_NONE); +} + +int anetTcpNonBlockConnect(char *err, char *addr, int port) +{ + return anetTcpGenericConnect(err,addr,port,ANET_CONNECT_NONBLOCK); +} + +/* Like read(2) but make sure 'count' is read before to return + * (unless error or EOF condition is encountered) */ +int anetRead(int fd, char *buf, int count) +{ + int nread, totlen = 0; + while(totlen != count) { + nread = read(fd,buf,count-totlen); + if (nread == 0) return totlen; + if (nread == -1) return -1; + totlen += nread; + buf += nread; + } + return totlen; +} + +/* Like write(2) but make sure 'count' is read before to return + * (unless error is encountered) */ +int anetWrite(int fd, char *buf, int count) +{ + int nwritten, totlen = 0; + while(totlen != count) { + nwritten = write(fd,buf,count-totlen); + if (nwritten == 0) return totlen; + if (nwritten == -1) return -1; + totlen += nwritten; + buf += nwritten; + } + return totlen; +} + +int anetTcpServer(char *err, int port, char *bindaddr) +{ + int s, on = 1; + struct sockaddr_in sa; + + if ((s = socket(AF_INET, SOCK_STREAM, 0)) == -1) { + anetSetError(err, "socket: %s\n", strerror(errno)); + return ANET_ERR; + } + if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) { + anetSetError(err, "setsockopt SO_REUSEADDR: %s\n", strerror(errno)); + close(s); + return ANET_ERR; + } + memset(&sa,0,sizeof(sa)); + sa.sin_family = AF_INET; + sa.sin_port = htons(port); + sa.sin_addr.s_addr = htonl(INADDR_ANY); + if (bindaddr) { + if (inet_aton(bindaddr, &sa.sin_addr) == 0) { + anetSetError(err, "Invalid bind address\n"); + close(s); + return ANET_ERR; + } + } + if (bind(s, (struct sockaddr*)&sa, sizeof(sa)) == -1) { + anetSetError(err, "bind: %s\n", strerror(errno)); + close(s); + return ANET_ERR; + } + if (listen(s, 511) == -1) { /* the magic 511 constant is from nginx */ + anetSetError(err, "listen: %s\n", strerror(errno)); + close(s); + return ANET_ERR; + } + return s; +} + +int anetAccept(char *err, int serversock, char *ip, int *port) +{ + int fd; + struct sockaddr_in sa; + unsigned int saLen; + + while(1) { + saLen = sizeof(sa); + fd = accept(serversock, (struct sockaddr*)&sa, &saLen); + if (fd == -1) { + if (errno == EINTR) + continue; + else { + anetSetError(err, "accept: %s\n", strerror(errno)); + return ANET_ERR; + } + } + break; + } + if (ip) strcpy(ip,inet_ntoa(sa.sin_addr)); + if (port) *port = ntohs(sa.sin_port); + return fd; +} diff --git a/src/anet.h b/src/anet.h new file mode 100644 index 000000000..ce0f47787 --- /dev/null +++ b/src/anet.h @@ -0,0 +1,49 @@ +/* anet.c -- Basic TCP socket stuff made a bit less boring + * + * Copyright (c) 2006-2010, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef ANET_H +#define ANET_H + +#define ANET_OK 0 +#define ANET_ERR -1 +#define ANET_ERR_LEN 256 + +int anetTcpConnect(char *err, char *addr, int port); +int anetTcpNonBlockConnect(char *err, char *addr, int port); +int anetRead(int fd, char *buf, int count); +int anetResolve(char *err, char *host, char *ipbuf); +int anetTcpServer(char *err, int port, char *bindaddr); +int anetAccept(char *err, int serversock, char *ip, int *port); +int anetWrite(int fd, char *buf, int count); +int anetNonBlock(char *err, int fd); +int anetTcpNoDelay(char *err, int fd); +int anetTcpKeepAlive(char *err, int fd); + +#endif diff --git a/src/aof.c b/src/aof.c new file mode 100644 index 000000000..51054b296 --- /dev/null +++ b/src/aof.c @@ -0,0 +1,694 @@ +#include "redis.h" + +#include +#include +#include + +/* Called when the user switches from "appendonly yes" to "appendonly no" + * at runtime using the CONFIG command. */ +void stopAppendOnly(void) { + flushAppendOnlyFile(); + aof_fsync(server.appendfd); + close(server.appendfd); + + server.appendfd = -1; + server.appendseldb = -1; + server.appendonly = 0; + /* rewrite operation in progress? kill it, wait child exit */ + if (server.bgsavechildpid != -1) { + int statloc; + + if (kill(server.bgsavechildpid,SIGKILL) != -1) + wait3(&statloc,0,NULL); + /* reset the buffer accumulating changes while the child saves */ + sdsfree(server.bgrewritebuf); + server.bgrewritebuf = sdsempty(); + server.bgsavechildpid = -1; + } +} + +/* Called when the user switches from "appendonly no" to "appendonly yes" + * at runtime using the CONFIG command. */ +int startAppendOnly(void) { + server.appendonly = 1; + server.lastfsync = time(NULL); + server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644); + if (server.appendfd == -1) { + redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, but I can't open the AOF file: %s",strerror(errno)); + return REDIS_ERR; + } + if (rewriteAppendOnlyFileBackground() == REDIS_ERR) { + server.appendonly = 0; + close(server.appendfd); + redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, I can't trigger a background AOF rewrite operation. Check the above logs for more info about the error.",strerror(errno)); + return REDIS_ERR; + } + return REDIS_OK; +} + +/* Write the append only file buffer on disk. + * + * Since we are required to write the AOF before replying to the client, + * and the only way the client socket can get a write is entering when the + * the event loop, we accumulate all the AOF writes in a memory + * buffer and write it on disk using this function just before entering + * the event loop again. */ +void flushAppendOnlyFile(void) { + time_t now; + ssize_t nwritten; + + if (sdslen(server.aofbuf) == 0) return; + + /* We want to perform a single write. This should be guaranteed atomic + * at least if the filesystem we are writing is a real physical one. + * While this will save us against the server being killed I don't think + * there is much to do about the whole server stopping for power problems + * or alike */ + nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf)); + if (nwritten != (signed)sdslen(server.aofbuf)) { + /* Ooops, we are in troubles. The best thing to do for now is + * aborting instead of giving the illusion that everything is + * working as expected. */ + if (nwritten == -1) { + redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno)); + } else { + redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno)); + } + exit(1); + } + sdsfree(server.aofbuf); + server.aofbuf = sdsempty(); + + /* Don't Fsync if no-appendfsync-on-rewrite is set to yes and we have + * childs performing heavy I/O on disk. */ + if (server.no_appendfsync_on_rewrite && + (server.bgrewritechildpid != -1 || server.bgsavechildpid != -1)) + return; + /* Fsync if needed */ + now = time(NULL); + if (server.appendfsync == APPENDFSYNC_ALWAYS || + (server.appendfsync == APPENDFSYNC_EVERYSEC && + now-server.lastfsync > 1)) + { + /* aof_fsync is defined as fdatasync() for Linux in order to avoid + * flushing metadata. */ + aof_fsync(server.appendfd); /* Let's try to get this data on the disk */ + server.lastfsync = now; + } +} + +sds catAppendOnlyGenericCommand(sds buf, int argc, robj **argv) { + int j; + buf = sdscatprintf(buf,"*%d\r\n",argc); + for (j = 0; j < argc; j++) { + robj *o = getDecodedObject(argv[j]); + buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr)); + buf = sdscatlen(buf,o->ptr,sdslen(o->ptr)); + buf = sdscatlen(buf,"\r\n",2); + decrRefCount(o); + } + return buf; +} + +sds catAppendOnlyExpireAtCommand(sds buf, robj *key, robj *seconds) { + int argc = 3; + long when; + robj *argv[3]; + + /* Make sure we can use strtol */ + seconds = getDecodedObject(seconds); + when = time(NULL)+strtol(seconds->ptr,NULL,10); + decrRefCount(seconds); + + argv[0] = createStringObject("EXPIREAT",8); + argv[1] = key; + argv[2] = createObject(REDIS_STRING, + sdscatprintf(sdsempty(),"%ld",when)); + buf = catAppendOnlyGenericCommand(buf, argc, argv); + decrRefCount(argv[0]); + decrRefCount(argv[2]); + return buf; +} + +void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) { + sds buf = sdsempty(); + robj *tmpargv[3]; + + /* The DB this command was targetting is not the same as the last command + * we appendend. To issue a SELECT command is needed. */ + if (dictid != server.appendseldb) { + char seldb[64]; + + snprintf(seldb,sizeof(seldb),"%d",dictid); + buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n", + (unsigned long)strlen(seldb),seldb); + server.appendseldb = dictid; + } + + if (cmd->proc == expireCommand) { + /* Translate EXPIRE into EXPIREAT */ + buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]); + } else if (cmd->proc == setexCommand) { + /* Translate SETEX to SET and EXPIREAT */ + tmpargv[0] = createStringObject("SET",3); + tmpargv[1] = argv[1]; + tmpargv[2] = argv[3]; + buf = catAppendOnlyGenericCommand(buf,3,tmpargv); + decrRefCount(tmpargv[0]); + buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]); + } else { + buf = catAppendOnlyGenericCommand(buf,argc,argv); + } + + /* Append to the AOF buffer. This will be flushed on disk just before + * of re-entering the event loop, so before the client will get a + * positive reply about the operation performed. */ + server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf)); + + /* If a background append only file rewriting is in progress we want to + * accumulate the differences between the child DB and the current one + * in a buffer, so that when the child process will do its work we + * can append the differences to the new append only file. */ + if (server.bgrewritechildpid != -1) + server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf)); + + sdsfree(buf); +} + +/* In Redis commands are always executed in the context of a client, so in + * order to load the append only file we need to create a fake client. */ +struct redisClient *createFakeClient(void) { + struct redisClient *c = zmalloc(sizeof(*c)); + + selectDb(c,0); + c->fd = -1; + c->querybuf = sdsempty(); + c->argc = 0; + c->argv = NULL; + c->flags = 0; + /* We set the fake client as a slave waiting for the synchronization + * so that Redis will not try to send replies to this client. */ + c->replstate = REDIS_REPL_WAIT_BGSAVE_START; + c->reply = listCreate(); + listSetFreeMethod(c->reply,decrRefCount); + listSetDupMethod(c->reply,dupClientReplyValue); + initClientMultiState(c); + return c; +} + +void freeFakeClient(struct redisClient *c) { + sdsfree(c->querybuf); + listRelease(c->reply); + freeClientMultiState(c); + zfree(c); +} + +/* Replay the append log file. On error REDIS_OK is returned. On non fatal + * error (the append only file is zero-length) REDIS_ERR is returned. On + * fatal error an error message is logged and the program exists. */ +int loadAppendOnlyFile(char *filename) { + struct redisClient *fakeClient; + FILE *fp = fopen(filename,"r"); + struct redis_stat sb; + int appendonly = server.appendonly; + + if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0) + return REDIS_ERR; + + if (fp == NULL) { + redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno)); + exit(1); + } + + /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI + * to the same file we're about to read. */ + server.appendonly = 0; + + fakeClient = createFakeClient(); + while(1) { + int argc, j; + unsigned long len; + robj **argv; + char buf[128]; + sds argsds; + struct redisCommand *cmd; + int force_swapout; + + if (fgets(buf,sizeof(buf),fp) == NULL) { + if (feof(fp)) + break; + else + goto readerr; + } + if (buf[0] != '*') goto fmterr; + argc = atoi(buf+1); + argv = zmalloc(sizeof(robj*)*argc); + for (j = 0; j < argc; j++) { + if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr; + if (buf[0] != '$') goto fmterr; + len = strtol(buf+1,NULL,10); + argsds = sdsnewlen(NULL,len); + if (len && fread(argsds,len,1,fp) == 0) goto fmterr; + argv[j] = createObject(REDIS_STRING,argsds); + if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */ + } + + /* Command lookup */ + cmd = lookupCommand(argv[0]->ptr); + if (!cmd) { + redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr); + exit(1); + } + /* Try object encoding */ + if (cmd->flags & REDIS_CMD_BULK) + argv[argc-1] = tryObjectEncoding(argv[argc-1]); + /* Run the command in the context of a fake client */ + fakeClient->argc = argc; + fakeClient->argv = argv; + cmd->proc(fakeClient); + /* Discard the reply objects list from the fake client */ + while(listLength(fakeClient->reply)) + listDelNode(fakeClient->reply,listFirst(fakeClient->reply)); + /* Clean up, ready for the next command */ + for (j = 0; j < argc; j++) decrRefCount(argv[j]); + zfree(argv); + /* Handle swapping while loading big datasets when VM is on */ + force_swapout = 0; + if ((zmalloc_used_memory() - server.vm_max_memory) > 1024*1024*32) + force_swapout = 1; + + if (server.vm_enabled && force_swapout) { + while (zmalloc_used_memory() > server.vm_max_memory) { + if (vmSwapOneObjectBlocking() == REDIS_ERR) break; + } + } + } + + /* This point can only be reached when EOF is reached without errors. + * If the client is in the middle of a MULTI/EXEC, log error and quit. */ + if (fakeClient->flags & REDIS_MULTI) goto readerr; + + fclose(fp); + freeFakeClient(fakeClient); + server.appendonly = appendonly; + return REDIS_OK; + +readerr: + if (feof(fp)) { + redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file"); + } else { + redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno)); + } + exit(1); +fmterr: + redisLog(REDIS_WARNING,"Bad file format reading the append only file"); + exit(1); +} + +/* Write binary-safe string into a file in the bulkformat + * $\r\n\r\n */ +int fwriteBulkString(FILE *fp, char *s, unsigned long len) { + char cbuf[128]; + int clen; + cbuf[0] = '$'; + clen = 1+ll2string(cbuf+1,sizeof(cbuf)-1,len); + cbuf[clen++] = '\r'; + cbuf[clen++] = '\n'; + if (fwrite(cbuf,clen,1,fp) == 0) return 0; + if (len > 0 && fwrite(s,len,1,fp) == 0) return 0; + if (fwrite("\r\n",2,1,fp) == 0) return 0; + return 1; +} + +/* Write a double value in bulk format $\r\n\r\n */ +int fwriteBulkDouble(FILE *fp, double d) { + char buf[128], dbuf[128]; + + snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d); + snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2); + if (fwrite(buf,strlen(buf),1,fp) == 0) return 0; + if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0; + return 1; +} + +/* Write a long value in bulk format $\r\n\r\n */ +int fwriteBulkLongLong(FILE *fp, long long l) { + char bbuf[128], lbuf[128]; + unsigned int blen, llen; + llen = ll2string(lbuf,32,l); + blen = snprintf(bbuf,sizeof(bbuf),"$%u\r\n%s\r\n",llen,lbuf); + if (fwrite(bbuf,blen,1,fp) == 0) return 0; + return 1; +} + +/* Delegate writing an object to writing a bulk string or bulk long long. */ +int fwriteBulkObject(FILE *fp, robj *obj) { + /* Avoid using getDecodedObject to help copy-on-write (we are often + * in a child process when this function is called). */ + if (obj->encoding == REDIS_ENCODING_INT) { + return fwriteBulkLongLong(fp,(long)obj->ptr); + } else if (obj->encoding == REDIS_ENCODING_RAW) { + return fwriteBulkString(fp,obj->ptr,sdslen(obj->ptr)); + } else { + redisPanic("Unknown string encoding"); + } +} + +/* Write a sequence of commands able to fully rebuild the dataset into + * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */ +int rewriteAppendOnlyFile(char *filename) { + dictIterator *di = NULL; + dictEntry *de; + FILE *fp; + char tmpfile[256]; + int j; + time_t now = time(NULL); + + /* Note that we have to use a different temp name here compared to the + * one used by rewriteAppendOnlyFileBackground() function. */ + snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid()); + fp = fopen(tmpfile,"w"); + if (!fp) { + redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno)); + return REDIS_ERR; + } + for (j = 0; j < server.dbnum; j++) { + char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n"; + redisDb *db = server.db+j; + dict *d = db->dict; + if (dictSize(d) == 0) continue; + di = dictGetIterator(d); + if (!di) { + fclose(fp); + return REDIS_ERR; + } + + /* SELECT the new DB */ + if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr; + if (fwriteBulkLongLong(fp,j) == 0) goto werr; + + /* Iterate this DB writing every entry */ + while((de = dictNext(di)) != NULL) { + sds keystr = dictGetEntryKey(de); + robj key, *o; + time_t expiretime; + int swapped; + + keystr = dictGetEntryKey(de); + o = dictGetEntryVal(de); + initStaticStringObject(key,keystr); + /* If the value for this key is swapped, load a preview in memory. + * We use a "swapped" flag to remember if we need to free the + * value object instead to just increment the ref count anyway + * in order to avoid copy-on-write of pages if we are forked() */ + if (!server.vm_enabled || o->storage == REDIS_VM_MEMORY || + o->storage == REDIS_VM_SWAPPING) { + swapped = 0; + } else { + o = vmPreviewObject(o); + swapped = 1; + } + expiretime = getExpire(db,&key); + + /* Save the key and associated value */ + if (o->type == REDIS_STRING) { + /* Emit a SET command */ + char cmd[]="*3\r\n$3\r\nSET\r\n"; + if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr; + /* Key and value */ + if (fwriteBulkObject(fp,&key) == 0) goto werr; + if (fwriteBulkObject(fp,o) == 0) goto werr; + } else if (o->type == REDIS_LIST) { + /* Emit the RPUSHes needed to rebuild the list */ + char cmd[]="*3\r\n$5\r\nRPUSH\r\n"; + if (o->encoding == REDIS_ENCODING_ZIPLIST) { + unsigned char *zl = o->ptr; + unsigned char *p = ziplistIndex(zl,0); + unsigned char *vstr; + unsigned int vlen; + long long vlong; + + while(ziplistGet(p,&vstr,&vlen,&vlong)) { + if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr; + if (fwriteBulkObject(fp,&key) == 0) goto werr; + if (vstr) { + if (fwriteBulkString(fp,(char*)vstr,vlen) == 0) + goto werr; + } else { + if (fwriteBulkLongLong(fp,vlong) == 0) + goto werr; + } + p = ziplistNext(zl,p); + } + } else if (o->encoding == REDIS_ENCODING_LINKEDLIST) { + list *list = o->ptr; + listNode *ln; + listIter li; + + listRewind(list,&li); + while((ln = listNext(&li))) { + robj *eleobj = listNodeValue(ln); + + if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr; + if (fwriteBulkObject(fp,&key) == 0) goto werr; + if (fwriteBulkObject(fp,eleobj) == 0) goto werr; + } + } else { + redisPanic("Unknown list encoding"); + } + } else if (o->type == REDIS_SET) { + /* Emit the SADDs needed to rebuild the set */ + dict *set = o->ptr; + dictIterator *di = dictGetIterator(set); + dictEntry *de; + + while((de = dictNext(di)) != NULL) { + char cmd[]="*3\r\n$4\r\nSADD\r\n"; + robj *eleobj = dictGetEntryKey(de); + + if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr; + if (fwriteBulkObject(fp,&key) == 0) goto werr; + if (fwriteBulkObject(fp,eleobj) == 0) goto werr; + } + dictReleaseIterator(di); + } else if (o->type == REDIS_ZSET) { + /* Emit the ZADDs needed to rebuild the sorted set */ + zset *zs = o->ptr; + dictIterator *di = dictGetIterator(zs->dict); + dictEntry *de; + + while((de = dictNext(di)) != NULL) { + char cmd[]="*4\r\n$4\r\nZADD\r\n"; + robj *eleobj = dictGetEntryKey(de); + double *score = dictGetEntryVal(de); + + if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr; + if (fwriteBulkObject(fp,&key) == 0) goto werr; + if (fwriteBulkDouble(fp,*score) == 0) goto werr; + if (fwriteBulkObject(fp,eleobj) == 0) goto werr; + } + dictReleaseIterator(di); + } else if (o->type == REDIS_HASH) { + char cmd[]="*4\r\n$4\r\nHSET\r\n"; + + /* Emit the HSETs needed to rebuild the hash */ + if (o->encoding == REDIS_ENCODING_ZIPMAP) { + unsigned char *p = zipmapRewind(o->ptr); + unsigned char *field, *val; + unsigned int flen, vlen; + + while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) { + if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr; + if (fwriteBulkObject(fp,&key) == 0) goto werr; + if (fwriteBulkString(fp,(char*)field,flen) == -1) + return -1; + if (fwriteBulkString(fp,(char*)val,vlen) == -1) + return -1; + } + } else { + dictIterator *di = dictGetIterator(o->ptr); + dictEntry *de; + + while((de = dictNext(di)) != NULL) { + robj *field = dictGetEntryKey(de); + robj *val = dictGetEntryVal(de); + + if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr; + if (fwriteBulkObject(fp,&key) == 0) goto werr; + if (fwriteBulkObject(fp,field) == -1) return -1; + if (fwriteBulkObject(fp,val) == -1) return -1; + } + dictReleaseIterator(di); + } + } else { + redisPanic("Unknown object type"); + } + /* Save the expire time */ + if (expiretime != -1) { + char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n"; + /* If this key is already expired skip it */ + if (expiretime < now) continue; + if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr; + if (fwriteBulkObject(fp,&key) == 0) goto werr; + if (fwriteBulkLongLong(fp,expiretime) == 0) goto werr; + } + if (swapped) decrRefCount(o); + } + dictReleaseIterator(di); + } + + /* Make sure data will not remain on the OS's output buffers */ + fflush(fp); + aof_fsync(fileno(fp)); + fclose(fp); + + /* Use RENAME to make sure the DB file is changed atomically only + * if the generate DB file is ok. */ + if (rename(tmpfile,filename) == -1) { + redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno)); + unlink(tmpfile); + return REDIS_ERR; + } + redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed"); + return REDIS_OK; + +werr: + fclose(fp); + unlink(tmpfile); + redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno)); + if (di) dictReleaseIterator(di); + return REDIS_ERR; +} + +/* This is how rewriting of the append only file in background works: + * + * 1) The user calls BGREWRITEAOF + * 2) Redis calls this function, that forks(): + * 2a) the child rewrite the append only file in a temp file. + * 2b) the parent accumulates differences in server.bgrewritebuf. + * 3) When the child finished '2a' exists. + * 4) The parent will trap the exit code, if it's OK, will append the + * data accumulated into server.bgrewritebuf into the temp file, and + * finally will rename(2) the temp file in the actual file name. + * The the new file is reopened as the new append only file. Profit! + */ +int rewriteAppendOnlyFileBackground(void) { + pid_t childpid; + + if (server.bgrewritechildpid != -1) return REDIS_ERR; + if (server.vm_enabled) waitEmptyIOJobsQueue(); + if ((childpid = fork()) == 0) { + /* Child */ + char tmpfile[256]; + + if (server.vm_enabled) vmReopenSwapFile(); + close(server.fd); + snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid()); + if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) { + _exit(0); + } else { + _exit(1); + } + } else { + /* Parent */ + if (childpid == -1) { + redisLog(REDIS_WARNING, + "Can't rewrite append only file in background: fork: %s", + strerror(errno)); + return REDIS_ERR; + } + redisLog(REDIS_NOTICE, + "Background append only file rewriting started by pid %d",childpid); + server.bgrewritechildpid = childpid; + updateDictResizePolicy(); + /* We set appendseldb to -1 in order to force the next call to the + * feedAppendOnlyFile() to issue a SELECT command, so the differences + * accumulated by the parent into server.bgrewritebuf will start + * with a SELECT statement and it will be safe to merge. */ + server.appendseldb = -1; + return REDIS_OK; + } + return REDIS_OK; /* unreached */ +} + +void bgrewriteaofCommand(redisClient *c) { + if (server.bgrewritechildpid != -1) { + addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n")); + return; + } + if (rewriteAppendOnlyFileBackground() == REDIS_OK) { + char *status = "+Background append only file rewriting started\r\n"; + addReplySds(c,sdsnew(status)); + } else { + addReply(c,shared.err); + } +} + +void aofRemoveTempFile(pid_t childpid) { + char tmpfile[256]; + + snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid); + unlink(tmpfile); +} + +/* A background append only file rewriting (BGREWRITEAOF) terminated its work. + * Handle this. */ +void backgroundRewriteDoneHandler(int statloc) { + int exitcode = WEXITSTATUS(statloc); + int bysignal = WIFSIGNALED(statloc); + + if (!bysignal && exitcode == 0) { + int fd; + char tmpfile[256]; + + redisLog(REDIS_NOTICE, + "Background append only file rewriting terminated with success"); + /* Now it's time to flush the differences accumulated by the parent */ + snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid); + fd = open(tmpfile,O_WRONLY|O_APPEND); + if (fd == -1) { + redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno)); + goto cleanup; + } + /* Flush our data... */ + if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) != + (signed) sdslen(server.bgrewritebuf)) { + redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno)); + close(fd); + goto cleanup; + } + redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf)); + /* Now our work is to rename the temp file into the stable file. And + * switch the file descriptor used by the server for append only. */ + if (rename(tmpfile,server.appendfilename) == -1) { + redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno)); + close(fd); + goto cleanup; + } + /* Mission completed... almost */ + redisLog(REDIS_NOTICE,"Append only file successfully rewritten."); + if (server.appendfd != -1) { + /* If append only is actually enabled... */ + close(server.appendfd); + server.appendfd = fd; + if (server.appendfsync != APPENDFSYNC_NO) aof_fsync(fd); + server.appendseldb = -1; /* Make sure it will issue SELECT */ + redisLog(REDIS_NOTICE,"The new append only file was selected for future appends."); + } else { + /* If append only is disabled we just generate a dump in this + * format. Why not? */ + close(fd); + } + } else if (!bysignal && exitcode != 0) { + redisLog(REDIS_WARNING, "Background append only file rewriting error"); + } else { + redisLog(REDIS_WARNING, + "Background append only file rewriting terminated by signal %d", + WTERMSIG(statloc)); + } +cleanup: + sdsfree(server.bgrewritebuf); + server.bgrewritebuf = sdsempty(); + aofRemoveTempFile(server.bgrewritechildpid); + server.bgrewritechildpid = -1; +} diff --git a/src/config.c b/src/config.c new file mode 100644 index 000000000..6d946ee0c --- /dev/null +++ b/src/config.c @@ -0,0 +1,438 @@ +#include "redis.h" + +/*----------------------------------------------------------------------------- + * Config file parsing + *----------------------------------------------------------------------------*/ + +int yesnotoi(char *s) { + if (!strcasecmp(s,"yes")) return 1; + else if (!strcasecmp(s,"no")) return 0; + else return -1; +} + +void appendServerSaveParams(time_t seconds, int changes) { + server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1)); + server.saveparams[server.saveparamslen].seconds = seconds; + server.saveparams[server.saveparamslen].changes = changes; + server.saveparamslen++; +} + +void resetServerSaveParams() { + zfree(server.saveparams); + server.saveparams = NULL; + server.saveparamslen = 0; +} + +/* I agree, this is a very rudimental way to load a configuration... + will improve later if the config gets more complex */ +void loadServerConfig(char *filename) { + FILE *fp; + char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL; + int linenum = 0; + sds line = NULL; + + if (filename[0] == '-' && filename[1] == '\0') + fp = stdin; + else { + if ((fp = fopen(filename,"r")) == NULL) { + redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename); + exit(1); + } + } + + while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) { + sds *argv; + int argc, j; + + linenum++; + line = sdsnew(buf); + line = sdstrim(line," \t\r\n"); + + /* Skip comments and blank lines*/ + if (line[0] == '#' || line[0] == '\0') { + sdsfree(line); + continue; + } + + /* Split into arguments */ + argv = sdssplitlen(line,sdslen(line)," ",1,&argc); + sdstolower(argv[0]); + + /* Execute config directives */ + if (!strcasecmp(argv[0],"timeout") && argc == 2) { + server.maxidletime = atoi(argv[1]); + if (server.maxidletime < 0) { + err = "Invalid timeout value"; goto loaderr; + } + } else if (!strcasecmp(argv[0],"port") && argc == 2) { + server.port = atoi(argv[1]); + if (server.port < 1 || server.port > 65535) { + err = "Invalid port"; goto loaderr; + } + } else if (!strcasecmp(argv[0],"bind") && argc == 2) { + server.bindaddr = zstrdup(argv[1]); + } else if (!strcasecmp(argv[0],"save") && argc == 3) { + int seconds = atoi(argv[1]); + int changes = atoi(argv[2]); + if (seconds < 1 || changes < 0) { + err = "Invalid save parameters"; goto loaderr; + } + appendServerSaveParams(seconds,changes); + } else if (!strcasecmp(argv[0],"dir") && argc == 2) { + if (chdir(argv[1]) == -1) { + redisLog(REDIS_WARNING,"Can't chdir to '%s': %s", + argv[1], strerror(errno)); + exit(1); + } + } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) { + if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG; + else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE; + else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE; + else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING; + else { + err = "Invalid log level. Must be one of debug, notice, warning"; + goto loaderr; + } + } else if (!strcasecmp(argv[0],"logfile") && argc == 2) { + FILE *logfp; + + server.logfile = zstrdup(argv[1]); + if (!strcasecmp(server.logfile,"stdout")) { + zfree(server.logfile); + server.logfile = NULL; + } + if (server.logfile) { + /* Test if we are able to open the file. The server will not + * be able to abort just for this problem later... */ + logfp = fopen(server.logfile,"a"); + if (logfp == NULL) { + err = sdscatprintf(sdsempty(), + "Can't open the log file: %s", strerror(errno)); + goto loaderr; + } + fclose(logfp); + } + } else if (!strcasecmp(argv[0],"databases") && argc == 2) { + server.dbnum = atoi(argv[1]); + if (server.dbnum < 1) { + err = "Invalid number of databases"; goto loaderr; + } + } else if (!strcasecmp(argv[0],"include") && argc == 2) { + loadServerConfig(argv[1]); + } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) { + server.maxclients = atoi(argv[1]); + } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) { + server.maxmemory = memtoll(argv[1],NULL); + } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) { + server.masterhost = sdsnew(argv[1]); + server.masterport = atoi(argv[2]); + server.replstate = REDIS_REPL_CONNECT; + } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) { + server.masterauth = zstrdup(argv[1]); + } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) { + if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) { + err = "argument must be 'yes' or 'no'"; goto loaderr; + } + } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) { + if ((server.rdbcompression = yesnotoi(argv[1])) == -1) { + err = "argument must be 'yes' or 'no'"; goto loaderr; + } + } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) { + if ((server.activerehashing = yesnotoi(argv[1])) == -1) { + err = "argument must be 'yes' or 'no'"; goto loaderr; + } + } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) { + if ((server.daemonize = yesnotoi(argv[1])) == -1) { + err = "argument must be 'yes' or 'no'"; goto loaderr; + } + } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) { + if ((server.appendonly = yesnotoi(argv[1])) == -1) { + err = "argument must be 'yes' or 'no'"; goto loaderr; + } + } else if (!strcasecmp(argv[0],"appendfilename") && argc == 2) { + zfree(server.appendfilename); + server.appendfilename = zstrdup(argv[1]); + } else if (!strcasecmp(argv[0],"no-appendfsync-on-rewrite") + && argc == 2) { + if ((server.no_appendfsync_on_rewrite= yesnotoi(argv[1])) == -1) { + err = "argument must be 'yes' or 'no'"; goto loaderr; + } + } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) { + if (!strcasecmp(argv[1],"no")) { + server.appendfsync = APPENDFSYNC_NO; + } else if (!strcasecmp(argv[1],"always")) { + server.appendfsync = APPENDFSYNC_ALWAYS; + } else if (!strcasecmp(argv[1],"everysec")) { + server.appendfsync = APPENDFSYNC_EVERYSEC; + } else { + err = "argument must be 'no', 'always' or 'everysec'"; + goto loaderr; + } + } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) { + server.requirepass = zstrdup(argv[1]); + } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) { + zfree(server.pidfile); + server.pidfile = zstrdup(argv[1]); + } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) { + zfree(server.dbfilename); + server.dbfilename = zstrdup(argv[1]); + } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) { + if ((server.vm_enabled = yesnotoi(argv[1])) == -1) { + err = "argument must be 'yes' or 'no'"; goto loaderr; + } + } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) { + zfree(server.vm_swap_file); + server.vm_swap_file = zstrdup(argv[1]); + } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) { + server.vm_max_memory = memtoll(argv[1],NULL); + } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) { + server.vm_page_size = memtoll(argv[1], NULL); + } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) { + server.vm_pages = memtoll(argv[1], NULL); + } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) { + server.vm_max_threads = strtoll(argv[1], NULL, 10); + } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){ + server.hash_max_zipmap_entries = memtoll(argv[1], NULL); + } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){ + server.hash_max_zipmap_value = memtoll(argv[1], NULL); + } else if (!strcasecmp(argv[0],"list-max-ziplist-entries") && argc == 2){ + server.list_max_ziplist_entries = memtoll(argv[1], NULL); + } else if (!strcasecmp(argv[0],"list-max-ziplist-value") && argc == 2){ + server.list_max_ziplist_value = memtoll(argv[1], NULL); + } else { + err = "Bad directive or wrong number of arguments"; goto loaderr; + } + for (j = 0; j < argc; j++) + sdsfree(argv[j]); + zfree(argv); + sdsfree(line); + } + if (fp != stdin) fclose(fp); + return; + +loaderr: + fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n"); + fprintf(stderr, "Reading the configuration file, at line %d\n", linenum); + fprintf(stderr, ">>> '%s'\n", line); + fprintf(stderr, "%s\n", err); + exit(1); +} + +/*----------------------------------------------------------------------------- + * CONFIG command for remote configuration + *----------------------------------------------------------------------------*/ + +void configSetCommand(redisClient *c) { + robj *o = getDecodedObject(c->argv[3]); + long long ll; + + if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) { + zfree(server.dbfilename); + server.dbfilename = zstrdup(o->ptr); + } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) { + zfree(server.requirepass); + server.requirepass = zstrdup(o->ptr); + } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) { + zfree(server.masterauth); + server.masterauth = zstrdup(o->ptr); + } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) { + if (getLongLongFromObject(o,&ll) == REDIS_ERR || + ll < 0) goto badfmt; + server.maxmemory = ll; + } else if (!strcasecmp(c->argv[2]->ptr,"timeout")) { + if (getLongLongFromObject(o,&ll) == REDIS_ERR || + ll < 0 || ll > LONG_MAX) goto badfmt; + server.maxidletime = ll; + } else if (!strcasecmp(c->argv[2]->ptr,"appendfsync")) { + if (!strcasecmp(o->ptr,"no")) { + server.appendfsync = APPENDFSYNC_NO; + } else if (!strcasecmp(o->ptr,"everysec")) { + server.appendfsync = APPENDFSYNC_EVERYSEC; + } else if (!strcasecmp(o->ptr,"always")) { + server.appendfsync = APPENDFSYNC_ALWAYS; + } else { + goto badfmt; + } + } else if (!strcasecmp(c->argv[2]->ptr,"no-appendfsync-on-rewrite")) { + int yn = yesnotoi(o->ptr); + + if (yn == -1) goto badfmt; + server.no_appendfsync_on_rewrite = yn; + } else if (!strcasecmp(c->argv[2]->ptr,"appendonly")) { + int old = server.appendonly; + int new = yesnotoi(o->ptr); + + if (new == -1) goto badfmt; + if (old != new) { + if (new == 0) { + stopAppendOnly(); + } else { + if (startAppendOnly() == REDIS_ERR) { + addReplySds(c,sdscatprintf(sdsempty(), + "-ERR Unable to turn on AOF. Check server logs.\r\n")); + decrRefCount(o); + return; + } + } + } + } else if (!strcasecmp(c->argv[2]->ptr,"save")) { + int vlen, j; + sds *v = sdssplitlen(o->ptr,sdslen(o->ptr)," ",1,&vlen); + + /* Perform sanity check before setting the new config: + * - Even number of args + * - Seconds >= 1, changes >= 0 */ + if (vlen & 1) { + sdsfreesplitres(v,vlen); + goto badfmt; + } + for (j = 0; j < vlen; j++) { + char *eptr; + long val; + + val = strtoll(v[j], &eptr, 10); + if (eptr[0] != '\0' || + ((j & 1) == 0 && val < 1) || + ((j & 1) == 1 && val < 0)) { + sdsfreesplitres(v,vlen); + goto badfmt; + } + } + /* Finally set the new config */ + resetServerSaveParams(); + for (j = 0; j < vlen; j += 2) { + time_t seconds; + int changes; + + seconds = strtoll(v[j],NULL,10); + changes = strtoll(v[j+1],NULL,10); + appendServerSaveParams(seconds, changes); + } + sdsfreesplitres(v,vlen); + } else { + addReplySds(c,sdscatprintf(sdsempty(), + "-ERR not supported CONFIG parameter %s\r\n", + (char*)c->argv[2]->ptr)); + decrRefCount(o); + return; + } + decrRefCount(o); + addReply(c,shared.ok); + return; + +badfmt: /* Bad format errors */ + addReplySds(c,sdscatprintf(sdsempty(), + "-ERR invalid argument '%s' for CONFIG SET '%s'\r\n", + (char*)o->ptr, + (char*)c->argv[2]->ptr)); + decrRefCount(o); +} + +void configGetCommand(redisClient *c) { + robj *o = getDecodedObject(c->argv[2]); + robj *lenobj = createObject(REDIS_STRING,NULL); + char *pattern = o->ptr; + int matches = 0; + + addReply(c,lenobj); + decrRefCount(lenobj); + + if (stringmatch(pattern,"dbfilename",0)) { + addReplyBulkCString(c,"dbfilename"); + addReplyBulkCString(c,server.dbfilename); + matches++; + } + if (stringmatch(pattern,"requirepass",0)) { + addReplyBulkCString(c,"requirepass"); + addReplyBulkCString(c,server.requirepass); + matches++; + } + if (stringmatch(pattern,"masterauth",0)) { + addReplyBulkCString(c,"masterauth"); + addReplyBulkCString(c,server.masterauth); + matches++; + } + if (stringmatch(pattern,"maxmemory",0)) { + char buf[128]; + + ll2string(buf,128,server.maxmemory); + addReplyBulkCString(c,"maxmemory"); + addReplyBulkCString(c,buf); + matches++; + } + if (stringmatch(pattern,"timeout",0)) { + char buf[128]; + + ll2string(buf,128,server.maxidletime); + addReplyBulkCString(c,"timeout"); + addReplyBulkCString(c,buf); + matches++; + } + if (stringmatch(pattern,"appendonly",0)) { + addReplyBulkCString(c,"appendonly"); + addReplyBulkCString(c,server.appendonly ? "yes" : "no"); + matches++; + } + if (stringmatch(pattern,"no-appendfsync-on-rewrite",0)) { + addReplyBulkCString(c,"no-appendfsync-on-rewrite"); + addReplyBulkCString(c,server.no_appendfsync_on_rewrite ? "yes" : "no"); + matches++; + } + if (stringmatch(pattern,"appendfsync",0)) { + char *policy; + + switch(server.appendfsync) { + case APPENDFSYNC_NO: policy = "no"; break; + case APPENDFSYNC_EVERYSEC: policy = "everysec"; break; + case APPENDFSYNC_ALWAYS: policy = "always"; break; + default: policy = "unknown"; break; /* too harmless to panic */ + } + addReplyBulkCString(c,"appendfsync"); + addReplyBulkCString(c,policy); + matches++; + } + if (stringmatch(pattern,"save",0)) { + sds buf = sdsempty(); + int j; + + for (j = 0; j < server.saveparamslen; j++) { + buf = sdscatprintf(buf,"%ld %d", + server.saveparams[j].seconds, + server.saveparams[j].changes); + if (j != server.saveparamslen-1) + buf = sdscatlen(buf," ",1); + } + addReplyBulkCString(c,"save"); + addReplyBulkCString(c,buf); + sdsfree(buf); + matches++; + } + decrRefCount(o); + lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2); +} + +void configCommand(redisClient *c) { + if (!strcasecmp(c->argv[1]->ptr,"set")) { + if (c->argc != 4) goto badarity; + configSetCommand(c); + } else if (!strcasecmp(c->argv[1]->ptr,"get")) { + if (c->argc != 3) goto badarity; + configGetCommand(c); + } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) { + if (c->argc != 2) goto badarity; + server.stat_numcommands = 0; + server.stat_numconnections = 0; + server.stat_expiredkeys = 0; + server.stat_starttime = time(NULL); + addReply(c,shared.ok); + } else { + addReplySds(c,sdscatprintf(sdsempty(), + "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n")); + } + return; + +badarity: + addReplySds(c,sdscatprintf(sdsempty(), + "-ERR Wrong number of arguments for CONFIG %s\r\n", + (char*) c->argv[1]->ptr)); +} diff --git a/src/config.h b/src/config.h new file mode 100644 index 000000000..6e98fbb2c --- /dev/null +++ b/src/config.h @@ -0,0 +1,45 @@ +#ifndef __CONFIG_H +#define __CONFIG_H + +#ifdef __APPLE__ +#include +#endif + +/* test for malloc_size() */ +#ifdef __APPLE__ +#include +#define HAVE_MALLOC_SIZE 1 +#define redis_malloc_size(p) malloc_size(p) +#endif + +/* define redis_fstat to fstat or fstat64() */ +#if defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6) +#define redis_fstat fstat64 +#define redis_stat stat64 +#else +#define redis_fstat fstat +#define redis_stat stat +#endif + +/* test for backtrace() */ +#if defined(__APPLE__) || defined(__linux__) +#define HAVE_BACKTRACE 1 +#endif + +/* test for polling API */ +#ifdef __linux__ +#define HAVE_EPOLL 1 +#endif + +#if (defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined (__NetBSD__) +#define HAVE_KQUEUE 1 +#endif + +/* define aof_fsync to fdatasync() in Linux and fsync() for all the rest */ +#ifdef __linux__ +#define aof_fsync fdatasync +#else +#define aof_fsync fsync +#endif + +#endif diff --git a/src/db.c b/src/db.c new file mode 100644 index 000000000..e1e82cb22 --- /dev/null +++ b/src/db.c @@ -0,0 +1,508 @@ +#include "redis.h" + +#include + +/*----------------------------------------------------------------------------- + * C-level DB API + *----------------------------------------------------------------------------*/ + +robj *lookupKey(redisDb *db, robj *key) { + dictEntry *de = dictFind(db->dict,key->ptr); + if (de) { + robj *val = dictGetEntryVal(de); + + if (server.vm_enabled) { + if (val->storage == REDIS_VM_MEMORY || + val->storage == REDIS_VM_SWAPPING) + { + /* If we were swapping the object out, cancel the operation */ + if (val->storage == REDIS_VM_SWAPPING) + vmCancelThreadedIOJob(val); + /* Update the access time for the aging algorithm. */ + val->lru = server.lruclock; + } else { + int notify = (val->storage == REDIS_VM_LOADING); + + /* Our value was swapped on disk. Bring it at home. */ + redisAssert(val->type == REDIS_VMPOINTER); + val = vmLoadObject(val); + dictGetEntryVal(de) = val; + + /* Clients blocked by the VM subsystem may be waiting for + * this key... */ + if (notify) handleClientsBlockedOnSwappedKey(db,key); + } + } + return val; + } else { + return NULL; + } +} + +robj *lookupKeyRead(redisDb *db, robj *key) { + expireIfNeeded(db,key); + return lookupKey(db,key); +} + +robj *lookupKeyWrite(redisDb *db, robj *key) { + deleteIfVolatile(db,key); + touchWatchedKey(db,key); + return lookupKey(db,key); +} + +robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) { + robj *o = lookupKeyRead(c->db, key); + if (!o) addReply(c,reply); + return o; +} + +robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) { + robj *o = lookupKeyWrite(c->db, key); + if (!o) addReply(c,reply); + return o; +} + +/* Add the key to the DB. If the key already exists REDIS_ERR is returned, + * otherwise REDIS_OK is returned, and the caller should increment the + * refcount of 'val'. */ +int dbAdd(redisDb *db, robj *key, robj *val) { + /* Perform a lookup before adding the key, as we need to copy the + * key value. */ + if (dictFind(db->dict, key->ptr) != NULL) { + return REDIS_ERR; + } else { + sds copy = sdsdup(key->ptr); + dictAdd(db->dict, copy, val); + return REDIS_OK; + } +} + +/* If the key does not exist, this is just like dbAdd(). Otherwise + * the value associated to the key is replaced with the new one. + * + * On update (key already existed) 0 is returned. Otherwise 1. */ +int dbReplace(redisDb *db, robj *key, robj *val) { + if (dictFind(db->dict,key->ptr) == NULL) { + sds copy = sdsdup(key->ptr); + dictAdd(db->dict, copy, val); + return 1; + } else { + dictReplace(db->dict, key->ptr, val); + return 0; + } +} + +int dbExists(redisDb *db, robj *key) { + return dictFind(db->dict,key->ptr) != NULL; +} + +/* Return a random key, in form of a Redis object. + * If there are no keys, NULL is returned. + * + * The function makes sure to return keys not already expired. */ +robj *dbRandomKey(redisDb *db) { + struct dictEntry *de; + + while(1) { + sds key; + robj *keyobj; + + de = dictGetRandomKey(db->dict); + if (de == NULL) return NULL; + + key = dictGetEntryKey(de); + keyobj = createStringObject(key,sdslen(key)); + if (dictFind(db->expires,key)) { + if (expireIfNeeded(db,keyobj)) { + decrRefCount(keyobj); + continue; /* search for another key. This expired. */ + } + } + return keyobj; + } +} + +/* Delete a key, value, and associated expiration entry if any, from the DB */ +int dbDelete(redisDb *db, robj *key) { + /* Deleting an entry from the expires dict will not free the sds of + * the key, because it is shared with the main dictionary. */ + if (dictSize(db->expires) > 0) dictDelete(db->expires,key->ptr); + return dictDelete(db->dict,key->ptr) == DICT_OK; +} + +/* Empty the whole database */ +long long emptyDb() { + int j; + long long removed = 0; + + for (j = 0; j < server.dbnum; j++) { + removed += dictSize(server.db[j].dict); + dictEmpty(server.db[j].dict); + dictEmpty(server.db[j].expires); + } + return removed; +} + +int selectDb(redisClient *c, int id) { + if (id < 0 || id >= server.dbnum) + return REDIS_ERR; + c->db = &server.db[id]; + return REDIS_OK; +} + +/*----------------------------------------------------------------------------- + * Type agnostic commands operating on the key space + *----------------------------------------------------------------------------*/ + +void flushdbCommand(redisClient *c) { + server.dirty += dictSize(c->db->dict); + touchWatchedKeysOnFlush(c->db->id); + dictEmpty(c->db->dict); + dictEmpty(c->db->expires); + addReply(c,shared.ok); +} + +void flushallCommand(redisClient *c) { + touchWatchedKeysOnFlush(-1); + server.dirty += emptyDb(); + addReply(c,shared.ok); + if (server.bgsavechildpid != -1) { + kill(server.bgsavechildpid,SIGKILL); + rdbRemoveTempFile(server.bgsavechildpid); + } + rdbSave(server.dbfilename); + server.dirty++; +} + +void delCommand(redisClient *c) { + int deleted = 0, j; + + for (j = 1; j < c->argc; j++) { + if (dbDelete(c->db,c->argv[j])) { + touchWatchedKey(c->db,c->argv[j]); + server.dirty++; + deleted++; + } + } + addReplyLongLong(c,deleted); +} + +void existsCommand(redisClient *c) { + expireIfNeeded(c->db,c->argv[1]); + if (dbExists(c->db,c->argv[1])) { + addReply(c, shared.cone); + } else { + addReply(c, shared.czero); + } +} + +void selectCommand(redisClient *c) { + int id = atoi(c->argv[1]->ptr); + + if (selectDb(c,id) == REDIS_ERR) { + addReplySds(c,sdsnew("-ERR invalid DB index\r\n")); + } else { + addReply(c,shared.ok); + } +} + +void randomkeyCommand(redisClient *c) { + robj *key; + + if ((key = dbRandomKey(c->db)) == NULL) { + addReply(c,shared.nullbulk); + return; + } + + addReplyBulk(c,key); + decrRefCount(key); +} + +void keysCommand(redisClient *c) { + dictIterator *di; + dictEntry *de; + sds pattern = c->argv[1]->ptr; + int plen = sdslen(pattern); + unsigned long numkeys = 0; + robj *lenobj = createObject(REDIS_STRING,NULL); + + di = dictGetIterator(c->db->dict); + addReply(c,lenobj); + decrRefCount(lenobj); + while((de = dictNext(di)) != NULL) { + sds key = dictGetEntryKey(de); + robj *keyobj; + + if ((pattern[0] == '*' && pattern[1] == '\0') || + stringmatchlen(pattern,plen,key,sdslen(key),0)) { + keyobj = createStringObject(key,sdslen(key)); + if (expireIfNeeded(c->db,keyobj) == 0) { + addReplyBulk(c,keyobj); + numkeys++; + } + decrRefCount(keyobj); + } + } + dictReleaseIterator(di); + lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys); +} + +void dbsizeCommand(redisClient *c) { + addReplySds(c, + sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict))); +} + +void lastsaveCommand(redisClient *c) { + addReplySds(c, + sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave)); +} + +void typeCommand(redisClient *c) { + robj *o; + char *type; + + o = lookupKeyRead(c->db,c->argv[1]); + if (o == NULL) { + type = "+none"; + } else { + switch(o->type) { + case REDIS_STRING: type = "+string"; break; + case REDIS_LIST: type = "+list"; break; + case REDIS_SET: type = "+set"; break; + case REDIS_ZSET: type = "+zset"; break; + case REDIS_HASH: type = "+hash"; break; + default: type = "+unknown"; break; + } + } + addReplySds(c,sdsnew(type)); + addReply(c,shared.crlf); +} + +void saveCommand(redisClient *c) { + if (server.bgsavechildpid != -1) { + addReplySds(c,sdsnew("-ERR background save in progress\r\n")); + return; + } + if (rdbSave(server.dbfilename) == REDIS_OK) { + addReply(c,shared.ok); + } else { + addReply(c,shared.err); + } +} + +void bgsaveCommand(redisClient *c) { + if (server.bgsavechildpid != -1) { + addReplySds(c,sdsnew("-ERR background save already in progress\r\n")); + return; + } + if (rdbSaveBackground(server.dbfilename) == REDIS_OK) { + char *status = "+Background saving started\r\n"; + addReplySds(c,sdsnew(status)); + } else { + addReply(c,shared.err); + } +} + +void shutdownCommand(redisClient *c) { + if (prepareForShutdown() == REDIS_OK) + exit(0); + addReplySds(c, sdsnew("-ERR Errors trying to SHUTDOWN. Check logs.\r\n")); +} + +void renameGenericCommand(redisClient *c, int nx) { + robj *o; + + /* To use the same key as src and dst is probably an error */ + if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) { + addReply(c,shared.sameobjecterr); + return; + } + + if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL) + return; + + incrRefCount(o); + deleteIfVolatile(c->db,c->argv[2]); + if (dbAdd(c->db,c->argv[2],o) == REDIS_ERR) { + if (nx) { + decrRefCount(o); + addReply(c,shared.czero); + return; + } + dbReplace(c->db,c->argv[2],o); + } + dbDelete(c->db,c->argv[1]); + touchWatchedKey(c->db,c->argv[2]); + server.dirty++; + addReply(c,nx ? shared.cone : shared.ok); +} + +void renameCommand(redisClient *c) { + renameGenericCommand(c,0); +} + +void renamenxCommand(redisClient *c) { + renameGenericCommand(c,1); +} + +void moveCommand(redisClient *c) { + robj *o; + redisDb *src, *dst; + int srcid; + + /* Obtain source and target DB pointers */ + src = c->db; + srcid = c->db->id; + if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) { + addReply(c,shared.outofrangeerr); + return; + } + dst = c->db; + selectDb(c,srcid); /* Back to the source DB */ + + /* If the user is moving using as target the same + * DB as the source DB it is probably an error. */ + if (src == dst) { + addReply(c,shared.sameobjecterr); + return; + } + + /* Check if the element exists and get a reference */ + o = lookupKeyWrite(c->db,c->argv[1]); + if (!o) { + addReply(c,shared.czero); + return; + } + + /* Try to add the element to the target DB */ + deleteIfVolatile(dst,c->argv[1]); + if (dbAdd(dst,c->argv[1],o) == REDIS_ERR) { + addReply(c,shared.czero); + return; + } + incrRefCount(o); + + /* OK! key moved, free the entry in the source DB */ + dbDelete(src,c->argv[1]); + server.dirty++; + addReply(c,shared.cone); +} + +/*----------------------------------------------------------------------------- + * Expires API + *----------------------------------------------------------------------------*/ + +int removeExpire(redisDb *db, robj *key) { + /* An expire may only be removed if there is a corresponding entry in the + * main dict. Otherwise, the key will never be freed. */ + redisAssert(dictFind(db->dict,key->ptr) != NULL); + if (dictDelete(db->expires,key->ptr) == DICT_OK) { + return 1; + } else { + return 0; + } +} + +int setExpire(redisDb *db, robj *key, time_t when) { + dictEntry *de; + + /* Reuse the sds from the main dict in the expire dict */ + redisAssert((de = dictFind(db->dict,key->ptr)) != NULL); + if (dictAdd(db->expires,dictGetEntryKey(de),(void*)when) == DICT_ERR) { + return 0; + } else { + return 1; + } +} + +/* Return the expire time of the specified key, or -1 if no expire + * is associated with this key (i.e. the key is non volatile) */ +time_t getExpire(redisDb *db, robj *key) { + dictEntry *de; + + /* No expire? return ASAP */ + if (dictSize(db->expires) == 0 || + (de = dictFind(db->expires,key->ptr)) == NULL) return -1; + + /* The entry was found in the expire dict, this means it should also + * be present in the main dict (safety check). */ + redisAssert(dictFind(db->dict,key->ptr) != NULL); + return (time_t) dictGetEntryVal(de); +} + +int expireIfNeeded(redisDb *db, robj *key) { + time_t when = getExpire(db,key); + if (when < 0) return 0; + + /* Return when this key has not expired */ + if (time(NULL) <= when) return 0; + + /* Delete the key */ + server.stat_expiredkeys++; + server.dirty++; + return dbDelete(db,key); +} + +int deleteIfVolatile(redisDb *db, robj *key) { + if (getExpire(db,key) < 0) return 0; + + /* Delete the key */ + server.stat_expiredkeys++; + server.dirty++; + return dbDelete(db,key); +} + +/*----------------------------------------------------------------------------- + * Expires Commands + *----------------------------------------------------------------------------*/ + +void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) { + dictEntry *de; + time_t seconds; + + if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return; + + seconds -= offset; + + de = dictFind(c->db->dict,key->ptr); + if (de == NULL) { + addReply(c,shared.czero); + return; + } + if (seconds <= 0) { + if (dbDelete(c->db,key)) server.dirty++; + addReply(c, shared.cone); + return; + } else { + time_t when = time(NULL)+seconds; + if (setExpire(c->db,key,when)) { + addReply(c,shared.cone); + server.dirty++; + } else { + addReply(c,shared.czero); + } + return; + } +} + +void expireCommand(redisClient *c) { + expireGenericCommand(c,c->argv[1],c->argv[2],0); +} + +void expireatCommand(redisClient *c) { + expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL)); +} + +void ttlCommand(redisClient *c) { + time_t expire; + int ttl = -1; + + expire = getExpire(c->db,c->argv[1]); + if (expire != -1) { + ttl = (int) (expire-time(NULL)); + if (ttl < 0) ttl = -1; + } + addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl)); +} + + diff --git a/src/debug.c b/src/debug.c new file mode 100644 index 000000000..10b620d6f --- /dev/null +++ b/src/debug.c @@ -0,0 +1,309 @@ +#include "redis.h" +#include "sha1.h" /* SHA1 is used for DEBUG DIGEST */ + +/* ================================= Debugging ============================== */ + +/* Compute the sha1 of string at 's' with 'len' bytes long. + * The SHA1 is then xored againt the string pointed by digest. + * Since xor is commutative, this operation is used in order to + * "add" digests relative to unordered elements. + * + * So digest(a,b,c,d) will be the same of digest(b,a,c,d) */ +void xorDigest(unsigned char *digest, void *ptr, size_t len) { + SHA1_CTX ctx; + unsigned char hash[20], *s = ptr; + int j; + + SHA1Init(&ctx); + SHA1Update(&ctx,s,len); + SHA1Final(hash,&ctx); + + for (j = 0; j < 20; j++) + digest[j] ^= hash[j]; +} + +void xorObjectDigest(unsigned char *digest, robj *o) { + o = getDecodedObject(o); + xorDigest(digest,o->ptr,sdslen(o->ptr)); + decrRefCount(o); +} + +/* This function instead of just computing the SHA1 and xoring it + * against diget, also perform the digest of "digest" itself and + * replace the old value with the new one. + * + * So the final digest will be: + * + * digest = SHA1(digest xor SHA1(data)) + * + * This function is used every time we want to preserve the order so + * that digest(a,b,c,d) will be different than digest(b,c,d,a) + * + * Also note that mixdigest("foo") followed by mixdigest("bar") + * will lead to a different digest compared to "fo", "obar". + */ +void mixDigest(unsigned char *digest, void *ptr, size_t len) { + SHA1_CTX ctx; + char *s = ptr; + + xorDigest(digest,s,len); + SHA1Init(&ctx); + SHA1Update(&ctx,digest,20); + SHA1Final(digest,&ctx); +} + +void mixObjectDigest(unsigned char *digest, robj *o) { + o = getDecodedObject(o); + mixDigest(digest,o->ptr,sdslen(o->ptr)); + decrRefCount(o); +} + +/* Compute the dataset digest. Since keys, sets elements, hashes elements + * are not ordered, we use a trick: every aggregate digest is the xor + * of the digests of their elements. This way the order will not change + * the result. For list instead we use a feedback entering the output digest + * as input in order to ensure that a different ordered list will result in + * a different digest. */ +void computeDatasetDigest(unsigned char *final) { + unsigned char digest[20]; + char buf[128]; + dictIterator *di = NULL; + dictEntry *de; + int j; + uint32_t aux; + + memset(final,0,20); /* Start with a clean result */ + + for (j = 0; j < server.dbnum; j++) { + redisDb *db = server.db+j; + + if (dictSize(db->dict) == 0) continue; + di = dictGetIterator(db->dict); + + /* hash the DB id, so the same dataset moved in a different + * DB will lead to a different digest */ + aux = htonl(j); + mixDigest(final,&aux,sizeof(aux)); + + /* Iterate this DB writing every entry */ + while((de = dictNext(di)) != NULL) { + sds key; + robj *keyobj, *o; + time_t expiretime; + + memset(digest,0,20); /* This key-val digest */ + key = dictGetEntryKey(de); + keyobj = createStringObject(key,sdslen(key)); + + mixDigest(digest,key,sdslen(key)); + + /* Make sure the key is loaded if VM is active */ + o = lookupKeyRead(db,keyobj); + + aux = htonl(o->type); + mixDigest(digest,&aux,sizeof(aux)); + expiretime = getExpire(db,keyobj); + + /* Save the key and associated value */ + if (o->type == REDIS_STRING) { + mixObjectDigest(digest,o); + } else if (o->type == REDIS_LIST) { + listTypeIterator *li = listTypeInitIterator(o,0,REDIS_TAIL); + listTypeEntry entry; + while(listTypeNext(li,&entry)) { + robj *eleobj = listTypeGet(&entry); + mixObjectDigest(digest,eleobj); + decrRefCount(eleobj); + } + listTypeReleaseIterator(li); + } else if (o->type == REDIS_SET) { + dict *set = o->ptr; + dictIterator *di = dictGetIterator(set); + dictEntry *de; + + while((de = dictNext(di)) != NULL) { + robj *eleobj = dictGetEntryKey(de); + + xorObjectDigest(digest,eleobj); + } + dictReleaseIterator(di); + } else if (o->type == REDIS_ZSET) { + zset *zs = o->ptr; + dictIterator *di = dictGetIterator(zs->dict); + dictEntry *de; + + while((de = dictNext(di)) != NULL) { + robj *eleobj = dictGetEntryKey(de); + double *score = dictGetEntryVal(de); + unsigned char eledigest[20]; + + snprintf(buf,sizeof(buf),"%.17g",*score); + memset(eledigest,0,20); + mixObjectDigest(eledigest,eleobj); + mixDigest(eledigest,buf,strlen(buf)); + xorDigest(digest,eledigest,20); + } + dictReleaseIterator(di); + } else if (o->type == REDIS_HASH) { + hashTypeIterator *hi; + robj *obj; + + hi = hashTypeInitIterator(o); + while (hashTypeNext(hi) != REDIS_ERR) { + unsigned char eledigest[20]; + + memset(eledigest,0,20); + obj = hashTypeCurrent(hi,REDIS_HASH_KEY); + mixObjectDigest(eledigest,obj); + decrRefCount(obj); + obj = hashTypeCurrent(hi,REDIS_HASH_VALUE); + mixObjectDigest(eledigest,obj); + decrRefCount(obj); + xorDigest(digest,eledigest,20); + } + hashTypeReleaseIterator(hi); + } else { + redisPanic("Unknown object type"); + } + /* If the key has an expire, add it to the mix */ + if (expiretime != -1) xorDigest(digest,"!!expire!!",10); + /* We can finally xor the key-val digest to the final digest */ + xorDigest(final,digest,20); + decrRefCount(keyobj); + } + dictReleaseIterator(di); + } +} + +void debugCommand(redisClient *c) { + if (!strcasecmp(c->argv[1]->ptr,"segfault")) { + *((char*)-1) = 'x'; + } else if (!strcasecmp(c->argv[1]->ptr,"reload")) { + if (rdbSave(server.dbfilename) != REDIS_OK) { + addReply(c,shared.err); + return; + } + emptyDb(); + if (rdbLoad(server.dbfilename) != REDIS_OK) { + addReply(c,shared.err); + return; + } + redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD"); + addReply(c,shared.ok); + } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) { + emptyDb(); + if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) { + addReply(c,shared.err); + return; + } + redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF"); + addReply(c,shared.ok); + } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) { + dictEntry *de = dictFind(c->db->dict,c->argv[2]->ptr); + robj *val; + + if (!de) { + addReply(c,shared.nokeyerr); + return; + } + val = dictGetEntryVal(de); + if (!server.vm_enabled || (val->storage == REDIS_VM_MEMORY || + val->storage == REDIS_VM_SWAPPING)) { + char *strenc; + + strenc = strEncoding(val->encoding); + addReplySds(c,sdscatprintf(sdsempty(), + "+Value at:%p refcount:%d " + "encoding:%s serializedlength:%lld\r\n", + (void*)val, val->refcount, + strenc, (long long) rdbSavedObjectLen(val,NULL))); + } else { + vmpointer *vp = (vmpointer*) val; + addReplySds(c,sdscatprintf(sdsempty(), + "+Value swapped at: page %llu " + "using %llu pages\r\n", + (unsigned long long) vp->page, + (unsigned long long) vp->usedpages)); + } + } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) { + lookupKeyRead(c->db,c->argv[2]); + addReply(c,shared.ok); + } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) { + dictEntry *de = dictFind(c->db->dict,c->argv[2]->ptr); + robj *val; + vmpointer *vp; + + if (!server.vm_enabled) { + addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n")); + return; + } + if (!de) { + addReply(c,shared.nokeyerr); + return; + } + val = dictGetEntryVal(de); + /* Swap it */ + if (val->storage != REDIS_VM_MEMORY) { + addReplySds(c,sdsnew("-ERR This key is not in memory\r\n")); + } else if (val->refcount != 1) { + addReplySds(c,sdsnew("-ERR Object is shared\r\n")); + } else if ((vp = vmSwapObjectBlocking(val)) != NULL) { + dictGetEntryVal(de) = vp; + addReply(c,shared.ok); + } else { + addReply(c,shared.err); + } + } else if (!strcasecmp(c->argv[1]->ptr,"populate") && c->argc == 3) { + long keys, j; + robj *key, *val; + char buf[128]; + + if (getLongFromObjectOrReply(c, c->argv[2], &keys, NULL) != REDIS_OK) + return; + for (j = 0; j < keys; j++) { + snprintf(buf,sizeof(buf),"key:%lu",j); + key = createStringObject(buf,strlen(buf)); + if (lookupKeyRead(c->db,key) != NULL) { + decrRefCount(key); + continue; + } + snprintf(buf,sizeof(buf),"value:%lu",j); + val = createStringObject(buf,strlen(buf)); + dbAdd(c->db,key,val); + decrRefCount(key); + } + addReply(c,shared.ok); + } else if (!strcasecmp(c->argv[1]->ptr,"digest") && c->argc == 2) { + unsigned char digest[20]; + sds d = sdsnew("+"); + int j; + + computeDatasetDigest(digest); + for (j = 0; j < 20; j++) + d = sdscatprintf(d, "%02x",digest[j]); + + d = sdscatlen(d,"\r\n",2); + addReplySds(c,d); + } else { + addReplySds(c,sdsnew( + "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT |SWAPIN |SWAPOUT |RELOAD]\r\n")); + } +} + +void _redisAssert(char *estr, char *file, int line) { + redisLog(REDIS_WARNING,"=== ASSERTION FAILED ==="); + redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true",file,line,estr); +#ifdef HAVE_BACKTRACE + redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)"); + *((char*)-1) = 'x'; +#endif +} + +void _redisPanic(char *msg, char *file, int line) { + redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue"); + redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line); +#ifdef HAVE_BACKTRACE + redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)"); + *((char*)-1) = 'x'; +#endif +} diff --git a/src/dict.c b/src/dict.c new file mode 100644 index 000000000..d5010708c --- /dev/null +++ b/src/dict.c @@ -0,0 +1,727 @@ +/* Hash Tables Implementation. + * + * This file implements in memory hash tables with insert/del/replace/find/ + * get-random-element operations. Hash tables will auto resize if needed + * tables of power of two in size are used, collisions are handled by + * chaining. See the source code for more information... :) + * + * Copyright (c) 2006-2010, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "fmacros.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "dict.h" +#include "zmalloc.h" + +/* Using dictEnableResize() / dictDisableResize() we make possible to + * enable/disable resizing of the hash table as needed. This is very important + * for Redis, as we use copy-on-write and don't want to move too much memory + * around when there is a child performing saving operations. */ +static int dict_can_resize = 1; + +/* ---------------------------- Utility funcitons --------------------------- */ + +static void _dictPanic(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + fprintf(stderr, "\nDICT LIBRARY PANIC: "); + vfprintf(stderr, fmt, ap); + fprintf(stderr, "\n\n"); + va_end(ap); +} + +/* ------------------------- Heap Management Wrappers------------------------ */ + +static void *_dictAlloc(size_t size) +{ + void *p = zmalloc(size); + if (p == NULL) + _dictPanic("Out of memory"); + return p; +} + +static void _dictFree(void *ptr) { + zfree(ptr); +} + +/* -------------------------- private prototypes ---------------------------- */ + +static int _dictExpandIfNeeded(dict *ht); +static unsigned long _dictNextPower(unsigned long size); +static int _dictKeyIndex(dict *ht, const void *key); +static int _dictInit(dict *ht, dictType *type, void *privDataPtr); + +/* -------------------------- hash functions -------------------------------- */ + +/* Thomas Wang's 32 bit Mix Function */ +unsigned int dictIntHashFunction(unsigned int key) +{ + key += ~(key << 15); + key ^= (key >> 10); + key += (key << 3); + key ^= (key >> 6); + key += ~(key << 11); + key ^= (key >> 16); + return key; +} + +/* Identity hash function for integer keys */ +unsigned int dictIdentityHashFunction(unsigned int key) +{ + return key; +} + +/* Generic hash function (a popular one from Bernstein). + * I tested a few and this was the best. */ +unsigned int dictGenHashFunction(const unsigned char *buf, int len) { + unsigned int hash = 5381; + + while (len--) + hash = ((hash << 5) + hash) + (*buf++); /* hash * 33 + c */ + return hash; +} + +/* ----------------------------- API implementation ------------------------- */ + +/* Reset an hashtable already initialized with ht_init(). + * NOTE: This function should only called by ht_destroy(). */ +static void _dictReset(dictht *ht) +{ + ht->table = NULL; + ht->size = 0; + ht->sizemask = 0; + ht->used = 0; +} + +/* Create a new hash table */ +dict *dictCreate(dictType *type, + void *privDataPtr) +{ + dict *d = _dictAlloc(sizeof(*d)); + + _dictInit(d,type,privDataPtr); + return d; +} + +/* Initialize the hash table */ +int _dictInit(dict *d, dictType *type, + void *privDataPtr) +{ + _dictReset(&d->ht[0]); + _dictReset(&d->ht[1]); + d->type = type; + d->privdata = privDataPtr; + d->rehashidx = -1; + d->iterators = 0; + return DICT_OK; +} + +/* Resize the table to the minimal size that contains all the elements, + * but with the invariant of a USER/BUCKETS ration near to <= 1 */ +int dictResize(dict *d) +{ + int minimal; + + if (!dict_can_resize || dictIsRehashing(d)) return DICT_ERR; + minimal = d->ht[0].used; + if (minimal < DICT_HT_INITIAL_SIZE) + minimal = DICT_HT_INITIAL_SIZE; + return dictExpand(d, minimal); +} + +/* Expand or create the hashtable */ +int dictExpand(dict *d, unsigned long size) +{ + dictht n; /* the new hashtable */ + unsigned long realsize = _dictNextPower(size); + + /* the size is invalid if it is smaller than the number of + * elements already inside the hashtable */ + if (dictIsRehashing(d) || d->ht[0].used > size) + return DICT_ERR; + + n.size = realsize; + n.sizemask = realsize-1; + n.table = _dictAlloc(realsize*sizeof(dictEntry*)); + n.used = 0; + + /* Initialize all the pointers to NULL */ + memset(n.table, 0, realsize*sizeof(dictEntry*)); + + /* Is this the first initialization? If so it's not really a rehashing + * we just set the first hash table so that it can accept keys. */ + if (d->ht[0].table == NULL) { + d->ht[0] = n; + return DICT_OK; + } + + /* Prepare a second hash table for incremental rehashing */ + d->ht[1] = n; + d->rehashidx = 0; + return DICT_OK; +} + +/* Performs N steps of incremental rehashing. Returns 1 if there are still + * keys to move from the old to the new hash table, otherwise 0 is returned. + * Note that a rehashing step consists in moving a bucket (that may have more + * thank one key as we use chaining) from the old to the new hash table. */ +int dictRehash(dict *d, int n) { + if (!dictIsRehashing(d)) return 0; + + while(n--) { + dictEntry *de, *nextde; + + /* Check if we already rehashed the whole table... */ + if (d->ht[0].used == 0) { + _dictFree(d->ht[0].table); + d->ht[0] = d->ht[1]; + _dictReset(&d->ht[1]); + d->rehashidx = -1; + return 0; + } + + /* Note that rehashidx can't overflow as we are sure there are more + * elements because ht[0].used != 0 */ + while(d->ht[0].table[d->rehashidx] == NULL) d->rehashidx++; + de = d->ht[0].table[d->rehashidx]; + /* Move all the keys in this bucket from the old to the new hash HT */ + while(de) { + unsigned int h; + + nextde = de->next; + /* Get the index in the new hash table */ + h = dictHashKey(d, de->key) & d->ht[1].sizemask; + de->next = d->ht[1].table[h]; + d->ht[1].table[h] = de; + d->ht[0].used--; + d->ht[1].used++; + de = nextde; + } + d->ht[0].table[d->rehashidx] = NULL; + d->rehashidx++; + } + return 1; +} + +long long timeInMilliseconds(void) { + struct timeval tv; + + gettimeofday(&tv,NULL); + return (((long long)tv.tv_sec)*1000)+(tv.tv_usec/1000); +} + +/* Rehash for an amount of time between ms milliseconds and ms+1 milliseconds */ +int dictRehashMilliseconds(dict *d, int ms) { + long long start = timeInMilliseconds(); + int rehashes = 0; + + while(dictRehash(d,100)) { + rehashes += 100; + if (timeInMilliseconds()-start > ms) break; + } + return rehashes; +} + +/* This function performs just a step of rehashing, and only if there are + * not iterators bound to our hash table. When we have iterators in the middle + * of a rehashing we can't mess with the two hash tables otherwise some element + * can be missed or duplicated. + * + * This function is called by common lookup or update operations in the + * dictionary so that the hash table automatically migrates from H1 to H2 + * while it is actively used. */ +static void _dictRehashStep(dict *d) { + if (d->iterators == 0) dictRehash(d,1); +} + +/* Add an element to the target hash table */ +int dictAdd(dict *d, void *key, void *val) +{ + int index; + dictEntry *entry; + dictht *ht; + + if (dictIsRehashing(d)) _dictRehashStep(d); + + /* Get the index of the new element, or -1 if + * the element already exists. */ + if ((index = _dictKeyIndex(d, key)) == -1) + return DICT_ERR; + + /* Allocates the memory and stores key */ + ht = dictIsRehashing(d) ? &d->ht[1] : &d->ht[0]; + entry = _dictAlloc(sizeof(*entry)); + entry->next = ht->table[index]; + ht->table[index] = entry; + ht->used++; + + /* Set the hash entry fields. */ + dictSetHashKey(d, entry, key); + dictSetHashVal(d, entry, val); + return DICT_OK; +} + +/* Add an element, discarding the old if the key already exists. + * Return 1 if the key was added from scratch, 0 if there was already an + * element with such key and dictReplace() just performed a value update + * operation. */ +int dictReplace(dict *d, void *key, void *val) +{ + dictEntry *entry, auxentry; + + /* Try to add the element. If the key + * does not exists dictAdd will suceed. */ + if (dictAdd(d, key, val) == DICT_OK) + return 1; + /* It already exists, get the entry */ + entry = dictFind(d, key); + /* Free the old value and set the new one */ + /* Set the new value and free the old one. Note that it is important + * to do that in this order, as the value may just be exactly the same + * as the previous one. In this context, think to reference counting, + * you want to increment (set), and then decrement (free), and not the + * reverse. */ + auxentry = *entry; + dictSetHashVal(d, entry, val); + dictFreeEntryVal(d, &auxentry); + return 0; +} + +/* Search and remove an element */ +static int dictGenericDelete(dict *d, const void *key, int nofree) +{ + unsigned int h, idx; + dictEntry *he, *prevHe; + int table; + + if (d->ht[0].size == 0) return DICT_ERR; /* d->ht[0].table is NULL */ + if (dictIsRehashing(d)) _dictRehashStep(d); + h = dictHashKey(d, key); + + for (table = 0; table <= 1; table++) { + idx = h & d->ht[table].sizemask; + he = d->ht[table].table[idx]; + prevHe = NULL; + while(he) { + if (dictCompareHashKeys(d, key, he->key)) { + /* Unlink the element from the list */ + if (prevHe) + prevHe->next = he->next; + else + d->ht[table].table[idx] = he->next; + if (!nofree) { + dictFreeEntryKey(d, he); + dictFreeEntryVal(d, he); + } + _dictFree(he); + d->ht[table].used--; + return DICT_OK; + } + prevHe = he; + he = he->next; + } + if (!dictIsRehashing(d)) break; + } + return DICT_ERR; /* not found */ +} + +int dictDelete(dict *ht, const void *key) { + return dictGenericDelete(ht,key,0); +} + +int dictDeleteNoFree(dict *ht, const void *key) { + return dictGenericDelete(ht,key,1); +} + +/* Destroy an entire dictionary */ +int _dictClear(dict *d, dictht *ht) +{ + unsigned long i; + + /* Free all the elements */ + for (i = 0; i < ht->size && ht->used > 0; i++) { + dictEntry *he, *nextHe; + + if ((he = ht->table[i]) == NULL) continue; + while(he) { + nextHe = he->next; + dictFreeEntryKey(d, he); + dictFreeEntryVal(d, he); + _dictFree(he); + ht->used--; + he = nextHe; + } + } + /* Free the table and the allocated cache structure */ + _dictFree(ht->table); + /* Re-initialize the table */ + _dictReset(ht); + return DICT_OK; /* never fails */ +} + +/* Clear & Release the hash table */ +void dictRelease(dict *d) +{ + _dictClear(d,&d->ht[0]); + _dictClear(d,&d->ht[1]); + _dictFree(d); +} + +dictEntry *dictFind(dict *d, const void *key) +{ + dictEntry *he; + unsigned int h, idx, table; + + if (d->ht[0].size == 0) return NULL; /* We don't have a table at all */ + if (dictIsRehashing(d)) _dictRehashStep(d); + h = dictHashKey(d, key); + for (table = 0; table <= 1; table++) { + idx = h & d->ht[table].sizemask; + he = d->ht[table].table[idx]; + while(he) { + if (dictCompareHashKeys(d, key, he->key)) + return he; + he = he->next; + } + if (!dictIsRehashing(d)) return NULL; + } + return NULL; +} + +void *dictFetchValue(dict *d, const void *key) { + dictEntry *he; + + he = dictFind(d,key); + return he ? dictGetEntryVal(he) : NULL; +} + +dictIterator *dictGetIterator(dict *d) +{ + dictIterator *iter = _dictAlloc(sizeof(*iter)); + + iter->d = d; + iter->table = 0; + iter->index = -1; + iter->entry = NULL; + iter->nextEntry = NULL; + return iter; +} + +dictEntry *dictNext(dictIterator *iter) +{ + while (1) { + if (iter->entry == NULL) { + dictht *ht = &iter->d->ht[iter->table]; + if (iter->index == -1 && iter->table == 0) iter->d->iterators++; + iter->index++; + if (iter->index >= (signed) ht->size) { + if (dictIsRehashing(iter->d) && iter->table == 0) { + iter->table++; + iter->index = 0; + ht = &iter->d->ht[1]; + } else { + break; + } + } + iter->entry = ht->table[iter->index]; + } else { + iter->entry = iter->nextEntry; + } + if (iter->entry) { + /* We need to save the 'next' here, the iterator user + * may delete the entry we are returning. */ + iter->nextEntry = iter->entry->next; + return iter->entry; + } + } + return NULL; +} + +void dictReleaseIterator(dictIterator *iter) +{ + if (!(iter->index == -1 && iter->table == 0)) iter->d->iterators--; + _dictFree(iter); +} + +/* Return a random entry from the hash table. Useful to + * implement randomized algorithms */ +dictEntry *dictGetRandomKey(dict *d) +{ + dictEntry *he, *orighe; + unsigned int h; + int listlen, listele; + + if (dictSize(d) == 0) return NULL; + if (dictIsRehashing(d)) _dictRehashStep(d); + if (dictIsRehashing(d)) { + do { + h = random() % (d->ht[0].size+d->ht[1].size); + he = (h >= d->ht[0].size) ? d->ht[1].table[h - d->ht[0].size] : + d->ht[0].table[h]; + } while(he == NULL); + } else { + do { + h = random() & d->ht[0].sizemask; + he = d->ht[0].table[h]; + } while(he == NULL); + } + + /* Now we found a non empty bucket, but it is a linked + * list and we need to get a random element from the list. + * The only sane way to do so is counting the elements and + * select a random index. */ + listlen = 0; + orighe = he; + while(he) { + he = he->next; + listlen++; + } + listele = random() % listlen; + he = orighe; + while(listele--) he = he->next; + return he; +} + +/* ------------------------- private functions ------------------------------ */ + +/* Expand the hash table if needed */ +static int _dictExpandIfNeeded(dict *d) +{ + /* If the hash table is empty expand it to the intial size, + * if the table is "full" dobule its size. */ + if (dictIsRehashing(d)) return DICT_OK; + if (d->ht[0].size == 0) + return dictExpand(d, DICT_HT_INITIAL_SIZE); + if (d->ht[0].used >= d->ht[0].size && dict_can_resize) + return dictExpand(d, ((d->ht[0].size > d->ht[0].used) ? + d->ht[0].size : d->ht[0].used)*2); + return DICT_OK; +} + +/* Our hash table capability is a power of two */ +static unsigned long _dictNextPower(unsigned long size) +{ + unsigned long i = DICT_HT_INITIAL_SIZE; + + if (size >= LONG_MAX) return LONG_MAX; + while(1) { + if (i >= size) + return i; + i *= 2; + } +} + +/* Returns the index of a free slot that can be populated with + * an hash entry for the given 'key'. + * If the key already exists, -1 is returned. + * + * Note that if we are in the process of rehashing the hash table, the + * index is always returned in the context of the second (new) hash table. */ +static int _dictKeyIndex(dict *d, const void *key) +{ + unsigned int h, idx, table; + dictEntry *he; + + /* Expand the hashtable if needed */ + if (_dictExpandIfNeeded(d) == DICT_ERR) + return -1; + /* Compute the key hash value */ + h = dictHashKey(d, key); + for (table = 0; table <= 1; table++) { + idx = h & d->ht[table].sizemask; + /* Search if this slot does not already contain the given key */ + he = d->ht[table].table[idx]; + while(he) { + if (dictCompareHashKeys(d, key, he->key)) + return -1; + he = he->next; + } + if (!dictIsRehashing(d)) break; + } + return idx; +} + +void dictEmpty(dict *d) { + _dictClear(d,&d->ht[0]); + _dictClear(d,&d->ht[1]); + d->rehashidx = -1; + d->iterators = 0; +} + +#define DICT_STATS_VECTLEN 50 +static void _dictPrintStatsHt(dictht *ht) { + unsigned long i, slots = 0, chainlen, maxchainlen = 0; + unsigned long totchainlen = 0; + unsigned long clvector[DICT_STATS_VECTLEN]; + + if (ht->used == 0) { + printf("No stats available for empty dictionaries\n"); + return; + } + + for (i = 0; i < DICT_STATS_VECTLEN; i++) clvector[i] = 0; + for (i = 0; i < ht->size; i++) { + dictEntry *he; + + if (ht->table[i] == NULL) { + clvector[0]++; + continue; + } + slots++; + /* For each hash entry on this slot... */ + chainlen = 0; + he = ht->table[i]; + while(he) { + chainlen++; + he = he->next; + } + clvector[(chainlen < DICT_STATS_VECTLEN) ? chainlen : (DICT_STATS_VECTLEN-1)]++; + if (chainlen > maxchainlen) maxchainlen = chainlen; + totchainlen += chainlen; + } + printf("Hash table stats:\n"); + printf(" table size: %ld\n", ht->size); + printf(" number of elements: %ld\n", ht->used); + printf(" different slots: %ld\n", slots); + printf(" max chain length: %ld\n", maxchainlen); + printf(" avg chain length (counted): %.02f\n", (float)totchainlen/slots); + printf(" avg chain length (computed): %.02f\n", (float)ht->used/slots); + printf(" Chain length distribution:\n"); + for (i = 0; i < DICT_STATS_VECTLEN-1; i++) { + if (clvector[i] == 0) continue; + printf(" %s%ld: %ld (%.02f%%)\n",(i == DICT_STATS_VECTLEN-1)?">= ":"", i, clvector[i], ((float)clvector[i]/ht->size)*100); + } +} + +void dictPrintStats(dict *d) { + _dictPrintStatsHt(&d->ht[0]); + if (dictIsRehashing(d)) { + printf("-- Rehashing into ht[1]:\n"); + _dictPrintStatsHt(&d->ht[1]); + } +} + +void dictEnableResize(void) { + dict_can_resize = 1; +} + +void dictDisableResize(void) { + dict_can_resize = 0; +} + +/* ----------------------- StringCopy Hash Table Type ------------------------*/ + +static unsigned int _dictStringCopyHTHashFunction(const void *key) +{ + return dictGenHashFunction(key, strlen(key)); +} + +static void *_dictStringCopyHTKeyDup(void *privdata, const void *key) +{ + int len = strlen(key); + char *copy = _dictAlloc(len+1); + DICT_NOTUSED(privdata); + + memcpy(copy, key, len); + copy[len] = '\0'; + return copy; +} + +static void *_dictStringKeyValCopyHTValDup(void *privdata, const void *val) +{ + int len = strlen(val); + char *copy = _dictAlloc(len+1); + DICT_NOTUSED(privdata); + + memcpy(copy, val, len); + copy[len] = '\0'; + return copy; +} + +static int _dictStringCopyHTKeyCompare(void *privdata, const void *key1, + const void *key2) +{ + DICT_NOTUSED(privdata); + + return strcmp(key1, key2) == 0; +} + +static void _dictStringCopyHTKeyDestructor(void *privdata, void *key) +{ + DICT_NOTUSED(privdata); + + _dictFree((void*)key); /* ATTENTION: const cast */ +} + +static void _dictStringKeyValCopyHTValDestructor(void *privdata, void *val) +{ + DICT_NOTUSED(privdata); + + _dictFree((void*)val); /* ATTENTION: const cast */ +} + +dictType dictTypeHeapStringCopyKey = { + _dictStringCopyHTHashFunction, /* hash function */ + _dictStringCopyHTKeyDup, /* key dup */ + NULL, /* val dup */ + _dictStringCopyHTKeyCompare, /* key compare */ + _dictStringCopyHTKeyDestructor, /* key destructor */ + NULL /* val destructor */ +}; + +/* This is like StringCopy but does not auto-duplicate the key. + * It's used for intepreter's shared strings. */ +dictType dictTypeHeapStrings = { + _dictStringCopyHTHashFunction, /* hash function */ + NULL, /* key dup */ + NULL, /* val dup */ + _dictStringCopyHTKeyCompare, /* key compare */ + _dictStringCopyHTKeyDestructor, /* key destructor */ + NULL /* val destructor */ +}; + +/* This is like StringCopy but also automatically handle dynamic + * allocated C strings as values. */ +dictType dictTypeHeapStringCopyKeyValue = { + _dictStringCopyHTHashFunction, /* hash function */ + _dictStringCopyHTKeyDup, /* key dup */ + _dictStringKeyValCopyHTValDup, /* val dup */ + _dictStringCopyHTKeyCompare, /* key compare */ + _dictStringCopyHTKeyDestructor, /* key destructor */ + _dictStringKeyValCopyHTValDestructor, /* val destructor */ +}; diff --git a/src/dict.h b/src/dict.h new file mode 100644 index 000000000..30ace4db7 --- /dev/null +++ b/src/dict.h @@ -0,0 +1,151 @@ +/* Hash Tables Implementation. + * + * This file implements in memory hash tables with insert/del/replace/find/ + * get-random-element operations. Hash tables will auto resize if needed + * tables of power of two in size are used, collisions are handled by + * chaining. See the source code for more information... :) + * + * Copyright (c) 2006-2010, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __DICT_H +#define __DICT_H + +#define DICT_OK 0 +#define DICT_ERR 1 + +/* Unused arguments generate annoying warnings... */ +#define DICT_NOTUSED(V) ((void) V) + +typedef struct dictEntry { + void *key; + void *val; + struct dictEntry *next; +} dictEntry; + +typedef struct dictType { + unsigned int (*hashFunction)(const void *key); + void *(*keyDup)(void *privdata, const void *key); + void *(*valDup)(void *privdata, const void *obj); + int (*keyCompare)(void *privdata, const void *key1, const void *key2); + void (*keyDestructor)(void *privdata, void *key); + void (*valDestructor)(void *privdata, void *obj); +} dictType; + +/* This is our hash table structure. Every dictionary has two of this as we + * implement incremental rehashing, for the old to the new table. */ +typedef struct dictht { + dictEntry **table; + unsigned long size; + unsigned long sizemask; + unsigned long used; +} dictht; + +typedef struct dict { + dictType *type; + void *privdata; + dictht ht[2]; + int rehashidx; /* rehashing not in progress if rehashidx == -1 */ + int iterators; /* number of iterators currently running */ +} dict; + +typedef struct dictIterator { + dict *d; + int table; + int index; + dictEntry *entry, *nextEntry; +} dictIterator; + +/* This is the initial size of every hash table */ +#define DICT_HT_INITIAL_SIZE 4 + +/* ------------------------------- Macros ------------------------------------*/ +#define dictFreeEntryVal(d, entry) \ + if ((d)->type->valDestructor) \ + (d)->type->valDestructor((d)->privdata, (entry)->val) + +#define dictSetHashVal(d, entry, _val_) do { \ + if ((d)->type->valDup) \ + entry->val = (d)->type->valDup((d)->privdata, _val_); \ + else \ + entry->val = (_val_); \ +} while(0) + +#define dictFreeEntryKey(d, entry) \ + if ((d)->type->keyDestructor) \ + (d)->type->keyDestructor((d)->privdata, (entry)->key) + +#define dictSetHashKey(d, entry, _key_) do { \ + if ((d)->type->keyDup) \ + entry->key = (d)->type->keyDup((d)->privdata, _key_); \ + else \ + entry->key = (_key_); \ +} while(0) + +#define dictCompareHashKeys(d, key1, key2) \ + (((d)->type->keyCompare) ? \ + (d)->type->keyCompare((d)->privdata, key1, key2) : \ + (key1) == (key2)) + +#define dictHashKey(d, key) (d)->type->hashFunction(key) + +#define dictGetEntryKey(he) ((he)->key) +#define dictGetEntryVal(he) ((he)->val) +#define dictSlots(d) ((d)->ht[0].size+(d)->ht[1].size) +#define dictSize(d) ((d)->ht[0].used+(d)->ht[1].used) +#define dictIsRehashing(ht) ((ht)->rehashidx != -1) + +/* API */ +dict *dictCreate(dictType *type, void *privDataPtr); +int dictExpand(dict *d, unsigned long size); +int dictAdd(dict *d, void *key, void *val); +int dictReplace(dict *d, void *key, void *val); +int dictDelete(dict *d, const void *key); +int dictDeleteNoFree(dict *d, const void *key); +void dictRelease(dict *d); +dictEntry * dictFind(dict *d, const void *key); +void *dictFetchValue(dict *d, const void *key); +int dictResize(dict *d); +dictIterator *dictGetIterator(dict *d); +dictEntry *dictNext(dictIterator *iter); +void dictReleaseIterator(dictIterator *iter); +dictEntry *dictGetRandomKey(dict *d); +void dictPrintStats(dict *d); +unsigned int dictGenHashFunction(const unsigned char *buf, int len); +void dictEmpty(dict *d); +void dictEnableResize(void); +void dictDisableResize(void); +int dictRehash(dict *d, int n); +int dictRehashMilliseconds(dict *d, int ms); + +/* Hash table types */ +extern dictType dictTypeHeapStringCopyKey; +extern dictType dictTypeHeapStrings; +extern dictType dictTypeHeapStringCopyKeyValue; + +#endif /* __DICT_H */ diff --git a/src/fmacros.h b/src/fmacros.h new file mode 100644 index 000000000..38f46482a --- /dev/null +++ b/src/fmacros.h @@ -0,0 +1,15 @@ +#ifndef _REDIS_FMACRO_H +#define _REDIS_FMACRO_H + +#define _BSD_SOURCE + +#ifdef __linux__ +#define _XOPEN_SOURCE 700 +#else +#define _XOPEN_SOURCE +#endif + +#define _LARGEFILE_SOURCE +#define _FILE_OFFSET_BITS 64 + +#endif diff --git a/src/linenoise.c b/src/linenoise.c new file mode 100644 index 000000000..0c04d03fb --- /dev/null +++ b/src/linenoise.c @@ -0,0 +1,433 @@ +/* linenoise.c -- guerrilla line editing library against the idea that a + * line editing lib needs to be 20,000 lines of C code. + * + * You can find the latest source code at: + * + * http://github.com/antirez/linenoise + * + * Does a number of crazy assumptions that happen to be true in 99.9999% of + * the 2010 UNIX computers around. + * + * Copyright (c) 2010, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * References: + * - http://invisible-island.net/xterm/ctlseqs/ctlseqs.html + * - http://www.3waylabs.com/nw/WWW/products/wizcon/vt220.html + * + * Todo list: + * - Switch to gets() if $TERM is something we can't support. + * - Filter bogus Ctrl+ combinations. + * - Win32 support + * + * Bloat: + * - Completion? + * - History search like Ctrl+r in readline? + * + * List of escape sequences used by this program, we do everything just + * with three sequences. In order to be so cheap we may have some + * flickering effect with some slow terminal, but the lesser sequences + * the more compatible. + * + * CHA (Cursor Horizontal Absolute) + * Sequence: ESC [ n G + * Effect: moves cursor to column n + * + * EL (Erase Line) + * Sequence: ESC [ n K + * Effect: if n is 0 or missing, clear from cursor to end of line + * Effect: if n is 1, clear from beginning of line to cursor + * Effect: if n is 2, clear entire line + * + * CUF (CUrsor Forward) + * Sequence: ESC [ n C + * Effect: moves cursor forward of n chars + * + */ + +#include "fmacros.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define LINENOISE_MAX_LINE 4096 +static char *unsupported_term[] = {"dumb","cons25",NULL}; + +static struct termios orig_termios; /* in order to restore at exit */ +static int rawmode = 0; /* for atexit() function to check if restore is needed*/ +static int atexit_registered = 0; /* register atexit just 1 time */ +static int history_max_len = 100; +static int history_len = 0; +char **history = NULL; + +static void linenoiseAtExit(void); +int linenoiseHistoryAdd(const char *line); + +static int isUnsupportedTerm(void) { + char *term = getenv("TERM"); + int j; + + if (term == NULL) return 0; + for (j = 0; unsupported_term[j]; j++) + if (!strcasecmp(term,unsupported_term[j])) return 1; + return 0; +} + +static void freeHistory(void) { + if (history) { + int j; + + for (j = 0; j < history_len; j++) + free(history[j]); + free(history); + } +} + +static int enableRawMode(int fd) { + struct termios raw; + + if (!isatty(STDIN_FILENO)) goto fatal; + if (!atexit_registered) { + atexit(linenoiseAtExit); + atexit_registered = 1; + } + if (tcgetattr(fd,&orig_termios) == -1) goto fatal; + + raw = orig_termios; /* modify the original mode */ + /* input modes: no break, no CR to NL, no parity check, no strip char, + * no start/stop output control. */ + raw.c_iflag &= ~(BRKINT | ICRNL | INPCK | ISTRIP | IXON); + /* output modes - disable post processing */ + raw.c_oflag &= ~(OPOST); + /* control modes - set 8 bit chars */ + raw.c_cflag |= (CS8); + /* local modes - choing off, canonical off, no extended functions, + * no signal chars (^Z,^C) */ + raw.c_lflag &= ~(ECHO | ICANON | IEXTEN | ISIG); + /* control chars - set return condition: min number of bytes and timer. + * We want read to return every single byte, without timeout. */ + raw.c_cc[VMIN] = 1; raw.c_cc[VTIME] = 0; /* 1 byte, no timer */ + + /* put terminal in raw mode after flushing */ + if (tcsetattr(fd,TCSAFLUSH,&raw) < 0) goto fatal; + rawmode = 1; + return 0; + +fatal: + errno = ENOTTY; + return -1; +} + +static void disableRawMode(int fd) { + /* Don't even check the return value as it's too late. */ + if (rawmode && tcsetattr(fd,TCSAFLUSH,&orig_termios) != -1) + rawmode = 0; +} + +/* At exit we'll try to fix the terminal to the initial conditions. */ +static void linenoiseAtExit(void) { + disableRawMode(STDIN_FILENO); + freeHistory(); +} + +static int getColumns(void) { + struct winsize ws; + + if (ioctl(1, TIOCGWINSZ, &ws) == -1) return 80; + return ws.ws_col; +} + +static void refreshLine(int fd, const char *prompt, char *buf, size_t len, size_t pos, size_t cols) { + char seq[64]; + size_t plen = strlen(prompt); + + while((plen+pos) >= cols) { + buf++; + len--; + pos--; + } + while (plen+len > cols) { + len--; + } + + /* Cursor to left edge */ + snprintf(seq,64,"\x1b[0G"); + if (write(fd,seq,strlen(seq)) == -1) return; + /* Write the prompt and the current buffer content */ + if (write(fd,prompt,strlen(prompt)) == -1) return; + if (write(fd,buf,len) == -1) return; + /* Erase to right */ + snprintf(seq,64,"\x1b[0K"); + if (write(fd,seq,strlen(seq)) == -1) return; + /* Move cursor to original position. */ + snprintf(seq,64,"\x1b[0G\x1b[%dC", (int)(pos+plen)); + if (write(fd,seq,strlen(seq)) == -1) return; +} + +static int linenoisePrompt(int fd, char *buf, size_t buflen, const char *prompt) { + size_t plen = strlen(prompt); + size_t pos = 0; + size_t len = 0; + size_t cols = getColumns(); + int history_index = 0; + + buf[0] = '\0'; + buflen--; /* Make sure there is always space for the nulterm */ + + /* The latest history entry is always our current buffer, that + * initially is just an empty string. */ + linenoiseHistoryAdd(""); + + if (write(fd,prompt,plen) == -1) return -1; + while(1) { + char c; + int nread; + char seq[2]; + + nread = read(fd,&c,1); + if (nread <= 0) return len; + switch(c) { + case 13: /* enter */ + history_len--; + return len; + case 4: /* ctrl-d */ + history_len--; + return (len == 0) ? -1 : (int)len; + case 3: /* ctrl-c */ + errno = EAGAIN; + return -1; + case 127: /* backspace */ + case 8: /* ctrl-h */ + if (pos > 0 && len > 0) { + memmove(buf+pos-1,buf+pos,len-pos); + pos--; + len--; + buf[len] = '\0'; + refreshLine(fd,prompt,buf,len,pos,cols); + } + break; + case 20: /* ctrl-t */ + if (pos > 0 && pos < len) { + int aux = buf[pos-1]; + buf[pos-1] = buf[pos]; + buf[pos] = aux; + if (pos != len-1) pos++; + refreshLine(fd,prompt,buf,len,pos,cols); + } + break; + case 2: /* ctrl-b */ + goto left_arrow; + case 6: /* ctrl-f */ + goto right_arrow; + case 16: /* ctrl-p */ + seq[1] = 65; + goto up_down_arrow; + case 14: /* ctrl-n */ + seq[1] = 66; + goto up_down_arrow; + break; + case 27: /* escape sequence */ + if (read(fd,seq,2) == -1) break; + if (seq[0] == 91 && seq[1] == 68) { +left_arrow: + /* left arrow */ + if (pos > 0) { + pos--; + refreshLine(fd,prompt,buf,len,pos,cols); + } + } else if (seq[0] == 91 && seq[1] == 67) { +right_arrow: + /* right arrow */ + if (pos != len) { + pos++; + refreshLine(fd,prompt,buf,len,pos,cols); + } + } else if (seq[0] == 91 && (seq[1] == 65 || seq[1] == 66)) { +up_down_arrow: + /* up and down arrow: history */ + if (history_len > 1) { + /* Update the current history entry before to + * overwrite it with tne next one. */ + free(history[history_len-1-history_index]); + history[history_len-1-history_index] = strdup(buf); + /* Show the new entry */ + history_index += (seq[1] == 65) ? 1 : -1; + if (history_index < 0) { + history_index = 0; + break; + } else if (history_index >= history_len) { + history_index = history_len-1; + break; + } + strncpy(buf,history[history_len-1-history_index],buflen); + buf[buflen] = '\0'; + len = pos = strlen(buf); + refreshLine(fd,prompt,buf,len,pos,cols); + } + } + break; + default: + if (len < buflen) { + if (len == pos) { + buf[pos] = c; + pos++; + len++; + buf[len] = '\0'; + if (plen+len < cols) { + /* Avoid a full update of the line in the + * trivial case. */ + if (write(fd,&c,1) == -1) return -1; + } else { + refreshLine(fd,prompt,buf,len,pos,cols); + } + } else { + memmove(buf+pos+1,buf+pos,len-pos); + buf[pos] = c; + len++; + pos++; + buf[len] = '\0'; + refreshLine(fd,prompt,buf,len,pos,cols); + } + } + break; + case 21: /* Ctrl+u, delete the whole line. */ + buf[0] = '\0'; + pos = len = 0; + refreshLine(fd,prompt,buf,len,pos,cols); + break; + case 11: /* Ctrl+k, delete from current to end of line. */ + buf[pos] = '\0'; + len = pos; + refreshLine(fd,prompt,buf,len,pos,cols); + break; + case 1: /* Ctrl+a, go to the start of the line */ + pos = 0; + refreshLine(fd,prompt,buf,len,pos,cols); + break; + case 5: /* ctrl+e, go to the end of the line */ + pos = len; + refreshLine(fd,prompt,buf,len,pos,cols); + break; + } + } + return len; +} + +static int linenoiseRaw(char *buf, size_t buflen, const char *prompt) { + int fd = STDIN_FILENO; + int count; + + if (buflen == 0) { + errno = EINVAL; + return -1; + } + if (!isatty(STDIN_FILENO)) { + if (fgets(buf, buflen, stdin) == NULL) return -1; + count = strlen(buf); + if (count && buf[count-1] == '\n') { + count--; + buf[count] = '\0'; + } + } else { + if (enableRawMode(fd) == -1) return -1; + count = linenoisePrompt(fd, buf, buflen, prompt); + disableRawMode(fd); + printf("\n"); + } + return count; +} + +char *linenoise(const char *prompt) { + char buf[LINENOISE_MAX_LINE]; + int count; + + if (isUnsupportedTerm()) { + size_t len; + + printf("%s",prompt); + fflush(stdout); + if (fgets(buf,LINENOISE_MAX_LINE,stdin) == NULL) return NULL; + len = strlen(buf); + while(len && (buf[len-1] == '\n' || buf[len-1] == '\r')) { + len--; + buf[len] = '\0'; + } + return strdup(buf); + } else { + count = linenoiseRaw(buf,LINENOISE_MAX_LINE,prompt); + if (count == -1) return NULL; + return strdup(buf); + } +} + +/* Using a circular buffer is smarter, but a bit more complex to handle. */ +int linenoiseHistoryAdd(const char *line) { + char *linecopy; + + if (history_max_len == 0) return 0; + if (history == 0) { + history = malloc(sizeof(char*)*history_max_len); + if (history == NULL) return 0; + memset(history,0,(sizeof(char*)*history_max_len)); + } + linecopy = strdup(line); + if (!linecopy) return 0; + if (history_len == history_max_len) { + memmove(history,history+1,sizeof(char*)*(history_max_len-1)); + history_len--; + } + history[history_len] = linecopy; + history_len++; + return 1; +} + +int linenoiseHistorySetMaxLen(int len) { + char **new; + + if (len < 1) return 0; + if (history) { + int tocopy = history_len; + + new = malloc(sizeof(char*)*len); + if (new == NULL) return 0; + if (len < tocopy) tocopy = len; + memcpy(new,history+(history_max_len-tocopy), sizeof(char*)*tocopy); + free(history); + history = new; + } + history_max_len = len; + if (history_len > history_max_len) + history_len = history_max_len; + return 1; +} diff --git a/src/linenoise.h b/src/linenoise.h new file mode 100644 index 000000000..ff45e2c47 --- /dev/null +++ b/src/linenoise.h @@ -0,0 +1,41 @@ +/* linenoise.h -- guerrilla line editing library against the idea that a + * line editing lib needs to be 20,000 lines of C code. + * + * See linenoise.c for more information. + * + * Copyright (c) 2010, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __LINENOISE_H +#define __LINENOISE_H + +char *linenoise(const char *prompt); +int linenoiseHistoryAdd(char *line); +int linenoiseHistorySetMaxLen(int len); + +#endif /* __LINENOISE_H */ diff --git a/src/lzf.h b/src/lzf.h new file mode 100644 index 000000000..919b6e6be --- /dev/null +++ b/src/lzf.h @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2000-2008 Marc Alexander Lehmann + * + * Redistribution and use in source and binary forms, with or without modifica- + * tion, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER- + * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO + * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- + * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH- + * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Alternatively, the contents of this file may be used under the terms of + * the GNU General Public License ("GPL") version 2 or any later version, + * in which case the provisions of the GPL are applicable instead of + * the above. If you wish to allow the use of your version of this file + * only under the terms of the GPL and not to allow others to use your + * version of this file under the BSD license, indicate your decision + * by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL. If you do not delete the + * provisions above, a recipient may use your version of this file under + * either the BSD or the GPL. + */ + +#ifndef LZF_H +#define LZF_H + +/*********************************************************************** +** +** lzf -- an extremely fast/free compression/decompression-method +** http://liblzf.plan9.de/ +** +** This algorithm is believed to be patent-free. +** +***********************************************************************/ + +#define LZF_VERSION 0x0105 /* 1.5, API version */ + +/* + * Compress in_len bytes stored at the memory block starting at + * in_data and write the result to out_data, up to a maximum length + * of out_len bytes. + * + * If the output buffer is not large enough or any error occurs return 0, + * otherwise return the number of bytes used, which might be considerably + * more than in_len (but less than 104% of the original size), so it + * makes sense to always use out_len == in_len - 1), to ensure _some_ + * compression, and store the data uncompressed otherwise (with a flag, of + * course. + * + * lzf_compress might use different algorithms on different systems and + * even different runs, thus might result in different compressed strings + * depending on the phase of the moon or similar factors. However, all + * these strings are architecture-independent and will result in the + * original data when decompressed using lzf_decompress. + * + * The buffers must not be overlapping. + * + * If the option LZF_STATE_ARG is enabled, an extra argument must be + * supplied which is not reflected in this header file. Refer to lzfP.h + * and lzf_c.c. + * + */ +unsigned int +lzf_compress (const void *const in_data, unsigned int in_len, + void *out_data, unsigned int out_len); + +/* + * Decompress data compressed with some version of the lzf_compress + * function and stored at location in_data and length in_len. The result + * will be stored at out_data up to a maximum of out_len characters. + * + * If the output buffer is not large enough to hold the decompressed + * data, a 0 is returned and errno is set to E2BIG. Otherwise the number + * of decompressed bytes (i.e. the original length of the data) is + * returned. + * + * If an error in the compressed data is detected, a zero is returned and + * errno is set to EINVAL. + * + * This function is very fast, about as fast as a copying loop. + */ +unsigned int +lzf_decompress (const void *const in_data, unsigned int in_len, + void *out_data, unsigned int out_len); + +#endif + diff --git a/src/lzfP.h b/src/lzfP.h new file mode 100644 index 000000000..d533f1829 --- /dev/null +++ b/src/lzfP.h @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2000-2007 Marc Alexander Lehmann + * + * Redistribution and use in source and binary forms, with or without modifica- + * tion, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER- + * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO + * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- + * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH- + * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Alternatively, the contents of this file may be used under the terms of + * the GNU General Public License ("GPL") version 2 or any later version, + * in which case the provisions of the GPL are applicable instead of + * the above. If you wish to allow the use of your version of this file + * only under the terms of the GPL and not to allow others to use your + * version of this file under the BSD license, indicate your decision + * by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL. If you do not delete the + * provisions above, a recipient may use your version of this file under + * either the BSD or the GPL. + */ + +#ifndef LZFP_h +#define LZFP_h + +#define STANDALONE 1 /* at the moment, this is ok. */ + +#ifndef STANDALONE +# include "lzf.h" +#endif + +/* + * Size of hashtable is (1 << HLOG) * sizeof (char *) + * decompression is independent of the hash table size + * the difference between 15 and 14 is very small + * for small blocks (and 14 is usually a bit faster). + * For a low-memory/faster configuration, use HLOG == 13; + * For best compression, use 15 or 16 (or more, up to 23). + */ +#ifndef HLOG +# define HLOG 16 +#endif + +/* + * Sacrifice very little compression quality in favour of compression speed. + * This gives almost the same compression as the default code, and is + * (very roughly) 15% faster. This is the preferred mode of operation. + */ +#ifndef VERY_FAST +# define VERY_FAST 1 +#endif + +/* + * Sacrifice some more compression quality in favour of compression speed. + * (roughly 1-2% worse compression for large blocks and + * 9-10% for small, redundant, blocks and >>20% better speed in both cases) + * In short: when in need for speed, enable this for binary data, + * possibly disable this for text data. + */ +#ifndef ULTRA_FAST +# define ULTRA_FAST 0 +#endif + +/* + * Unconditionally aligning does not cost very much, so do it if unsure + */ +#ifndef STRICT_ALIGN +# define STRICT_ALIGN !(defined(__i386) || defined (__amd64)) +#endif + +/* + * You may choose to pre-set the hash table (might be faster on some + * modern cpus and large (>>64k) blocks, and also makes compression + * deterministic/repeatable when the configuration otherwise is the same). + */ +#ifndef INIT_HTAB +# define INIT_HTAB 0 +#endif + +/* + * Avoid assigning values to errno variable? for some embedding purposes + * (linux kernel for example), this is neccessary. NOTE: this breaks + * the documentation in lzf.h. + */ +#ifndef AVOID_ERRNO +# define AVOID_ERRNO 0 +#endif + +/* + * Wether to pass the LZF_STATE variable as argument, or allocate it + * on the stack. For small-stack environments, define this to 1. + * NOTE: this breaks the prototype in lzf.h. + */ +#ifndef LZF_STATE_ARG +# define LZF_STATE_ARG 0 +#endif + +/* + * Wether to add extra checks for input validity in lzf_decompress + * and return EINVAL if the input stream has been corrupted. This + * only shields against overflowing the input buffer and will not + * detect most corrupted streams. + * This check is not normally noticable on modern hardware + * (<1% slowdown), but might slow down older cpus considerably. + */ +#ifndef CHECK_INPUT +# define CHECK_INPUT 1 +#endif + +/*****************************************************************************/ +/* nothing should be changed below */ + +typedef unsigned char u8; + +typedef const u8 *LZF_STATE[1 << (HLOG)]; + +#if !STRICT_ALIGN +/* for unaligned accesses we need a 16 bit datatype. */ +# include +# if USHRT_MAX == 65535 + typedef unsigned short u16; +# elif UINT_MAX == 65535 + typedef unsigned int u16; +# else +# undef STRICT_ALIGN +# define STRICT_ALIGN 1 +# endif +#endif + +#if ULTRA_FAST +# if defined(VERY_FAST) +# undef VERY_FAST +# endif +#endif + +#if INIT_HTAB +# ifdef __cplusplus +# include +# else +# include +# endif +#endif + +#endif + diff --git a/src/lzf_c.c b/src/lzf_c.c new file mode 100644 index 000000000..99dab091c --- /dev/null +++ b/src/lzf_c.c @@ -0,0 +1,295 @@ +/* + * Copyright (c) 2000-2008 Marc Alexander Lehmann + * + * Redistribution and use in source and binary forms, with or without modifica- + * tion, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER- + * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO + * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- + * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH- + * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Alternatively, the contents of this file may be used under the terms of + * the GNU General Public License ("GPL") version 2 or any later version, + * in which case the provisions of the GPL are applicable instead of + * the above. If you wish to allow the use of your version of this file + * only under the terms of the GPL and not to allow others to use your + * version of this file under the BSD license, indicate your decision + * by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL. If you do not delete the + * provisions above, a recipient may use your version of this file under + * either the BSD or the GPL. + */ + +#include "lzfP.h" + +#define HSIZE (1 << (HLOG)) + +/* + * don't play with this unless you benchmark! + * decompression is not dependent on the hash function + * the hashing function might seem strange, just believe me + * it works ;) + */ +#ifndef FRST +# define FRST(p) (((p[0]) << 8) | p[1]) +# define NEXT(v,p) (((v) << 8) | p[2]) +# if ULTRA_FAST +# define IDX(h) ((( h >> (3*8 - HLOG)) - h ) & (HSIZE - 1)) +# elif VERY_FAST +# define IDX(h) ((( h >> (3*8 - HLOG)) - h*5) & (HSIZE - 1)) +# else +# define IDX(h) ((((h ^ (h << 5)) >> (3*8 - HLOG)) - h*5) & (HSIZE - 1)) +# endif +#endif +/* + * IDX works because it is very similar to a multiplicative hash, e.g. + * ((h * 57321 >> (3*8 - HLOG)) & (HSIZE - 1)) + * the latter is also quite fast on newer CPUs, and compresses similarly. + * + * the next one is also quite good, albeit slow ;) + * (int)(cos(h & 0xffffff) * 1e6) + */ + +#if 0 +/* original lzv-like hash function, much worse and thus slower */ +# define FRST(p) (p[0] << 5) ^ p[1] +# define NEXT(v,p) ((v) << 5) ^ p[2] +# define IDX(h) ((h) & (HSIZE - 1)) +#endif + +#define MAX_LIT (1 << 5) +#define MAX_OFF (1 << 13) +#define MAX_REF ((1 << 8) + (1 << 3)) + +#if __GNUC__ >= 3 +# define expect(expr,value) __builtin_expect ((expr),(value)) +# define inline inline +#else +# define expect(expr,value) (expr) +# define inline static +#endif + +#define expect_false(expr) expect ((expr) != 0, 0) +#define expect_true(expr) expect ((expr) != 0, 1) + +/* + * compressed format + * + * 000LLLLL ; literal + * LLLooooo oooooooo ; backref L + * 111ooooo LLLLLLLL oooooooo ; backref L+7 + * + */ + +unsigned int +lzf_compress (const void *const in_data, unsigned int in_len, + void *out_data, unsigned int out_len +#if LZF_STATE_ARG + , LZF_STATE htab +#endif + ) +{ +#if !LZF_STATE_ARG + LZF_STATE htab; +#endif + const u8 **hslot; + const u8 *ip = (const u8 *)in_data; + u8 *op = (u8 *)out_data; + const u8 *in_end = ip + in_len; + u8 *out_end = op + out_len; + const u8 *ref; + + /* off requires a type wide enough to hold a general pointer difference. + * ISO C doesn't have that (size_t might not be enough and ptrdiff_t only + * works for differences within a single object). We also assume that no + * no bit pattern traps. Since the only platform that is both non-POSIX + * and fails to support both assumptions is windows 64 bit, we make a + * special workaround for it. + */ +#if defined (WIN32) && defined (_M_X64) + unsigned _int64 off; /* workaround for missing POSIX compliance */ +#else + unsigned long off; +#endif + unsigned int hval; + int lit; + + if (!in_len || !out_len) + return 0; + +#if INIT_HTAB + memset (htab, 0, sizeof (htab)); +# if 0 + for (hslot = htab; hslot < htab + HSIZE; hslot++) + *hslot++ = ip; +# endif +#endif + + lit = 0; op++; /* start run */ + + hval = FRST (ip); + while (ip < in_end - 2) + { + hval = NEXT (hval, ip); + hslot = htab + IDX (hval); + ref = *hslot; *hslot = ip; + + if (1 +#if INIT_HTAB + && ref < ip /* the next test will actually take care of this, but this is faster */ +#endif + && (off = ip - ref - 1) < MAX_OFF + && ip + 4 < in_end + && ref > (u8 *)in_data +#if STRICT_ALIGN + && ref[0] == ip[0] + && ref[1] == ip[1] + && ref[2] == ip[2] +#else + && *(u16 *)ref == *(u16 *)ip + && ref[2] == ip[2] +#endif + ) + { + /* match found at *ref++ */ + unsigned int len = 2; + unsigned int maxlen = in_end - ip - len; + maxlen = maxlen > MAX_REF ? MAX_REF : maxlen; + + op [- lit - 1] = lit - 1; /* stop run */ + op -= !lit; /* undo run if length is zero */ + + if (expect_false (op + 3 + 1 >= out_end)) + return 0; + + for (;;) + { + if (expect_true (maxlen > 16)) + { + len++; if (ref [len] != ip [len]) break; + len++; if (ref [len] != ip [len]) break; + len++; if (ref [len] != ip [len]) break; + len++; if (ref [len] != ip [len]) break; + + len++; if (ref [len] != ip [len]) break; + len++; if (ref [len] != ip [len]) break; + len++; if (ref [len] != ip [len]) break; + len++; if (ref [len] != ip [len]) break; + + len++; if (ref [len] != ip [len]) break; + len++; if (ref [len] != ip [len]) break; + len++; if (ref [len] != ip [len]) break; + len++; if (ref [len] != ip [len]) break; + + len++; if (ref [len] != ip [len]) break; + len++; if (ref [len] != ip [len]) break; + len++; if (ref [len] != ip [len]) break; + len++; if (ref [len] != ip [len]) break; + } + + do + len++; + while (len < maxlen && ref[len] == ip[len]); + + break; + } + + len -= 2; /* len is now #octets - 1 */ + ip++; + + if (len < 7) + { + *op++ = (off >> 8) + (len << 5); + } + else + { + *op++ = (off >> 8) + ( 7 << 5); + *op++ = len - 7; + } + + *op++ = off; + lit = 0; op++; /* start run */ + + ip += len + 1; + + if (expect_false (ip >= in_end - 2)) + break; + +#if ULTRA_FAST || VERY_FAST + --ip; +# if VERY_FAST && !ULTRA_FAST + --ip; +# endif + hval = FRST (ip); + + hval = NEXT (hval, ip); + htab[IDX (hval)] = ip; + ip++; + +# if VERY_FAST && !ULTRA_FAST + hval = NEXT (hval, ip); + htab[IDX (hval)] = ip; + ip++; +# endif +#else + ip -= len + 1; + + do + { + hval = NEXT (hval, ip); + htab[IDX (hval)] = ip; + ip++; + } + while (len--); +#endif + } + else + { + /* one more literal byte we must copy */ + if (expect_false (op >= out_end)) + return 0; + + lit++; *op++ = *ip++; + + if (expect_false (lit == MAX_LIT)) + { + op [- lit - 1] = lit - 1; /* stop run */ + lit = 0; op++; /* start run */ + } + } + } + + if (op + 3 > out_end) /* at most 3 bytes can be missing here */ + return 0; + + while (ip < in_end) + { + lit++; *op++ = *ip++; + + if (expect_false (lit == MAX_LIT)) + { + op [- lit - 1] = lit - 1; /* stop run */ + lit = 0; op++; /* start run */ + } + } + + op [- lit - 1] = lit - 1; /* end run */ + op -= !lit; /* undo run if length is zero */ + + return op - (u8 *)out_data; +} + diff --git a/src/lzf_d.c b/src/lzf_d.c new file mode 100644 index 000000000..e7e48c138 --- /dev/null +++ b/src/lzf_d.c @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2000-2007 Marc Alexander Lehmann + * + * Redistribution and use in source and binary forms, with or without modifica- + * tion, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER- + * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO + * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- + * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH- + * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Alternatively, the contents of this file may be used under the terms of + * the GNU General Public License ("GPL") version 2 or any later version, + * in which case the provisions of the GPL are applicable instead of + * the above. If you wish to allow the use of your version of this file + * only under the terms of the GPL and not to allow others to use your + * version of this file under the BSD license, indicate your decision + * by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL. If you do not delete the + * provisions above, a recipient may use your version of this file under + * either the BSD or the GPL. + */ + +#include "lzfP.h" + +#if AVOID_ERRNO +# define SET_ERRNO(n) +#else +# include +# define SET_ERRNO(n) errno = (n) +#endif + +/* +#if (__i386 || __amd64) && __GNUC__ >= 3 +# define lzf_movsb(dst, src, len) \ + asm ("rep movsb" \ + : "=D" (dst), "=S" (src), "=c" (len) \ + : "0" (dst), "1" (src), "2" (len)); +#endif +*/ + +unsigned int +lzf_decompress (const void *const in_data, unsigned int in_len, + void *out_data, unsigned int out_len) +{ + u8 const *ip = (const u8 *)in_data; + u8 *op = (u8 *)out_data; + u8 const *const in_end = ip + in_len; + u8 *const out_end = op + out_len; + + do + { + unsigned int ctrl = *ip++; + + if (ctrl < (1 << 5)) /* literal run */ + { + ctrl++; + + if (op + ctrl > out_end) + { + SET_ERRNO (E2BIG); + return 0; + } + +#if CHECK_INPUT + if (ip + ctrl > in_end) + { + SET_ERRNO (EINVAL); + return 0; + } +#endif + +#ifdef lzf_movsb + lzf_movsb (op, ip, ctrl); +#else + do + *op++ = *ip++; + while (--ctrl); +#endif + } + else /* back reference */ + { + unsigned int len = ctrl >> 5; + + u8 *ref = op - ((ctrl & 0x1f) << 8) - 1; + +#if CHECK_INPUT + if (ip >= in_end) + { + SET_ERRNO (EINVAL); + return 0; + } +#endif + if (len == 7) + { + len += *ip++; +#if CHECK_INPUT + if (ip >= in_end) + { + SET_ERRNO (EINVAL); + return 0; + } +#endif + } + + ref -= *ip++; + + if (op + len + 2 > out_end) + { + SET_ERRNO (E2BIG); + return 0; + } + + if (ref < (u8 *)out_data) + { + SET_ERRNO (EINVAL); + return 0; + } + +#ifdef lzf_movsb + len += 2; + lzf_movsb (op, ref, len); +#else + *op++ = *ref++; + *op++ = *ref++; + + do + *op++ = *ref++; + while (--len); +#endif + } + } + while (ip < in_end); + + return op - (u8 *)out_data; +} + diff --git a/src/mkreleasehdr.sh b/src/mkreleasehdr.sh new file mode 100755 index 000000000..30984160e --- /dev/null +++ b/src/mkreleasehdr.sh @@ -0,0 +1,9 @@ +#!/bin/sh +GIT_SHA1=`(git show-ref --head --hash=8 2> /dev/null || echo 00000000) | head -n1` +GIT_DIRTY=`git diff 2> /dev/null | wc -l` +test -f release.h || touch release.h +(cat release.h | grep SHA1 | grep $GIT_SHA1) && \ +(cat release.h | grep DIRTY | grep $GIT_DIRTY) && exit 0 # Already uptodate +echo "#define REDIS_GIT_SHA1 \"$GIT_SHA1\"" > release.h +echo "#define REDIS_GIT_DIRTY \"$GIT_DIRTY\"" >> release.h +touch release.c # Force recompile of release.c diff --git a/src/multi.c b/src/multi.c new file mode 100644 index 000000000..def1dd673 --- /dev/null +++ b/src/multi.c @@ -0,0 +1,266 @@ +#include "redis.h" + +/* ================================ MULTI/EXEC ============================== */ + +/* Client state initialization for MULTI/EXEC */ +void initClientMultiState(redisClient *c) { + c->mstate.commands = NULL; + c->mstate.count = 0; +} + +/* Release all the resources associated with MULTI/EXEC state */ +void freeClientMultiState(redisClient *c) { + int j; + + for (j = 0; j < c->mstate.count; j++) { + int i; + multiCmd *mc = c->mstate.commands+j; + + for (i = 0; i < mc->argc; i++) + decrRefCount(mc->argv[i]); + zfree(mc->argv); + } + zfree(c->mstate.commands); +} + +/* Add a new command into the MULTI commands queue */ +void queueMultiCommand(redisClient *c, struct redisCommand *cmd) { + multiCmd *mc; + int j; + + c->mstate.commands = zrealloc(c->mstate.commands, + sizeof(multiCmd)*(c->mstate.count+1)); + mc = c->mstate.commands+c->mstate.count; + mc->cmd = cmd; + mc->argc = c->argc; + mc->argv = zmalloc(sizeof(robj*)*c->argc); + memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc); + for (j = 0; j < c->argc; j++) + incrRefCount(mc->argv[j]); + c->mstate.count++; +} + +void multiCommand(redisClient *c) { + if (c->flags & REDIS_MULTI) { + addReplySds(c,sdsnew("-ERR MULTI calls can not be nested\r\n")); + return; + } + c->flags |= REDIS_MULTI; + addReply(c,shared.ok); +} + +void discardCommand(redisClient *c) { + if (!(c->flags & REDIS_MULTI)) { + addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n")); + return; + } + + freeClientMultiState(c); + initClientMultiState(c); + c->flags &= (~REDIS_MULTI); + unwatchAllKeys(c); + addReply(c,shared.ok); +} + +/* Send a MULTI command to all the slaves and AOF file. Check the execCommand + * implememntation for more information. */ +void execCommandReplicateMulti(redisClient *c) { + struct redisCommand *cmd; + robj *multistring = createStringObject("MULTI",5); + + cmd = lookupCommand("multi"); + if (server.appendonly) + feedAppendOnlyFile(cmd,c->db->id,&multistring,1); + if (listLength(server.slaves)) + replicationFeedSlaves(server.slaves,c->db->id,&multistring,1); + decrRefCount(multistring); +} + +void execCommand(redisClient *c) { + int j; + robj **orig_argv; + int orig_argc; + + if (!(c->flags & REDIS_MULTI)) { + addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n")); + return; + } + + /* Check if we need to abort the EXEC if some WATCHed key was touched. + * A failed EXEC will return a multi bulk nil object. */ + if (c->flags & REDIS_DIRTY_CAS) { + freeClientMultiState(c); + initClientMultiState(c); + c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS); + unwatchAllKeys(c); + addReply(c,shared.nullmultibulk); + return; + } + + /* Replicate a MULTI request now that we are sure the block is executed. + * This way we'll deliver the MULTI/..../EXEC block as a whole and + * both the AOF and the replication link will have the same consistency + * and atomicity guarantees. */ + execCommandReplicateMulti(c); + + /* Exec all the queued commands */ + unwatchAllKeys(c); /* Unwatch ASAP otherwise we'll waste CPU cycles */ + orig_argv = c->argv; + orig_argc = c->argc; + addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count)); + for (j = 0; j < c->mstate.count; j++) { + c->argc = c->mstate.commands[j].argc; + c->argv = c->mstate.commands[j].argv; + call(c,c->mstate.commands[j].cmd); + } + c->argv = orig_argv; + c->argc = orig_argc; + freeClientMultiState(c); + initClientMultiState(c); + c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS); + /* Make sure the EXEC command is always replicated / AOF, since we + * always send the MULTI command (we can't know beforehand if the + * next operations will contain at least a modification to the DB). */ + server.dirty++; +} + +/* ===================== WATCH (CAS alike for MULTI/EXEC) =================== + * + * The implementation uses a per-DB hash table mapping keys to list of clients + * WATCHing those keys, so that given a key that is going to be modified + * we can mark all the associated clients as dirty. + * + * Also every client contains a list of WATCHed keys so that's possible to + * un-watch such keys when the client is freed or when UNWATCH is called. */ + +/* In the client->watched_keys list we need to use watchedKey structures + * as in order to identify a key in Redis we need both the key name and the + * DB */ +typedef struct watchedKey { + robj *key; + redisDb *db; +} watchedKey; + +/* Watch for the specified key */ +void watchForKey(redisClient *c, robj *key) { + list *clients = NULL; + listIter li; + listNode *ln; + watchedKey *wk; + + /* Check if we are already watching for this key */ + listRewind(c->watched_keys,&li); + while((ln = listNext(&li))) { + wk = listNodeValue(ln); + if (wk->db == c->db && equalStringObjects(key,wk->key)) + return; /* Key already watched */ + } + /* This key is not already watched in this DB. Let's add it */ + clients = dictFetchValue(c->db->watched_keys,key); + if (!clients) { + clients = listCreate(); + dictAdd(c->db->watched_keys,key,clients); + incrRefCount(key); + } + listAddNodeTail(clients,c); + /* Add the new key to the lits of keys watched by this client */ + wk = zmalloc(sizeof(*wk)); + wk->key = key; + wk->db = c->db; + incrRefCount(key); + listAddNodeTail(c->watched_keys,wk); +} + +/* Unwatch all the keys watched by this client. To clean the EXEC dirty + * flag is up to the caller. */ +void unwatchAllKeys(redisClient *c) { + listIter li; + listNode *ln; + + if (listLength(c->watched_keys) == 0) return; + listRewind(c->watched_keys,&li); + while((ln = listNext(&li))) { + list *clients; + watchedKey *wk; + + /* Lookup the watched key -> clients list and remove the client + * from the list */ + wk = listNodeValue(ln); + clients = dictFetchValue(wk->db->watched_keys, wk->key); + redisAssert(clients != NULL); + listDelNode(clients,listSearchKey(clients,c)); + /* Kill the entry at all if this was the only client */ + if (listLength(clients) == 0) + dictDelete(wk->db->watched_keys, wk->key); + /* Remove this watched key from the client->watched list */ + listDelNode(c->watched_keys,ln); + decrRefCount(wk->key); + zfree(wk); + } +} + +/* "Touch" a key, so that if this key is being WATCHed by some client the + * next EXEC will fail. */ +void touchWatchedKey(redisDb *db, robj *key) { + list *clients; + listIter li; + listNode *ln; + + if (dictSize(db->watched_keys) == 0) return; + clients = dictFetchValue(db->watched_keys, key); + if (!clients) return; + + /* Mark all the clients watching this key as REDIS_DIRTY_CAS */ + /* Check if we are already watching for this key */ + listRewind(clients,&li); + while((ln = listNext(&li))) { + redisClient *c = listNodeValue(ln); + + c->flags |= REDIS_DIRTY_CAS; + } +} + +/* On FLUSHDB or FLUSHALL all the watched keys that are present before the + * flush but will be deleted as effect of the flushing operation should + * be touched. "dbid" is the DB that's getting the flush. -1 if it is + * a FLUSHALL operation (all the DBs flushed). */ +void touchWatchedKeysOnFlush(int dbid) { + listIter li1, li2; + listNode *ln; + + /* For every client, check all the waited keys */ + listRewind(server.clients,&li1); + while((ln = listNext(&li1))) { + redisClient *c = listNodeValue(ln); + listRewind(c->watched_keys,&li2); + while((ln = listNext(&li2))) { + watchedKey *wk = listNodeValue(ln); + + /* For every watched key matching the specified DB, if the + * key exists, mark the client as dirty, as the key will be + * removed. */ + if (dbid == -1 || wk->db->id == dbid) { + if (dictFind(wk->db->dict, wk->key->ptr) != NULL) + c->flags |= REDIS_DIRTY_CAS; + } + } + } +} + +void watchCommand(redisClient *c) { + int j; + + if (c->flags & REDIS_MULTI) { + addReplySds(c,sdsnew("-ERR WATCH inside MULTI is not allowed\r\n")); + return; + } + for (j = 1; j < c->argc; j++) + watchForKey(c,c->argv[j]); + addReply(c,shared.ok); +} + +void unwatchCommand(redisClient *c) { + unwatchAllKeys(c); + c->flags &= (~REDIS_DIRTY_CAS); + addReply(c,shared.ok); +} diff --git a/src/networking.c b/src/networking.c new file mode 100644 index 000000000..31844a09f --- /dev/null +++ b/src/networking.c @@ -0,0 +1,589 @@ +#include "redis.h" + +#include + +void *dupClientReplyValue(void *o) { + incrRefCount((robj*)o); + return o; +} + +int listMatchObjects(void *a, void *b) { + return equalStringObjects(a,b); +} + +redisClient *createClient(int fd) { + redisClient *c = zmalloc(sizeof(*c)); + + anetNonBlock(NULL,fd); + anetTcpNoDelay(NULL,fd); + if (!c) return NULL; + selectDb(c,0); + c->fd = fd; + c->querybuf = sdsempty(); + c->argc = 0; + c->argv = NULL; + c->bulklen = -1; + c->multibulk = 0; + c->mbargc = 0; + c->mbargv = NULL; + c->sentlen = 0; + c->flags = 0; + c->lastinteraction = time(NULL); + c->authenticated = 0; + c->replstate = REDIS_REPL_NONE; + c->reply = listCreate(); + listSetFreeMethod(c->reply,decrRefCount); + listSetDupMethod(c->reply,dupClientReplyValue); + c->blocking_keys = NULL; + c->blocking_keys_num = 0; + c->io_keys = listCreate(); + c->watched_keys = listCreate(); + listSetFreeMethod(c->io_keys,decrRefCount); + c->pubsub_channels = dictCreate(&setDictType,NULL); + c->pubsub_patterns = listCreate(); + listSetFreeMethod(c->pubsub_patterns,decrRefCount); + listSetMatchMethod(c->pubsub_patterns,listMatchObjects); + if (aeCreateFileEvent(server.el, c->fd, AE_READABLE, + readQueryFromClient, c) == AE_ERR) { + freeClient(c); + return NULL; + } + listAddNodeTail(server.clients,c); + initClientMultiState(c); + return c; +} + +void addReply(redisClient *c, robj *obj) { + if (listLength(c->reply) == 0 && + (c->replstate == REDIS_REPL_NONE || + c->replstate == REDIS_REPL_ONLINE) && + aeCreateFileEvent(server.el, c->fd, AE_WRITABLE, + sendReplyToClient, c) == AE_ERR) return; + + if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) { + obj = dupStringObject(obj); + obj->refcount = 0; /* getDecodedObject() will increment the refcount */ + } + listAddNodeTail(c->reply,getDecodedObject(obj)); +} + +void addReplySds(redisClient *c, sds s) { + robj *o = createObject(REDIS_STRING,s); + addReply(c,o); + decrRefCount(o); +} + +void addReplyDouble(redisClient *c, double d) { + char buf[128]; + + snprintf(buf,sizeof(buf),"%.17g",d); + addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n", + (unsigned long) strlen(buf),buf)); +} + +void addReplyLongLong(redisClient *c, long long ll) { + char buf[128]; + size_t len; + + if (ll == 0) { + addReply(c,shared.czero); + return; + } else if (ll == 1) { + addReply(c,shared.cone); + return; + } + buf[0] = ':'; + len = ll2string(buf+1,sizeof(buf)-1,ll); + buf[len+1] = '\r'; + buf[len+2] = '\n'; + addReplySds(c,sdsnewlen(buf,len+3)); +} + +void addReplyUlong(redisClient *c, unsigned long ul) { + char buf[128]; + size_t len; + + if (ul == 0) { + addReply(c,shared.czero); + return; + } else if (ul == 1) { + addReply(c,shared.cone); + return; + } + len = snprintf(buf,sizeof(buf),":%lu\r\n",ul); + addReplySds(c,sdsnewlen(buf,len)); +} + +void addReplyBulkLen(redisClient *c, robj *obj) { + size_t len, intlen; + char buf[128]; + + if (obj->encoding == REDIS_ENCODING_RAW) { + len = sdslen(obj->ptr); + } else { + long n = (long)obj->ptr; + + /* Compute how many bytes will take this integer as a radix 10 string */ + len = 1; + if (n < 0) { + len++; + n = -n; + } + while((n = n/10) != 0) { + len++; + } + } + buf[0] = '$'; + intlen = ll2string(buf+1,sizeof(buf)-1,(long long)len); + buf[intlen+1] = '\r'; + buf[intlen+2] = '\n'; + addReplySds(c,sdsnewlen(buf,intlen+3)); +} + +void addReplyBulk(redisClient *c, robj *obj) { + addReplyBulkLen(c,obj); + addReply(c,obj); + addReply(c,shared.crlf); +} + +/* In the CONFIG command we need to add vanilla C string as bulk replies */ +void addReplyBulkCString(redisClient *c, char *s) { + if (s == NULL) { + addReply(c,shared.nullbulk); + } else { + robj *o = createStringObject(s,strlen(s)); + addReplyBulk(c,o); + decrRefCount(o); + } +} + +void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) { + int cport, cfd; + char cip[128]; + redisClient *c; + REDIS_NOTUSED(el); + REDIS_NOTUSED(mask); + REDIS_NOTUSED(privdata); + + cfd = anetAccept(server.neterr, fd, cip, &cport); + if (cfd == AE_ERR) { + redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr); + return; + } + redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport); + if ((c = createClient(cfd)) == NULL) { + redisLog(REDIS_WARNING,"Error allocating resoures for the client"); + close(cfd); /* May be already closed, just ingore errors */ + return; + } + /* If maxclient directive is set and this is one client more... close the + * connection. Note that we create the client instead to check before + * for this condition, since now the socket is already set in nonblocking + * mode and we can send an error for free using the Kernel I/O */ + if (server.maxclients && listLength(server.clients) > server.maxclients) { + char *err = "-ERR max number of clients reached\r\n"; + + /* That's a best effort error message, don't check write errors */ + if (write(c->fd,err,strlen(err)) == -1) { + /* Nothing to do, Just to avoid the warning... */ + } + freeClient(c); + return; + } + server.stat_numconnections++; +} + +static void freeClientArgv(redisClient *c) { + int j; + + for (j = 0; j < c->argc; j++) + decrRefCount(c->argv[j]); + for (j = 0; j < c->mbargc; j++) + decrRefCount(c->mbargv[j]); + c->argc = 0; + c->mbargc = 0; +} + +void freeClient(redisClient *c) { + listNode *ln; + + /* Note that if the client we are freeing is blocked into a blocking + * call, we have to set querybuf to NULL *before* to call + * unblockClientWaitingData() to avoid processInputBuffer() will get + * called. Also it is important to remove the file events after + * this, because this call adds the READABLE event. */ + sdsfree(c->querybuf); + c->querybuf = NULL; + if (c->flags & REDIS_BLOCKED) + unblockClientWaitingData(c); + + /* UNWATCH all the keys */ + unwatchAllKeys(c); + listRelease(c->watched_keys); + /* Unsubscribe from all the pubsub channels */ + pubsubUnsubscribeAllChannels(c,0); + pubsubUnsubscribeAllPatterns(c,0); + dictRelease(c->pubsub_channels); + listRelease(c->pubsub_patterns); + /* Obvious cleanup */ + aeDeleteFileEvent(server.el,c->fd,AE_READABLE); + aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE); + listRelease(c->reply); + freeClientArgv(c); + close(c->fd); + /* Remove from the list of clients */ + ln = listSearchKey(server.clients,c); + redisAssert(ln != NULL); + listDelNode(server.clients,ln); + /* Remove from the list of clients that are now ready to be restarted + * after waiting for swapped keys */ + if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) { + ln = listSearchKey(server.io_ready_clients,c); + if (ln) { + listDelNode(server.io_ready_clients,ln); + server.vm_blocked_clients--; + } + } + /* Remove from the list of clients waiting for swapped keys */ + while (server.vm_enabled && listLength(c->io_keys)) { + ln = listFirst(c->io_keys); + dontWaitForSwappedKey(c,ln->value); + } + listRelease(c->io_keys); + /* Master/slave cleanup */ + if (c->flags & REDIS_SLAVE) { + if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1) + close(c->repldbfd); + list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves; + ln = listSearchKey(l,c); + redisAssert(ln != NULL); + listDelNode(l,ln); + } + if (c->flags & REDIS_MASTER) { + server.master = NULL; + server.replstate = REDIS_REPL_CONNECT; + } + /* Release memory */ + zfree(c->argv); + zfree(c->mbargv); + freeClientMultiState(c); + zfree(c); +} + +#define GLUEREPLY_UP_TO (1024) +static void glueReplyBuffersIfNeeded(redisClient *c) { + int copylen = 0; + char buf[GLUEREPLY_UP_TO]; + listNode *ln; + listIter li; + robj *o; + + listRewind(c->reply,&li); + while((ln = listNext(&li))) { + int objlen; + + o = ln->value; + objlen = sdslen(o->ptr); + if (copylen + objlen <= GLUEREPLY_UP_TO) { + memcpy(buf+copylen,o->ptr,objlen); + copylen += objlen; + listDelNode(c->reply,ln); + } else { + if (copylen == 0) return; + break; + } + } + /* Now the output buffer is empty, add the new single element */ + o = createObject(REDIS_STRING,sdsnewlen(buf,copylen)); + listAddNodeHead(c->reply,o); +} + +void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) { + redisClient *c = privdata; + int nwritten = 0, totwritten = 0, objlen; + robj *o; + REDIS_NOTUSED(el); + REDIS_NOTUSED(mask); + + /* Use writev() if we have enough buffers to send */ + if (!server.glueoutputbuf && + listLength(c->reply) > REDIS_WRITEV_THRESHOLD && + !(c->flags & REDIS_MASTER)) + { + sendReplyToClientWritev(el, fd, privdata, mask); + return; + } + + while(listLength(c->reply)) { + if (server.glueoutputbuf && listLength(c->reply) > 1) + glueReplyBuffersIfNeeded(c); + + o = listNodeValue(listFirst(c->reply)); + objlen = sdslen(o->ptr); + + if (objlen == 0) { + listDelNode(c->reply,listFirst(c->reply)); + continue; + } + + if (c->flags & REDIS_MASTER) { + /* Don't reply to a master */ + nwritten = objlen - c->sentlen; + } else { + nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen); + if (nwritten <= 0) break; + } + c->sentlen += nwritten; + totwritten += nwritten; + /* If we fully sent the object on head go to the next one */ + if (c->sentlen == objlen) { + listDelNode(c->reply,listFirst(c->reply)); + c->sentlen = 0; + } + /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT + * bytes, in a single threaded server it's a good idea to serve + * other clients as well, even if a very large request comes from + * super fast link that is always able to accept data (in real world + * scenario think about 'KEYS *' against the loopback interfae) */ + if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break; + } + if (nwritten == -1) { + if (errno == EAGAIN) { + nwritten = 0; + } else { + redisLog(REDIS_VERBOSE, + "Error writing to client: %s", strerror(errno)); + freeClient(c); + return; + } + } + if (totwritten > 0) c->lastinteraction = time(NULL); + if (listLength(c->reply) == 0) { + c->sentlen = 0; + aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE); + } +} + +void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask) +{ + redisClient *c = privdata; + int nwritten = 0, totwritten = 0, objlen, willwrite; + robj *o; + struct iovec iov[REDIS_WRITEV_IOVEC_COUNT]; + int offset, ion = 0; + REDIS_NOTUSED(el); + REDIS_NOTUSED(mask); + + listNode *node; + while (listLength(c->reply)) { + offset = c->sentlen; + ion = 0; + willwrite = 0; + + /* fill-in the iov[] array */ + for(node = listFirst(c->reply); node; node = listNextNode(node)) { + o = listNodeValue(node); + objlen = sdslen(o->ptr); + + if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT) + break; + + if(ion == REDIS_WRITEV_IOVEC_COUNT) + break; /* no more iovecs */ + + iov[ion].iov_base = ((char*)o->ptr) + offset; + iov[ion].iov_len = objlen - offset; + willwrite += objlen - offset; + offset = 0; /* just for the first item */ + ion++; + } + + if(willwrite == 0) + break; + + /* write all collected blocks at once */ + if((nwritten = writev(fd, iov, ion)) < 0) { + if (errno != EAGAIN) { + redisLog(REDIS_VERBOSE, + "Error writing to client: %s", strerror(errno)); + freeClient(c); + return; + } + break; + } + + totwritten += nwritten; + offset = c->sentlen; + + /* remove written robjs from c->reply */ + while (nwritten && listLength(c->reply)) { + o = listNodeValue(listFirst(c->reply)); + objlen = sdslen(o->ptr); + + if(nwritten >= objlen - offset) { + listDelNode(c->reply, listFirst(c->reply)); + nwritten -= objlen - offset; + c->sentlen = 0; + } else { + /* partial write */ + c->sentlen += nwritten; + break; + } + offset = 0; + } + } + + if (totwritten > 0) + c->lastinteraction = time(NULL); + + if (listLength(c->reply) == 0) { + c->sentlen = 0; + aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE); + } +} + +/* resetClient prepare the client to process the next command */ +void resetClient(redisClient *c) { + freeClientArgv(c); + c->bulklen = -1; + c->multibulk = 0; +} + +void closeTimedoutClients(void) { + redisClient *c; + listNode *ln; + time_t now = time(NULL); + listIter li; + + listRewind(server.clients,&li); + while ((ln = listNext(&li)) != NULL) { + c = listNodeValue(ln); + if (server.maxidletime && + !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */ + !(c->flags & REDIS_MASTER) && /* no timeout for masters */ + dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */ + listLength(c->pubsub_patterns) == 0 && + (now - c->lastinteraction > server.maxidletime)) + { + redisLog(REDIS_VERBOSE,"Closing idle client"); + freeClient(c); + } else if (c->flags & REDIS_BLOCKED) { + if (c->blockingto != 0 && c->blockingto < now) { + addReply(c,shared.nullmultibulk); + unblockClientWaitingData(c); + } + } + } +} + +void processInputBuffer(redisClient *c) { +again: + /* Before to process the input buffer, make sure the client is not + * waitig for a blocking operation such as BLPOP. Note that the first + * iteration the client is never blocked, otherwise the processInputBuffer + * would not be called at all, but after the execution of the first commands + * in the input buffer the client may be blocked, and the "goto again" + * will try to reiterate. The following line will make it return asap. */ + if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return; + if (c->bulklen == -1) { + /* Read the first line of the query */ + char *p = strchr(c->querybuf,'\n'); + size_t querylen; + + if (p) { + sds query, *argv; + int argc, j; + + query = c->querybuf; + c->querybuf = sdsempty(); + querylen = 1+(p-(query)); + if (sdslen(query) > querylen) { + /* leave data after the first line of the query in the buffer */ + c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen); + } + *p = '\0'; /* remove "\n" */ + if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */ + sdsupdatelen(query); + + /* Now we can split the query in arguments */ + argv = sdssplitlen(query,sdslen(query)," ",1,&argc); + sdsfree(query); + + if (c->argv) zfree(c->argv); + c->argv = zmalloc(sizeof(robj*)*argc); + + for (j = 0; j < argc; j++) { + if (sdslen(argv[j])) { + c->argv[c->argc] = createObject(REDIS_STRING,argv[j]); + c->argc++; + } else { + sdsfree(argv[j]); + } + } + zfree(argv); + if (c->argc) { + /* Execute the command. If the client is still valid + * after processCommand() return and there is something + * on the query buffer try to process the next command. */ + if (processCommand(c) && sdslen(c->querybuf)) goto again; + } else { + /* Nothing to process, argc == 0. Just process the query + * buffer if it's not empty or return to the caller */ + if (sdslen(c->querybuf)) goto again; + } + return; + } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) { + redisLog(REDIS_VERBOSE, "Client protocol error"); + freeClient(c); + return; + } + } else { + /* Bulk read handling. Note that if we are at this point + the client already sent a command terminated with a newline, + we are reading the bulk data that is actually the last + argument of the command. */ + int qbl = sdslen(c->querybuf); + + if (c->bulklen <= qbl) { + /* Copy everything but the final CRLF as final argument */ + c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2); + c->argc++; + c->querybuf = sdsrange(c->querybuf,c->bulklen,-1); + /* Process the command. If the client is still valid after + * the processing and there is more data in the buffer + * try to parse it. */ + if (processCommand(c) && sdslen(c->querybuf)) goto again; + return; + } + } +} + +void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) { + redisClient *c = (redisClient*) privdata; + char buf[REDIS_IOBUF_LEN]; + int nread; + REDIS_NOTUSED(el); + REDIS_NOTUSED(mask); + + nread = read(fd, buf, REDIS_IOBUF_LEN); + if (nread == -1) { + if (errno == EAGAIN) { + nread = 0; + } else { + redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno)); + freeClient(c); + return; + } + } else if (nread == 0) { + redisLog(REDIS_VERBOSE, "Client closed connection"); + freeClient(c); + return; + } + if (nread) { + c->querybuf = sdscatlen(c->querybuf, buf, nread); + c->lastinteraction = time(NULL); + } else { + return; + } + processInputBuffer(c); +} diff --git a/src/object.c b/src/object.c new file mode 100644 index 000000000..4854909e0 --- /dev/null +++ b/src/object.c @@ -0,0 +1,405 @@ +#include "redis.h" +#include + +robj *createObject(int type, void *ptr) { + robj *o; + + if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex); + if (listLength(server.objfreelist)) { + listNode *head = listFirst(server.objfreelist); + o = listNodeValue(head); + listDelNode(server.objfreelist,head); + if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex); + } else { + if (server.vm_enabled) + pthread_mutex_unlock(&server.obj_freelist_mutex); + o = zmalloc(sizeof(*o)); + } + o->type = type; + o->encoding = REDIS_ENCODING_RAW; + o->ptr = ptr; + o->refcount = 1; + if (server.vm_enabled) { + /* Note that this code may run in the context of an I/O thread + * and accessing server.lruclock in theory is an error + * (no locks). But in practice this is safe, and even if we read + * garbage Redis will not fail. */ + o->lru = server.lruclock; + o->storage = REDIS_VM_MEMORY; + } + return o; +} + +robj *createStringObject(char *ptr, size_t len) { + return createObject(REDIS_STRING,sdsnewlen(ptr,len)); +} + +robj *createStringObjectFromLongLong(long long value) { + robj *o; + if (value >= 0 && value < REDIS_SHARED_INTEGERS) { + incrRefCount(shared.integers[value]); + o = shared.integers[value]; + } else { + if (value >= LONG_MIN && value <= LONG_MAX) { + o = createObject(REDIS_STRING, NULL); + o->encoding = REDIS_ENCODING_INT; + o->ptr = (void*)((long)value); + } else { + o = createObject(REDIS_STRING,sdsfromlonglong(value)); + } + } + return o; +} + +robj *dupStringObject(robj *o) { + redisAssert(o->encoding == REDIS_ENCODING_RAW); + return createStringObject(o->ptr,sdslen(o->ptr)); +} + +robj *createListObject(void) { + list *l = listCreate(); + robj *o = createObject(REDIS_LIST,l); + listSetFreeMethod(l,decrRefCount); + o->encoding = REDIS_ENCODING_LINKEDLIST; + return o; +} + +robj *createZiplistObject(void) { + unsigned char *zl = ziplistNew(); + robj *o = createObject(REDIS_LIST,zl); + o->encoding = REDIS_ENCODING_ZIPLIST; + return o; +} + +robj *createSetObject(void) { + dict *d = dictCreate(&setDictType,NULL); + return createObject(REDIS_SET,d); +} + +robj *createHashObject(void) { + /* All the Hashes start as zipmaps. Will be automatically converted + * into hash tables if there are enough elements or big elements + * inside. */ + unsigned char *zm = zipmapNew(); + robj *o = createObject(REDIS_HASH,zm); + o->encoding = REDIS_ENCODING_ZIPMAP; + return o; +} + +robj *createZsetObject(void) { + zset *zs = zmalloc(sizeof(*zs)); + + zs->dict = dictCreate(&zsetDictType,NULL); + zs->zsl = zslCreate(); + return createObject(REDIS_ZSET,zs); +} + +void freeStringObject(robj *o) { + if (o->encoding == REDIS_ENCODING_RAW) { + sdsfree(o->ptr); + } +} + +void freeListObject(robj *o) { + switch (o->encoding) { + case REDIS_ENCODING_LINKEDLIST: + listRelease((list*) o->ptr); + break; + case REDIS_ENCODING_ZIPLIST: + zfree(o->ptr); + break; + default: + redisPanic("Unknown list encoding type"); + } +} + +void freeSetObject(robj *o) { + dictRelease((dict*) o->ptr); +} + +void freeZsetObject(robj *o) { + zset *zs = o->ptr; + + dictRelease(zs->dict); + zslFree(zs->zsl); + zfree(zs); +} + +void freeHashObject(robj *o) { + switch (o->encoding) { + case REDIS_ENCODING_HT: + dictRelease((dict*) o->ptr); + break; + case REDIS_ENCODING_ZIPMAP: + zfree(o->ptr); + break; + default: + redisPanic("Unknown hash encoding type"); + break; + } +} + +void incrRefCount(robj *o) { + o->refcount++; +} + +void decrRefCount(void *obj) { + robj *o = obj; + + /* Object is a swapped out value, or in the process of being loaded. */ + if (server.vm_enabled && + (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING)) + { + vmpointer *vp = obj; + if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(o); + vmMarkPagesFree(vp->page,vp->usedpages); + server.vm_stats_swapped_objects--; + zfree(vp); + return; + } + + if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0"); + /* Object is in memory, or in the process of being swapped out. + * + * If the object is being swapped out, abort the operation on + * decrRefCount even if the refcount does not drop to 0: the object + * is referenced at least two times, as value of the key AND as + * job->val in the iojob. So if we don't invalidate the iojob, when it is + * done but the relevant key was removed in the meantime, the + * complete jobs handler will not find the key about the job and the + * assert will fail. */ + if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING) + vmCancelThreadedIOJob(o); + if (--(o->refcount) == 0) { + switch(o->type) { + case REDIS_STRING: freeStringObject(o); break; + case REDIS_LIST: freeListObject(o); break; + case REDIS_SET: freeSetObject(o); break; + case REDIS_ZSET: freeZsetObject(o); break; + case REDIS_HASH: freeHashObject(o); break; + default: redisPanic("Unknown object type"); break; + } + if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex); + if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX || + !listAddNodeHead(server.objfreelist,o)) + zfree(o); + if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex); + } +} + +int checkType(redisClient *c, robj *o, int type) { + if (o->type != type) { + addReply(c,shared.wrongtypeerr); + return 1; + } + return 0; +} + +/* Try to encode a string object in order to save space */ +robj *tryObjectEncoding(robj *o) { + long value; + sds s = o->ptr; + + if (o->encoding != REDIS_ENCODING_RAW) + return o; /* Already encoded */ + + /* It's not safe to encode shared objects: shared objects can be shared + * everywhere in the "object space" of Redis. Encoded objects can only + * appear as "values" (and not, for instance, as keys) */ + if (o->refcount > 1) return o; + + /* Currently we try to encode only strings */ + redisAssert(o->type == REDIS_STRING); + + /* Check if we can represent this string as a long integer */ + if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o; + + /* Ok, this object can be encoded */ + if (value >= 0 && value < REDIS_SHARED_INTEGERS) { + decrRefCount(o); + incrRefCount(shared.integers[value]); + return shared.integers[value]; + } else { + o->encoding = REDIS_ENCODING_INT; + sdsfree(o->ptr); + o->ptr = (void*) value; + return o; + } +} + +/* Get a decoded version of an encoded object (returned as a new object). + * If the object is already raw-encoded just increment the ref count. */ +robj *getDecodedObject(robj *o) { + robj *dec; + + if (o->encoding == REDIS_ENCODING_RAW) { + incrRefCount(o); + return o; + } + if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) { + char buf[32]; + + ll2string(buf,32,(long)o->ptr); + dec = createStringObject(buf,strlen(buf)); + return dec; + } else { + redisPanic("Unknown encoding type"); + } +} + +/* Compare two string objects via strcmp() or alike. + * Note that the objects may be integer-encoded. In such a case we + * use ll2string() to get a string representation of the numbers on the stack + * and compare the strings, it's much faster than calling getDecodedObject(). + * + * Important note: if objects are not integer encoded, but binary-safe strings, + * sdscmp() from sds.c will apply memcmp() so this function ca be considered + * binary safe. */ +int compareStringObjects(robj *a, robj *b) { + redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING); + char bufa[128], bufb[128], *astr, *bstr; + int bothsds = 1; + + if (a == b) return 0; + if (a->encoding != REDIS_ENCODING_RAW) { + ll2string(bufa,sizeof(bufa),(long) a->ptr); + astr = bufa; + bothsds = 0; + } else { + astr = a->ptr; + } + if (b->encoding != REDIS_ENCODING_RAW) { + ll2string(bufb,sizeof(bufb),(long) b->ptr); + bstr = bufb; + bothsds = 0; + } else { + bstr = b->ptr; + } + return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr); +} + +/* Equal string objects return 1 if the two objects are the same from the + * point of view of a string comparison, otherwise 0 is returned. Note that + * this function is faster then checking for (compareStringObject(a,b) == 0) + * because it can perform some more optimization. */ +int equalStringObjects(robj *a, robj *b) { + if (a->encoding != REDIS_ENCODING_RAW && b->encoding != REDIS_ENCODING_RAW){ + return a->ptr == b->ptr; + } else { + return compareStringObjects(a,b) == 0; + } +} + +size_t stringObjectLen(robj *o) { + redisAssert(o->type == REDIS_STRING); + if (o->encoding == REDIS_ENCODING_RAW) { + return sdslen(o->ptr); + } else { + char buf[32]; + + return ll2string(buf,32,(long)o->ptr); + } +} + +int getDoubleFromObject(robj *o, double *target) { + double value; + char *eptr; + + if (o == NULL) { + value = 0; + } else { + redisAssert(o->type == REDIS_STRING); + if (o->encoding == REDIS_ENCODING_RAW) { + value = strtod(o->ptr, &eptr); + if (eptr[0] != '\0') return REDIS_ERR; + } else if (o->encoding == REDIS_ENCODING_INT) { + value = (long)o->ptr; + } else { + redisPanic("Unknown string encoding"); + } + } + + *target = value; + return REDIS_OK; +} + +int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) { + double value; + if (getDoubleFromObject(o, &value) != REDIS_OK) { + if (msg != NULL) { + addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg)); + } else { + addReplySds(c, sdsnew("-ERR value is not a double\r\n")); + } + return REDIS_ERR; + } + + *target = value; + return REDIS_OK; +} + +int getLongLongFromObject(robj *o, long long *target) { + long long value; + char *eptr; + + if (o == NULL) { + value = 0; + } else { + redisAssert(o->type == REDIS_STRING); + if (o->encoding == REDIS_ENCODING_RAW) { + value = strtoll(o->ptr, &eptr, 10); + if (eptr[0] != '\0') return REDIS_ERR; + } else if (o->encoding == REDIS_ENCODING_INT) { + value = (long)o->ptr; + } else { + redisPanic("Unknown string encoding"); + } + } + + *target = value; + return REDIS_OK; +} + +int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) { + long long value; + if (getLongLongFromObject(o, &value) != REDIS_OK) { + if (msg != NULL) { + addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg)); + } else { + addReplySds(c, sdsnew("-ERR value is not an integer\r\n")); + } + return REDIS_ERR; + } + + *target = value; + return REDIS_OK; +} + +int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) { + long long value; + + if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR; + if (value < LONG_MIN || value > LONG_MAX) { + if (msg != NULL) { + addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg)); + } else { + addReplySds(c, sdsnew("-ERR value is out of range\r\n")); + } + return REDIS_ERR; + } + + *target = value; + return REDIS_OK; +} + +char *strEncoding(int encoding) { + switch(encoding) { + case REDIS_ENCODING_RAW: return "raw"; + case REDIS_ENCODING_INT: return "int"; + case REDIS_ENCODING_HT: return "hashtable"; + case REDIS_ENCODING_ZIPMAP: return "zipmap"; + case REDIS_ENCODING_LINKEDLIST: return "linkedlist"; + case REDIS_ENCODING_ZIPLIST: return "ziplist"; + default: return "unknown"; + } +} diff --git a/src/pqsort.c b/src/pqsort.c new file mode 100644 index 000000000..257756376 --- /dev/null +++ b/src/pqsort.c @@ -0,0 +1,197 @@ +/* The following is the NetBSD libc qsort implementation modified in order to + * support partial sorting of ranges for Redis. + * + * Copyright(C) 2009-2010 Salvatore Sanfilippo. All rights reserved. + * + * The original copyright notice follows. */ + + +/* $NetBSD: qsort.c,v 1.19 2009/01/30 23:38:44 lukem Exp $ */ + +/*- + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include +#include + +static inline char *med3 (char *, char *, char *, + int (*)(const void *, const void *)); +static inline void swapfunc (char *, char *, size_t, int); + +#define min(a, b) (a) < (b) ? a : b + +/* + * Qsort routine from Bentley & McIlroy's "Engineering a Sort Function". + */ +#define swapcode(TYPE, parmi, parmj, n) { \ + size_t i = (n) / sizeof (TYPE); \ + TYPE *pi = (TYPE *)(void *)(parmi); \ + TYPE *pj = (TYPE *)(void *)(parmj); \ + do { \ + TYPE t = *pi; \ + *pi++ = *pj; \ + *pj++ = t; \ + } while (--i > 0); \ +} + +#define SWAPINIT(a, es) swaptype = ((char *)a - (char *)0) % sizeof(long) || \ + es % sizeof(long) ? 2 : es == sizeof(long)? 0 : 1; + +static inline void +swapfunc(char *a, char *b, size_t n, int swaptype) +{ + + if (swaptype <= 1) + swapcode(long, a, b, n) + else + swapcode(char, a, b, n) +} + +#define swap(a, b) \ + if (swaptype == 0) { \ + long t = *(long *)(void *)(a); \ + *(long *)(void *)(a) = *(long *)(void *)(b); \ + *(long *)(void *)(b) = t; \ + } else \ + swapfunc(a, b, es, swaptype) + +#define vecswap(a, b, n) if ((n) > 0) swapfunc((a), (b), (size_t)(n), swaptype) + +static inline char * +med3(char *a, char *b, char *c, + int (*cmp) (const void *, const void *)) +{ + + return cmp(a, b) < 0 ? + (cmp(b, c) < 0 ? b : (cmp(a, c) < 0 ? c : a )) + :(cmp(b, c) > 0 ? b : (cmp(a, c) < 0 ? a : c )); +} + +static void +_pqsort(void *a, size_t n, size_t es, + int (*cmp) (const void *, const void *), void *lrange, void *rrange) +{ + char *pa, *pb, *pc, *pd, *pl, *pm, *pn; + size_t d, r; + int swaptype, swap_cnt, cmp_result; + +loop: SWAPINIT(a, es); + swap_cnt = 0; + if (n < 7) { + for (pm = (char *) a + es; pm < (char *) a + n * es; pm += es) + for (pl = pm; pl > (char *) a && cmp(pl - es, pl) > 0; + pl -= es) + swap(pl, pl - es); + return; + } + pm = (char *) a + (n / 2) * es; + if (n > 7) { + pl = (char *) a; + pn = (char *) a + (n - 1) * es; + if (n > 40) { + d = (n / 8) * es; + pl = med3(pl, pl + d, pl + 2 * d, cmp); + pm = med3(pm - d, pm, pm + d, cmp); + pn = med3(pn - 2 * d, pn - d, pn, cmp); + } + pm = med3(pl, pm, pn, cmp); + } + swap(a, pm); + pa = pb = (char *) a + es; + + pc = pd = (char *) a + (n - 1) * es; + for (;;) { + while (pb <= pc && (cmp_result = cmp(pb, a)) <= 0) { + if (cmp_result == 0) { + swap_cnt = 1; + swap(pa, pb); + pa += es; + } + pb += es; + } + while (pb <= pc && (cmp_result = cmp(pc, a)) >= 0) { + if (cmp_result == 0) { + swap_cnt = 1; + swap(pc, pd); + pd -= es; + } + pc -= es; + } + if (pb > pc) + break; + swap(pb, pc); + swap_cnt = 1; + pb += es; + pc -= es; + } + if (swap_cnt == 0) { /* Switch to insertion sort */ + for (pm = (char *) a + es; pm < (char *) a + n * es; pm += es) + for (pl = pm; pl > (char *) a && cmp(pl - es, pl) > 0; + pl -= es) + swap(pl, pl - es); + return; + } + + pn = (char *) a + n * es; + r = min(pa - (char *) a, pb - pa); + vecswap(a, pb - r, r); + r = min((size_t)(pd - pc), pn - pd - es); + vecswap(pb, pn - r, r); + if ((r = pb - pa) > es) { + void *_l = a, *_r = ((unsigned char*)a)+r-1; + if (!((lrange < _l && rrange < _l) || + (lrange > _r && rrange > _r))) + _pqsort(a, r / es, es, cmp, lrange, rrange); + } + if ((r = pd - pc) > es) { + void *_l, *_r; + + /* Iterate rather than recurse to save stack space */ + a = pn - r; + n = r / es; + + _l = a; + _r = ((unsigned char*)a)+r-1; + if (!((lrange < _l && rrange < _l) || + (lrange > _r && rrange > _r))) + goto loop; + } +/* qsort(pn - r, r / es, es, cmp);*/ +} + +void +pqsort(void *a, size_t n, size_t es, + int (*cmp) (const void *, const void *), size_t lrange, size_t rrange) +{ + _pqsort(a,n,es,cmp,((unsigned char*)a)+(lrange*es), + ((unsigned char*)a)+((rrange+1)*es)-1); +} diff --git a/src/pqsort.h b/src/pqsort.h new file mode 100644 index 000000000..5054d5209 --- /dev/null +++ b/src/pqsort.h @@ -0,0 +1,15 @@ +/* The following is the NetBSD libc qsort implementation modified in order to + * support partial sorting of ranges for Redis. + * + * Copyright(C) 2009-2010 Salvatore Sanfilippo. All rights reserved. + * + * See the pqsort.c file for the original copyright notice. */ + +#ifndef __PQSORT_H +#define __PQSORT_H + +void +pqsort(void *a, size_t n, size_t es, + int (*cmp) (const void *, const void *), size_t lrange, size_t rrange); + +#endif diff --git a/src/pubsub.c b/src/pubsub.c new file mode 100644 index 000000000..c9f5f310e --- /dev/null +++ b/src/pubsub.c @@ -0,0 +1,259 @@ +#include "redis.h" + +void freePubsubPattern(void *p) { + pubsubPattern *pat = p; + + decrRefCount(pat->pattern); + zfree(pat); +} + +int listMatchPubsubPattern(void *a, void *b) { + pubsubPattern *pa = a, *pb = b; + + return (pa->client == pb->client) && + (equalStringObjects(pa->pattern,pb->pattern)); +} + +/* Subscribe a client to a channel. Returns 1 if the operation succeeded, or + * 0 if the client was already subscribed to that channel. */ +int pubsubSubscribeChannel(redisClient *c, robj *channel) { + struct dictEntry *de; + list *clients = NULL; + int retval = 0; + + /* Add the channel to the client -> channels hash table */ + if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) { + retval = 1; + incrRefCount(channel); + /* Add the client to the channel -> list of clients hash table */ + de = dictFind(server.pubsub_channels,channel); + if (de == NULL) { + clients = listCreate(); + dictAdd(server.pubsub_channels,channel,clients); + incrRefCount(channel); + } else { + clients = dictGetEntryVal(de); + } + listAddNodeTail(clients,c); + } + /* Notify the client */ + addReply(c,shared.mbulk3); + addReply(c,shared.subscribebulk); + addReplyBulk(c,channel); + addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns)); + return retval; +} + +/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or + * 0 if the client was not subscribed to the specified channel. */ +int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) { + struct dictEntry *de; + list *clients; + listNode *ln; + int retval = 0; + + /* Remove the channel from the client -> channels hash table */ + incrRefCount(channel); /* channel may be just a pointer to the same object + we have in the hash tables. Protect it... */ + if (dictDelete(c->pubsub_channels,channel) == DICT_OK) { + retval = 1; + /* Remove the client from the channel -> clients list hash table */ + de = dictFind(server.pubsub_channels,channel); + redisAssert(de != NULL); + clients = dictGetEntryVal(de); + ln = listSearchKey(clients,c); + redisAssert(ln != NULL); + listDelNode(clients,ln); + if (listLength(clients) == 0) { + /* Free the list and associated hash entry at all if this was + * the latest client, so that it will be possible to abuse + * Redis PUBSUB creating millions of channels. */ + dictDelete(server.pubsub_channels,channel); + } + } + /* Notify the client */ + if (notify) { + addReply(c,shared.mbulk3); + addReply(c,shared.unsubscribebulk); + addReplyBulk(c,channel); + addReplyLongLong(c,dictSize(c->pubsub_channels)+ + listLength(c->pubsub_patterns)); + + } + decrRefCount(channel); /* it is finally safe to release it */ + return retval; +} + +/* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */ +int pubsubSubscribePattern(redisClient *c, robj *pattern) { + int retval = 0; + + if (listSearchKey(c->pubsub_patterns,pattern) == NULL) { + retval = 1; + pubsubPattern *pat; + listAddNodeTail(c->pubsub_patterns,pattern); + incrRefCount(pattern); + pat = zmalloc(sizeof(*pat)); + pat->pattern = getDecodedObject(pattern); + pat->client = c; + listAddNodeTail(server.pubsub_patterns,pat); + } + /* Notify the client */ + addReply(c,shared.mbulk3); + addReply(c,shared.psubscribebulk); + addReplyBulk(c,pattern); + addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns)); + return retval; +} + +/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or + * 0 if the client was not subscribed to the specified channel. */ +int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) { + listNode *ln; + pubsubPattern pat; + int retval = 0; + + incrRefCount(pattern); /* Protect the object. May be the same we remove */ + if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) { + retval = 1; + listDelNode(c->pubsub_patterns,ln); + pat.client = c; + pat.pattern = pattern; + ln = listSearchKey(server.pubsub_patterns,&pat); + listDelNode(server.pubsub_patterns,ln); + } + /* Notify the client */ + if (notify) { + addReply(c,shared.mbulk3); + addReply(c,shared.punsubscribebulk); + addReplyBulk(c,pattern); + addReplyLongLong(c,dictSize(c->pubsub_channels)+ + listLength(c->pubsub_patterns)); + } + decrRefCount(pattern); + return retval; +} + +/* Unsubscribe from all the channels. Return the number of channels the + * client was subscribed from. */ +int pubsubUnsubscribeAllChannels(redisClient *c, int notify) { + dictIterator *di = dictGetIterator(c->pubsub_channels); + dictEntry *de; + int count = 0; + + while((de = dictNext(di)) != NULL) { + robj *channel = dictGetEntryKey(de); + + count += pubsubUnsubscribeChannel(c,channel,notify); + } + dictReleaseIterator(di); + return count; +} + +/* Unsubscribe from all the patterns. Return the number of patterns the + * client was subscribed from. */ +int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) { + listNode *ln; + listIter li; + int count = 0; + + listRewind(c->pubsub_patterns,&li); + while ((ln = listNext(&li)) != NULL) { + robj *pattern = ln->value; + + count += pubsubUnsubscribePattern(c,pattern,notify); + } + return count; +} + +/* Publish a message */ +int pubsubPublishMessage(robj *channel, robj *message) { + int receivers = 0; + struct dictEntry *de; + listNode *ln; + listIter li; + + /* Send to clients listening for that channel */ + de = dictFind(server.pubsub_channels,channel); + if (de) { + list *list = dictGetEntryVal(de); + listNode *ln; + listIter li; + + listRewind(list,&li); + while ((ln = listNext(&li)) != NULL) { + redisClient *c = ln->value; + + addReply(c,shared.mbulk3); + addReply(c,shared.messagebulk); + addReplyBulk(c,channel); + addReplyBulk(c,message); + receivers++; + } + } + /* Send to clients listening to matching channels */ + if (listLength(server.pubsub_patterns)) { + listRewind(server.pubsub_patterns,&li); + channel = getDecodedObject(channel); + while ((ln = listNext(&li)) != NULL) { + pubsubPattern *pat = ln->value; + + if (stringmatchlen((char*)pat->pattern->ptr, + sdslen(pat->pattern->ptr), + (char*)channel->ptr, + sdslen(channel->ptr),0)) { + addReply(pat->client,shared.mbulk4); + addReply(pat->client,shared.pmessagebulk); + addReplyBulk(pat->client,pat->pattern); + addReplyBulk(pat->client,channel); + addReplyBulk(pat->client,message); + receivers++; + } + } + decrRefCount(channel); + } + return receivers; +} + +void subscribeCommand(redisClient *c) { + int j; + + for (j = 1; j < c->argc; j++) + pubsubSubscribeChannel(c,c->argv[j]); +} + +void unsubscribeCommand(redisClient *c) { + if (c->argc == 1) { + pubsubUnsubscribeAllChannels(c,1); + return; + } else { + int j; + + for (j = 1; j < c->argc; j++) + pubsubUnsubscribeChannel(c,c->argv[j],1); + } +} + +void psubscribeCommand(redisClient *c) { + int j; + + for (j = 1; j < c->argc; j++) + pubsubSubscribePattern(c,c->argv[j]); +} + +void punsubscribeCommand(redisClient *c) { + if (c->argc == 1) { + pubsubUnsubscribeAllPatterns(c,1); + return; + } else { + int j; + + for (j = 1; j < c->argc; j++) + pubsubUnsubscribePattern(c,c->argv[j],1); + } +} + +void publishCommand(redisClient *c) { + int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]); + addReplyLongLong(c,receivers); +} diff --git a/src/rdb.c b/src/rdb.c new file mode 100644 index 000000000..5bda5e565 --- /dev/null +++ b/src/rdb.c @@ -0,0 +1,886 @@ +#include "redis.h" +#include "lzf.h" /* LZF compression library */ + +#include + +int rdbSaveType(FILE *fp, unsigned char type) { + if (fwrite(&type,1,1,fp) == 0) return -1; + return 0; +} + +int rdbSaveTime(FILE *fp, time_t t) { + int32_t t32 = (int32_t) t; + if (fwrite(&t32,4,1,fp) == 0) return -1; + return 0; +} + +/* check rdbLoadLen() comments for more info */ +int rdbSaveLen(FILE *fp, uint32_t len) { + unsigned char buf[2]; + + if (len < (1<<6)) { + /* Save a 6 bit len */ + buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6); + if (fwrite(buf,1,1,fp) == 0) return -1; + } else if (len < (1<<14)) { + /* Save a 14 bit len */ + buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6); + buf[1] = len&0xFF; + if (fwrite(buf,2,1,fp) == 0) return -1; + } else { + /* Save a 32 bit len */ + buf[0] = (REDIS_RDB_32BITLEN<<6); + if (fwrite(buf,1,1,fp) == 0) return -1; + len = htonl(len); + if (fwrite(&len,4,1,fp) == 0) return -1; + } + return 0; +} + +/* Encode 'value' as an integer if possible (if integer will fit the + * supported range). If the function sucessful encoded the integer + * then the (up to 5 bytes) encoded representation is written in the + * string pointed by 'enc' and the length is returned. Otherwise + * 0 is returned. */ +int rdbEncodeInteger(long long value, unsigned char *enc) { + /* Finally check if it fits in our ranges */ + if (value >= -(1<<7) && value <= (1<<7)-1) { + enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8; + enc[1] = value&0xFF; + return 2; + } else if (value >= -(1<<15) && value <= (1<<15)-1) { + enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16; + enc[1] = value&0xFF; + enc[2] = (value>>8)&0xFF; + return 3; + } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) { + enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32; + enc[1] = value&0xFF; + enc[2] = (value>>8)&0xFF; + enc[3] = (value>>16)&0xFF; + enc[4] = (value>>24)&0xFF; + return 5; + } else { + return 0; + } +} + +/* String objects in the form "2391" "-100" without any space and with a + * range of values that can fit in an 8, 16 or 32 bit signed value can be + * encoded as integers to save space */ +int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) { + long long value; + char *endptr, buf[32]; + + /* Check if it's possible to encode this value as a number */ + value = strtoll(s, &endptr, 10); + if (endptr[0] != '\0') return 0; + ll2string(buf,32,value); + + /* If the number converted back into a string is not identical + * then it's not possible to encode the string as integer */ + if (strlen(buf) != len || memcmp(buf,s,len)) return 0; + + return rdbEncodeInteger(value,enc); +} + +int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) { + size_t comprlen, outlen; + unsigned char byte; + void *out; + + /* We require at least four bytes compression for this to be worth it */ + if (len <= 4) return 0; + outlen = len-4; + if ((out = zmalloc(outlen+1)) == NULL) return 0; + comprlen = lzf_compress(s, len, out, outlen); + if (comprlen == 0) { + zfree(out); + return 0; + } + /* Data compressed! Let's save it on disk */ + byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF; + if (fwrite(&byte,1,1,fp) == 0) goto writeerr; + if (rdbSaveLen(fp,comprlen) == -1) goto writeerr; + if (rdbSaveLen(fp,len) == -1) goto writeerr; + if (fwrite(out,comprlen,1,fp) == 0) goto writeerr; + zfree(out); + return comprlen; + +writeerr: + zfree(out); + return -1; +} + +/* Save a string objet as [len][data] on disk. If the object is a string + * representation of an integer value we try to safe it in a special form */ +int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) { + int enclen; + + /* Try integer encoding */ + if (len <= 11) { + unsigned char buf[5]; + if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) { + if (fwrite(buf,enclen,1,fp) == 0) return -1; + return 0; + } + } + + /* Try LZF compression - under 20 bytes it's unable to compress even + * aaaaaaaaaaaaaaaaaa so skip it */ + if (server.rdbcompression && len > 20) { + int retval; + + retval = rdbSaveLzfStringObject(fp,s,len); + if (retval == -1) return -1; + if (retval > 0) return 0; + /* retval == 0 means data can't be compressed, save the old way */ + } + + /* Store verbatim */ + if (rdbSaveLen(fp,len) == -1) return -1; + if (len && fwrite(s,len,1,fp) == 0) return -1; + return 0; +} + +/* Save a long long value as either an encoded string or a string. */ +int rdbSaveLongLongAsStringObject(FILE *fp, long long value) { + unsigned char buf[32]; + int enclen = rdbEncodeInteger(value,buf); + if (enclen > 0) { + if (fwrite(buf,enclen,1,fp) == 0) return -1; + } else { + /* Encode as string */ + enclen = ll2string((char*)buf,32,value); + redisAssert(enclen < 32); + if (rdbSaveLen(fp,enclen) == -1) return -1; + if (fwrite(buf,enclen,1,fp) == 0) return -1; + } + return 0; +} + +/* Like rdbSaveStringObjectRaw() but handle encoded objects */ +int rdbSaveStringObject(FILE *fp, robj *obj) { + /* Avoid to decode the object, then encode it again, if the + * object is alrady integer encoded. */ + if (obj->encoding == REDIS_ENCODING_INT) { + return rdbSaveLongLongAsStringObject(fp,(long)obj->ptr); + } else { + redisAssert(obj->encoding == REDIS_ENCODING_RAW); + return rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr)); + } +} + +/* Save a double value. Doubles are saved as strings prefixed by an unsigned + * 8 bit integer specifing the length of the representation. + * This 8 bit integer has special values in order to specify the following + * conditions: + * 253: not a number + * 254: + inf + * 255: - inf + */ +int rdbSaveDoubleValue(FILE *fp, double val) { + unsigned char buf[128]; + int len; + + if (isnan(val)) { + buf[0] = 253; + len = 1; + } else if (!isfinite(val)) { + len = 1; + buf[0] = (val < 0) ? 255 : 254; + } else { +#if (DBL_MANT_DIG >= 52) && (LLONG_MAX == 0x7fffffffffffffffLL) + /* Check if the float is in a safe range to be casted into a + * long long. We are assuming that long long is 64 bit here. + * Also we are assuming that there are no implementations around where + * double has precision < 52 bit. + * + * Under this assumptions we test if a double is inside an interval + * where casting to long long is safe. Then using two castings we + * make sure the decimal part is zero. If all this is true we use + * integer printing function that is much faster. */ + double min = -4503599627370495; /* (2^52)-1 */ + double max = 4503599627370496; /* -(2^52) */ + if (val > min && val < max && val == ((double)((long long)val))) + ll2string((char*)buf+1,sizeof(buf),(long long)val); + else +#endif + snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val); + buf[0] = strlen((char*)buf+1); + len = buf[0]+1; + } + if (fwrite(buf,len,1,fp) == 0) return -1; + return 0; +} + +/* Save a Redis object. */ +int rdbSaveObject(FILE *fp, robj *o) { + if (o->type == REDIS_STRING) { + /* Save a string value */ + if (rdbSaveStringObject(fp,o) == -1) return -1; + } else if (o->type == REDIS_LIST) { + /* Save a list value */ + if (o->encoding == REDIS_ENCODING_ZIPLIST) { + unsigned char *p; + unsigned char *vstr; + unsigned int vlen; + long long vlong; + + if (rdbSaveLen(fp,ziplistLen(o->ptr)) == -1) return -1; + p = ziplistIndex(o->ptr,0); + while(ziplistGet(p,&vstr,&vlen,&vlong)) { + if (vstr) { + if (rdbSaveRawString(fp,vstr,vlen) == -1) + return -1; + } else { + if (rdbSaveLongLongAsStringObject(fp,vlong) == -1) + return -1; + } + p = ziplistNext(o->ptr,p); + } + } else if (o->encoding == REDIS_ENCODING_LINKEDLIST) { + list *list = o->ptr; + listIter li; + listNode *ln; + + if (rdbSaveLen(fp,listLength(list)) == -1) return -1; + listRewind(list,&li); + while((ln = listNext(&li))) { + robj *eleobj = listNodeValue(ln); + if (rdbSaveStringObject(fp,eleobj) == -1) return -1; + } + } else { + redisPanic("Unknown list encoding"); + } + } else if (o->type == REDIS_SET) { + /* Save a set value */ + dict *set = o->ptr; + dictIterator *di = dictGetIterator(set); + dictEntry *de; + + if (rdbSaveLen(fp,dictSize(set)) == -1) return -1; + while((de = dictNext(di)) != NULL) { + robj *eleobj = dictGetEntryKey(de); + + if (rdbSaveStringObject(fp,eleobj) == -1) return -1; + } + dictReleaseIterator(di); + } else if (o->type == REDIS_ZSET) { + /* Save a set value */ + zset *zs = o->ptr; + dictIterator *di = dictGetIterator(zs->dict); + dictEntry *de; + + if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1; + while((de = dictNext(di)) != NULL) { + robj *eleobj = dictGetEntryKey(de); + double *score = dictGetEntryVal(de); + + if (rdbSaveStringObject(fp,eleobj) == -1) return -1; + if (rdbSaveDoubleValue(fp,*score) == -1) return -1; + } + dictReleaseIterator(di); + } else if (o->type == REDIS_HASH) { + /* Save a hash value */ + if (o->encoding == REDIS_ENCODING_ZIPMAP) { + unsigned char *p = zipmapRewind(o->ptr); + unsigned int count = zipmapLen(o->ptr); + unsigned char *key, *val; + unsigned int klen, vlen; + + if (rdbSaveLen(fp,count) == -1) return -1; + while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) { + if (rdbSaveRawString(fp,key,klen) == -1) return -1; + if (rdbSaveRawString(fp,val,vlen) == -1) return -1; + } + } else { + dictIterator *di = dictGetIterator(o->ptr); + dictEntry *de; + + if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1; + while((de = dictNext(di)) != NULL) { + robj *key = dictGetEntryKey(de); + robj *val = dictGetEntryVal(de); + + if (rdbSaveStringObject(fp,key) == -1) return -1; + if (rdbSaveStringObject(fp,val) == -1) return -1; + } + dictReleaseIterator(di); + } + } else { + redisPanic("Unknown object type"); + } + return 0; +} + +/* Return the length the object will have on disk if saved with + * the rdbSaveObject() function. Currently we use a trick to get + * this length with very little changes to the code. In the future + * we could switch to a faster solution. */ +off_t rdbSavedObjectLen(robj *o, FILE *fp) { + if (fp == NULL) fp = server.devnull; + rewind(fp); + redisAssert(rdbSaveObject(fp,o) != 1); + return ftello(fp); +} + +/* Return the number of pages required to save this object in the swap file */ +off_t rdbSavedObjectPages(robj *o, FILE *fp) { + off_t bytes = rdbSavedObjectLen(o,fp); + + return (bytes+(server.vm_page_size-1))/server.vm_page_size; +} + +/* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */ +int rdbSave(char *filename) { + dictIterator *di = NULL; + dictEntry *de; + FILE *fp; + char tmpfile[256]; + int j; + time_t now = time(NULL); + + /* Wait for I/O therads to terminate, just in case this is a + * foreground-saving, to avoid seeking the swap file descriptor at the + * same time. */ + if (server.vm_enabled) + waitEmptyIOJobsQueue(); + + snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid()); + fp = fopen(tmpfile,"w"); + if (!fp) { + redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno)); + return REDIS_ERR; + } + if (fwrite("REDIS0001",9,1,fp) == 0) goto werr; + for (j = 0; j < server.dbnum; j++) { + redisDb *db = server.db+j; + dict *d = db->dict; + if (dictSize(d) == 0) continue; + di = dictGetIterator(d); + if (!di) { + fclose(fp); + return REDIS_ERR; + } + + /* Write the SELECT DB opcode */ + if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr; + if (rdbSaveLen(fp,j) == -1) goto werr; + + /* Iterate this DB writing every entry */ + while((de = dictNext(di)) != NULL) { + sds keystr = dictGetEntryKey(de); + robj key, *o = dictGetEntryVal(de); + time_t expiretime; + + initStaticStringObject(key,keystr); + expiretime = getExpire(db,&key); + + /* Save the expire time */ + if (expiretime != -1) { + /* If this key is already expired skip it */ + if (expiretime < now) continue; + if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr; + if (rdbSaveTime(fp,expiretime) == -1) goto werr; + } + /* Save the key and associated value. This requires special + * handling if the value is swapped out. */ + if (!server.vm_enabled || o->storage == REDIS_VM_MEMORY || + o->storage == REDIS_VM_SWAPPING) { + /* Save type, key, value */ + if (rdbSaveType(fp,o->type) == -1) goto werr; + if (rdbSaveStringObject(fp,&key) == -1) goto werr; + if (rdbSaveObject(fp,o) == -1) goto werr; + } else { + /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */ + robj *po; + /* Get a preview of the object in memory */ + po = vmPreviewObject(o); + /* Save type, key, value */ + if (rdbSaveType(fp,po->type) == -1) goto werr; + if (rdbSaveStringObject(fp,&key) == -1) goto werr; + if (rdbSaveObject(fp,po) == -1) goto werr; + /* Remove the loaded object from memory */ + decrRefCount(po); + } + } + dictReleaseIterator(di); + } + /* EOF opcode */ + if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr; + + /* Make sure data will not remain on the OS's output buffers */ + fflush(fp); + fsync(fileno(fp)); + fclose(fp); + + /* Use RENAME to make sure the DB file is changed atomically only + * if the generate DB file is ok. */ + if (rename(tmpfile,filename) == -1) { + redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno)); + unlink(tmpfile); + return REDIS_ERR; + } + redisLog(REDIS_NOTICE,"DB saved on disk"); + server.dirty = 0; + server.lastsave = time(NULL); + return REDIS_OK; + +werr: + fclose(fp); + unlink(tmpfile); + redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno)); + if (di) dictReleaseIterator(di); + return REDIS_ERR; +} + +int rdbSaveBackground(char *filename) { + pid_t childpid; + + if (server.bgsavechildpid != -1) return REDIS_ERR; + if (server.vm_enabled) waitEmptyIOJobsQueue(); + if ((childpid = fork()) == 0) { + /* Child */ + if (server.vm_enabled) vmReopenSwapFile(); + close(server.fd); + if (rdbSave(filename) == REDIS_OK) { + _exit(0); + } else { + _exit(1); + } + } else { + /* Parent */ + if (childpid == -1) { + redisLog(REDIS_WARNING,"Can't save in background: fork: %s", + strerror(errno)); + return REDIS_ERR; + } + redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid); + server.bgsavechildpid = childpid; + updateDictResizePolicy(); + return REDIS_OK; + } + return REDIS_OK; /* unreached */ +} + +void rdbRemoveTempFile(pid_t childpid) { + char tmpfile[256]; + + snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid); + unlink(tmpfile); +} + +int rdbLoadType(FILE *fp) { + unsigned char type; + if (fread(&type,1,1,fp) == 0) return -1; + return type; +} + +time_t rdbLoadTime(FILE *fp) { + int32_t t32; + if (fread(&t32,4,1,fp) == 0) return -1; + return (time_t) t32; +} + +/* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top + * of this file for a description of how this are stored on disk. + * + * isencoded is set to 1 if the readed length is not actually a length but + * an "encoding type", check the above comments for more info */ +uint32_t rdbLoadLen(FILE *fp, int *isencoded) { + unsigned char buf[2]; + uint32_t len; + int type; + + if (isencoded) *isencoded = 0; + if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR; + type = (buf[0]&0xC0)>>6; + if (type == REDIS_RDB_6BITLEN) { + /* Read a 6 bit len */ + return buf[0]&0x3F; + } else if (type == REDIS_RDB_ENCVAL) { + /* Read a 6 bit len encoding type */ + if (isencoded) *isencoded = 1; + return buf[0]&0x3F; + } else if (type == REDIS_RDB_14BITLEN) { + /* Read a 14 bit len */ + if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR; + return ((buf[0]&0x3F)<<8)|buf[1]; + } else { + /* Read a 32 bit len */ + if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR; + return ntohl(len); + } +} + +/* Load an integer-encoded object from file 'fp', with the specified + * encoding type 'enctype'. If encode is true the function may return + * an integer-encoded object as reply, otherwise the returned object + * will always be encoded as a raw string. */ +robj *rdbLoadIntegerObject(FILE *fp, int enctype, int encode) { + unsigned char enc[4]; + long long val; + + if (enctype == REDIS_RDB_ENC_INT8) { + if (fread(enc,1,1,fp) == 0) return NULL; + val = (signed char)enc[0]; + } else if (enctype == REDIS_RDB_ENC_INT16) { + uint16_t v; + if (fread(enc,2,1,fp) == 0) return NULL; + v = enc[0]|(enc[1]<<8); + val = (int16_t)v; + } else if (enctype == REDIS_RDB_ENC_INT32) { + uint32_t v; + if (fread(enc,4,1,fp) == 0) return NULL; + v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24); + val = (int32_t)v; + } else { + val = 0; /* anti-warning */ + redisPanic("Unknown RDB integer encoding type"); + } + if (encode) + return createStringObjectFromLongLong(val); + else + return createObject(REDIS_STRING,sdsfromlonglong(val)); +} + +robj *rdbLoadLzfStringObject(FILE*fp) { + unsigned int len, clen; + unsigned char *c = NULL; + sds val = NULL; + + if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL; + if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL; + if ((c = zmalloc(clen)) == NULL) goto err; + if ((val = sdsnewlen(NULL,len)) == NULL) goto err; + if (fread(c,clen,1,fp) == 0) goto err; + if (lzf_decompress(c,clen,val,len) == 0) goto err; + zfree(c); + return createObject(REDIS_STRING,val); +err: + zfree(c); + sdsfree(val); + return NULL; +} + +robj *rdbGenericLoadStringObject(FILE*fp, int encode) { + int isencoded; + uint32_t len; + sds val; + + len = rdbLoadLen(fp,&isencoded); + if (isencoded) { + switch(len) { + case REDIS_RDB_ENC_INT8: + case REDIS_RDB_ENC_INT16: + case REDIS_RDB_ENC_INT32: + return rdbLoadIntegerObject(fp,len,encode); + case REDIS_RDB_ENC_LZF: + return rdbLoadLzfStringObject(fp); + default: + redisPanic("Unknown RDB encoding type"); + } + } + + if (len == REDIS_RDB_LENERR) return NULL; + val = sdsnewlen(NULL,len); + if (len && fread(val,len,1,fp) == 0) { + sdsfree(val); + return NULL; + } + return createObject(REDIS_STRING,val); +} + +robj *rdbLoadStringObject(FILE *fp) { + return rdbGenericLoadStringObject(fp,0); +} + +robj *rdbLoadEncodedStringObject(FILE *fp) { + return rdbGenericLoadStringObject(fp,1); +} + +/* For information about double serialization check rdbSaveDoubleValue() */ +int rdbLoadDoubleValue(FILE *fp, double *val) { + char buf[128]; + unsigned char len; + + if (fread(&len,1,1,fp) == 0) return -1; + switch(len) { + case 255: *val = R_NegInf; return 0; + case 254: *val = R_PosInf; return 0; + case 253: *val = R_Nan; return 0; + default: + if (fread(buf,len,1,fp) == 0) return -1; + buf[len] = '\0'; + sscanf(buf, "%lg", val); + return 0; + } +} + +/* Load a Redis object of the specified type from the specified file. + * On success a newly allocated object is returned, otherwise NULL. */ +robj *rdbLoadObject(int type, FILE *fp) { + robj *o, *ele, *dec; + size_t len; + + redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp)); + if (type == REDIS_STRING) { + /* Read string value */ + if ((o = rdbLoadEncodedStringObject(fp)) == NULL) return NULL; + o = tryObjectEncoding(o); + } else if (type == REDIS_LIST) { + /* Read list value */ + if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL; + + /* Use a real list when there are too many entries */ + if (len > server.list_max_ziplist_entries) { + o = createListObject(); + } else { + o = createZiplistObject(); + } + + /* Load every single element of the list */ + while(len--) { + if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL; + + /* If we are using a ziplist and the value is too big, convert + * the object to a real list. */ + if (o->encoding == REDIS_ENCODING_ZIPLIST && + ele->encoding == REDIS_ENCODING_RAW && + sdslen(ele->ptr) > server.list_max_ziplist_value) + listTypeConvert(o,REDIS_ENCODING_LINKEDLIST); + + if (o->encoding == REDIS_ENCODING_ZIPLIST) { + dec = getDecodedObject(ele); + o->ptr = ziplistPush(o->ptr,dec->ptr,sdslen(dec->ptr),REDIS_TAIL); + decrRefCount(dec); + decrRefCount(ele); + } else { + ele = tryObjectEncoding(ele); + listAddNodeTail(o->ptr,ele); + } + } + } else if (type == REDIS_SET) { + /* Read list/set value */ + if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL; + o = createSetObject(); + /* It's faster to expand the dict to the right size asap in order + * to avoid rehashing */ + if (len > DICT_HT_INITIAL_SIZE) + dictExpand(o->ptr,len); + /* Load every single element of the list/set */ + while(len--) { + if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL; + ele = tryObjectEncoding(ele); + dictAdd((dict*)o->ptr,ele,NULL); + } + } else if (type == REDIS_ZSET) { + /* Read list/set value */ + size_t zsetlen; + zset *zs; + + if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL; + o = createZsetObject(); + zs = o->ptr; + /* Load every single element of the list/set */ + while(zsetlen--) { + robj *ele; + double *score = zmalloc(sizeof(double)); + + if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL; + ele = tryObjectEncoding(ele); + if (rdbLoadDoubleValue(fp,score) == -1) return NULL; + dictAdd(zs->dict,ele,score); + zslInsert(zs->zsl,*score,ele); + incrRefCount(ele); /* added to skiplist */ + } + } else if (type == REDIS_HASH) { + size_t hashlen; + + if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL; + o = createHashObject(); + /* Too many entries? Use an hash table. */ + if (hashlen > server.hash_max_zipmap_entries) + convertToRealHash(o); + /* Load every key/value, then set it into the zipmap or hash + * table, as needed. */ + while(hashlen--) { + robj *key, *val; + + if ((key = rdbLoadEncodedStringObject(fp)) == NULL) return NULL; + if ((val = rdbLoadEncodedStringObject(fp)) == NULL) return NULL; + /* If we are using a zipmap and there are too big values + * the object is converted to real hash table encoding. */ + if (o->encoding != REDIS_ENCODING_HT && + ((key->encoding == REDIS_ENCODING_RAW && + sdslen(key->ptr) > server.hash_max_zipmap_value) || + (val->encoding == REDIS_ENCODING_RAW && + sdslen(val->ptr) > server.hash_max_zipmap_value))) + { + convertToRealHash(o); + } + + if (o->encoding == REDIS_ENCODING_ZIPMAP) { + unsigned char *zm = o->ptr; + robj *deckey, *decval; + + /* We need raw string objects to add them to the zipmap */ + deckey = getDecodedObject(key); + decval = getDecodedObject(val); + zm = zipmapSet(zm,deckey->ptr,sdslen(deckey->ptr), + decval->ptr,sdslen(decval->ptr),NULL); + o->ptr = zm; + decrRefCount(deckey); + decrRefCount(decval); + decrRefCount(key); + decrRefCount(val); + } else { + key = tryObjectEncoding(key); + val = tryObjectEncoding(val); + dictAdd((dict*)o->ptr,key,val); + } + } + } else { + redisPanic("Unknown object type"); + } + return o; +} + +int rdbLoad(char *filename) { + FILE *fp; + uint32_t dbid; + int type, retval, rdbver; + int swap_all_values = 0; + redisDb *db = server.db+0; + char buf[1024]; + time_t expiretime, now = time(NULL); + + fp = fopen(filename,"r"); + if (!fp) return REDIS_ERR; + if (fread(buf,9,1,fp) == 0) goto eoferr; + buf[9] = '\0'; + if (memcmp(buf,"REDIS",5) != 0) { + fclose(fp); + redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file"); + return REDIS_ERR; + } + rdbver = atoi(buf+5); + if (rdbver != 1) { + fclose(fp); + redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver); + return REDIS_ERR; + } + while(1) { + robj *key, *val; + int force_swapout; + + expiretime = -1; + /* Read type. */ + if ((type = rdbLoadType(fp)) == -1) goto eoferr; + if (type == REDIS_EXPIRETIME) { + if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr; + /* We read the time so we need to read the object type again */ + if ((type = rdbLoadType(fp)) == -1) goto eoferr; + } + if (type == REDIS_EOF) break; + /* Handle SELECT DB opcode as a special case */ + if (type == REDIS_SELECTDB) { + if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) + goto eoferr; + if (dbid >= (unsigned)server.dbnum) { + redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum); + exit(1); + } + db = server.db+dbid; + continue; + } + /* Read key */ + if ((key = rdbLoadStringObject(fp)) == NULL) goto eoferr; + /* Read value */ + if ((val = rdbLoadObject(type,fp)) == NULL) goto eoferr; + /* Check if the key already expired */ + if (expiretime != -1 && expiretime < now) { + decrRefCount(key); + decrRefCount(val); + continue; + } + /* Add the new object in the hash table */ + retval = dbAdd(db,key,val); + if (retval == REDIS_ERR) { + redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", key->ptr); + exit(1); + } + /* Set the expire time if needed */ + if (expiretime != -1) setExpire(db,key,expiretime); + + /* Handle swapping while loading big datasets when VM is on */ + + /* If we detecter we are hopeless about fitting something in memory + * we just swap every new key on disk. Directly... + * Note that's important to check for this condition before resorting + * to random sampling, otherwise we may try to swap already + * swapped keys. */ + if (swap_all_values) { + dictEntry *de = dictFind(db->dict,key->ptr); + + /* de may be NULL since the key already expired */ + if (de) { + vmpointer *vp; + val = dictGetEntryVal(de); + + if (val->refcount == 1 && + (vp = vmSwapObjectBlocking(val)) != NULL) + dictGetEntryVal(de) = vp; + } + decrRefCount(key); + continue; + } + decrRefCount(key); + + /* Flush data on disk once 32 MB of additional RAM are used... */ + force_swapout = 0; + if ((zmalloc_used_memory() - server.vm_max_memory) > 1024*1024*32) + force_swapout = 1; + + /* If we have still some hope of having some value fitting memory + * then we try random sampling. */ + if (!swap_all_values && server.vm_enabled && force_swapout) { + while (zmalloc_used_memory() > server.vm_max_memory) { + if (vmSwapOneObjectBlocking() == REDIS_ERR) break; + } + if (zmalloc_used_memory() > server.vm_max_memory) + swap_all_values = 1; /* We are already using too much mem */ + } + } + fclose(fp); + return REDIS_OK; + +eoferr: /* unexpected end of file is handled here with a fatal exit */ + redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now."); + exit(1); + return REDIS_ERR; /* Just to avoid warning */ +} + +/* A background saving child (BGSAVE) terminated its work. Handle this. */ +void backgroundSaveDoneHandler(int statloc) { + int exitcode = WEXITSTATUS(statloc); + int bysignal = WIFSIGNALED(statloc); + + if (!bysignal && exitcode == 0) { + redisLog(REDIS_NOTICE, + "Background saving terminated with success"); + server.dirty = 0; + server.lastsave = time(NULL); + } else if (!bysignal && exitcode != 0) { + redisLog(REDIS_WARNING, "Background saving error"); + } else { + redisLog(REDIS_WARNING, + "Background saving terminated by signal %d", WTERMSIG(statloc)); + rdbRemoveTempFile(server.bgsavechildpid); + } + server.bgsavechildpid = -1; + /* Possibly there are slaves waiting for a BGSAVE in order to be served + * (the first stage of SYNC is a bulk transfer of dump.rdb) */ + updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR); +} diff --git a/src/redis-benchmark.c b/src/redis-benchmark.c new file mode 100644 index 000000000..123d81180 --- /dev/null +++ b/src/redis-benchmark.c @@ -0,0 +1,665 @@ +/* Redis benchmark utility. + * + * Copyright (c) 2009-2010, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "fmacros.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ae.h" +#include "anet.h" +#include "sds.h" +#include "adlist.h" +#include "zmalloc.h" + +#define REPLY_INT 0 +#define REPLY_RETCODE 1 +#define REPLY_BULK 2 +#define REPLY_MBULK 3 + +#define CLIENT_CONNECTING 0 +#define CLIENT_SENDQUERY 1 +#define CLIENT_READREPLY 2 + +#define MAX_LATENCY 5000 + +#define REDIS_NOTUSED(V) ((void) V) + +static struct config { + int debug; + int numclients; + int requests; + int liveclients; + int donerequests; + int keysize; + int datasize; + int randomkeys; + int randomkeys_keyspacelen; + aeEventLoop *el; + char *hostip; + int hostport; + int keepalive; + long long start; + long long totlatency; + int *latency; + list *clients; + int quiet; + int loop; + int idlemode; +} config; + +typedef struct _client { + int state; + int fd; + sds obuf; + sds ibuf; + int mbulk; /* Number of elements in an mbulk reply */ + int readlen; /* readlen == -1 means read a single line */ + int totreceived; + unsigned int written; /* bytes of 'obuf' already written */ + int replytype; + long long start; /* start time in milliseconds */ +} *client; + +/* Prototypes */ +static void writeHandler(aeEventLoop *el, int fd, void *privdata, int mask); +static void createMissingClients(client c); + +/* Implementation */ +static long long mstime(void) { + struct timeval tv; + long long mst; + + gettimeofday(&tv, NULL); + mst = ((long)tv.tv_sec)*1000; + mst += tv.tv_usec/1000; + return mst; +} + +static void freeClient(client c) { + listNode *ln; + + aeDeleteFileEvent(config.el,c->fd,AE_WRITABLE); + aeDeleteFileEvent(config.el,c->fd,AE_READABLE); + sdsfree(c->ibuf); + sdsfree(c->obuf); + close(c->fd); + zfree(c); + config.liveclients--; + ln = listSearchKey(config.clients,c); + assert(ln != NULL); + listDelNode(config.clients,ln); +} + +static void freeAllClients(void) { + listNode *ln = config.clients->head, *next; + + while(ln) { + next = ln->next; + freeClient(ln->value); + ln = next; + } +} + +static void resetClient(client c) { + aeDeleteFileEvent(config.el,c->fd,AE_WRITABLE); + aeDeleteFileEvent(config.el,c->fd,AE_READABLE); + aeCreateFileEvent(config.el,c->fd, AE_WRITABLE,writeHandler,c); + sdsfree(c->ibuf); + c->ibuf = sdsempty(); + c->readlen = (c->replytype == REPLY_BULK || + c->replytype == REPLY_MBULK) ? -1 : 0; + c->mbulk = -1; + c->written = 0; + c->totreceived = 0; + c->state = CLIENT_SENDQUERY; + c->start = mstime(); + createMissingClients(c); +} + +static void randomizeClientKey(client c) { + char *p; + char buf[32]; + long r; + + p = strstr(c->obuf, "_rand"); + if (!p) return; + p += 5; + r = random() % config.randomkeys_keyspacelen; + sprintf(buf,"%ld",r); + memcpy(p,buf,strlen(buf)); +} + +static void prepareClientForReply(client c, int type) { + if (type == REPLY_BULK) { + c->replytype = REPLY_BULK; + c->readlen = -1; + } else if (type == REPLY_MBULK) { + c->replytype = REPLY_MBULK; + c->readlen = -1; + c->mbulk = -1; + } else { + c->replytype = type; + c->readlen = 0; + } +} + +static void clientDone(client c) { + static int last_tot_received = 1; + + long long latency; + config.donerequests ++; + latency = mstime() - c->start; + if (latency > MAX_LATENCY) latency = MAX_LATENCY; + config.latency[latency]++; + + if (config.debug && last_tot_received != c->totreceived) { + printf("Tot bytes received: %d\n", c->totreceived); + last_tot_received = c->totreceived; + } + if (config.donerequests == config.requests) { + freeClient(c); + aeStop(config.el); + return; + } + if (config.keepalive) { + resetClient(c); + if (config.randomkeys) randomizeClientKey(c); + } else { + config.liveclients--; + createMissingClients(c); + config.liveclients++; + freeClient(c); + } +} + +static void readHandler(aeEventLoop *el, int fd, void *privdata, int mask) +{ + char buf[1024]; + int nread; + client c = privdata; + REDIS_NOTUSED(el); + REDIS_NOTUSED(fd); + REDIS_NOTUSED(mask); + + nread = read(c->fd, buf, 1024); + if (nread == -1) { + fprintf(stderr, "Reading from socket: %s\n", strerror(errno)); + freeClient(c); + return; + } + if (nread == 0) { + fprintf(stderr, "EOF from client\n"); + freeClient(c); + return; + } + c->totreceived += nread; + c->ibuf = sdscatlen(c->ibuf,buf,nread); + +processdata: + /* Are we waiting for the first line of the command of for sdf + * count in bulk or multi bulk operations? */ + if (c->replytype == REPLY_INT || + c->replytype == REPLY_RETCODE || + (c->replytype == REPLY_BULK && c->readlen == -1) || + (c->replytype == REPLY_MBULK && c->readlen == -1) || + (c->replytype == REPLY_MBULK && c->mbulk == -1)) { + char *p; + + /* Check if the first line is complete. This is only true if + * there is a newline inside the buffer. */ + if ((p = strchr(c->ibuf,'\n')) != NULL) { + if (c->replytype == REPLY_BULK || + (c->replytype == REPLY_MBULK && c->mbulk != -1)) + { + /* Read the count of a bulk reply (being it a single bulk or + * a multi bulk reply). "$" for the protocol spec. */ + *p = '\0'; + *(p-1) = '\0'; + c->readlen = atoi(c->ibuf+1)+2; + // printf("BULK ATOI: %s\n", c->ibuf+1); + /* Handle null bulk reply "$-1" */ + if (c->readlen-2 == -1) { + clientDone(c); + return; + } + /* Leave all the rest in the input buffer */ + c->ibuf = sdsrange(c->ibuf,(p-c->ibuf)+1,-1); + /* fall through to reach the point where the code will try + * to check if the bulk reply is complete. */ + } else if (c->replytype == REPLY_MBULK && c->mbulk == -1) { + /* Read the count of a multi bulk reply. That is, how many + * bulk replies we have to read next. "*" protocol. */ + *p = '\0'; + *(p-1) = '\0'; + c->mbulk = atoi(c->ibuf+1); + /* Handle null bulk reply "*-1" */ + if (c->mbulk == -1) { + clientDone(c); + return; + } + // printf("%p) %d elements list\n", c, c->mbulk); + /* Leave all the rest in the input buffer */ + c->ibuf = sdsrange(c->ibuf,(p-c->ibuf)+1,-1); + goto processdata; + } else { + c->ibuf = sdstrim(c->ibuf,"\r\n"); + clientDone(c); + return; + } + } + } + /* bulk read, did we read everything? */ + if (((c->replytype == REPLY_MBULK && c->mbulk != -1) || + (c->replytype == REPLY_BULK)) && c->readlen != -1 && + (unsigned)c->readlen <= sdslen(c->ibuf)) + { + // printf("BULKSTATUS mbulk:%d readlen:%d sdslen:%d\n", + // c->mbulk,c->readlen,sdslen(c->ibuf)); + if (c->replytype == REPLY_BULK) { + clientDone(c); + } else if (c->replytype == REPLY_MBULK) { + // printf("%p) %d (%d)) ",c, c->mbulk, c->readlen); + // fwrite(c->ibuf,c->readlen,1,stdout); + // printf("\n"); + if (--c->mbulk == 0) { + clientDone(c); + } else { + c->ibuf = sdsrange(c->ibuf,c->readlen,-1); + c->readlen = -1; + goto processdata; + } + } + } +} + +static void writeHandler(aeEventLoop *el, int fd, void *privdata, int mask) +{ + client c = privdata; + REDIS_NOTUSED(el); + REDIS_NOTUSED(fd); + REDIS_NOTUSED(mask); + + if (c->state == CLIENT_CONNECTING) { + c->state = CLIENT_SENDQUERY; + c->start = mstime(); + } + if (sdslen(c->obuf) > c->written) { + void *ptr = c->obuf+c->written; + int len = sdslen(c->obuf) - c->written; + int nwritten = write(c->fd, ptr, len); + if (nwritten == -1) { + if (errno != EPIPE) + fprintf(stderr, "Writing to socket: %s\n", strerror(errno)); + freeClient(c); + return; + } + c->written += nwritten; + if (sdslen(c->obuf) == c->written) { + aeDeleteFileEvent(config.el,c->fd,AE_WRITABLE); + aeCreateFileEvent(config.el,c->fd,AE_READABLE,readHandler,c); + c->state = CLIENT_READREPLY; + } + } +} + +static client createClient(void) { + client c = zmalloc(sizeof(struct _client)); + char err[ANET_ERR_LEN]; + + c->fd = anetTcpNonBlockConnect(err,config.hostip,config.hostport); + if (c->fd == ANET_ERR) { + zfree(c); + fprintf(stderr,"Connect: %s\n",err); + return NULL; + } + anetTcpNoDelay(NULL,c->fd); + c->obuf = sdsempty(); + c->ibuf = sdsempty(); + c->mbulk = -1; + c->readlen = 0; + c->written = 0; + c->totreceived = 0; + c->state = CLIENT_CONNECTING; + aeCreateFileEvent(config.el, c->fd, AE_WRITABLE, writeHandler, c); + config.liveclients++; + listAddNodeTail(config.clients,c); + return c; +} + +static void createMissingClients(client c) { + while(config.liveclients < config.numclients) { + client new = createClient(); + if (!new) continue; + sdsfree(new->obuf); + new->obuf = sdsdup(c->obuf); + if (config.randomkeys) randomizeClientKey(c); + prepareClientForReply(new,c->replytype); + } +} + +static void showLatencyReport(char *title) { + int j, seen = 0; + float perc, reqpersec; + + reqpersec = (float)config.donerequests/((float)config.totlatency/1000); + if (!config.quiet) { + printf("====== %s ======\n", title); + printf(" %d requests completed in %.2f seconds\n", config.donerequests, + (float)config.totlatency/1000); + printf(" %d parallel clients\n", config.numclients); + printf(" %d bytes payload\n", config.datasize); + printf(" keep alive: %d\n", config.keepalive); + printf("\n"); + for (j = 0; j <= MAX_LATENCY; j++) { + if (config.latency[j]) { + seen += config.latency[j]; + perc = ((float)seen*100)/config.donerequests; + printf("%.2f%% <= %d milliseconds\n", perc, j); + } + } + printf("%.2f requests per second\n\n", reqpersec); + } else { + printf("%s: %.2f requests per second\n", title, reqpersec); + } +} + +static void prepareForBenchmark(void) +{ + memset(config.latency,0,sizeof(int)*(MAX_LATENCY+1)); + config.start = mstime(); + config.donerequests = 0; +} + +static void endBenchmark(char *title) { + config.totlatency = mstime()-config.start; + showLatencyReport(title); + freeAllClients(); +} + +void parseOptions(int argc, char **argv) { + int i; + + for (i = 1; i < argc; i++) { + int lastarg = i==argc-1; + + if (!strcmp(argv[i],"-c") && !lastarg) { + config.numclients = atoi(argv[i+1]); + i++; + } else if (!strcmp(argv[i],"-n") && !lastarg) { + config.requests = atoi(argv[i+1]); + i++; + } else if (!strcmp(argv[i],"-k") && !lastarg) { + config.keepalive = atoi(argv[i+1]); + i++; + } else if (!strcmp(argv[i],"-h") && !lastarg) { + char *ip = zmalloc(32); + if (anetResolve(NULL,argv[i+1],ip) == ANET_ERR) { + printf("Can't resolve %s\n", argv[i]); + exit(1); + } + config.hostip = ip; + i++; + } else if (!strcmp(argv[i],"-p") && !lastarg) { + config.hostport = atoi(argv[i+1]); + i++; + } else if (!strcmp(argv[i],"-d") && !lastarg) { + config.datasize = atoi(argv[i+1]); + i++; + if (config.datasize < 1) config.datasize=1; + if (config.datasize > 1024*1024) config.datasize = 1024*1024; + } else if (!strcmp(argv[i],"-r") && !lastarg) { + config.randomkeys = 1; + config.randomkeys_keyspacelen = atoi(argv[i+1]); + if (config.randomkeys_keyspacelen < 0) + config.randomkeys_keyspacelen = 0; + i++; + } else if (!strcmp(argv[i],"-q")) { + config.quiet = 1; + } else if (!strcmp(argv[i],"-l")) { + config.loop = 1; + } else if (!strcmp(argv[i],"-D")) { + config.debug = 1; + } else if (!strcmp(argv[i],"-I")) { + config.idlemode = 1; + } else { + printf("Wrong option '%s' or option argument missing\n\n",argv[i]); + printf("Usage: redis-benchmark [-h ] [-p ] [-c ] [-n [-k ]\n\n"); + printf(" -h Server hostname (default 127.0.0.1)\n"); + printf(" -p Server port (default 6379)\n"); + printf(" -c Number of parallel connections (default 50)\n"); + printf(" -n Total number of requests (default 10000)\n"); + printf(" -d Data size of SET/GET value in bytes (default 2)\n"); + printf(" -k 1=keep alive 0=reconnect (default 1)\n"); + printf(" -r Use random keys for SET/GET/INCR, random values for SADD\n"); + printf(" Using this option the benchmark will get/set keys\n"); + printf(" in the form mykey_rand000000012456 instead of constant\n"); + printf(" keys, the argument determines the max\n"); + printf(" number of values for the random number. For instance\n"); + printf(" if set to 10 only rand000000000000 - rand000000000009\n"); + printf(" range will be allowed.\n"); + printf(" -q Quiet. Just show query/sec values\n"); + printf(" -l Loop. Run the tests forever\n"); + printf(" -I Idle mode. Just open N idle connections and wait.\n"); + printf(" -D Debug mode. more verbose.\n"); + exit(1); + } + } +} + +int main(int argc, char **argv) { + client c; + + signal(SIGHUP, SIG_IGN); + signal(SIGPIPE, SIG_IGN); + + config.debug = 0; + config.numclients = 50; + config.requests = 10000; + config.liveclients = 0; + config.el = aeCreateEventLoop(); + config.keepalive = 1; + config.donerequests = 0; + config.datasize = 3; + config.randomkeys = 0; + config.randomkeys_keyspacelen = 0; + config.quiet = 0; + config.loop = 0; + config.idlemode = 0; + config.latency = NULL; + config.clients = listCreate(); + config.latency = zmalloc(sizeof(int)*(MAX_LATENCY+1)); + + config.hostip = "127.0.0.1"; + config.hostport = 6379; + + parseOptions(argc,argv); + + if (config.keepalive == 0) { + printf("WARNING: keepalive disabled, you probably need 'echo 1 > /proc/sys/net/ipv4/tcp_tw_reuse' for Linux and 'sudo sysctl -w net.inet.tcp.msl=1000' for Mac OS X in order to use a lot of clients/requests\n"); + } + + if (config.idlemode) { + printf("Creating %d idle connections and waiting forever (Ctrl+C when done)\n", config.numclients); + prepareForBenchmark(); + c = createClient(); + if (!c) exit(1); + c->obuf = sdsempty(); + prepareClientForReply(c,REPLY_RETCODE); /* will never receive it */ + createMissingClients(c); + aeMain(config.el); + /* and will wait for every */ + } + + do { + prepareForBenchmark(); + c = createClient(); + if (!c) exit(1); + c->obuf = sdscat(c->obuf,"PING\r\n"); + prepareClientForReply(c,REPLY_RETCODE); + createMissingClients(c); + aeMain(config.el); + endBenchmark("PING"); + + prepareForBenchmark(); + c = createClient(); + if (!c) exit(1); + c->obuf = sdscat(c->obuf,"*1\r\n$4\r\nPING\r\n"); + prepareClientForReply(c,REPLY_RETCODE); + createMissingClients(c); + aeMain(config.el); + endBenchmark("PING (multi bulk)"); + + prepareForBenchmark(); + c = createClient(); + if (!c) exit(1); + c->obuf = sdscatprintf(c->obuf,"SET foo_rand000000000000 %d\r\n",config.datasize); + { + char *data = zmalloc(config.datasize+2); + memset(data,'x',config.datasize); + data[config.datasize] = '\r'; + data[config.datasize+1] = '\n'; + c->obuf = sdscatlen(c->obuf,data,config.datasize+2); + } + prepareClientForReply(c,REPLY_RETCODE); + createMissingClients(c); + aeMain(config.el); + endBenchmark("SET"); + + prepareForBenchmark(); + c = createClient(); + if (!c) exit(1); + c->obuf = sdscat(c->obuf,"GET foo_rand000000000000\r\n"); + prepareClientForReply(c,REPLY_BULK); + createMissingClients(c); + aeMain(config.el); + endBenchmark("GET"); + + prepareForBenchmark(); + c = createClient(); + if (!c) exit(1); + c->obuf = sdscat(c->obuf,"INCR counter_rand000000000000\r\n"); + prepareClientForReply(c,REPLY_INT); + createMissingClients(c); + aeMain(config.el); + endBenchmark("INCR"); + + prepareForBenchmark(); + c = createClient(); + if (!c) exit(1); + c->obuf = sdscat(c->obuf,"LPUSH mylist 3\r\nbar\r\n"); + prepareClientForReply(c,REPLY_INT); + createMissingClients(c); + aeMain(config.el); + endBenchmark("LPUSH"); + + prepareForBenchmark(); + c = createClient(); + if (!c) exit(1); + c->obuf = sdscat(c->obuf,"LPOP mylist\r\n"); + prepareClientForReply(c,REPLY_BULK); + createMissingClients(c); + aeMain(config.el); + endBenchmark("LPOP"); + + prepareForBenchmark(); + c = createClient(); + if (!c) exit(1); + c->obuf = sdscat(c->obuf,"SADD myset 24\r\ncounter_rand000000000000\r\n"); + prepareClientForReply(c,REPLY_RETCODE); + createMissingClients(c); + aeMain(config.el); + endBenchmark("SADD"); + + prepareForBenchmark(); + c = createClient(); + if (!c) exit(1); + c->obuf = sdscat(c->obuf,"SPOP myset\r\n"); + prepareClientForReply(c,REPLY_BULK); + createMissingClients(c); + aeMain(config.el); + endBenchmark("SPOP"); + + prepareForBenchmark(); + c = createClient(); + if (!c) exit(1); + c->obuf = sdscat(c->obuf,"LPUSH mylist 3\r\nbar\r\n"); + prepareClientForReply(c,REPLY_RETCODE); + createMissingClients(c); + aeMain(config.el); + endBenchmark("LPUSH (again, in order to bench LRANGE)"); + + prepareForBenchmark(); + c = createClient(); + if (!c) exit(1); + c->obuf = sdscat(c->obuf,"LRANGE mylist 0 99\r\n"); + prepareClientForReply(c,REPLY_MBULK); + createMissingClients(c); + aeMain(config.el); + endBenchmark("LRANGE (first 100 elements)"); + + prepareForBenchmark(); + c = createClient(); + if (!c) exit(1); + c->obuf = sdscat(c->obuf,"LRANGE mylist 0 299\r\n"); + prepareClientForReply(c,REPLY_MBULK); + createMissingClients(c); + aeMain(config.el); + endBenchmark("LRANGE (first 300 elements)"); + + prepareForBenchmark(); + c = createClient(); + if (!c) exit(1); + c->obuf = sdscat(c->obuf,"LRANGE mylist 0 449\r\n"); + prepareClientForReply(c,REPLY_MBULK); + createMissingClients(c); + aeMain(config.el); + endBenchmark("LRANGE (first 450 elements)"); + + prepareForBenchmark(); + c = createClient(); + if (!c) exit(1); + c->obuf = sdscat(c->obuf,"LRANGE mylist 0 599\r\n"); + prepareClientForReply(c,REPLY_MBULK); + createMissingClients(c); + aeMain(config.el); + endBenchmark("LRANGE (first 600 elements)"); + + printf("\n"); + } while(config.loop); + + return 0; +} diff --git a/src/redis-check-aof.c b/src/redis-check-aof.c new file mode 100644 index 000000000..ff0d1f82c --- /dev/null +++ b/src/redis-check-aof.c @@ -0,0 +1,185 @@ +#include "fmacros.h" +#include +#include +#include +#include +#include +#include "config.h" + +#define ERROR(...) { \ + char __buf[1024]; \ + sprintf(__buf, __VA_ARGS__); \ + sprintf(error, "0x%08lx: %s", epos, __buf); \ +} + +static char error[1024]; +static long epos; + +int consumeNewline(char *buf) { + if (strncmp(buf,"\r\n",2) != 0) { + ERROR("Expected \\r\\n, got: %02x%02x",buf[0],buf[1]); + return 0; + } + return 1; +} + +int readLong(FILE *fp, char prefix, long *target) { + char buf[128], *eptr; + epos = ftell(fp); + if (fgets(buf,sizeof(buf),fp) == NULL) { + return 0; + } + if (buf[0] != prefix) { + ERROR("Expected prefix '%c', got: '%c'",buf[0],prefix); + return 0; + } + *target = strtol(buf+1,&eptr,10); + return consumeNewline(eptr); +} + +int readBytes(FILE *fp, char *target, long length) { + long real; + epos = ftell(fp); + real = fread(target,1,length,fp); + if (real != length) { + ERROR("Expected to read %ld bytes, got %ld bytes",length,real); + return 0; + } + return 1; +} + +int readString(FILE *fp, char** target) { + long len; + *target = NULL; + if (!readLong(fp,'$',&len)) { + return 0; + } + + /* Increase length to also consume \r\n */ + len += 2; + *target = (char*)malloc(len); + if (!readBytes(fp,*target,len)) { + return 0; + } + if (!consumeNewline(*target+len-2)) { + return 0; + } + (*target)[len-2] = '\0'; + return 1; +} + +int readArgc(FILE *fp, long *target) { + return readLong(fp,'*',target); +} + +long process(FILE *fp) { + long argc, pos = 0; + int i, multi = 0; + char *str; + + while(1) { + if (!multi) pos = ftell(fp); + if (!readArgc(fp, &argc)) break; + + for (i = 0; i < argc; i++) { + if (!readString(fp,&str)) break; + if (i == 0) { + if (strcasecmp(str, "multi") == 0) { + if (multi++) { + ERROR("Unexpected MULTI"); + break; + } + } else if (strcasecmp(str, "exec") == 0) { + if (--multi) { + ERROR("Unexpected EXEC"); + break; + } + } + } + free(str); + } + + /* Stop if the loop did not finish */ + if (i < argc) { + if (str) free(str); + break; + } + } + + if (feof(fp) && multi && strlen(error) == 0) { + ERROR("Reached EOF before reading EXEC for MULTI"); + } + if (strlen(error) > 0) { + printf("%s\n", error); + } + return pos; +} + +int main(int argc, char **argv) { + char *filename; + int fix = 0; + + if (argc < 2) { + printf("Usage: %s [--fix] \n", argv[0]); + exit(1); + } else if (argc == 2) { + filename = argv[1]; + } else if (argc == 3) { + if (strcmp(argv[1],"--fix") != 0) { + printf("Invalid argument: %s\n", argv[1]); + exit(1); + } + filename = argv[2]; + fix = 1; + } else { + printf("Invalid arguments\n"); + exit(1); + } + + FILE *fp = fopen(filename,"r+"); + if (fp == NULL) { + printf("Cannot open file: %s\n", filename); + exit(1); + } + + struct redis_stat sb; + if (redis_fstat(fileno(fp),&sb) == -1) { + printf("Cannot stat file: %s\n", filename); + exit(1); + } + + long size = sb.st_size; + if (size == 0) { + printf("Empty file: %s\n", filename); + exit(1); + } + + long pos = process(fp); + long diff = size-pos; + if (diff > 0) { + if (fix) { + char buf[2]; + printf("This will shrink the AOF from %ld bytes, with %ld bytes, to %ld bytes\n",size,diff,pos); + printf("Continue? [y/N]: "); + if (fgets(buf,sizeof(buf),stdin) == NULL || + strncasecmp(buf,"y",1) != 0) { + printf("Aborting...\n"); + exit(1); + } + if (ftruncate(fileno(fp), pos) == -1) { + printf("Failed to truncate AOF\n"); + exit(1); + } else { + printf("Successfully truncated AOF\n"); + } + } else { + printf("AOF is not valid\n"); + exit(1); + } + } else { + printf("AOF is valid\n"); + } + + fclose(fp); + return 0; +} diff --git a/src/redis-check-dump.c b/src/redis-check-dump.c new file mode 100644 index 000000000..0b002790d --- /dev/null +++ b/src/redis-check-dump.c @@ -0,0 +1,671 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "lzf.h" + +/* Object types */ +#define REDIS_STRING 0 +#define REDIS_LIST 1 +#define REDIS_SET 2 +#define REDIS_ZSET 3 +#define REDIS_HASH 4 + +/* Objects encoding. Some kind of objects like Strings and Hashes can be + * internally represented in multiple ways. The 'encoding' field of the object + * is set to one of this fields for this object. */ +#define REDIS_ENCODING_RAW 0 /* Raw representation */ +#define REDIS_ENCODING_INT 1 /* Encoded as integer */ +#define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */ +#define REDIS_ENCODING_HT 3 /* Encoded as an hash table */ + +/* Object types only used for dumping to disk */ +#define REDIS_EXPIRETIME 253 +#define REDIS_SELECTDB 254 +#define REDIS_EOF 255 + +/* Defines related to the dump file format. To store 32 bits lengths for short + * keys requires a lot of space, so we check the most significant 2 bits of + * the first byte to interpreter the length: + * + * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte + * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte + * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow + * 11|000000 this means: specially encoded object will follow. The six bits + * number specify the kind of object that follows. + * See the REDIS_RDB_ENC_* defines. + * + * Lenghts up to 63 are stored using a single byte, most DB keys, and may + * values, will fit inside. */ +#define REDIS_RDB_6BITLEN 0 +#define REDIS_RDB_14BITLEN 1 +#define REDIS_RDB_32BITLEN 2 +#define REDIS_RDB_ENCVAL 3 +#define REDIS_RDB_LENERR UINT_MAX + +/* When a length of a string object stored on disk has the first two bits + * set, the remaining two bits specify a special encoding for the object + * accordingly to the following defines: */ +#define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */ +#define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */ +#define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */ +#define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */ + +#define ERROR(...) { \ + printf(__VA_ARGS__); \ + exit(1); \ +} + +/* data type to hold offset in file and size */ +typedef struct { + void *data; + unsigned long size; + unsigned long offset; +} pos; + +static unsigned char level = 0; +static pos positions[16]; + +#define CURR_OFFSET (positions[level].offset) + +/* Hold a stack of errors */ +typedef struct { + char error[16][1024]; + unsigned long offset[16]; + unsigned int level; +} errors_t; +static errors_t errors; + +#define SHIFT_ERROR(provided_offset, ...) { \ + sprintf(errors.error[errors.level], __VA_ARGS__); \ + errors.offset[errors.level] = provided_offset; \ + errors.level++; \ +} + +/* Data type to hold opcode with optional key name an success status */ +typedef struct { + char* key; + int type; + char success; +} entry; + +/* Global vars that are actally used as constants. The following double + * values are used for double on-disk serialization, and are initialized + * at runtime to avoid strange compiler optimizations. */ +static double R_Zero, R_PosInf, R_NegInf, R_Nan; + +/* store string types for output */ +static char types[256][16]; + +/* when number of bytes to read is negative, do a peek */ +int readBytes(void *target, long num) { + char peek = (num < 0) ? 1 : 0; + num = (num < 0) ? -num : num; + + pos p = positions[level]; + if (p.offset + num > p.size) { + return 0; + } else { + memcpy(target, (void*)((unsigned long)p.data + p.offset), num); + if (!peek) positions[level].offset += num; + } + return 1; +} + +int processHeader() { + char buf[10] = "_________"; + int dump_version; + + if (!readBytes(buf, 9)) { + ERROR("Cannot read header\n"); + } + + /* expect the first 5 bytes to equal REDIS */ + if (memcmp(buf,"REDIS",5) != 0) { + ERROR("Wrong signature in header\n"); + } + + dump_version = (int)strtol(buf + 5, NULL, 10); + if (dump_version != 1) { + ERROR("Unknown RDB format version: %d\n", dump_version); + } + return 1; +} + +int loadType(entry *e) { + uint32_t offset = CURR_OFFSET; + + /* this byte needs to qualify as type */ + unsigned char t; + if (readBytes(&t, 1)) { + if (t <= 4 || t >= 253) { + e->type = t; + return 1; + } else { + SHIFT_ERROR(offset, "Unknown type (0x%02x)", t); + } + } else { + SHIFT_ERROR(offset, "Could not read type"); + } + + /* failure */ + return 0; +} + +int peekType() { + unsigned char t; + if (readBytes(&t, -1) && (t <= 4 || t >= 253)) return t; + return -1; +} + +/* discard time, just consume the bytes */ +int processTime() { + uint32_t offset = CURR_OFFSET; + unsigned char t[4]; + if (readBytes(t, 4)) { + return 1; + } else { + SHIFT_ERROR(offset, "Could not read time"); + } + + /* failure */ + return 0; +} + +uint32_t loadLength(int *isencoded) { + unsigned char buf[2]; + uint32_t len; + int type; + + if (isencoded) *isencoded = 0; + if (!readBytes(buf, 1)) return REDIS_RDB_LENERR; + type = (buf[0] & 0xC0) >> 6; + if (type == REDIS_RDB_6BITLEN) { + /* Read a 6 bit len */ + return buf[0] & 0x3F; + } else if (type == REDIS_RDB_ENCVAL) { + /* Read a 6 bit len encoding type */ + if (isencoded) *isencoded = 1; + return buf[0] & 0x3F; + } else if (type == REDIS_RDB_14BITLEN) { + /* Read a 14 bit len */ + if (!readBytes(buf+1,1)) return REDIS_RDB_LENERR; + return ((buf[0] & 0x3F) << 8) | buf[1]; + } else { + /* Read a 32 bit len */ + if (!readBytes(&len, 4)) return REDIS_RDB_LENERR; + return (unsigned int)ntohl(len); + } +} + +char *loadIntegerObject(int enctype) { + uint32_t offset = CURR_OFFSET; + unsigned char enc[4]; + long long val; + + if (enctype == REDIS_RDB_ENC_INT8) { + uint8_t v; + if (!readBytes(enc, 1)) return NULL; + v = enc[0]; + val = (int8_t)v; + } else if (enctype == REDIS_RDB_ENC_INT16) { + uint16_t v; + if (!readBytes(enc, 2)) return NULL; + v = enc[0]|(enc[1]<<8); + val = (int16_t)v; + } else if (enctype == REDIS_RDB_ENC_INT32) { + uint32_t v; + if (!readBytes(enc, 4)) return NULL; + v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24); + val = (int32_t)v; + } else { + SHIFT_ERROR(offset, "Unknown integer encoding (0x%02x)", enctype); + return NULL; + } + + /* convert val into string */ + char *buf; + buf = malloc(sizeof(char) * 128); + sprintf(buf, "%lld", val); + return buf; +} + +char* loadLzfStringObject() { + unsigned int slen, clen; + char *c, *s; + + if ((clen = loadLength(NULL)) == REDIS_RDB_LENERR) return NULL; + if ((slen = loadLength(NULL)) == REDIS_RDB_LENERR) return NULL; + + c = malloc(clen); + if (!readBytes(c, clen)) { + free(c); + return NULL; + } + + s = malloc(slen+1); + if (lzf_decompress(c,clen,s,slen) == 0) { + free(c); free(s); + return NULL; + } + + free(c); + return s; +} + +/* returns NULL when not processable, char* when valid */ +char* loadStringObject() { + uint32_t offset = CURR_OFFSET; + int isencoded; + uint32_t len; + + len = loadLength(&isencoded); + if (isencoded) { + switch(len) { + case REDIS_RDB_ENC_INT8: + case REDIS_RDB_ENC_INT16: + case REDIS_RDB_ENC_INT32: + return loadIntegerObject(len); + case REDIS_RDB_ENC_LZF: + return loadLzfStringObject(); + default: + /* unknown encoding */ + SHIFT_ERROR(offset, "Unknown string encoding (0x%02x)", len); + return NULL; + } + } + + if (len == REDIS_RDB_LENERR) return NULL; + + char *buf = malloc(sizeof(char) * (len+1)); + buf[len] = '\0'; + if (!readBytes(buf, len)) { + free(buf); + return NULL; + } + return buf; +} + +int processStringObject(char** store) { + unsigned long offset = CURR_OFFSET; + char *key = loadStringObject(); + if (key == NULL) { + SHIFT_ERROR(offset, "Error reading string object"); + free(key); + return 0; + } + + if (store != NULL) { + *store = key; + } else { + free(key); + } + return 1; +} + +double* loadDoubleValue() { + char buf[256]; + unsigned char len; + double* val; + + if (!readBytes(&len,1)) return NULL; + + val = malloc(sizeof(double)); + switch(len) { + case 255: *val = R_NegInf; return val; + case 254: *val = R_PosInf; return val; + case 253: *val = R_Nan; return val; + default: + if (!readBytes(buf, len)) { + free(val); + return NULL; + } + buf[len] = '\0'; + sscanf(buf, "%lg", val); + return val; + } +} + +int processDoubleValue(double** store) { + unsigned long offset = CURR_OFFSET; + double *val = loadDoubleValue(); + if (val == NULL) { + SHIFT_ERROR(offset, "Error reading double value"); + free(val); + return 0; + } + + if (store != NULL) { + *store = val; + } else { + free(val); + } + return 1; +} + +int loadPair(entry *e) { + uint32_t offset = CURR_OFFSET; + uint32_t i; + + /* read key first */ + char *key; + if (processStringObject(&key)) { + e->key = key; + } else { + SHIFT_ERROR(offset, "Error reading entry key"); + return 0; + } + + uint32_t length = 0; + if (e->type == REDIS_LIST || + e->type == REDIS_SET || + e->type == REDIS_ZSET || + e->type == REDIS_HASH) { + if ((length = loadLength(NULL)) == REDIS_RDB_LENERR) { + SHIFT_ERROR(offset, "Error reading %s length", types[e->type]); + return 0; + } + } + + switch(e->type) { + case REDIS_STRING: + if (!processStringObject(NULL)) { + SHIFT_ERROR(offset, "Error reading entry value"); + return 0; + } + break; + case REDIS_LIST: + case REDIS_SET: + for (i = 0; i < length; i++) { + offset = CURR_OFFSET; + if (!processStringObject(NULL)) { + SHIFT_ERROR(offset, "Error reading element at index %d (length: %d)", i, length); + return 0; + } + } + break; + case REDIS_ZSET: + for (i = 0; i < length; i++) { + offset = CURR_OFFSET; + if (!processStringObject(NULL)) { + SHIFT_ERROR(offset, "Error reading element key at index %d (length: %d)", i, length); + return 0; + } + offset = CURR_OFFSET; + if (!processDoubleValue(NULL)) { + SHIFT_ERROR(offset, "Error reading element value at index %d (length: %d)", i, length); + return 0; + } + } + break; + case REDIS_HASH: + for (i = 0; i < length; i++) { + offset = CURR_OFFSET; + if (!processStringObject(NULL)) { + SHIFT_ERROR(offset, "Error reading element key at index %d (length: %d)", i, length); + return 0; + } + offset = CURR_OFFSET; + if (!processStringObject(NULL)) { + SHIFT_ERROR(offset, "Error reading element value at index %d (length: %d)", i, length); + return 0; + } + } + break; + default: + SHIFT_ERROR(offset, "Type not implemented"); + return 0; + } + /* because we're done, we assume success */ + e->success = 1; + return 1; +} + +entry loadEntry() { + entry e = { NULL, -1, 0 }; + uint32_t length, offset[4]; + + /* reset error container */ + errors.level = 0; + + offset[0] = CURR_OFFSET; + if (!loadType(&e)) { + return e; + } + + offset[1] = CURR_OFFSET; + if (e.type == REDIS_SELECTDB) { + if ((length = loadLength(NULL)) == REDIS_RDB_LENERR) { + SHIFT_ERROR(offset[1], "Error reading database number"); + return e; + } + if (length > 63) { + SHIFT_ERROR(offset[1], "Database number out of range (%d)", length); + return e; + } + } else if (e.type == REDIS_EOF) { + if (positions[level].offset < positions[level].size) { + SHIFT_ERROR(offset[0], "Unexpected EOF"); + } else { + e.success = 1; + } + return e; + } else { + /* optionally consume expire */ + if (e.type == REDIS_EXPIRETIME) { + if (!processTime()) return e; + if (!loadType(&e)) return e; + } + + offset[1] = CURR_OFFSET; + if (!loadPair(&e)) { + SHIFT_ERROR(offset[1], "Error for type %s", types[e.type]); + return e; + } + } + + /* all entries are followed by a valid type: + * e.g. a new entry, SELECTDB, EXPIRE, EOF */ + offset[2] = CURR_OFFSET; + if (peekType() == -1) { + SHIFT_ERROR(offset[2], "Followed by invalid type"); + SHIFT_ERROR(offset[0], "Error for type %s", types[e.type]); + e.success = 0; + } else { + e.success = 1; + } + + return e; +} + +void printCentered(int indent, int width, char* body) { + char head[256], tail[256]; + memset(head, '\0', 256); + memset(tail, '\0', 256); + + memset(head, '=', indent); + memset(tail, '=', width - 2 - indent - strlen(body)); + printf("%s %s %s\n", head, body, tail); +} + +void printValid(int ops, int bytes) { + char body[80]; + sprintf(body, "Processed %d valid opcodes (in %d bytes)", ops, bytes); + printCentered(4, 80, body); +} + +void printSkipped(int bytes, int offset) { + char body[80]; + sprintf(body, "Skipped %d bytes (resuming at 0x%08x)", bytes, offset); + printCentered(4, 80, body); +} + +void printErrorStack(entry *e) { + unsigned int i; + char body[64]; + + if (e->type == -1) { + sprintf(body, "Error trace"); + } else if (e->type >= 253) { + sprintf(body, "Error trace (%s)", types[e->type]); + } else if (!e->key) { + sprintf(body, "Error trace (%s: (unknown))", types[e->type]); + } else { + char tmp[41]; + strncpy(tmp, e->key, 40); + + /* display truncation at the last 3 chars */ + if (strlen(e->key) > 40) { + memset(&tmp[37], '.', 3); + } + + /* display unprintable characters as ? */ + for (i = 0; i < strlen(tmp); i++) { + if (tmp[i] <= 32) tmp[i] = '?'; + } + sprintf(body, "Error trace (%s: %s)", types[e->type], tmp); + } + + printCentered(4, 80, body); + + /* display error stack */ + for (i = 0; i < errors.level; i++) { + printf("0x%08lx - %s\n", errors.offset[i], errors.error[i]); + } +} + +void process() { + int i, num_errors = 0, num_valid_ops = 0, num_valid_bytes = 0; + entry entry; + processHeader(); + + level = 1; + while(positions[0].offset < positions[0].size) { + positions[1] = positions[0]; + + entry = loadEntry(); + if (!entry.success) { + printValid(num_valid_ops, num_valid_bytes); + printErrorStack(&entry); + num_errors++; + num_valid_ops = 0; + num_valid_bytes = 0; + + /* search for next valid entry */ + unsigned long offset = positions[0].offset + 1; + while (!entry.success && offset < positions[0].size) { + positions[1].offset = offset; + + /* find 3 consecutive valid entries */ + for (i = 0; i < 3; i++) { + entry = loadEntry(); + if (!entry.success) break; + } + /* check if we found 3 consecutive valid entries */ + if (i < 3) { + offset++; + } + } + + /* print how many bytes we have skipped to find a new valid opcode */ + if (offset < positions[0].size) { + printSkipped(offset - positions[0].offset, offset); + } + + positions[0].offset = offset; + } else { + num_valid_ops++; + num_valid_bytes += positions[1].offset - positions[0].offset; + + /* advance position */ + positions[0] = positions[1]; + } + } + + /* because there is another potential error, + * print how many valid ops we have processed */ + printValid(num_valid_ops, num_valid_bytes); + + /* expect an eof */ + if (entry.type != REDIS_EOF) { + /* last byte should be EOF, add error */ + errors.level = 0; + SHIFT_ERROR(positions[0].offset, "Expected EOF, got %s", types[entry.type]); + + /* this is an EOF error so reset type */ + entry.type = -1; + printErrorStack(&entry); + + num_errors++; + } + + /* print summary on errors */ + if (num_errors > 0) { + printf("\n"); + printf("Total unprocessable opcodes: %d\n", num_errors); + } +} + +int main(int argc, char **argv) { + /* expect the first argument to be the dump file */ + if (argc <= 1) { + printf("Usage: %s \n", argv[0]); + exit(0); + } + + int fd; + unsigned long size; + struct stat stat; + void *data; + + fd = open(argv[1], O_RDONLY); + if (fd < 1) { + ERROR("Cannot open file: %s\n", argv[1]); + } + if (fstat(fd, &stat) == -1) { + ERROR("Cannot stat: %s\n", argv[1]); + } else { + size = stat.st_size; + } + + data = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0); + if (data == MAP_FAILED) { + ERROR("Cannot mmap: %s\n", argv[1]); + } + + /* Initialize static vars */ + positions[0].data = data; + positions[0].size = size; + positions[0].offset = 0; + errors.level = 0; + + /* Object types */ + sprintf(types[REDIS_STRING], "STRING"); + sprintf(types[REDIS_LIST], "LIST"); + sprintf(types[REDIS_SET], "SET"); + sprintf(types[REDIS_ZSET], "ZSET"); + sprintf(types[REDIS_HASH], "HASH"); + + /* Object types only used for dumping to disk */ + sprintf(types[REDIS_EXPIRETIME], "EXPIRETIME"); + sprintf(types[REDIS_SELECTDB], "SELECTDB"); + sprintf(types[REDIS_EOF], "EOF"); + + /* Double constants initialization */ + R_Zero = 0.0; + R_PosInf = 1.0/R_Zero; + R_NegInf = -1.0/R_Zero; + R_Nan = R_Zero/R_Zero; + + process(); + + munmap(data, size); + close(fd); + return 0; +} diff --git a/src/redis-cli.c b/src/redis-cli.c new file mode 100644 index 000000000..2daa7c461 --- /dev/null +++ b/src/redis-cli.c @@ -0,0 +1,493 @@ +/* Redis CLI (command line interface) + * + * Copyright (c) 2009-2010, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "fmacros.h" + +#include +#include +#include +#include +#include + +#include "anet.h" +#include "sds.h" +#include "adlist.h" +#include "zmalloc.h" +#include "linenoise.h" + +#define REDIS_CMD_INLINE 1 +#define REDIS_CMD_BULK 2 +#define REDIS_CMD_MULTIBULK 4 + +#define REDIS_NOTUSED(V) ((void) V) + +static struct config { + char *hostip; + int hostport; + long repeat; + int dbnum; + int argn_from_stdin; + int interactive; + int shutdown; + int monitor_mode; + int pubsub_mode; + int raw_output; + char *auth; +} config; + +static int cliReadReply(int fd); +static void usage(); + +static int cliConnect(void) { + char err[ANET_ERR_LEN]; + static int fd = ANET_ERR; + + if (fd == ANET_ERR) { + fd = anetTcpConnect(err,config.hostip,config.hostport); + if (fd == ANET_ERR) { + fprintf(stderr, "Could not connect to Redis at %s:%d: %s", config.hostip, config.hostport, err); + return -1; + } + anetTcpNoDelay(NULL,fd); + } + return fd; +} + +static sds cliReadLine(int fd) { + sds line = sdsempty(); + + while(1) { + char c; + ssize_t ret; + + ret = read(fd,&c,1); + if (ret == -1) { + sdsfree(line); + return NULL; + } else if ((ret == 0) || (c == '\n')) { + break; + } else { + line = sdscatlen(line,&c,1); + } + } + return sdstrim(line,"\r\n"); +} + +static int cliReadSingleLineReply(int fd, int quiet) { + sds reply = cliReadLine(fd); + + if (reply == NULL) return 1; + if (!quiet) + printf("%s\n", reply); + sdsfree(reply); + return 0; +} + +static void printStringRepr(char *s, int len) { + printf("\""); + while(len--) { + switch(*s) { + case '\\': + case '"': + printf("\\%c",*s); + break; + case '\n': printf("\\n"); break; + case '\r': printf("\\r"); break; + case '\t': printf("\\t"); break; + case '\a': printf("\\a"); break; + case '\b': printf("\\b"); break; + default: + if (isprint(*s)) + printf("%c",*s); + else + printf("\\x%02x",(unsigned char)*s); + break; + } + s++; + } + printf("\"\n"); +} + +static int cliReadBulkReply(int fd) { + sds replylen = cliReadLine(fd); + char *reply, crlf[2]; + int bulklen; + + if (replylen == NULL) return 1; + bulklen = atoi(replylen); + if (bulklen == -1) { + sdsfree(replylen); + printf("(nil)\n"); + return 0; + } + reply = zmalloc(bulklen); + anetRead(fd,reply,bulklen); + anetRead(fd,crlf,2); + if (config.raw_output || !isatty(fileno(stdout))) { + if (bulklen && fwrite(reply,bulklen,1,stdout) == 0) { + zfree(reply); + return 1; + } + } else { + /* If you are producing output for the standard output we want + * a more interesting output with quoted characters and so forth */ + printStringRepr(reply,bulklen); + } + zfree(reply); + return 0; +} + +static int cliReadMultiBulkReply(int fd) { + sds replylen = cliReadLine(fd); + int elements, c = 1; + + if (replylen == NULL) return 1; + elements = atoi(replylen); + if (elements == -1) { + sdsfree(replylen); + printf("(nil)\n"); + return 0; + } + if (elements == 0) { + printf("(empty list or set)\n"); + } + while(elements--) { + printf("%d. ", c); + if (cliReadReply(fd)) return 1; + c++; + } + return 0; +} + +static int cliReadReply(int fd) { + char type; + + if (anetRead(fd,&type,1) <= 0) { + if (config.shutdown) return 0; + exit(1); + } + switch(type) { + case '-': + printf("(error) "); + cliReadSingleLineReply(fd,0); + return 1; + case '+': + return cliReadSingleLineReply(fd,0); + case ':': + printf("(integer) "); + return cliReadSingleLineReply(fd,0); + case '$': + return cliReadBulkReply(fd); + case '*': + return cliReadMultiBulkReply(fd); + default: + printf("protocol error, got '%c' as reply type byte\n", type); + return 1; + } +} + +static int selectDb(int fd) { + int retval; + sds cmd; + char type; + + if (config.dbnum == 0) + return 0; + + cmd = sdsempty(); + cmd = sdscatprintf(cmd,"SELECT %d\r\n",config.dbnum); + anetWrite(fd,cmd,sdslen(cmd)); + anetRead(fd,&type,1); + if (type <= 0 || type != '+') return 1; + retval = cliReadSingleLineReply(fd,1); + if (retval) { + return retval; + } + return 0; +} + +static int cliSendCommand(int argc, char **argv, int repeat) { + char *command = argv[0]; + int fd, j, retval = 0; + sds cmd; + + config.raw_output = !strcasecmp(command,"info"); + if (!strcasecmp(command,"shutdown")) config.shutdown = 1; + if (!strcasecmp(command,"monitor")) config.monitor_mode = 1; + if (!strcasecmp(command,"subscribe") || + !strcasecmp(command,"psubscribe")) config.pubsub_mode = 1; + if ((fd = cliConnect()) == -1) return 1; + + /* Select db number */ + retval = selectDb(fd); + if (retval) { + fprintf(stderr,"Error setting DB num\n"); + return 1; + } + + /* Build the command to send */ + cmd = sdscatprintf(sdsempty(),"*%d\r\n",argc); + for (j = 0; j < argc; j++) { + cmd = sdscatprintf(cmd,"$%lu\r\n", + (unsigned long)sdslen(argv[j])); + cmd = sdscatlen(cmd,argv[j],sdslen(argv[j])); + cmd = sdscatlen(cmd,"\r\n",2); + } + + while(repeat--) { + anetWrite(fd,cmd,sdslen(cmd)); + while (config.monitor_mode) { + cliReadSingleLineReply(fd,0); + } + + if (config.pubsub_mode) { + printf("Reading messages... (press Ctrl-c to quit)\n"); + while (1) { + cliReadReply(fd); + printf("\n"); + } + } + + retval = cliReadReply(fd); + if (retval) { + return retval; + } + } + return 0; +} + +static int parseOptions(int argc, char **argv) { + int i; + + for (i = 1; i < argc; i++) { + int lastarg = i==argc-1; + + if (!strcmp(argv[i],"-h") && !lastarg) { + char *ip = zmalloc(32); + if (anetResolve(NULL,argv[i+1],ip) == ANET_ERR) { + printf("Can't resolve %s\n", argv[i]); + exit(1); + } + config.hostip = ip; + i++; + } else if (!strcmp(argv[i],"-h") && lastarg) { + usage(); + } else if (!strcmp(argv[i],"-p") && !lastarg) { + config.hostport = atoi(argv[i+1]); + i++; + } else if (!strcmp(argv[i],"-r") && !lastarg) { + config.repeat = strtoll(argv[i+1],NULL,10); + i++; + } else if (!strcmp(argv[i],"-n") && !lastarg) { + config.dbnum = atoi(argv[i+1]); + i++; + } else if (!strcmp(argv[i],"-a") && !lastarg) { + config.auth = argv[i+1]; + i++; + } else if (!strcmp(argv[i],"-i")) { + config.interactive = 1; + } else if (!strcmp(argv[i],"-c")) { + config.argn_from_stdin = 1; + } else { + break; + } + } + return i; +} + +static sds readArgFromStdin(void) { + char buf[1024]; + sds arg = sdsempty(); + + while(1) { + int nread = read(fileno(stdin),buf,1024); + + if (nread == 0) break; + else if (nread == -1) { + perror("Reading from standard input"); + exit(1); + } + arg = sdscatlen(arg,buf,nread); + } + return arg; +} + +static void usage() { + fprintf(stderr, "usage: redis-cli [-h host] [-p port] [-a authpw] [-r repeat_times] [-n db_num] [-i] cmd arg1 arg2 arg3 ... argN\n"); + fprintf(stderr, "usage: echo \"argN\" | redis-cli -c [-h host] [-p port] [-a authpw] [-r repeat_times] [-n db_num] cmd arg1 arg2 ... arg(N-1)\n"); + fprintf(stderr, "\nIf a pipe from standard input is detected this data is used as last argument.\n\n"); + fprintf(stderr, "example: cat /etc/passwd | redis-cli set my_passwd\n"); + fprintf(stderr, "example: redis-cli get my_passwd\n"); + fprintf(stderr, "example: redis-cli -r 100 lpush mylist x\n"); + fprintf(stderr, "\nRun in interactive mode: redis-cli -i or just don't pass any command\n"); + exit(1); +} + +/* Turn the plain C strings into Sds strings */ +static char **convertToSds(int count, char** args) { + int j; + char **sds = zmalloc(sizeof(char*)*count); + + for(j = 0; j < count; j++) + sds[j] = sdsnew(args[j]); + + return sds; +} + +static char **splitArguments(char *line, int *argc) { + char *p = line; + char *current = NULL; + char **vector = NULL; + + *argc = 0; + while(1) { + /* skip blanks */ + while(*p && isspace(*p)) p++; + if (*p) { + /* get a token */ + int inq=0; /* set to 1 if we are in "quotes" */ + int done = 0; + + if (current == NULL) current = sdsempty(); + while(!done) { + if (inq) { + if (*p == '\\' && *(p+1)) { + char c; + + p++; + switch(*p) { + case 'n': c = '\n'; break; + case 'r': c = '\r'; break; + case 't': c = '\t'; break; + case 'b': c = '\b'; break; + case 'a': c = '\a'; break; + default: c = *p; break; + } + current = sdscatlen(current,&c,1); + } else if (*p == '"') { + done = 1; + } else { + current = sdscatlen(current,p,1); + } + } else { + switch(*p) { + case ' ': + case '\n': + case '\r': + case '\t': + case '\0': + done=1; + break; + case '"': + inq=1; + break; + default: + current = sdscatlen(current,p,1); + break; + } + } + if (*p) p++; + } + /* add the token to the vector */ + vector = zrealloc(vector,((*argc)+1)*sizeof(char*)); + vector[*argc] = current; + (*argc)++; + current = NULL; + } else { + return vector; + } + } +} + +#define LINE_BUFLEN 4096 +static void repl() { + int argc, j; + char *line, **argv; + + while((line = linenoise("redis> ")) != NULL) { + if (line[0] != '\0') { + argv = splitArguments(line,&argc); + linenoiseHistoryAdd(line); + if (argc > 0) { + if (strcasecmp(argv[0],"quit") == 0 || + strcasecmp(argv[0],"exit") == 0) + exit(0); + else + cliSendCommand(argc, argv, 1); + } + /* Free the argument vector */ + for (j = 0; j < argc; j++) + sdsfree(argv[j]); + zfree(argv); + } + /* linenoise() returns malloc-ed lines like readline() */ + free(line); + } + exit(0); +} + +int main(int argc, char **argv) { + int firstarg; + char **argvcopy; + + config.hostip = "127.0.0.1"; + config.hostport = 6379; + config.repeat = 1; + config.dbnum = 0; + config.argn_from_stdin = 0; + config.shutdown = 0; + config.interactive = 0; + config.monitor_mode = 0; + config.pubsub_mode = 0; + config.raw_output = 0; + config.auth = NULL; + + firstarg = parseOptions(argc,argv); + argc -= firstarg; + argv += firstarg; + + if (config.auth != NULL) { + char *authargv[2]; + + authargv[0] = "AUTH"; + authargv[1] = config.auth; + cliSendCommand(2, convertToSds(2, authargv), 1); + } + + if (argc == 0 || config.interactive == 1) repl(); + + argvcopy = convertToSds(argc+1, argv); + if (config.argn_from_stdin) { + sds lastarg = readArgFromStdin(); + argvcopy[argc] = lastarg; + argc++; + } + return cliSendCommand(argc, argvcopy, config.repeat); +} diff --git a/src/redis.c b/src/redis.c new file mode 100644 index 000000000..5f539216f --- /dev/null +++ b/src/redis.c @@ -0,0 +1,1516 @@ +/* + * Copyright (c) 2009-2010, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "redis.h" + +#ifdef HAVE_BACKTRACE +#include +#include +#endif /* HAVE_BACKTRACE */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Our shared "common" objects */ + +struct sharedObjectsStruct shared; + +/* Global vars that are actally used as constants. The following double + * values are used for double on-disk serialization, and are initialized + * at runtime to avoid strange compiler optimizations. */ + +double R_Zero, R_PosInf, R_NegInf, R_Nan; + +/*================================= Globals ================================= */ + +/* Global vars */ +struct redisServer server; /* server global state */ +struct redisCommand *commandTable; +struct redisCommand readonlyCommandTable[] = { + {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, + {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0}, + {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0}, + {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0}, + {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, + {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1}, + {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0}, + {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, + {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1}, + {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1}, + {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1}, + {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, + {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, + {"rpushx",rpushxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, + {"lpushx",lpushxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, + {"linsert",linsertCommand,5,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, + {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, + {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, + {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1}, + {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1}, + {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, + {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1}, + {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, + {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1}, + {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1}, + {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1}, + {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1}, + {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, + {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1}, + {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1}, + {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1}, + {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, + {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, + {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, + {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1}, + {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1}, + {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1}, + {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1}, + {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1}, + {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1}, + {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, + {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, + {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, + {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1}, + {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1}, + {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1}, + {"zunionstore",zunionstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0}, + {"zinterstore",zinterstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0}, + {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1}, + {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1}, + {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1}, + {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1}, + {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, + {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, + {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1}, + {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1}, + {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, + {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, + {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1}, + {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, + {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1}, + {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1}, + {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1}, + {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, + {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, + {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, + {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, + {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1}, + {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1}, + {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1}, + {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1}, + {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2}, + {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2}, + {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, + {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0}, + {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1}, + {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1}, + {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1}, + {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0}, + {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0}, + {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0}, + {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, + {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0}, + {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, + {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0}, + {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, + {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, + {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, + {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, + {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, + {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, + {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, + {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,execBlockClientOnSwappedKeys,0,0,0}, + {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, + {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, + {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, + {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, + {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1}, + {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, + {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}, + {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1}, + {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0}, + {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0}, + {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0}, + {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0}, + {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0}, + {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0}, + {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0}, + {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0}, + {"watch",watchCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0}, + {"unwatch",unwatchCommand,1,REDIS_CMD_INLINE,NULL,0,0,0} +}; + +/*============================ Utility functions ============================ */ + +void redisLog(int level, const char *fmt, ...) { + va_list ap; + FILE *fp; + + fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a"); + if (!fp) return; + + va_start(ap, fmt); + if (level >= server.verbosity) { + char *c = ".-*#"; + char buf[64]; + time_t now; + + now = time(NULL); + strftime(buf,64,"%d %b %H:%M:%S",localtime(&now)); + fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]); + vfprintf(fp, fmt, ap); + fprintf(fp,"\n"); + fflush(fp); + } + va_end(ap); + + if (server.logfile) fclose(fp); +} + +/* Redis generally does not try to recover from out of memory conditions + * when allocating objects or strings, it is not clear if it will be possible + * to report this condition to the client since the networking layer itself + * is based on heap allocation for send buffers, so we simply abort. + * At least the code will be simpler to read... */ +void oom(const char *msg) { + redisLog(REDIS_WARNING, "%s: Out of memory\n",msg); + sleep(1); + abort(); +} + +/*====================== Hash table type implementation ==================== */ + +/* This is an hash table type that uses the SDS dynamic strings libary as + * keys and radis objects as values (objects can hold SDS strings, + * lists, sets). */ + +void dictVanillaFree(void *privdata, void *val) +{ + DICT_NOTUSED(privdata); + zfree(val); +} + +void dictListDestructor(void *privdata, void *val) +{ + DICT_NOTUSED(privdata); + listRelease((list*)val); +} + +int dictSdsKeyCompare(void *privdata, const void *key1, + const void *key2) +{ + int l1,l2; + DICT_NOTUSED(privdata); + + l1 = sdslen((sds)key1); + l2 = sdslen((sds)key2); + if (l1 != l2) return 0; + return memcmp(key1, key2, l1) == 0; +} + +void dictRedisObjectDestructor(void *privdata, void *val) +{ + DICT_NOTUSED(privdata); + + if (val == NULL) return; /* Values of swapped out keys as set to NULL */ + decrRefCount(val); +} + +void dictSdsDestructor(void *privdata, void *val) +{ + DICT_NOTUSED(privdata); + + sdsfree(val); +} + +int dictObjKeyCompare(void *privdata, const void *key1, + const void *key2) +{ + const robj *o1 = key1, *o2 = key2; + return dictSdsKeyCompare(privdata,o1->ptr,o2->ptr); +} + +unsigned int dictObjHash(const void *key) { + const robj *o = key; + return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr)); +} + +unsigned int dictSdsHash(const void *key) { + return dictGenHashFunction((unsigned char*)key, sdslen((char*)key)); +} + +int dictEncObjKeyCompare(void *privdata, const void *key1, + const void *key2) +{ + robj *o1 = (robj*) key1, *o2 = (robj*) key2; + int cmp; + + if (o1->encoding == REDIS_ENCODING_INT && + o2->encoding == REDIS_ENCODING_INT) + return o1->ptr == o2->ptr; + + o1 = getDecodedObject(o1); + o2 = getDecodedObject(o2); + cmp = dictSdsKeyCompare(privdata,o1->ptr,o2->ptr); + decrRefCount(o1); + decrRefCount(o2); + return cmp; +} + +unsigned int dictEncObjHash(const void *key) { + robj *o = (robj*) key; + + if (o->encoding == REDIS_ENCODING_RAW) { + return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr)); + } else { + if (o->encoding == REDIS_ENCODING_INT) { + char buf[32]; + int len; + + len = ll2string(buf,32,(long)o->ptr); + return dictGenHashFunction((unsigned char*)buf, len); + } else { + unsigned int hash; + + o = getDecodedObject(o); + hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr)); + decrRefCount(o); + return hash; + } + } +} + +/* Sets type */ +dictType setDictType = { + dictEncObjHash, /* hash function */ + NULL, /* key dup */ + NULL, /* val dup */ + dictEncObjKeyCompare, /* key compare */ + dictRedisObjectDestructor, /* key destructor */ + NULL /* val destructor */ +}; + +/* Sorted sets hash (note: a skiplist is used in addition to the hash table) */ +dictType zsetDictType = { + dictEncObjHash, /* hash function */ + NULL, /* key dup */ + NULL, /* val dup */ + dictEncObjKeyCompare, /* key compare */ + dictRedisObjectDestructor, /* key destructor */ + dictVanillaFree /* val destructor of malloc(sizeof(double)) */ +}; + +/* Db->dict, keys are sds strings, vals are Redis objects. */ +dictType dbDictType = { + dictSdsHash, /* hash function */ + NULL, /* key dup */ + NULL, /* val dup */ + dictSdsKeyCompare, /* key compare */ + dictSdsDestructor, /* key destructor */ + dictRedisObjectDestructor /* val destructor */ +}; + +/* Db->expires */ +dictType keyptrDictType = { + dictSdsHash, /* hash function */ + NULL, /* key dup */ + NULL, /* val dup */ + dictSdsKeyCompare, /* key compare */ + NULL, /* key destructor */ + NULL /* val destructor */ +}; + +/* Hash type hash table (note that small hashes are represented with zimpaps) */ +dictType hashDictType = { + dictEncObjHash, /* hash function */ + NULL, /* key dup */ + NULL, /* val dup */ + dictEncObjKeyCompare, /* key compare */ + dictRedisObjectDestructor, /* key destructor */ + dictRedisObjectDestructor /* val destructor */ +}; + +/* Keylist hash table type has unencoded redis objects as keys and + * lists as values. It's used for blocking operations (BLPOP) and to + * map swapped keys to a list of clients waiting for this keys to be loaded. */ +dictType keylistDictType = { + dictObjHash, /* hash function */ + NULL, /* key dup */ + NULL, /* val dup */ + dictObjKeyCompare, /* key compare */ + dictRedisObjectDestructor, /* key destructor */ + dictListDestructor /* val destructor */ +}; + +int htNeedsResize(dict *dict) { + long long size, used; + + size = dictSlots(dict); + used = dictSize(dict); + return (size && used && size > DICT_HT_INITIAL_SIZE && + (used*100/size < REDIS_HT_MINFILL)); +} + +/* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL + * we resize the hash table to save memory */ +void tryResizeHashTables(void) { + int j; + + for (j = 0; j < server.dbnum; j++) { + if (htNeedsResize(server.db[j].dict)) + dictResize(server.db[j].dict); + if (htNeedsResize(server.db[j].expires)) + dictResize(server.db[j].expires); + } +} + +/* Our hash table implementation performs rehashing incrementally while + * we write/read from the hash table. Still if the server is idle, the hash + * table will use two tables for a long time. So we try to use 1 millisecond + * of CPU time at every serverCron() loop in order to rehash some key. */ +void incrementallyRehash(void) { + int j; + + for (j = 0; j < server.dbnum; j++) { + if (dictIsRehashing(server.db[j].dict)) { + dictRehashMilliseconds(server.db[j].dict,1); + break; /* already used our millisecond for this loop... */ + } + } +} + +/* This function is called once a background process of some kind terminates, + * as we want to avoid resizing the hash tables when there is a child in order + * to play well with copy-on-write (otherwise when a resize happens lots of + * memory pages are copied). The goal of this function is to update the ability + * for dict.c to resize the hash tables accordingly to the fact we have o not + * running childs. */ +void updateDictResizePolicy(void) { + if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) + dictEnableResize(); + else + dictDisableResize(); +} + +/* ======================= Cron: called every 100 ms ======================== */ + +int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) { + int j, loops = server.cronloops++; + REDIS_NOTUSED(eventLoop); + REDIS_NOTUSED(id); + REDIS_NOTUSED(clientData); + + /* We take a cached value of the unix time in the global state because + * with virtual memory and aging there is to store the current time + * in objects at every object access, and accuracy is not needed. + * To access a global var is faster than calling time(NULL) */ + server.unixtime = time(NULL); + /* We have just 21 bits per object for LRU information. + * So we use an (eventually wrapping) LRU clock with minutes resolution. + * + * When we need to select what object to swap, we compute the minimum + * time distance between the current lruclock and the object last access + * lruclock info. Even if clocks will wrap on overflow, there is + * the interesting property that we are sure that at least + * ABS(A-B) minutes passed between current time and timestamp B. + * + * This is not precise but we don't need at all precision, but just + * something statistically reasonable. + */ + server.lruclock = (time(NULL)/60)&((1<<21)-1); + + /* We received a SIGTERM, shutting down here in a safe way, as it is + * not ok doing so inside the signal handler. */ + if (server.shutdown_asap) { + if (prepareForShutdown() == REDIS_OK) exit(0); + redisLog(REDIS_WARNING,"SIGTERM received but errors trying to shut down the server, check the logs for more information"); + } + + /* Show some info about non-empty databases */ + for (j = 0; j < server.dbnum; j++) { + long long size, used, vkeys; + + size = dictSlots(server.db[j].dict); + used = dictSize(server.db[j].dict); + vkeys = dictSize(server.db[j].expires); + if (!(loops % 50) && (used || vkeys)) { + redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size); + /* dictPrintStats(server.dict); */ + } + } + + /* We don't want to resize the hash tables while a bacground saving + * is in progress: the saving child is created using fork() that is + * implemented with a copy-on-write semantic in most modern systems, so + * if we resize the HT while there is the saving child at work actually + * a lot of memory movements in the parent will cause a lot of pages + * copied. */ + if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) { + if (!(loops % 10)) tryResizeHashTables(); + if (server.activerehashing) incrementallyRehash(); + } + + /* Show information about connected clients */ + if (!(loops % 50)) { + redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use", + listLength(server.clients)-listLength(server.slaves), + listLength(server.slaves), + zmalloc_used_memory()); + } + + /* Close connections of timedout clients */ + if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients) + closeTimedoutClients(); + + /* Check if a background saving or AOF rewrite in progress terminated */ + if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) { + int statloc; + pid_t pid; + + if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) { + if (pid == server.bgsavechildpid) { + backgroundSaveDoneHandler(statloc); + } else { + backgroundRewriteDoneHandler(statloc); + } + updateDictResizePolicy(); + } + } else { + /* If there is not a background saving in progress check if + * we have to save now */ + time_t now = time(NULL); + for (j = 0; j < server.saveparamslen; j++) { + struct saveparam *sp = server.saveparams+j; + + if (server.dirty >= sp->changes && + now-server.lastsave > sp->seconds) { + redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...", + sp->changes, sp->seconds); + rdbSaveBackground(server.dbfilename); + break; + } + } + } + + /* Try to expire a few timed out keys. The algorithm used is adaptive and + * will use few CPU cycles if there are few expiring keys, otherwise + * it will get more aggressive to avoid that too much memory is used by + * keys that can be removed from the keyspace. */ + for (j = 0; j < server.dbnum; j++) { + int expired; + redisDb *db = server.db+j; + + /* Continue to expire if at the end of the cycle more than 25% + * of the keys were expired. */ + do { + long num = dictSize(db->expires); + time_t now = time(NULL); + + expired = 0; + if (num > REDIS_EXPIRELOOKUPS_PER_CRON) + num = REDIS_EXPIRELOOKUPS_PER_CRON; + while (num--) { + dictEntry *de; + time_t t; + + if ((de = dictGetRandomKey(db->expires)) == NULL) break; + t = (time_t) dictGetEntryVal(de); + if (now > t) { + sds key = dictGetEntryKey(de); + robj *keyobj = createStringObject(key,sdslen(key)); + + dbDelete(db,keyobj); + decrRefCount(keyobj); + expired++; + server.stat_expiredkeys++; + } + } + } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4); + } + + /* Swap a few keys on disk if we are over the memory limit and VM + * is enbled. Try to free objects from the free list first. */ + if (vmCanSwapOut()) { + while (server.vm_enabled && zmalloc_used_memory() > + server.vm_max_memory) + { + int retval; + + if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue; + retval = (server.vm_max_threads == 0) ? + vmSwapOneObjectBlocking() : + vmSwapOneObjectThreaded(); + if (retval == REDIS_ERR && !(loops % 300) && + zmalloc_used_memory() > + (server.vm_max_memory+server.vm_max_memory/10)) + { + redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!"); + } + /* Note that when using threade I/O we free just one object, + * because anyway when the I/O thread in charge to swap this + * object out will finish, the handler of completed jobs + * will try to swap more objects if we are still out of memory. */ + if (retval == REDIS_ERR || server.vm_max_threads > 0) break; + } + } + + /* Check if we should connect to a MASTER */ + if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) { + redisLog(REDIS_NOTICE,"Connecting to MASTER..."); + if (syncWithMaster() == REDIS_OK) { + redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded"); + if (server.appendonly) rewriteAppendOnlyFileBackground(); + } + } + return 100; +} + +/* This function gets called every time Redis is entering the + * main loop of the event driven library, that is, before to sleep + * for ready file descriptors. */ +void beforeSleep(struct aeEventLoop *eventLoop) { + REDIS_NOTUSED(eventLoop); + + /* Awake clients that got all the swapped keys they requested */ + if (server.vm_enabled && listLength(server.io_ready_clients)) { + listIter li; + listNode *ln; + + listRewind(server.io_ready_clients,&li); + while((ln = listNext(&li))) { + redisClient *c = ln->value; + struct redisCommand *cmd; + + /* Resume the client. */ + listDelNode(server.io_ready_clients,ln); + c->flags &= (~REDIS_IO_WAIT); + server.vm_blocked_clients--; + aeCreateFileEvent(server.el, c->fd, AE_READABLE, + readQueryFromClient, c); + cmd = lookupCommand(c->argv[0]->ptr); + redisAssert(cmd != NULL); + call(c,cmd); + resetClient(c); + /* There may be more data to process in the input buffer. */ + if (c->querybuf && sdslen(c->querybuf) > 0) + processInputBuffer(c); + } + } + /* Write the AOF buffer on disk */ + flushAppendOnlyFile(); +} + +/* =========================== Server initialization ======================== */ + +void createSharedObjects(void) { + int j; + + shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n")); + shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n")); + shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n")); + shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n")); + shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n")); + shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n")); + shared.cnegone = createObject(REDIS_STRING,sdsnew(":-1\r\n")); + shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n")); + shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n")); + shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n")); + shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n")); + shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n")); + shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew( + "-ERR Operation against a key holding the wrong kind of value\r\n")); + shared.nokeyerr = createObject(REDIS_STRING,sdsnew( + "-ERR no such key\r\n")); + shared.syntaxerr = createObject(REDIS_STRING,sdsnew( + "-ERR syntax error\r\n")); + shared.sameobjecterr = createObject(REDIS_STRING,sdsnew( + "-ERR source and destination objects are the same\r\n")); + shared.outofrangeerr = createObject(REDIS_STRING,sdsnew( + "-ERR index out of range\r\n")); + shared.space = createObject(REDIS_STRING,sdsnew(" ")); + shared.colon = createObject(REDIS_STRING,sdsnew(":")); + shared.plus = createObject(REDIS_STRING,sdsnew("+")); + shared.select0 = createStringObject("select 0\r\n",10); + shared.select1 = createStringObject("select 1\r\n",10); + shared.select2 = createStringObject("select 2\r\n",10); + shared.select3 = createStringObject("select 3\r\n",10); + shared.select4 = createStringObject("select 4\r\n",10); + shared.select5 = createStringObject("select 5\r\n",10); + shared.select6 = createStringObject("select 6\r\n",10); + shared.select7 = createStringObject("select 7\r\n",10); + shared.select8 = createStringObject("select 8\r\n",10); + shared.select9 = createStringObject("select 9\r\n",10); + shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13); + shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14); + shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15); + shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18); + shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17); + shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19); + shared.mbulk3 = createStringObject("*3\r\n",4); + shared.mbulk4 = createStringObject("*4\r\n",4); + for (j = 0; j < REDIS_SHARED_INTEGERS; j++) { + shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j); + shared.integers[j]->encoding = REDIS_ENCODING_INT; + } +} + +void initServerConfig() { + server.dbnum = REDIS_DEFAULT_DBNUM; + server.port = REDIS_SERVERPORT; + server.verbosity = REDIS_VERBOSE; + server.maxidletime = REDIS_MAXIDLETIME; + server.saveparams = NULL; + server.logfile = NULL; /* NULL = log on standard output */ + server.bindaddr = NULL; + server.glueoutputbuf = 1; + server.daemonize = 0; + server.appendonly = 0; + server.appendfsync = APPENDFSYNC_EVERYSEC; + server.no_appendfsync_on_rewrite = 0; + server.lastfsync = time(NULL); + server.appendfd = -1; + server.appendseldb = -1; /* Make sure the first time will not match */ + server.pidfile = zstrdup("/var/run/redis.pid"); + server.dbfilename = zstrdup("dump.rdb"); + server.appendfilename = zstrdup("appendonly.aof"); + server.requirepass = NULL; + server.rdbcompression = 1; + server.activerehashing = 1; + server.maxclients = 0; + server.blpop_blocked_clients = 0; + server.maxmemory = 0; + server.vm_enabled = 0; + server.vm_swap_file = zstrdup("/tmp/redis-%p.vm"); + server.vm_page_size = 256; /* 256 bytes per page */ + server.vm_pages = 1024*1024*100; /* 104 millions of pages */ + server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */ + server.vm_max_threads = 4; + server.vm_blocked_clients = 0; + server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES; + server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE; + server.list_max_ziplist_entries = REDIS_LIST_MAX_ZIPLIST_ENTRIES; + server.list_max_ziplist_value = REDIS_LIST_MAX_ZIPLIST_VALUE; + server.shutdown_asap = 0; + + resetServerSaveParams(); + + appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */ + appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */ + appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */ + /* Replication related */ + server.isslave = 0; + server.masterauth = NULL; + server.masterhost = NULL; + server.masterport = 6379; + server.master = NULL; + server.replstate = REDIS_REPL_NONE; + + /* Double constants initialization */ + R_Zero = 0.0; + R_PosInf = 1.0/R_Zero; + R_NegInf = -1.0/R_Zero; + R_Nan = R_Zero/R_Zero; +} + +void initServer() { + int j; + + signal(SIGHUP, SIG_IGN); + signal(SIGPIPE, SIG_IGN); + setupSigSegvAction(); + + server.devnull = fopen("/dev/null","w"); + if (server.devnull == NULL) { + redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr); + exit(1); + } + server.clients = listCreate(); + server.slaves = listCreate(); + server.monitors = listCreate(); + server.objfreelist = listCreate(); + createSharedObjects(); + server.el = aeCreateEventLoop(); + server.db = zmalloc(sizeof(redisDb)*server.dbnum); + server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr); + if (server.fd == -1) { + redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr); + exit(1); + } + for (j = 0; j < server.dbnum; j++) { + server.db[j].dict = dictCreate(&dbDictType,NULL); + server.db[j].expires = dictCreate(&keyptrDictType,NULL); + server.db[j].blocking_keys = dictCreate(&keylistDictType,NULL); + server.db[j].watched_keys = dictCreate(&keylistDictType,NULL); + if (server.vm_enabled) + server.db[j].io_keys = dictCreate(&keylistDictType,NULL); + server.db[j].id = j; + } + server.pubsub_channels = dictCreate(&keylistDictType,NULL); + server.pubsub_patterns = listCreate(); + listSetFreeMethod(server.pubsub_patterns,freePubsubPattern); + listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern); + server.cronloops = 0; + server.bgsavechildpid = -1; + server.bgrewritechildpid = -1; + server.bgrewritebuf = sdsempty(); + server.aofbuf = sdsempty(); + server.lastsave = time(NULL); + server.dirty = 0; + server.stat_numcommands = 0; + server.stat_numconnections = 0; + server.stat_expiredkeys = 0; + server.stat_starttime = time(NULL); + server.unixtime = time(NULL); + aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL); + if (aeCreateFileEvent(server.el, server.fd, AE_READABLE, + acceptHandler, NULL) == AE_ERR) oom("creating file event"); + + if (server.appendonly) { + server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644); + if (server.appendfd == -1) { + redisLog(REDIS_WARNING, "Can't open the append-only file: %s", + strerror(errno)); + exit(1); + } + } + + if (server.vm_enabled) vmInit(); +} + +int qsortRedisCommands(const void *r1, const void *r2) { + return strcasecmp( + ((struct redisCommand*)r1)->name, + ((struct redisCommand*)r2)->name); +} + +void sortCommandTable() { + /* Copy and sort the read-only version of the command table */ + commandTable = (struct redisCommand*)malloc(sizeof(readonlyCommandTable)); + memcpy(commandTable,readonlyCommandTable,sizeof(readonlyCommandTable)); + qsort(commandTable, + sizeof(readonlyCommandTable)/sizeof(struct redisCommand), + sizeof(struct redisCommand),qsortRedisCommands); +} + +/* ====================== Commands lookup and execution ===================== */ + +struct redisCommand *lookupCommand(char *name) { + struct redisCommand tmp = {name,NULL,0,0,NULL,0,0,0}; + return bsearch( + &tmp, + commandTable, + sizeof(readonlyCommandTable)/sizeof(struct redisCommand), + sizeof(struct redisCommand), + qsortRedisCommands); +} + +/* Call() is the core of Redis execution of a command */ +void call(redisClient *c, struct redisCommand *cmd) { + long long dirty; + + dirty = server.dirty; + cmd->proc(c); + dirty = server.dirty-dirty; + + if (server.appendonly && dirty) + feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc); + if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) && + listLength(server.slaves)) + replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc); + if (listLength(server.monitors)) + replicationFeedMonitors(server.monitors,c->db->id,c->argv,c->argc); + server.stat_numcommands++; +} + +/* If this function gets called we already read a whole + * command, argments are in the client argv/argc fields. + * processCommand() execute the command or prepare the + * server for a bulk read from the client. + * + * If 1 is returned the client is still alive and valid and + * and other operations can be performed by the caller. Otherwise + * if 0 is returned the client was destroied (i.e. after QUIT). */ +int processCommand(redisClient *c) { + struct redisCommand *cmd; + + /* Free some memory if needed (maxmemory setting) */ + if (server.maxmemory) freeMemoryIfNeeded(); + + /* Handle the multi bulk command type. This is an alternative protocol + * supported by Redis in order to receive commands that are composed of + * multiple binary-safe "bulk" arguments. The latency of processing is + * a bit higher but this allows things like multi-sets, so if this + * protocol is used only for MSET and similar commands this is a big win. */ + if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') { + c->multibulk = atoi(((char*)c->argv[0]->ptr)+1); + if (c->multibulk <= 0) { + resetClient(c); + return 1; + } else { + decrRefCount(c->argv[c->argc-1]); + c->argc--; + return 1; + } + } else if (c->multibulk) { + if (c->bulklen == -1) { + if (((char*)c->argv[0]->ptr)[0] != '$') { + addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n")); + resetClient(c); + return 1; + } else { + int bulklen = atoi(((char*)c->argv[0]->ptr)+1); + decrRefCount(c->argv[0]); + if (bulklen < 0 || bulklen > 1024*1024*1024) { + c->argc--; + addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n")); + resetClient(c); + return 1; + } + c->argc--; + c->bulklen = bulklen+2; /* add two bytes for CR+LF */ + return 1; + } + } else { + c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1)); + c->mbargv[c->mbargc] = c->argv[0]; + c->mbargc++; + c->argc--; + c->multibulk--; + if (c->multibulk == 0) { + robj **auxargv; + int auxargc; + + /* Here we need to swap the multi-bulk argc/argv with the + * normal argc/argv of the client structure. */ + auxargv = c->argv; + c->argv = c->mbargv; + c->mbargv = auxargv; + + auxargc = c->argc; + c->argc = c->mbargc; + c->mbargc = auxargc; + + /* We need to set bulklen to something different than -1 + * in order for the code below to process the command without + * to try to read the last argument of a bulk command as + * a special argument. */ + c->bulklen = 0; + /* continue below and process the command */ + } else { + c->bulklen = -1; + return 1; + } + } + } + /* -- end of multi bulk commands processing -- */ + + /* The QUIT command is handled as a special case. Normal command + * procs are unable to close the client connection safely */ + if (!strcasecmp(c->argv[0]->ptr,"quit")) { + freeClient(c); + return 0; + } + + /* Now lookup the command and check ASAP about trivial error conditions + * such wrong arity, bad command name and so forth. */ + cmd = lookupCommand(c->argv[0]->ptr); + if (!cmd) { + addReplySds(c, + sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n", + (char*)c->argv[0]->ptr)); + resetClient(c); + return 1; + } else if ((cmd->arity > 0 && cmd->arity != c->argc) || + (c->argc < -cmd->arity)) { + addReplySds(c, + sdscatprintf(sdsempty(), + "-ERR wrong number of arguments for '%s' command\r\n", + cmd->name)); + resetClient(c); + return 1; + } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) { + /* This is a bulk command, we have to read the last argument yet. */ + int bulklen = atoi(c->argv[c->argc-1]->ptr); + + decrRefCount(c->argv[c->argc-1]); + if (bulklen < 0 || bulklen > 1024*1024*1024) { + c->argc--; + addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n")); + resetClient(c); + return 1; + } + c->argc--; + c->bulklen = bulklen+2; /* add two bytes for CR+LF */ + /* It is possible that the bulk read is already in the + * buffer. Check this condition and handle it accordingly. + * This is just a fast path, alternative to call processInputBuffer(). + * It's a good idea since the code is small and this condition + * happens most of the times. */ + if ((signed)sdslen(c->querybuf) >= c->bulklen) { + c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2); + c->argc++; + c->querybuf = sdsrange(c->querybuf,c->bulklen,-1); + } else { + /* Otherwise return... there is to read the last argument + * from the socket. */ + return 1; + } + } + /* Let's try to encode the bulk object to save space. */ + if (cmd->flags & REDIS_CMD_BULK) + c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]); + + /* Check if the user is authenticated */ + if (server.requirepass && !c->authenticated && cmd->proc != authCommand) { + addReplySds(c,sdsnew("-ERR operation not permitted\r\n")); + resetClient(c); + return 1; + } + + /* Handle the maxmemory directive */ + if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) && + zmalloc_used_memory() > server.maxmemory) + { + addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n")); + resetClient(c); + return 1; + } + + /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */ + if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0) + && + cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand && + cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) { + addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n")); + resetClient(c); + return 1; + } + + /* Exec the command */ + if (c->flags & REDIS_MULTI && + cmd->proc != execCommand && cmd->proc != discardCommand && + cmd->proc != multiCommand && cmd->proc != watchCommand) + { + queueMultiCommand(c,cmd); + addReply(c,shared.queued); + } else { + if (server.vm_enabled && server.vm_max_threads > 0 && + blockClientOnSwappedKeys(c,cmd)) return 1; + call(c,cmd); + } + + /* Prepare the client for the next command */ + resetClient(c); + return 1; +} + +/*================================== Shutdown =============================== */ + +int prepareForShutdown() { + redisLog(REDIS_WARNING,"User requested shutdown, saving DB..."); + /* Kill the saving child if there is a background saving in progress. + We want to avoid race conditions, for instance our saving child may + overwrite the synchronous saving did by SHUTDOWN. */ + if (server.bgsavechildpid != -1) { + redisLog(REDIS_WARNING,"There is a live saving child. Killing it!"); + kill(server.bgsavechildpid,SIGKILL); + rdbRemoveTempFile(server.bgsavechildpid); + } + if (server.appendonly) { + /* Append only file: fsync() the AOF and exit */ + aof_fsync(server.appendfd); + if (server.vm_enabled) unlink(server.vm_swap_file); + } else { + /* Snapshotting. Perform a SYNC SAVE and exit */ + if (rdbSave(server.dbfilename) == REDIS_OK) { + if (server.daemonize) + unlink(server.pidfile); + redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory()); + } else { + /* Ooops.. error saving! The best we can do is to continue + * operating. Note that if there was a background saving process, + * in the next cron() Redis will be notified that the background + * saving aborted, handling special stuff like slaves pending for + * synchronization... */ + redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit"); + return REDIS_ERR; + } + } + redisLog(REDIS_WARNING,"Server exit now, bye bye..."); + return REDIS_OK; +} + +/*================================== Commands =============================== */ + +void authCommand(redisClient *c) { + if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) { + c->authenticated = 1; + addReply(c,shared.ok); + } else { + c->authenticated = 0; + addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n")); + } +} + +void pingCommand(redisClient *c) { + addReply(c,shared.pong); +} + +void echoCommand(redisClient *c) { + addReplyBulk(c,c->argv[1]); +} + +/* Convert an amount of bytes into a human readable string in the form + * of 100B, 2G, 100M, 4K, and so forth. */ +void bytesToHuman(char *s, unsigned long long n) { + double d; + + if (n < 1024) { + /* Bytes */ + sprintf(s,"%lluB",n); + return; + } else if (n < (1024*1024)) { + d = (double)n/(1024); + sprintf(s,"%.2fK",d); + } else if (n < (1024LL*1024*1024)) { + d = (double)n/(1024*1024); + sprintf(s,"%.2fM",d); + } else if (n < (1024LL*1024*1024*1024)) { + d = (double)n/(1024LL*1024*1024); + sprintf(s,"%.2fG",d); + } +} + +/* Create the string returned by the INFO command. This is decoupled + * by the INFO command itself as we need to report the same information + * on memory corruption problems. */ +sds genRedisInfoString(void) { + sds info; + time_t uptime = time(NULL)-server.stat_starttime; + int j; + char hmem[64]; + + bytesToHuman(hmem,zmalloc_used_memory()); + info = sdscatprintf(sdsempty(), + "redis_version:%s\r\n" + "redis_git_sha1:%s\r\n" + "redis_git_dirty:%d\r\n" + "arch_bits:%s\r\n" + "multiplexing_api:%s\r\n" + "process_id:%ld\r\n" + "uptime_in_seconds:%ld\r\n" + "uptime_in_days:%ld\r\n" + "connected_clients:%d\r\n" + "connected_slaves:%d\r\n" + "blocked_clients:%d\r\n" + "used_memory:%zu\r\n" + "used_memory_human:%s\r\n" + "changes_since_last_save:%lld\r\n" + "bgsave_in_progress:%d\r\n" + "last_save_time:%ld\r\n" + "bgrewriteaof_in_progress:%d\r\n" + "total_connections_received:%lld\r\n" + "total_commands_processed:%lld\r\n" + "expired_keys:%lld\r\n" + "hash_max_zipmap_entries:%zu\r\n" + "hash_max_zipmap_value:%zu\r\n" + "pubsub_channels:%ld\r\n" + "pubsub_patterns:%u\r\n" + "vm_enabled:%d\r\n" + "role:%s\r\n" + ,REDIS_VERSION, + redisGitSHA1(), + strtol(redisGitDirty(),NULL,10) > 0, + (sizeof(long) == 8) ? "64" : "32", + aeGetApiName(), + (long) getpid(), + uptime, + uptime/(3600*24), + listLength(server.clients)-listLength(server.slaves), + listLength(server.slaves), + server.blpop_blocked_clients, + zmalloc_used_memory(), + hmem, + server.dirty, + server.bgsavechildpid != -1, + server.lastsave, + server.bgrewritechildpid != -1, + server.stat_numconnections, + server.stat_numcommands, + server.stat_expiredkeys, + server.hash_max_zipmap_entries, + server.hash_max_zipmap_value, + dictSize(server.pubsub_channels), + listLength(server.pubsub_patterns), + server.vm_enabled != 0, + server.masterhost == NULL ? "master" : "slave" + ); + if (server.masterhost) { + info = sdscatprintf(info, + "master_host:%s\r\n" + "master_port:%d\r\n" + "master_link_status:%s\r\n" + "master_last_io_seconds_ago:%d\r\n" + ,server.masterhost, + server.masterport, + (server.replstate == REDIS_REPL_CONNECTED) ? + "up" : "down", + server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1 + ); + } + if (server.vm_enabled) { + lockThreadedIO(); + info = sdscatprintf(info, + "vm_conf_max_memory:%llu\r\n" + "vm_conf_page_size:%llu\r\n" + "vm_conf_pages:%llu\r\n" + "vm_stats_used_pages:%llu\r\n" + "vm_stats_swapped_objects:%llu\r\n" + "vm_stats_swappin_count:%llu\r\n" + "vm_stats_swappout_count:%llu\r\n" + "vm_stats_io_newjobs_len:%lu\r\n" + "vm_stats_io_processing_len:%lu\r\n" + "vm_stats_io_processed_len:%lu\r\n" + "vm_stats_io_active_threads:%lu\r\n" + "vm_stats_blocked_clients:%lu\r\n" + ,(unsigned long long) server.vm_max_memory, + (unsigned long long) server.vm_page_size, + (unsigned long long) server.vm_pages, + (unsigned long long) server.vm_stats_used_pages, + (unsigned long long) server.vm_stats_swapped_objects, + (unsigned long long) server.vm_stats_swapins, + (unsigned long long) server.vm_stats_swapouts, + (unsigned long) listLength(server.io_newjobs), + (unsigned long) listLength(server.io_processing), + (unsigned long) listLength(server.io_processed), + (unsigned long) server.io_active_threads, + (unsigned long) server.vm_blocked_clients + ); + unlockThreadedIO(); + } + for (j = 0; j < server.dbnum; j++) { + long long keys, vkeys; + + keys = dictSize(server.db[j].dict); + vkeys = dictSize(server.db[j].expires); + if (keys || vkeys) { + info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n", + j, keys, vkeys); + } + } + return info; +} + +void infoCommand(redisClient *c) { + sds info = genRedisInfoString(); + addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n", + (unsigned long)sdslen(info))); + addReplySds(c,info); + addReply(c,shared.crlf); +} + +void monitorCommand(redisClient *c) { + /* ignore MONITOR if aleady slave or in monitor mode */ + if (c->flags & REDIS_SLAVE) return; + + c->flags |= (REDIS_SLAVE|REDIS_MONITOR); + c->slaveseldb = 0; + listAddNodeTail(server.monitors,c); + addReply(c,shared.ok); +} + +/* ============================ Maxmemory directive ======================== */ + +/* Try to free one object form the pre-allocated objects free list. + * This is useful under low mem conditions as by default we take 1 million + * free objects allocated. On success REDIS_OK is returned, otherwise + * REDIS_ERR. */ +int tryFreeOneObjectFromFreelist(void) { + robj *o; + + if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex); + if (listLength(server.objfreelist)) { + listNode *head = listFirst(server.objfreelist); + o = listNodeValue(head); + listDelNode(server.objfreelist,head); + if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex); + zfree(o); + return REDIS_OK; + } else { + if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex); + return REDIS_ERR; + } +} + +/* This function gets called when 'maxmemory' is set on the config file to limit + * the max memory used by the server, and we are out of memory. + * This function will try to, in order: + * + * - Free objects from the free list + * - Try to remove keys with an EXPIRE set + * + * It is not possible to free enough memory to reach used-memory < maxmemory + * the server will start refusing commands that will enlarge even more the + * memory usage. + */ +void freeMemoryIfNeeded(void) { + while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) { + int j, k, freed = 0; + + if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue; + for (j = 0; j < server.dbnum; j++) { + int minttl = -1; + robj *minkey = NULL; + struct dictEntry *de; + + if (dictSize(server.db[j].expires)) { + freed = 1; + /* From a sample of three keys drop the one nearest to + * the natural expire */ + for (k = 0; k < 3; k++) { + time_t t; + + de = dictGetRandomKey(server.db[j].expires); + t = (time_t) dictGetEntryVal(de); + if (minttl == -1 || t < minttl) { + minkey = dictGetEntryKey(de); + minttl = t; + } + } + dbDelete(server.db+j,minkey); + } + } + if (!freed) return; /* nothing to free... */ + } +} + +/* =================================== Main! ================================ */ + +#ifdef __linux__ +int linuxOvercommitMemoryValue(void) { + FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r"); + char buf[64]; + + if (!fp) return -1; + if (fgets(buf,64,fp) == NULL) { + fclose(fp); + return -1; + } + fclose(fp); + + return atoi(buf); +} + +void linuxOvercommitMemoryWarning(void) { + if (linuxOvercommitMemoryValue() == 0) { + redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect."); + } +} +#endif /* __linux__ */ + +void daemonize(void) { + int fd; + FILE *fp; + + if (fork() != 0) exit(0); /* parent exits */ + setsid(); /* create a new session */ + + /* Every output goes to /dev/null. If Redis is daemonized but + * the 'logfile' is set to 'stdout' in the configuration file + * it will not log at all. */ + if ((fd = open("/dev/null", O_RDWR, 0)) != -1) { + dup2(fd, STDIN_FILENO); + dup2(fd, STDOUT_FILENO); + dup2(fd, STDERR_FILENO); + if (fd > STDERR_FILENO) close(fd); + } + /* Try to write the pid file */ + fp = fopen(server.pidfile,"w"); + if (fp) { + fprintf(fp,"%d\n",getpid()); + fclose(fp); + } +} + +void version() { + printf("Redis server version %s (%s:%d)\n", REDIS_VERSION, + redisGitSHA1(), atoi(redisGitDirty()) > 0); + exit(0); +} + +void usage() { + fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n"); + fprintf(stderr," ./redis-server - (read config from stdin)\n"); + exit(1); +} + +int main(int argc, char **argv) { + time_t start; + + initServerConfig(); + sortCommandTable(); + if (argc == 2) { + if (strcmp(argv[1], "-v") == 0 || + strcmp(argv[1], "--version") == 0) version(); + if (strcmp(argv[1], "--help") == 0) usage(); + resetServerSaveParams(); + loadServerConfig(argv[1]); + } else if ((argc > 2)) { + usage(); + } else { + redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'"); + } + if (server.daemonize) daemonize(); + initServer(); + redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION); +#ifdef __linux__ + linuxOvercommitMemoryWarning(); +#endif + start = time(NULL); + if (server.appendonly) { + if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK) + redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start); + } else { + if (rdbLoad(server.dbfilename) == REDIS_OK) + redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start); + } + redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port); + aeSetBeforeSleepProc(server.el,beforeSleep); + aeMain(server.el); + aeDeleteEventLoop(server.el); + return 0; +} + +/* ============================= Backtrace support ========================= */ + +#ifdef HAVE_BACKTRACE +void *getMcontextEip(ucontext_t *uc) { +#if defined(__FreeBSD__) + return (void*) uc->uc_mcontext.mc_eip; +#elif defined(__dietlibc__) + return (void*) uc->uc_mcontext.eip; +#elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6) + #if __x86_64__ + return (void*) uc->uc_mcontext->__ss.__rip; + #else + return (void*) uc->uc_mcontext->__ss.__eip; + #endif +#elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6) + #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__) + return (void*) uc->uc_mcontext->__ss.__rip; + #else + return (void*) uc->uc_mcontext->__ss.__eip; + #endif +#elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__) + return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */ +#elif defined(__ia64__) /* Linux IA64 */ + return (void*) uc->uc_mcontext.sc_ip; +#else + return NULL; +#endif +} + +void segvHandler(int sig, siginfo_t *info, void *secret) { + void *trace[100]; + char **messages = NULL; + int i, trace_size = 0; + ucontext_t *uc = (ucontext_t*) secret; + sds infostring; + REDIS_NOTUSED(info); + + redisLog(REDIS_WARNING, + "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig); + infostring = genRedisInfoString(); + redisLog(REDIS_WARNING, "%s",infostring); + /* It's not safe to sdsfree() the returned string under memory + * corruption conditions. Let it leak as we are going to abort */ + + trace_size = backtrace(trace, 100); + /* overwrite sigaction with caller's address */ + if (getMcontextEip(uc) != NULL) { + trace[1] = getMcontextEip(uc); + } + messages = backtrace_symbols(trace, trace_size); + + for (i=1; i +#include +#include +#include +#include +#include +#include + +#include "ae.h" /* Event driven programming library */ +#include "sds.h" /* Dynamic safe strings */ +#include "dict.h" /* Hash tables */ +#include "adlist.h" /* Linked lists */ +#include "zmalloc.h" /* total memory usage aware version of malloc/free */ +#include "anet.h" /* Networking the easy way */ +#include "zipmap.h" /* Compact string -> string data structure */ +#include "ziplist.h" /* Compact list data structure */ +#include "version.h" + +/* Error codes */ +#define REDIS_OK 0 +#define REDIS_ERR -1 + +/* Static server configuration */ +#define REDIS_SERVERPORT 6379 /* TCP port */ +#define REDIS_MAXIDLETIME (60*5) /* default client timeout */ +#define REDIS_IOBUF_LEN 1024 +#define REDIS_LOADBUF_LEN 1024 +#define REDIS_STATIC_ARGS 8 +#define REDIS_DEFAULT_DBNUM 16 +#define REDIS_CONFIGLINE_MAX 1024 +#define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */ +#define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */ +#define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */ +#define REDIS_MAX_WRITE_PER_EVENT (1024*64) +#define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */ +#define REDIS_SHARED_INTEGERS 10000 + +/* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */ +#define REDIS_WRITEV_THRESHOLD 3 +/* Max number of iovecs used for each writev call */ +#define REDIS_WRITEV_IOVEC_COUNT 256 + +/* Hash table parameters */ +#define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */ + +/* Command flags */ +#define REDIS_CMD_BULK 1 /* Bulk write command */ +#define REDIS_CMD_INLINE 2 /* Inline command */ +/* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with + this flags will return an error when the 'maxmemory' option is set in the + config file and the server is using more than maxmemory bytes of memory. + In short this commands are denied on low memory conditions. */ +#define REDIS_CMD_DENYOOM 4 +#define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */ + +/* Object types */ +#define REDIS_STRING 0 +#define REDIS_LIST 1 +#define REDIS_SET 2 +#define REDIS_ZSET 3 +#define REDIS_HASH 4 +#define REDIS_VMPOINTER 8 + +/* Objects encoding. Some kind of objects like Strings and Hashes can be + * internally represented in multiple ways. The 'encoding' field of the object + * is set to one of this fields for this object. */ +#define REDIS_ENCODING_RAW 0 /* Raw representation */ +#define REDIS_ENCODING_INT 1 /* Encoded as integer */ +#define REDIS_ENCODING_HT 2 /* Encoded as hash table */ +#define REDIS_ENCODING_ZIPMAP 3 /* Encoded as zipmap */ +#define REDIS_ENCODING_LINKEDLIST 4 /* Encoded as regular linked list */ +#define REDIS_ENCODING_ZIPLIST 5 /* Encoded as ziplist */ + +/* Object types only used for dumping to disk */ +#define REDIS_EXPIRETIME 253 +#define REDIS_SELECTDB 254 +#define REDIS_EOF 255 + +/* Defines related to the dump file format. To store 32 bits lengths for short + * keys requires a lot of space, so we check the most significant 2 bits of + * the first byte to interpreter the length: + * + * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte + * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte + * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow + * 11|000000 this means: specially encoded object will follow. The six bits + * number specify the kind of object that follows. + * See the REDIS_RDB_ENC_* defines. + * + * Lenghts up to 63 are stored using a single byte, most DB keys, and may + * values, will fit inside. */ +#define REDIS_RDB_6BITLEN 0 +#define REDIS_RDB_14BITLEN 1 +#define REDIS_RDB_32BITLEN 2 +#define REDIS_RDB_ENCVAL 3 +#define REDIS_RDB_LENERR UINT_MAX + +/* When a length of a string object stored on disk has the first two bits + * set, the remaining two bits specify a special encoding for the object + * accordingly to the following defines: */ +#define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */ +#define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */ +#define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */ +#define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */ + +/* Virtual memory object->where field. */ +#define REDIS_VM_MEMORY 0 /* The object is on memory */ +#define REDIS_VM_SWAPPED 1 /* The object is on disk */ +#define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */ +#define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */ + +/* Virtual memory static configuration stuff. + * Check vmFindContiguousPages() to know more about this magic numbers. */ +#define REDIS_VM_MAX_NEAR_PAGES 65536 +#define REDIS_VM_MAX_RANDOM_JUMP 4096 +#define REDIS_VM_MAX_THREADS 32 +#define REDIS_THREAD_STACK_SIZE (1024*1024*4) +/* The following is the *percentage* of completed I/O jobs to process when the + * handelr is called. While Virtual Memory I/O operations are performed by + * threads, this operations must be processed by the main thread when completed + * in order to take effect. */ +#define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1 + +/* Client flags */ +#define REDIS_SLAVE 1 /* This client is a slave server */ +#define REDIS_MASTER 2 /* This client is a master server */ +#define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */ +#define REDIS_MULTI 8 /* This client is in a MULTI context */ +#define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */ +#define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */ +#define REDIS_DIRTY_CAS 64 /* Watched keys modified. EXEC will fail. */ + +/* Slave replication state - slave side */ +#define REDIS_REPL_NONE 0 /* No active replication */ +#define REDIS_REPL_CONNECT 1 /* Must connect to master */ +#define REDIS_REPL_CONNECTED 2 /* Connected to master */ + +/* Slave replication state - from the point of view of master + * Note that in SEND_BULK and ONLINE state the slave receives new updates + * in its output queue. In the WAIT_BGSAVE state instead the server is waiting + * to start the next background saving in order to send updates to it. */ +#define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */ +#define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */ +#define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */ +#define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */ + +/* List related stuff */ +#define REDIS_HEAD 0 +#define REDIS_TAIL 1 + +/* Sort operations */ +#define REDIS_SORT_GET 0 +#define REDIS_SORT_ASC 1 +#define REDIS_SORT_DESC 2 +#define REDIS_SORTKEY_MAX 1024 + +/* Log levels */ +#define REDIS_DEBUG 0 +#define REDIS_VERBOSE 1 +#define REDIS_NOTICE 2 +#define REDIS_WARNING 3 + +/* Anti-warning macro... */ +#define REDIS_NOTUSED(V) ((void) V) + +#define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */ +#define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */ + +/* Append only defines */ +#define APPENDFSYNC_NO 0 +#define APPENDFSYNC_ALWAYS 1 +#define APPENDFSYNC_EVERYSEC 2 + +/* Zip structure related defaults */ +#define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64 +#define REDIS_HASH_MAX_ZIPMAP_VALUE 512 +#define REDIS_LIST_MAX_ZIPLIST_ENTRIES 1024 +#define REDIS_LIST_MAX_ZIPLIST_VALUE 32 + +/* Sets operations codes */ +#define REDIS_OP_UNION 0 +#define REDIS_OP_DIFF 1 +#define REDIS_OP_INTER 2 + +/* We can print the stacktrace, so our assert is defined this way: */ +#define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1))) +#define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1) +void _redisAssert(char *estr, char *file, int line); +void _redisPanic(char *msg, char *file, int line); + +/*----------------------------------------------------------------------------- + * Data types + *----------------------------------------------------------------------------*/ + +/* A redis object, that is a type able to hold a string / list / set */ + +/* The actual Redis Object */ +typedef struct redisObject { + unsigned type:4; + unsigned storage:2; /* REDIS_VM_MEMORY or REDIS_VM_SWAPPING */ + unsigned encoding:4; + unsigned lru:22; /* lru time (relative to server.lruclock) */ + int refcount; + void *ptr; + /* VM fields are only allocated if VM is active, otherwise the + * object allocation function will just allocate + * sizeof(redisObjct) minus sizeof(redisObjectVM), so using + * Redis without VM active will not have any overhead. */ +} robj; + +/* The VM pointer structure - identifies an object in the swap file. + * + * This object is stored in place of the value + * object in the main key->value hash table representing a database. + * Note that the first fields (type, storage) are the same as the redisObject + * structure so that vmPointer strucuters can be accessed even when casted + * as redisObject structures. + * + * This is useful as we don't know if a value object is or not on disk, but we + * are always able to read obj->storage to check this. For vmPointer + * structures "type" is set to REDIS_VMPOINTER (even if without this field + * is still possible to check the kind of object from the value of 'storage').*/ +typedef struct vmPointer { + unsigned type:4; + unsigned storage:2; /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */ + unsigned notused:26; + unsigned int vtype; /* type of the object stored in the swap file */ + off_t page; /* the page at witch the object is stored on disk */ + off_t usedpages; /* number of pages used on disk */ +} vmpointer; + +/* Macro used to initalize a Redis object allocated on the stack. + * Note that this macro is taken near the structure definition to make sure + * we'll update it when the structure is changed, to avoid bugs like + * bug #85 introduced exactly in this way. */ +#define initStaticStringObject(_var,_ptr) do { \ + _var.refcount = 1; \ + _var.type = REDIS_STRING; \ + _var.encoding = REDIS_ENCODING_RAW; \ + _var.ptr = _ptr; \ + _var.storage = REDIS_VM_MEMORY; \ +} while(0); + +typedef struct redisDb { + dict *dict; /* The keyspace for this DB */ + dict *expires; /* Timeout of keys with a timeout set */ + dict *blocking_keys; /* Keys with clients waiting for data (BLPOP) */ + dict *io_keys; /* Keys with clients waiting for VM I/O */ + dict *watched_keys; /* WATCHED keys for MULTI/EXEC CAS */ + int id; +} redisDb; + +/* Client MULTI/EXEC state */ +typedef struct multiCmd { + robj **argv; + int argc; + struct redisCommand *cmd; +} multiCmd; + +typedef struct multiState { + multiCmd *commands; /* Array of MULTI commands */ + int count; /* Total number of MULTI commands */ +} multiState; + +/* With multiplexing we need to take per-clinet state. + * Clients are taken in a liked list. */ +typedef struct redisClient { + int fd; + redisDb *db; + int dictid; + sds querybuf; + robj **argv, **mbargv; + int argc, mbargc; + int bulklen; /* bulk read len. -1 if not in bulk read mode */ + int multibulk; /* multi bulk command format active */ + list *reply; + int sentlen; + time_t lastinteraction; /* time of the last interaction, used for timeout */ + int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */ + int slaveseldb; /* slave selected db, if this client is a slave */ + int authenticated; /* when requirepass is non-NULL */ + int replstate; /* replication state if this is a slave */ + int repldbfd; /* replication DB file descriptor */ + long repldboff; /* replication DB file offset */ + off_t repldbsize; /* replication DB file size */ + multiState mstate; /* MULTI/EXEC state */ + robj **blocking_keys; /* The key we are waiting to terminate a blocking + * operation such as BLPOP. Otherwise NULL. */ + int blocking_keys_num; /* Number of blocking keys */ + time_t blockingto; /* Blocking operation timeout. If UNIX current time + * is >= blockingto then the operation timed out. */ + list *io_keys; /* Keys this client is waiting to be loaded from the + * swap file in order to continue. */ + list *watched_keys; /* Keys WATCHED for MULTI/EXEC CAS */ + dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */ + list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */ +} redisClient; + +struct saveparam { + time_t seconds; + int changes; +}; + +struct sharedObjectsStruct { + robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *cnegone, *pong, *space, + *colon, *nullbulk, *nullmultibulk, *queued, + *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr, + *outofrangeerr, *plus, + *select0, *select1, *select2, *select3, *select4, + *select5, *select6, *select7, *select8, *select9, + *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3, + *mbulk4, *psubscribebulk, *punsubscribebulk, + *integers[REDIS_SHARED_INTEGERS]; +}; + +/* Global server state structure */ +struct redisServer { + int port; + int fd; + redisDb *db; + long long dirty; /* changes to DB from the last save */ + list *clients; + list *slaves, *monitors; + char neterr[ANET_ERR_LEN]; + aeEventLoop *el; + int cronloops; /* number of times the cron function run */ + list *objfreelist; /* A list of freed objects to avoid malloc() */ + time_t lastsave; /* Unix time of last save succeeede */ + /* Fields used only for stats */ + time_t stat_starttime; /* server start time */ + long long stat_numcommands; /* number of processed commands */ + long long stat_numconnections; /* number of connections received */ + long long stat_expiredkeys; /* number of expired keys */ + /* Configuration */ + int verbosity; + int glueoutputbuf; + int maxidletime; + int dbnum; + int daemonize; + int appendonly; + int appendfsync; + int no_appendfsync_on_rewrite; + int shutdown_asap; + time_t lastfsync; + int appendfd; + int appendseldb; + char *pidfile; + pid_t bgsavechildpid; + pid_t bgrewritechildpid; + sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */ + sds aofbuf; /* AOF buffer, written before entering the event loop */ + struct saveparam *saveparams; + int saveparamslen; + char *logfile; + char *bindaddr; + char *dbfilename; + char *appendfilename; + char *requirepass; + int rdbcompression; + int activerehashing; + /* Replication related */ + int isslave; + char *masterauth; + char *masterhost; + int masterport; + redisClient *master; /* client that is master for this slave */ + int replstate; + unsigned int maxclients; + unsigned long long maxmemory; + unsigned int blpop_blocked_clients; + unsigned int vm_blocked_clients; + /* Sort parameters - qsort_r() is only available under BSD so we + * have to take this state global, in order to pass it to sortCompare() */ + int sort_desc; + int sort_alpha; + int sort_bypattern; + /* Virtual memory configuration */ + int vm_enabled; + char *vm_swap_file; + off_t vm_page_size; + off_t vm_pages; + unsigned long long vm_max_memory; + /* Zip structure config */ + size_t hash_max_zipmap_entries; + size_t hash_max_zipmap_value; + size_t list_max_ziplist_entries; + size_t list_max_ziplist_value; + /* Virtual memory state */ + FILE *vm_fp; + int vm_fd; + off_t vm_next_page; /* Next probably empty page */ + off_t vm_near_pages; /* Number of pages allocated sequentially */ + unsigned char *vm_bitmap; /* Bitmap of free/used pages */ + time_t unixtime; /* Unix time sampled every second. */ + /* Virtual memory I/O threads stuff */ + /* An I/O thread process an element taken from the io_jobs queue and + * put the result of the operation in the io_done list. While the + * job is being processed, it's put on io_processing queue. */ + list *io_newjobs; /* List of VM I/O jobs yet to be processed */ + list *io_processing; /* List of VM I/O jobs being processed */ + list *io_processed; /* List of VM I/O jobs already processed */ + list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */ + pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */ + pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */ + pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */ + pthread_attr_t io_threads_attr; /* attributes for threads creation */ + int io_active_threads; /* Number of running I/O threads */ + int vm_max_threads; /* Max number of I/O threads running at the same time */ + /* Our main thread is blocked on the event loop, locking for sockets ready + * to be read or written, so when a threaded I/O operation is ready to be + * processed by the main thread, the I/O thread will use a unix pipe to + * awake the main thread. The followings are the two pipe FDs. */ + int io_ready_pipe_read; + int io_ready_pipe_write; + /* Virtual memory stats */ + unsigned long long vm_stats_used_pages; + unsigned long long vm_stats_swapped_objects; + unsigned long long vm_stats_swapouts; + unsigned long long vm_stats_swapins; + /* Pubsub */ + dict *pubsub_channels; /* Map channels to list of subscribed clients */ + list *pubsub_patterns; /* A list of pubsub_patterns */ + /* Misc */ + FILE *devnull; + unsigned lruclock:22; /* clock incrementing every minute, for LRU */ + unsigned lruclock_padding:10; +}; + +typedef struct pubsubPattern { + redisClient *client; + robj *pattern; +} pubsubPattern; + +typedef void redisCommandProc(redisClient *c); +typedef void redisVmPreloadProc(redisClient *c, struct redisCommand *cmd, int argc, robj **argv); +struct redisCommand { + char *name; + redisCommandProc *proc; + int arity; + int flags; + /* Use a function to determine which keys need to be loaded + * in the background prior to executing this command. Takes precedence + * over vm_firstkey and others, ignored when NULL */ + redisVmPreloadProc *vm_preload_proc; + /* What keys should be loaded in background when calling this command? */ + int vm_firstkey; /* The first argument that's a key (0 = no keys) */ + int vm_lastkey; /* THe last argument that's a key */ + int vm_keystep; /* The step between first and last key */ +}; + +struct redisFunctionSym { + char *name; + unsigned long pointer; +}; + +typedef struct _redisSortObject { + robj *obj; + union { + double score; + robj *cmpobj; + } u; +} redisSortObject; + +typedef struct _redisSortOperation { + int type; + robj *pattern; +} redisSortOperation; + +/* ZSETs use a specialized version of Skiplists */ + +typedef struct zskiplistNode { + struct zskiplistNode **forward; + struct zskiplistNode *backward; + unsigned int *span; + double score; + robj *obj; +} zskiplistNode; + +typedef struct zskiplist { + struct zskiplistNode *header, *tail; + unsigned long length; + int level; +} zskiplist; + +typedef struct zset { + dict *dict; + zskiplist *zsl; +} zset; + +/* VM threaded I/O request message */ +#define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */ +#define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */ +#define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */ +typedef struct iojob { + int type; /* Request type, REDIS_IOJOB_* */ + redisDb *db;/* Redis database */ + robj *key; /* This I/O request is about swapping this key */ + robj *id; /* Unique identifier of this job: + this is the object to swap for REDIS_IOREQ_*_SWAP, or the + vmpointer objct for REDIS_IOREQ_LOAD. */ + robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this + * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */ + off_t page; /* Swap page where to read/write the object */ + off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */ + int canceled; /* True if this command was canceled by blocking side of VM */ + pthread_t thread; /* ID of the thread processing this entry */ +} iojob; + +/* Structure to hold list iteration abstraction. */ +typedef struct { + robj *subject; + unsigned char encoding; + unsigned char direction; /* Iteration direction */ + unsigned char *zi; + listNode *ln; +} listTypeIterator; + +/* Structure for an entry while iterating over a list. */ +typedef struct { + listTypeIterator *li; + unsigned char *zi; /* Entry in ziplist */ + listNode *ln; /* Entry in linked list */ +} listTypeEntry; + +/* Structure to hold hash iteration abstration. Note that iteration over + * hashes involves both fields and values. Because it is possible that + * not both are required, store pointers in the iterator to avoid + * unnecessary memory allocation for fields/values. */ +typedef struct { + int encoding; + unsigned char *zi; + unsigned char *zk, *zv; + unsigned int zklen, zvlen; + + dictIterator *di; + dictEntry *de; +} hashTypeIterator; + +#define REDIS_HASH_KEY 1 +#define REDIS_HASH_VALUE 2 + +/*----------------------------------------------------------------------------- + * Extern declarations + *----------------------------------------------------------------------------*/ + +extern struct redisServer server; +extern struct sharedObjectsStruct shared; +extern dictType setDictType; +extern dictType zsetDictType; +extern double R_Zero, R_PosInf, R_NegInf, R_Nan; +dictType hashDictType; + +/*----------------------------------------------------------------------------- + * Functions prototypes + *----------------------------------------------------------------------------*/ + +/* networking.c -- Networking and Client related operations */ +redisClient *createClient(int fd); +void closeTimedoutClients(void); +void freeClient(redisClient *c); +void resetClient(redisClient *c); +void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask); +void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask); +void addReply(redisClient *c, robj *obj); +void addReplySds(redisClient *c, sds s); +void processInputBuffer(redisClient *c); +void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask); +void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask); +void addReplyBulk(redisClient *c, robj *obj); +void addReplyBulkCString(redisClient *c, char *s); +void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask); +void addReply(redisClient *c, robj *obj); +void addReplySds(redisClient *c, sds s); +void addReplyDouble(redisClient *c, double d); +void addReplyLongLong(redisClient *c, long long ll); +void addReplyUlong(redisClient *c, unsigned long ul); +void *dupClientReplyValue(void *o); + +/* List data type */ +void listTypeTryConversion(robj *subject, robj *value); +void listTypePush(robj *subject, robj *value, int where); +robj *listTypePop(robj *subject, int where); +unsigned long listTypeLength(robj *subject); +listTypeIterator *listTypeInitIterator(robj *subject, int index, unsigned char direction); +void listTypeReleaseIterator(listTypeIterator *li); +int listTypeNext(listTypeIterator *li, listTypeEntry *entry); +robj *listTypeGet(listTypeEntry *entry); +void listTypeInsert(listTypeEntry *entry, robj *value, int where); +int listTypeEqual(listTypeEntry *entry, robj *o); +void listTypeDelete(listTypeEntry *entry); +void listTypeConvert(robj *subject, int enc); +void unblockClientWaitingData(redisClient *c); +int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele); +void popGenericCommand(redisClient *c, int where); + +/* MULTI/EXEC/WATCH... */ +void unwatchAllKeys(redisClient *c); +void initClientMultiState(redisClient *c); +void freeClientMultiState(redisClient *c); +void queueMultiCommand(redisClient *c, struct redisCommand *cmd); +void touchWatchedKey(redisDb *db, robj *key); +void touchWatchedKeysOnFlush(int dbid); + +/* Redis object implementation */ +void decrRefCount(void *o); +void incrRefCount(robj *o); +void freeStringObject(robj *o); +void freeListObject(robj *o); +void freeSetObject(robj *o); +void freeZsetObject(robj *o); +void freeHashObject(robj *o); +robj *createObject(int type, void *ptr); +robj *createStringObject(char *ptr, size_t len); +robj *dupStringObject(robj *o); +robj *tryObjectEncoding(robj *o); +robj *getDecodedObject(robj *o); +size_t stringObjectLen(robj *o); +int tryFreeOneObjectFromFreelist(void); +robj *createStringObjectFromLongLong(long long value); +robj *createListObject(void); +robj *createZiplistObject(void); +robj *createSetObject(void); +robj *createHashObject(void); +robj *createZsetObject(void); +int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg); +int checkType(redisClient *c, robj *o, int type); +int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg); +int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg); +int getLongLongFromObject(robj *o, long long *target); +char *strEncoding(int encoding); +int compareStringObjects(robj *a, robj *b); +int equalStringObjects(robj *a, robj *b); + +/* Replication */ +void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc); +void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc); +int syncWithMaster(void); +void updateSlavesWaitingBgsave(int bgsaveerr); + +/* RDB persistence */ +int rdbLoad(char *filename); +int rdbSaveBackground(char *filename); +void rdbRemoveTempFile(pid_t childpid); +int rdbSave(char *filename); +int rdbSaveObject(FILE *fp, robj *o); +off_t rdbSavedObjectPages(robj *o, FILE *fp); +off_t rdbSavedObjectLen(robj *o, FILE *fp); +robj *rdbLoadObject(int type, FILE *fp); +void backgroundSaveDoneHandler(int statloc); + +/* AOF persistence */ +void flushAppendOnlyFile(void); +void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc); +void aofRemoveTempFile(pid_t childpid); +int rewriteAppendOnlyFileBackground(void); +int loadAppendOnlyFile(char *filename); +void stopAppendOnly(void); +int startAppendOnly(void); +void backgroundRewriteDoneHandler(int statloc); + +/* Sorted sets data type */ +zskiplist *zslCreate(void); +void zslFree(zskiplist *zsl); +void zslInsert(zskiplist *zsl, double score, robj *obj); + +/* Core functions */ +void freeMemoryIfNeeded(void); +int processCommand(redisClient *c); +void setupSigSegvAction(void); +struct redisCommand *lookupCommand(char *name); +void call(redisClient *c, struct redisCommand *cmd); +int prepareForShutdown(); +void redisLog(int level, const char *fmt, ...); +void usage(); +void updateDictResizePolicy(void); +int htNeedsResize(dict *dict); +void oom(const char *msg); + +/* Virtual Memory */ +void vmInit(void); +void vmMarkPagesFree(off_t page, off_t count); +robj *vmLoadObject(robj *o); +robj *vmPreviewObject(robj *o); +int vmSwapOneObjectBlocking(void); +int vmSwapOneObjectThreaded(void); +int vmCanSwapOut(void); +void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask); +void vmCancelThreadedIOJob(robj *o); +void lockThreadedIO(void); +void unlockThreadedIO(void); +int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db); +void freeIOJob(iojob *j); +void queueIOJob(iojob *j); +int vmWriteObjectOnSwap(robj *o, off_t page); +robj *vmReadObjectFromSwap(off_t page, int type); +void waitEmptyIOJobsQueue(void); +void vmReopenSwapFile(void); +int vmFreePage(off_t page); +void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv); +void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv); +int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd); +int dontWaitForSwappedKey(redisClient *c, robj *key); +void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key); +vmpointer *vmSwapObjectBlocking(robj *val); + +/* Hash data type */ +void convertToRealHash(robj *o); +void hashTypeTryConversion(robj *subject, robj **argv, int start, int end); +void hashTypeTryObjectEncoding(robj *subject, robj **o1, robj **o2); +robj *hashTypeGet(robj *o, robj *key); +int hashTypeExists(robj *o, robj *key); +int hashTypeSet(robj *o, robj *key, robj *value); +int hashTypeDelete(robj *o, robj *key); +unsigned long hashTypeLength(robj *o); +hashTypeIterator *hashTypeInitIterator(robj *subject); +void hashTypeReleaseIterator(hashTypeIterator *hi); +int hashTypeNext(hashTypeIterator *hi); +robj *hashTypeCurrent(hashTypeIterator *hi, int what); +robj *hashTypeLookupWriteOrCreate(redisClient *c, robj *key); + +/* Pub / Sub */ +int pubsubUnsubscribeAllChannels(redisClient *c, int notify); +int pubsubUnsubscribeAllPatterns(redisClient *c, int notify); +void freePubsubPattern(void *p); +int listMatchPubsubPattern(void *a, void *b); + +/* Utility functions */ +int stringmatchlen(const char *pattern, int patternLen, + const char *string, int stringLen, int nocase); +int stringmatch(const char *pattern, const char *string, int nocase); +long long memtoll(const char *p, int *err); +int ll2string(char *s, size_t len, long long value); +int isStringRepresentableAsLong(sds s, long *longval); + +/* Configuration */ +void loadServerConfig(char *filename); +void appendServerSaveParams(time_t seconds, int changes); +void resetServerSaveParams(); + +/* db.c -- Keyspace access API */ +int removeExpire(redisDb *db, robj *key); +int expireIfNeeded(redisDb *db, robj *key); +int deleteIfVolatile(redisDb *db, robj *key); +time_t getExpire(redisDb *db, robj *key); +int setExpire(redisDb *db, robj *key, time_t when); +robj *lookupKey(redisDb *db, robj *key); +robj *lookupKeyRead(redisDb *db, robj *key); +robj *lookupKeyWrite(redisDb *db, robj *key); +robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply); +robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply); +int dbAdd(redisDb *db, robj *key, robj *val); +int dbReplace(redisDb *db, robj *key, robj *val); +int dbExists(redisDb *db, robj *key); +robj *dbRandomKey(redisDb *db); +int dbDelete(redisDb *db, robj *key); +long long emptyDb(); +int selectDb(redisClient *c, int id); + +/* Git SHA1 */ +char *redisGitSHA1(void); +char *redisGitDirty(void); + +/* Commands prototypes */ +void authCommand(redisClient *c); +void pingCommand(redisClient *c); +void echoCommand(redisClient *c); +void setCommand(redisClient *c); +void setnxCommand(redisClient *c); +void setexCommand(redisClient *c); +void getCommand(redisClient *c); +void delCommand(redisClient *c); +void existsCommand(redisClient *c); +void incrCommand(redisClient *c); +void decrCommand(redisClient *c); +void incrbyCommand(redisClient *c); +void decrbyCommand(redisClient *c); +void selectCommand(redisClient *c); +void randomkeyCommand(redisClient *c); +void keysCommand(redisClient *c); +void dbsizeCommand(redisClient *c); +void lastsaveCommand(redisClient *c); +void saveCommand(redisClient *c); +void bgsaveCommand(redisClient *c); +void bgrewriteaofCommand(redisClient *c); +void shutdownCommand(redisClient *c); +void moveCommand(redisClient *c); +void renameCommand(redisClient *c); +void renamenxCommand(redisClient *c); +void lpushCommand(redisClient *c); +void rpushCommand(redisClient *c); +void lpushxCommand(redisClient *c); +void rpushxCommand(redisClient *c); +void linsertCommand(redisClient *c); +void lpopCommand(redisClient *c); +void rpopCommand(redisClient *c); +void llenCommand(redisClient *c); +void lindexCommand(redisClient *c); +void lrangeCommand(redisClient *c); +void ltrimCommand(redisClient *c); +void typeCommand(redisClient *c); +void lsetCommand(redisClient *c); +void saddCommand(redisClient *c); +void sremCommand(redisClient *c); +void smoveCommand(redisClient *c); +void sismemberCommand(redisClient *c); +void scardCommand(redisClient *c); +void spopCommand(redisClient *c); +void srandmemberCommand(redisClient *c); +void sinterCommand(redisClient *c); +void sinterstoreCommand(redisClient *c); +void sunionCommand(redisClient *c); +void sunionstoreCommand(redisClient *c); +void sdiffCommand(redisClient *c); +void sdiffstoreCommand(redisClient *c); +void syncCommand(redisClient *c); +void flushdbCommand(redisClient *c); +void flushallCommand(redisClient *c); +void sortCommand(redisClient *c); +void lremCommand(redisClient *c); +void rpoplpushcommand(redisClient *c); +void infoCommand(redisClient *c); +void mgetCommand(redisClient *c); +void monitorCommand(redisClient *c); +void expireCommand(redisClient *c); +void expireatCommand(redisClient *c); +void getsetCommand(redisClient *c); +void ttlCommand(redisClient *c); +void slaveofCommand(redisClient *c); +void debugCommand(redisClient *c); +void msetCommand(redisClient *c); +void msetnxCommand(redisClient *c); +void zaddCommand(redisClient *c); +void zincrbyCommand(redisClient *c); +void zrangeCommand(redisClient *c); +void zrangebyscoreCommand(redisClient *c); +void zcountCommand(redisClient *c); +void zrevrangeCommand(redisClient *c); +void zcardCommand(redisClient *c); +void zremCommand(redisClient *c); +void zscoreCommand(redisClient *c); +void zremrangebyscoreCommand(redisClient *c); +void multiCommand(redisClient *c); +void execCommand(redisClient *c); +void discardCommand(redisClient *c); +void blpopCommand(redisClient *c); +void brpopCommand(redisClient *c); +void appendCommand(redisClient *c); +void substrCommand(redisClient *c); +void zrankCommand(redisClient *c); +void zrevrankCommand(redisClient *c); +void hsetCommand(redisClient *c); +void hsetnxCommand(redisClient *c); +void hgetCommand(redisClient *c); +void hmsetCommand(redisClient *c); +void hmgetCommand(redisClient *c); +void hdelCommand(redisClient *c); +void hlenCommand(redisClient *c); +void zremrangebyrankCommand(redisClient *c); +void zunionstoreCommand(redisClient *c); +void zinterstoreCommand(redisClient *c); +void hkeysCommand(redisClient *c); +void hvalsCommand(redisClient *c); +void hgetallCommand(redisClient *c); +void hexistsCommand(redisClient *c); +void configCommand(redisClient *c); +void hincrbyCommand(redisClient *c); +void subscribeCommand(redisClient *c); +void unsubscribeCommand(redisClient *c); +void psubscribeCommand(redisClient *c); +void punsubscribeCommand(redisClient *c); +void publishCommand(redisClient *c); +void watchCommand(redisClient *c); +void unwatchCommand(redisClient *c); + +#endif diff --git a/src/release.c b/src/release.c new file mode 100644 index 000000000..64186ec4e --- /dev/null +++ b/src/release.c @@ -0,0 +1,13 @@ +/* Every time the Redis Git SHA1 or Dirty status changes only this file + * small file is recompiled, as we access this information in all the other + * files using this functions. */ + +#include "release.h" + +char *redisGitSHA1(void) { + return REDIS_GIT_SHA1; +} + +char *redisGitDirty(void) { + return REDIS_GIT_DIRTY; +} diff --git a/src/replication.c b/src/replication.c new file mode 100644 index 000000000..ecb04ce1a --- /dev/null +++ b/src/replication.c @@ -0,0 +1,475 @@ +#include "redis.h" + +#include +#include +#include +#include + +void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) { + listNode *ln; + listIter li; + int outc = 0, j; + robj **outv; + /* We need 1+(ARGS*3) objects since commands are using the new protocol + * and we one 1 object for the first "*\r\n" multibulk count, then + * for every additional object we have "$\r\n" + object + "\r\n". */ + robj *static_outv[REDIS_STATIC_ARGS*3+1]; + robj *lenobj; + + if (argc <= REDIS_STATIC_ARGS) { + outv = static_outv; + } else { + outv = zmalloc(sizeof(robj*)*(argc*3+1)); + } + + lenobj = createObject(REDIS_STRING, + sdscatprintf(sdsempty(), "*%d\r\n", argc)); + lenobj->refcount = 0; + outv[outc++] = lenobj; + for (j = 0; j < argc; j++) { + lenobj = createObject(REDIS_STRING, + sdscatprintf(sdsempty(),"$%lu\r\n", + (unsigned long) stringObjectLen(argv[j]))); + lenobj->refcount = 0; + outv[outc++] = lenobj; + outv[outc++] = argv[j]; + outv[outc++] = shared.crlf; + } + + /* Increment all the refcounts at start and decrement at end in order to + * be sure to free objects if there is no slave in a replication state + * able to be feed with commands */ + for (j = 0; j < outc; j++) incrRefCount(outv[j]); + listRewind(slaves,&li); + while((ln = listNext(&li))) { + redisClient *slave = ln->value; + + /* Don't feed slaves that are still waiting for BGSAVE to start */ + if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue; + + /* Feed all the other slaves, MONITORs and so on */ + if (slave->slaveseldb != dictid) { + robj *selectcmd; + + switch(dictid) { + case 0: selectcmd = shared.select0; break; + case 1: selectcmd = shared.select1; break; + case 2: selectcmd = shared.select2; break; + case 3: selectcmd = shared.select3; break; + case 4: selectcmd = shared.select4; break; + case 5: selectcmd = shared.select5; break; + case 6: selectcmd = shared.select6; break; + case 7: selectcmd = shared.select7; break; + case 8: selectcmd = shared.select8; break; + case 9: selectcmd = shared.select9; break; + default: + selectcmd = createObject(REDIS_STRING, + sdscatprintf(sdsempty(),"select %d\r\n",dictid)); + selectcmd->refcount = 0; + break; + } + addReply(slave,selectcmd); + slave->slaveseldb = dictid; + } + for (j = 0; j < outc; j++) addReply(slave,outv[j]); + } + for (j = 0; j < outc; j++) decrRefCount(outv[j]); + if (outv != static_outv) zfree(outv); +} + +void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc) { + listNode *ln; + listIter li; + int j; + sds cmdrepr = sdsnew("+"); + robj *cmdobj; + struct timeval tv; + + gettimeofday(&tv,NULL); + cmdrepr = sdscatprintf(cmdrepr,"%ld.%ld ",(long)tv.tv_sec,(long)tv.tv_usec); + if (dictid != 0) cmdrepr = sdscatprintf(cmdrepr,"(db %d) ", dictid); + + for (j = 0; j < argc; j++) { + if (argv[j]->encoding == REDIS_ENCODING_INT) { + cmdrepr = sdscatprintf(cmdrepr, "%ld", (long)argv[j]->ptr); + } else { + cmdrepr = sdscatrepr(cmdrepr,(char*)argv[j]->ptr, + sdslen(argv[j]->ptr)); + } + if (j != argc-1) + cmdrepr = sdscatlen(cmdrepr," ",1); + } + cmdrepr = sdscatlen(cmdrepr,"\r\n",2); + cmdobj = createObject(REDIS_STRING,cmdrepr); + + listRewind(monitors,&li); + while((ln = listNext(&li))) { + redisClient *monitor = ln->value; + addReply(monitor,cmdobj); + } + decrRefCount(cmdobj); +} + +int syncWrite(int fd, char *ptr, ssize_t size, int timeout) { + ssize_t nwritten, ret = size; + time_t start = time(NULL); + + timeout++; + while(size) { + if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) { + nwritten = write(fd,ptr,size); + if (nwritten == -1) return -1; + ptr += nwritten; + size -= nwritten; + } + if ((time(NULL)-start) > timeout) { + errno = ETIMEDOUT; + return -1; + } + } + return ret; +} + +int syncRead(int fd, char *ptr, ssize_t size, int timeout) { + ssize_t nread, totread = 0; + time_t start = time(NULL); + + timeout++; + while(size) { + if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) { + nread = read(fd,ptr,size); + if (nread == -1) return -1; + ptr += nread; + size -= nread; + totread += nread; + } + if ((time(NULL)-start) > timeout) { + errno = ETIMEDOUT; + return -1; + } + } + return totread; +} + +int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) { + ssize_t nread = 0; + + size--; + while(size) { + char c; + + if (syncRead(fd,&c,1,timeout) == -1) return -1; + if (c == '\n') { + *ptr = '\0'; + if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0'; + return nread; + } else { + *ptr++ = c; + *ptr = '\0'; + nread++; + } + } + return nread; +} + +void syncCommand(redisClient *c) { + /* ignore SYNC if aleady slave or in monitor mode */ + if (c->flags & REDIS_SLAVE) return; + + /* SYNC can't be issued when the server has pending data to send to + * the client about already issued commands. We need a fresh reply + * buffer registering the differences between the BGSAVE and the current + * dataset, so that we can copy to other slaves if needed. */ + if (listLength(c->reply) != 0) { + addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n")); + return; + } + + redisLog(REDIS_NOTICE,"Slave ask for synchronization"); + /* Here we need to check if there is a background saving operation + * in progress, or if it is required to start one */ + if (server.bgsavechildpid != -1) { + /* Ok a background save is in progress. Let's check if it is a good + * one for replication, i.e. if there is another slave that is + * registering differences since the server forked to save */ + redisClient *slave; + listNode *ln; + listIter li; + + listRewind(server.slaves,&li); + while((ln = listNext(&li))) { + slave = ln->value; + if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break; + } + if (ln) { + /* Perfect, the server is already registering differences for + * another slave. Set the right state, and copy the buffer. */ + listRelease(c->reply); + c->reply = listDup(slave->reply); + c->replstate = REDIS_REPL_WAIT_BGSAVE_END; + redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC"); + } else { + /* No way, we need to wait for the next BGSAVE in order to + * register differences */ + c->replstate = REDIS_REPL_WAIT_BGSAVE_START; + redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC"); + } + } else { + /* Ok we don't have a BGSAVE in progress, let's start one */ + redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC"); + if (rdbSaveBackground(server.dbfilename) != REDIS_OK) { + redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE"); + addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n")); + return; + } + c->replstate = REDIS_REPL_WAIT_BGSAVE_END; + } + c->repldbfd = -1; + c->flags |= REDIS_SLAVE; + c->slaveseldb = 0; + listAddNodeTail(server.slaves,c); + return; +} + +void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) { + redisClient *slave = privdata; + REDIS_NOTUSED(el); + REDIS_NOTUSED(mask); + char buf[REDIS_IOBUF_LEN]; + ssize_t nwritten, buflen; + + if (slave->repldboff == 0) { + /* Write the bulk write count before to transfer the DB. In theory here + * we don't know how much room there is in the output buffer of the + * socket, but in pratice SO_SNDLOWAT (the minimum count for output + * operations) will never be smaller than the few bytes we need. */ + sds bulkcount; + + bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long) + slave->repldbsize); + if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount)) + { + sdsfree(bulkcount); + freeClient(slave); + return; + } + sdsfree(bulkcount); + } + lseek(slave->repldbfd,slave->repldboff,SEEK_SET); + buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN); + if (buflen <= 0) { + redisLog(REDIS_WARNING,"Read error sending DB to slave: %s", + (buflen == 0) ? "premature EOF" : strerror(errno)); + freeClient(slave); + return; + } + if ((nwritten = write(fd,buf,buflen)) == -1) { + redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s", + strerror(errno)); + freeClient(slave); + return; + } + slave->repldboff += nwritten; + if (slave->repldboff == slave->repldbsize) { + close(slave->repldbfd); + slave->repldbfd = -1; + aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE); + slave->replstate = REDIS_REPL_ONLINE; + if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, + sendReplyToClient, slave) == AE_ERR) { + freeClient(slave); + return; + } + addReplySds(slave,sdsempty()); + redisLog(REDIS_NOTICE,"Synchronization with slave succeeded"); + } +} + +/* This function is called at the end of every backgrond saving. + * The argument bgsaveerr is REDIS_OK if the background saving succeeded + * otherwise REDIS_ERR is passed to the function. + * + * The goal of this function is to handle slaves waiting for a successful + * background saving in order to perform non-blocking synchronization. */ +void updateSlavesWaitingBgsave(int bgsaveerr) { + listNode *ln; + int startbgsave = 0; + listIter li; + + listRewind(server.slaves,&li); + while((ln = listNext(&li))) { + redisClient *slave = ln->value; + + if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) { + startbgsave = 1; + slave->replstate = REDIS_REPL_WAIT_BGSAVE_END; + } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) { + struct redis_stat buf; + + if (bgsaveerr != REDIS_OK) { + freeClient(slave); + redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error"); + continue; + } + if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 || + redis_fstat(slave->repldbfd,&buf) == -1) { + freeClient(slave); + redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno)); + continue; + } + slave->repldboff = 0; + slave->repldbsize = buf.st_size; + slave->replstate = REDIS_REPL_SEND_BULK; + aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE); + if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) { + freeClient(slave); + continue; + } + } + } + if (startbgsave) { + if (rdbSaveBackground(server.dbfilename) != REDIS_OK) { + listIter li; + + listRewind(server.slaves,&li); + redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed"); + while((ln = listNext(&li))) { + redisClient *slave = ln->value; + + if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) + freeClient(slave); + } + } + } +} + +int syncWithMaster(void) { + char buf[1024], tmpfile[256], authcmd[1024]; + long dumpsize; + int fd = anetTcpConnect(NULL,server.masterhost,server.masterport); + int dfd, maxtries = 5; + + if (fd == -1) { + redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s", + strerror(errno)); + return REDIS_ERR; + } + + /* AUTH with the master if required. */ + if(server.masterauth) { + snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth); + if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) { + close(fd); + redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s", + strerror(errno)); + return REDIS_ERR; + } + /* Read the AUTH result. */ + if (syncReadLine(fd,buf,1024,3600) == -1) { + close(fd); + redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s", + strerror(errno)); + return REDIS_ERR; + } + if (buf[0] != '+') { + close(fd); + redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?"); + return REDIS_ERR; + } + } + + /* Issue the SYNC command */ + if (syncWrite(fd,"SYNC \r\n",7,5) == -1) { + close(fd); + redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s", + strerror(errno)); + return REDIS_ERR; + } + /* Read the bulk write count */ + if (syncReadLine(fd,buf,1024,3600) == -1) { + close(fd); + redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s", + strerror(errno)); + return REDIS_ERR; + } + if (buf[0] != '$') { + close(fd); + redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?"); + return REDIS_ERR; + } + dumpsize = strtol(buf+1,NULL,10); + redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize); + /* Read the bulk write data on a temp file */ + while(maxtries--) { + snprintf(tmpfile,256, + "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid()); + dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644); + if (dfd != -1) break; + sleep(1); + } + if (dfd == -1) { + close(fd); + redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno)); + return REDIS_ERR; + } + while(dumpsize) { + int nread, nwritten; + + nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024); + if (nread == -1) { + redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s", + strerror(errno)); + close(fd); + close(dfd); + return REDIS_ERR; + } + nwritten = write(dfd,buf,nread); + if (nwritten == -1) { + redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno)); + close(fd); + close(dfd); + return REDIS_ERR; + } + dumpsize -= nread; + } + close(dfd); + if (rename(tmpfile,server.dbfilename) == -1) { + redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno)); + unlink(tmpfile); + close(fd); + return REDIS_ERR; + } + emptyDb(); + if (rdbLoad(server.dbfilename) != REDIS_OK) { + redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk"); + close(fd); + return REDIS_ERR; + } + server.master = createClient(fd); + server.master->flags |= REDIS_MASTER; + server.master->authenticated = 1; + server.replstate = REDIS_REPL_CONNECTED; + return REDIS_OK; +} + +void slaveofCommand(redisClient *c) { + if (!strcasecmp(c->argv[1]->ptr,"no") && + !strcasecmp(c->argv[2]->ptr,"one")) { + if (server.masterhost) { + sdsfree(server.masterhost); + server.masterhost = NULL; + if (server.master) freeClient(server.master); + server.replstate = REDIS_REPL_NONE; + redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)"); + } + } else { + sdsfree(server.masterhost); + server.masterhost = sdsdup(c->argv[1]->ptr); + server.masterport = atoi(c->argv[2]->ptr); + if (server.master) freeClient(server.master); + server.replstate = REDIS_REPL_CONNECT; + redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)", + server.masterhost, server.masterport); + } + addReply(c,shared.ok); +} diff --git a/src/sds.c b/src/sds.c new file mode 100644 index 000000000..5e67f0443 --- /dev/null +++ b/src/sds.c @@ -0,0 +1,384 @@ +/* SDSLib, A C dynamic strings library + * + * Copyright (c) 2006-2010, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#define SDS_ABORT_ON_OOM + +#include "sds.h" +#include +#include +#include +#include +#include +#include "zmalloc.h" + +static void sdsOomAbort(void) { + fprintf(stderr,"SDS: Out Of Memory (SDS_ABORT_ON_OOM defined)\n"); + abort(); +} + +sds sdsnewlen(const void *init, size_t initlen) { + struct sdshdr *sh; + + sh = zmalloc(sizeof(struct sdshdr)+initlen+1); +#ifdef SDS_ABORT_ON_OOM + if (sh == NULL) sdsOomAbort(); +#else + if (sh == NULL) return NULL; +#endif + sh->len = initlen; + sh->free = 0; + if (initlen) { + if (init) memcpy(sh->buf, init, initlen); + else memset(sh->buf,0,initlen); + } + sh->buf[initlen] = '\0'; + return (char*)sh->buf; +} + +sds sdsempty(void) { + return sdsnewlen("",0); +} + +sds sdsnew(const char *init) { + size_t initlen = (init == NULL) ? 0 : strlen(init); + return sdsnewlen(init, initlen); +} + +size_t sdslen(const sds s) { + struct sdshdr *sh = (void*) (s-(sizeof(struct sdshdr))); + return sh->len; +} + +sds sdsdup(const sds s) { + return sdsnewlen(s, sdslen(s)); +} + +void sdsfree(sds s) { + if (s == NULL) return; + zfree(s-sizeof(struct sdshdr)); +} + +size_t sdsavail(sds s) { + struct sdshdr *sh = (void*) (s-(sizeof(struct sdshdr))); + return sh->free; +} + +void sdsupdatelen(sds s) { + struct sdshdr *sh = (void*) (s-(sizeof(struct sdshdr))); + int reallen = strlen(s); + sh->free += (sh->len-reallen); + sh->len = reallen; +} + +static sds sdsMakeRoomFor(sds s, size_t addlen) { + struct sdshdr *sh, *newsh; + size_t free = sdsavail(s); + size_t len, newlen; + + if (free >= addlen) return s; + len = sdslen(s); + sh = (void*) (s-(sizeof(struct sdshdr))); + newlen = (len+addlen)*2; + newsh = zrealloc(sh, sizeof(struct sdshdr)+newlen+1); +#ifdef SDS_ABORT_ON_OOM + if (newsh == NULL) sdsOomAbort(); +#else + if (newsh == NULL) return NULL; +#endif + + newsh->free = newlen - len; + return newsh->buf; +} + +sds sdscatlen(sds s, void *t, size_t len) { + struct sdshdr *sh; + size_t curlen = sdslen(s); + + s = sdsMakeRoomFor(s,len); + if (s == NULL) return NULL; + sh = (void*) (s-(sizeof(struct sdshdr))); + memcpy(s+curlen, t, len); + sh->len = curlen+len; + sh->free = sh->free-len; + s[curlen+len] = '\0'; + return s; +} + +sds sdscat(sds s, char *t) { + return sdscatlen(s, t, strlen(t)); +} + +sds sdscpylen(sds s, char *t, size_t len) { + struct sdshdr *sh = (void*) (s-(sizeof(struct sdshdr))); + size_t totlen = sh->free+sh->len; + + if (totlen < len) { + s = sdsMakeRoomFor(s,len-sh->len); + if (s == NULL) return NULL; + sh = (void*) (s-(sizeof(struct sdshdr))); + totlen = sh->free+sh->len; + } + memcpy(s, t, len); + s[len] = '\0'; + sh->len = len; + sh->free = totlen-len; + return s; +} + +sds sdscpy(sds s, char *t) { + return sdscpylen(s, t, strlen(t)); +} + +sds sdscatprintf(sds s, const char *fmt, ...) { + va_list ap; + char *buf, *t; + size_t buflen = 16; + + while(1) { + buf = zmalloc(buflen); +#ifdef SDS_ABORT_ON_OOM + if (buf == NULL) sdsOomAbort(); +#else + if (buf == NULL) return NULL; +#endif + buf[buflen-2] = '\0'; + va_start(ap, fmt); + vsnprintf(buf, buflen, fmt, ap); + va_end(ap); + if (buf[buflen-2] != '\0') { + zfree(buf); + buflen *= 2; + continue; + } + break; + } + t = sdscat(s, buf); + zfree(buf); + return t; +} + +sds sdstrim(sds s, const char *cset) { + struct sdshdr *sh = (void*) (s-(sizeof(struct sdshdr))); + char *start, *end, *sp, *ep; + size_t len; + + sp = start = s; + ep = end = s+sdslen(s)-1; + while(sp <= end && strchr(cset, *sp)) sp++; + while(ep > start && strchr(cset, *ep)) ep--; + len = (sp > ep) ? 0 : ((ep-sp)+1); + if (sh->buf != sp) memmove(sh->buf, sp, len); + sh->buf[len] = '\0'; + sh->free = sh->free+(sh->len-len); + sh->len = len; + return s; +} + +sds sdsrange(sds s, int start, int end) { + struct sdshdr *sh = (void*) (s-(sizeof(struct sdshdr))); + size_t newlen, len = sdslen(s); + + if (len == 0) return s; + if (start < 0) { + start = len+start; + if (start < 0) start = 0; + } + if (end < 0) { + end = len+end; + if (end < 0) end = 0; + } + newlen = (start > end) ? 0 : (end-start)+1; + if (newlen != 0) { + if (start >= (signed)len) start = len-1; + if (end >= (signed)len) end = len-1; + newlen = (start > end) ? 0 : (end-start)+1; + } else { + start = 0; + } + if (start != 0) memmove(sh->buf, sh->buf+start, newlen); + sh->buf[newlen] = 0; + sh->free = sh->free+(sh->len-newlen); + sh->len = newlen; + return s; +} + +void sdstolower(sds s) { + int len = sdslen(s), j; + + for (j = 0; j < len; j++) s[j] = tolower(s[j]); +} + +void sdstoupper(sds s) { + int len = sdslen(s), j; + + for (j = 0; j < len; j++) s[j] = toupper(s[j]); +} + +int sdscmp(sds s1, sds s2) { + size_t l1, l2, minlen; + int cmp; + + l1 = sdslen(s1); + l2 = sdslen(s2); + minlen = (l1 < l2) ? l1 : l2; + cmp = memcmp(s1,s2,minlen); + if (cmp == 0) return l1-l2; + return cmp; +} + +/* Split 's' with separator in 'sep'. An array + * of sds strings is returned. *count will be set + * by reference to the number of tokens returned. + * + * On out of memory, zero length string, zero length + * separator, NULL is returned. + * + * Note that 'sep' is able to split a string using + * a multi-character separator. For example + * sdssplit("foo_-_bar","_-_"); will return two + * elements "foo" and "bar". + * + * This version of the function is binary-safe but + * requires length arguments. sdssplit() is just the + * same function but for zero-terminated strings. + */ +sds *sdssplitlen(char *s, int len, char *sep, int seplen, int *count) { + int elements = 0, slots = 5, start = 0, j; + + sds *tokens = zmalloc(sizeof(sds)*slots); +#ifdef SDS_ABORT_ON_OOM + if (tokens == NULL) sdsOomAbort(); +#endif + if (seplen < 1 || len < 0 || tokens == NULL) return NULL; + if (len == 0) { + *count = 0; + return tokens; + } + for (j = 0; j < (len-(seplen-1)); j++) { + /* make sure there is room for the next element and the final one */ + if (slots < elements+2) { + sds *newtokens; + + slots *= 2; + newtokens = zrealloc(tokens,sizeof(sds)*slots); + if (newtokens == NULL) { +#ifdef SDS_ABORT_ON_OOM + sdsOomAbort(); +#else + goto cleanup; +#endif + } + tokens = newtokens; + } + /* search the separator */ + if ((seplen == 1 && *(s+j) == sep[0]) || (memcmp(s+j,sep,seplen) == 0)) { + tokens[elements] = sdsnewlen(s+start,j-start); + if (tokens[elements] == NULL) { +#ifdef SDS_ABORT_ON_OOM + sdsOomAbort(); +#else + goto cleanup; +#endif + } + elements++; + start = j+seplen; + j = j+seplen-1; /* skip the separator */ + } + } + /* Add the final element. We are sure there is room in the tokens array. */ + tokens[elements] = sdsnewlen(s+start,len-start); + if (tokens[elements] == NULL) { +#ifdef SDS_ABORT_ON_OOM + sdsOomAbort(); +#else + goto cleanup; +#endif + } + elements++; + *count = elements; + return tokens; + +#ifndef SDS_ABORT_ON_OOM +cleanup: + { + int i; + for (i = 0; i < elements; i++) sdsfree(tokens[i]); + zfree(tokens); + return NULL; + } +#endif +} + +void sdsfreesplitres(sds *tokens, int count) { + if (!tokens) return; + while(count--) + sdsfree(tokens[count]); + zfree(tokens); +} + +sds sdsfromlonglong(long long value) { + char buf[32], *p; + unsigned long long v; + + v = (value < 0) ? -value : value; + p = buf+31; /* point to the last character */ + do { + *p-- = '0'+(v%10); + v /= 10; + } while(v); + if (value < 0) *p-- = '-'; + p++; + return sdsnewlen(p,32-(p-buf)); +} + +sds sdscatrepr(sds s, char *p, size_t len) { + s = sdscatlen(s,"\"",1); + while(len--) { + switch(*p) { + case '\\': + case '"': + s = sdscatprintf(s,"\\%c",*p); + break; + case '\n': s = sdscatlen(s,"\\n",1); break; + case '\r': s = sdscatlen(s,"\\r",1); break; + case '\t': s = sdscatlen(s,"\\t",1); break; + case '\a': s = sdscatlen(s,"\\a",1); break; + case '\b': s = sdscatlen(s,"\\b",1); break; + default: + if (isprint(*p)) + s = sdscatprintf(s,"%c",*p); + else + s = sdscatprintf(s,"\\x%02x",(unsigned char)*p); + break; + } + p++; + } + return sdscatlen(s,"\"",1); +} diff --git a/src/sds.h b/src/sds.h new file mode 100644 index 000000000..ef3a418f2 --- /dev/null +++ b/src/sds.h @@ -0,0 +1,74 @@ +/* SDSLib, A C dynamic strings library + * + * Copyright (c) 2006-2010, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __SDS_H +#define __SDS_H + +#include + +typedef char *sds; + +struct sdshdr { + int len; + int free; + char buf[]; +}; + +sds sdsnewlen(const void *init, size_t initlen); +sds sdsnew(const char *init); +sds sdsempty(); +size_t sdslen(const sds s); +sds sdsdup(const sds s); +void sdsfree(sds s); +size_t sdsavail(sds s); +sds sdscatlen(sds s, void *t, size_t len); +sds sdscat(sds s, char *t); +sds sdscpylen(sds s, char *t, size_t len); +sds sdscpy(sds s, char *t); + +#ifdef __GNUC__ +sds sdscatprintf(sds s, const char *fmt, ...) + __attribute__((format(printf, 2, 3))); +#else +sds sdscatprintf(sds s, const char *fmt, ...); +#endif + +sds sdstrim(sds s, const char *cset); +sds sdsrange(sds s, int start, int end); +void sdsupdatelen(sds s); +int sdscmp(sds s1, sds s2); +sds *sdssplitlen(char *s, int len, char *sep, int seplen, int *count); +void sdsfreesplitres(sds *tokens, int count); +void sdstolower(sds s); +void sdstoupper(sds s); +sds sdsfromlonglong(long long value); +sds sdscatrepr(sds s, char *p, size_t len); + +#endif diff --git a/src/sha1.c b/src/sha1.c new file mode 100644 index 000000000..2c50433e8 --- /dev/null +++ b/src/sha1.c @@ -0,0 +1,276 @@ + +/* from valgrind tests */ + +/* ================ sha1.c ================ */ +/* +SHA-1 in C +By Steve Reid +100% Public Domain + +Test Vectors (from FIPS PUB 180-1) +"abc" + A9993E36 4706816A BA3E2571 7850C26C 9CD0D89D +"abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq" + 84983E44 1C3BD26E BAAE4AA1 F95129E5 E54670F1 +A million repetitions of "a" + 34AA973C D4C4DAA4 F61EEB2B DBAD2731 6534016F +*/ + +/* #define LITTLE_ENDIAN * This should be #define'd already, if true. */ +/* #define SHA1HANDSOFF * Copies data before messing with it. */ + +#define SHA1HANDSOFF + +#include +#include +#include /* for u_int*_t */ +#if defined(__sun) +#include "solarisfixes.h" +#endif +#include "sha1.h" + +#ifndef BYTE_ORDER +#if (BSD >= 199103) +# include +#else +#if defined(linux) || defined(__linux__) +# include +#else +#define LITTLE_ENDIAN 1234 /* least-significant byte first (vax, pc) */ +#define BIG_ENDIAN 4321 /* most-significant byte first (IBM, net) */ +#define PDP_ENDIAN 3412 /* LSB first in word, MSW first in long (pdp)*/ + +#if defined(vax) || defined(ns32000) || defined(sun386) || defined(__i386__) || \ + defined(MIPSEL) || defined(_MIPSEL) || defined(BIT_ZERO_ON_RIGHT) || \ + defined(__alpha__) || defined(__alpha) +#define BYTE_ORDER LITTLE_ENDIAN +#endif + +#if defined(sel) || defined(pyr) || defined(mc68000) || defined(sparc) || \ + defined(is68k) || defined(tahoe) || defined(ibm032) || defined(ibm370) || \ + defined(MIPSEB) || defined(_MIPSEB) || defined(_IBMR2) || defined(DGUX) ||\ + defined(apollo) || defined(__convex__) || defined(_CRAY) || \ + defined(__hppa) || defined(__hp9000) || \ + defined(__hp9000s300) || defined(__hp9000s700) || \ + defined (BIT_ZERO_ON_LEFT) || defined(m68k) || defined(__sparc) +#define BYTE_ORDER BIG_ENDIAN +#endif +#endif /* linux */ +#endif /* BSD */ +#endif /* BYTE_ORDER */ + +#if defined(__BYTE_ORDER) && !defined(BYTE_ORDER) +#if (__BYTE_ORDER == __LITTLE_ENDIAN) +#define BYTE_ORDER LITTLE_ENDIAN +#else +#define BYTE_ORDER BIG_ENDIAN +#endif +#endif + +#if !defined(BYTE_ORDER) || \ + (BYTE_ORDER != BIG_ENDIAN && BYTE_ORDER != LITTLE_ENDIAN && \ + BYTE_ORDER != PDP_ENDIAN) + /* you must determine what the correct bit order is for + * your compiler - the next line is an intentional error + * which will force your compiles to bomb until you fix + * the above macros. + */ +#error "Undefined or invalid BYTE_ORDER" +#endif + +#define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits)))) + +/* blk0() and blk() perform the initial expand. */ +/* I got the idea of expanding during the round function from SSLeay */ +#if BYTE_ORDER == LITTLE_ENDIAN +#define blk0(i) (block->l[i] = (rol(block->l[i],24)&0xFF00FF00) \ + |(rol(block->l[i],8)&0x00FF00FF)) +#elif BYTE_ORDER == BIG_ENDIAN +#define blk0(i) block->l[i] +#else +#error "Endianness not defined!" +#endif +#define blk(i) (block->l[i&15] = rol(block->l[(i+13)&15]^block->l[(i+8)&15] \ + ^block->l[(i+2)&15]^block->l[i&15],1)) + +/* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */ +#define R0(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk0(i)+0x5A827999+rol(v,5);w=rol(w,30); +#define R1(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk(i)+0x5A827999+rol(v,5);w=rol(w,30); +#define R2(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0x6ED9EBA1+rol(v,5);w=rol(w,30); +#define R3(v,w,x,y,z,i) z+=(((w|x)&y)|(w&x))+blk(i)+0x8F1BBCDC+rol(v,5);w=rol(w,30); +#define R4(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0xCA62C1D6+rol(v,5);w=rol(w,30); + + +/* Hash a single 512-bit block. This is the core of the algorithm. */ + +void SHA1Transform(u_int32_t state[5], const unsigned char buffer[64]) +{ +u_int32_t a, b, c, d, e; +typedef union { + unsigned char c[64]; + u_int32_t l[16]; +} CHAR64LONG16; +#ifdef SHA1HANDSOFF +CHAR64LONG16 block[1]; /* use array to appear as a pointer */ + memcpy(block, buffer, 64); +#else + /* The following had better never be used because it causes the + * pointer-to-const buffer to be cast into a pointer to non-const. + * And the result is written through. I threw a "const" in, hoping + * this will cause a diagnostic. + */ +CHAR64LONG16* block = (const CHAR64LONG16*)buffer; +#endif + /* Copy context->state[] to working vars */ + a = state[0]; + b = state[1]; + c = state[2]; + d = state[3]; + e = state[4]; + /* 4 rounds of 20 operations each. Loop unrolled. */ + R0(a,b,c,d,e, 0); R0(e,a,b,c,d, 1); R0(d,e,a,b,c, 2); R0(c,d,e,a,b, 3); + R0(b,c,d,e,a, 4); R0(a,b,c,d,e, 5); R0(e,a,b,c,d, 6); R0(d,e,a,b,c, 7); + R0(c,d,e,a,b, 8); R0(b,c,d,e,a, 9); R0(a,b,c,d,e,10); R0(e,a,b,c,d,11); + R0(d,e,a,b,c,12); R0(c,d,e,a,b,13); R0(b,c,d,e,a,14); R0(a,b,c,d,e,15); + R1(e,a,b,c,d,16); R1(d,e,a,b,c,17); R1(c,d,e,a,b,18); R1(b,c,d,e,a,19); + R2(a,b,c,d,e,20); R2(e,a,b,c,d,21); R2(d,e,a,b,c,22); R2(c,d,e,a,b,23); + R2(b,c,d,e,a,24); R2(a,b,c,d,e,25); R2(e,a,b,c,d,26); R2(d,e,a,b,c,27); + R2(c,d,e,a,b,28); R2(b,c,d,e,a,29); R2(a,b,c,d,e,30); R2(e,a,b,c,d,31); + R2(d,e,a,b,c,32); R2(c,d,e,a,b,33); R2(b,c,d,e,a,34); R2(a,b,c,d,e,35); + R2(e,a,b,c,d,36); R2(d,e,a,b,c,37); R2(c,d,e,a,b,38); R2(b,c,d,e,a,39); + R3(a,b,c,d,e,40); R3(e,a,b,c,d,41); R3(d,e,a,b,c,42); R3(c,d,e,a,b,43); + R3(b,c,d,e,a,44); R3(a,b,c,d,e,45); R3(e,a,b,c,d,46); R3(d,e,a,b,c,47); + R3(c,d,e,a,b,48); R3(b,c,d,e,a,49); R3(a,b,c,d,e,50); R3(e,a,b,c,d,51); + R3(d,e,a,b,c,52); R3(c,d,e,a,b,53); R3(b,c,d,e,a,54); R3(a,b,c,d,e,55); + R3(e,a,b,c,d,56); R3(d,e,a,b,c,57); R3(c,d,e,a,b,58); R3(b,c,d,e,a,59); + R4(a,b,c,d,e,60); R4(e,a,b,c,d,61); R4(d,e,a,b,c,62); R4(c,d,e,a,b,63); + R4(b,c,d,e,a,64); R4(a,b,c,d,e,65); R4(e,a,b,c,d,66); R4(d,e,a,b,c,67); + R4(c,d,e,a,b,68); R4(b,c,d,e,a,69); R4(a,b,c,d,e,70); R4(e,a,b,c,d,71); + R4(d,e,a,b,c,72); R4(c,d,e,a,b,73); R4(b,c,d,e,a,74); R4(a,b,c,d,e,75); + R4(e,a,b,c,d,76); R4(d,e,a,b,c,77); R4(c,d,e,a,b,78); R4(b,c,d,e,a,79); + /* Add the working vars back into context.state[] */ + state[0] += a; + state[1] += b; + state[2] += c; + state[3] += d; + state[4] += e; + /* Wipe variables */ + a = b = c = d = e = 0; +#ifdef SHA1HANDSOFF + memset(block, '\0', sizeof(block)); +#endif +} + + +/* SHA1Init - Initialize new context */ + +void SHA1Init(SHA1_CTX* context) +{ + /* SHA1 initialization constants */ + context->state[0] = 0x67452301; + context->state[1] = 0xEFCDAB89; + context->state[2] = 0x98BADCFE; + context->state[3] = 0x10325476; + context->state[4] = 0xC3D2E1F0; + context->count[0] = context->count[1] = 0; +} + + +/* Run your data through this. */ + +void SHA1Update(SHA1_CTX* context, const unsigned char* data, u_int32_t len) +{ +u_int32_t i; +u_int32_t j; + + j = context->count[0]; + if ((context->count[0] += len << 3) < j) + context->count[1]++; + context->count[1] += (len>>29); + j = (j >> 3) & 63; + if ((j + len) > 63) { + memcpy(&context->buffer[j], data, (i = 64-j)); + SHA1Transform(context->state, context->buffer); + for ( ; i + 63 < len; i += 64) { + SHA1Transform(context->state, &data[i]); + } + j = 0; + } + else i = 0; + memcpy(&context->buffer[j], &data[i], len - i); +} + + +/* Add padding and return the message digest. */ + +void SHA1Final(unsigned char digest[20], SHA1_CTX* context) +{ +unsigned i; +unsigned char finalcount[8]; +unsigned char c; + +#if 0 /* untested "improvement" by DHR */ + /* Convert context->count to a sequence of bytes + * in finalcount. Second element first, but + * big-endian order within element. + * But we do it all backwards. + */ + unsigned char *fcp = &finalcount[8]; + + for (i = 0; i < 2; i++) + { + u_int32_t t = context->count[i]; + int j; + + for (j = 0; j < 4; t >>= 8, j++) + *--fcp = (unsigned char) t + } +#else + for (i = 0; i < 8; i++) { + finalcount[i] = (unsigned char)((context->count[(i >= 4 ? 0 : 1)] + >> ((3-(i & 3)) * 8) ) & 255); /* Endian independent */ + } +#endif + c = 0200; + SHA1Update(context, &c, 1); + while ((context->count[0] & 504) != 448) { + c = 0000; + SHA1Update(context, &c, 1); + } + SHA1Update(context, finalcount, 8); /* Should cause a SHA1Transform() */ + for (i = 0; i < 20; i++) { + digest[i] = (unsigned char) + ((context->state[i>>2] >> ((3-(i & 3)) * 8) ) & 255); + } + /* Wipe variables */ + memset(context, '\0', sizeof(*context)); + memset(&finalcount, '\0', sizeof(finalcount)); +} +/* ================ end of sha1.c ================ */ + +#if 0 +#define BUFSIZE 4096 + +int +main(int argc, char **argv) +{ + SHA1_CTX ctx; + unsigned char hash[20], buf[BUFSIZE]; + int i; + + for(i=0;i +100% Public Domain +*/ + +typedef struct { + u_int32_t state[5]; + u_int32_t count[2]; + unsigned char buffer[64]; +} SHA1_CTX; + +void SHA1Transform(u_int32_t state[5], const unsigned char buffer[64]); +void SHA1Init(SHA1_CTX* context); +void SHA1Update(SHA1_CTX* context, const unsigned char* data, u_int32_t len); +void SHA1Final(unsigned char digest[20], SHA1_CTX* context); diff --git a/src/solarisfixes.h b/src/solarisfixes.h new file mode 100644 index 000000000..ce8e7b6fd --- /dev/null +++ b/src/solarisfixes.h @@ -0,0 +1,21 @@ +/* Solaris specific fixes */ + +#if defined(__GNUC__) +#undef isnan +#define isnan(x) \ + __extension__({ __typeof (x) __x_a = (x); \ + __builtin_expect(__x_a != __x_a, 0); }) + +#undef isfinite +#define isfinite(x) \ + __extension__ ({ __typeof (x) __x_f = (x); \ + __builtin_expect(!isnan(__x_f - __x_f), 1); }) + +#undef isinf +#define isinf(x) \ + __extension__ ({ __typeof (x) __x_i = (x); \ + __builtin_expect(!isnan(__x_i) && !isfinite(__x_i), 0); }) + +#define u_int uint +#define u_int32_t uint32_t +#endif /* __GNUC__ */ diff --git a/src/sort.c b/src/sort.c new file mode 100644 index 000000000..0bc86b474 --- /dev/null +++ b/src/sort.c @@ -0,0 +1,383 @@ +#include "redis.h" +#include "pqsort.h" /* Partial qsort for SORT+LIMIT */ + +redisSortOperation *createSortOperation(int type, robj *pattern) { + redisSortOperation *so = zmalloc(sizeof(*so)); + so->type = type; + so->pattern = pattern; + return so; +} + +/* Return the value associated to the key with a name obtained + * substituting the first occurence of '*' in 'pattern' with 'subst'. + * The returned object will always have its refcount increased by 1 + * when it is non-NULL. */ +robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) { + char *p, *f; + sds spat, ssub; + robj keyobj, fieldobj, *o; + int prefixlen, sublen, postfixlen, fieldlen; + /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */ + struct { + int len; + int free; + char buf[REDIS_SORTKEY_MAX+1]; + } keyname, fieldname; + + /* If the pattern is "#" return the substitution object itself in order + * to implement the "SORT ... GET #" feature. */ + spat = pattern->ptr; + if (spat[0] == '#' && spat[1] == '\0') { + incrRefCount(subst); + return subst; + } + + /* The substitution object may be specially encoded. If so we create + * a decoded object on the fly. Otherwise getDecodedObject will just + * increment the ref count, that we'll decrement later. */ + subst = getDecodedObject(subst); + + ssub = subst->ptr; + if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL; + p = strchr(spat,'*'); + if (!p) { + decrRefCount(subst); + return NULL; + } + + /* Find out if we're dealing with a hash dereference. */ + if ((f = strstr(p+1, "->")) != NULL) { + fieldlen = sdslen(spat)-(f-spat); + /* this also copies \0 character */ + memcpy(fieldname.buf,f+2,fieldlen-1); + fieldname.len = fieldlen-2; + } else { + fieldlen = 0; + } + + prefixlen = p-spat; + sublen = sdslen(ssub); + postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen; + memcpy(keyname.buf,spat,prefixlen); + memcpy(keyname.buf+prefixlen,ssub,sublen); + memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen); + keyname.buf[prefixlen+sublen+postfixlen] = '\0'; + keyname.len = prefixlen+sublen+postfixlen; + decrRefCount(subst); + + /* Lookup substituted key */ + initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(struct sdshdr))); + o = lookupKeyRead(db,&keyobj); + if (o == NULL) return NULL; + + if (fieldlen > 0) { + if (o->type != REDIS_HASH || fieldname.len < 1) return NULL; + + /* Retrieve value from hash by the field name. This operation + * already increases the refcount of the returned object. */ + initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(struct sdshdr))); + o = hashTypeGet(o, &fieldobj); + } else { + if (o->type != REDIS_STRING) return NULL; + + /* Every object that this function returns needs to have its refcount + * increased. sortCommand decreases it again. */ + incrRefCount(o); + } + + return o; +} + +/* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with + * the additional parameter is not standard but a BSD-specific we have to + * pass sorting parameters via the global 'server' structure */ +int sortCompare(const void *s1, const void *s2) { + const redisSortObject *so1 = s1, *so2 = s2; + int cmp; + + if (!server.sort_alpha) { + /* Numeric sorting. Here it's trivial as we precomputed scores */ + if (so1->u.score > so2->u.score) { + cmp = 1; + } else if (so1->u.score < so2->u.score) { + cmp = -1; + } else { + cmp = 0; + } + } else { + /* Alphanumeric sorting */ + if (server.sort_bypattern) { + if (!so1->u.cmpobj || !so2->u.cmpobj) { + /* At least one compare object is NULL */ + if (so1->u.cmpobj == so2->u.cmpobj) + cmp = 0; + else if (so1->u.cmpobj == NULL) + cmp = -1; + else + cmp = 1; + } else { + /* We have both the objects, use strcoll */ + cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr); + } + } else { + /* Compare elements directly. */ + cmp = compareStringObjects(so1->obj,so2->obj); + } + } + return server.sort_desc ? -cmp : cmp; +} + +/* The SORT command is the most complex command in Redis. Warning: this code + * is optimized for speed and a bit less for readability */ +void sortCommand(redisClient *c) { + list *operations; + unsigned int outputlen = 0; + int desc = 0, alpha = 0; + int limit_start = 0, limit_count = -1, start, end; + int j, dontsort = 0, vectorlen; + int getop = 0; /* GET operation counter */ + robj *sortval, *sortby = NULL, *storekey = NULL; + redisSortObject *vector; /* Resulting vector to sort */ + + /* Lookup the key to sort. It must be of the right types */ + sortval = lookupKeyRead(c->db,c->argv[1]); + if (sortval == NULL) { + addReply(c,shared.emptymultibulk); + return; + } + if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST && + sortval->type != REDIS_ZSET) + { + addReply(c,shared.wrongtypeerr); + return; + } + + /* Create a list of operations to perform for every sorted element. + * Operations can be GET/DEL/INCR/DECR */ + operations = listCreate(); + listSetFreeMethod(operations,zfree); + j = 2; + + /* Now we need to protect sortval incrementing its count, in the future + * SORT may have options able to overwrite/delete keys during the sorting + * and the sorted key itself may get destroied */ + incrRefCount(sortval); + + /* The SORT command has an SQL-alike syntax, parse it */ + while(j < c->argc) { + int leftargs = c->argc-j-1; + if (!strcasecmp(c->argv[j]->ptr,"asc")) { + desc = 0; + } else if (!strcasecmp(c->argv[j]->ptr,"desc")) { + desc = 1; + } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) { + alpha = 1; + } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) { + limit_start = atoi(c->argv[j+1]->ptr); + limit_count = atoi(c->argv[j+2]->ptr); + j+=2; + } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) { + storekey = c->argv[j+1]; + j++; + } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) { + sortby = c->argv[j+1]; + /* If the BY pattern does not contain '*', i.e. it is constant, + * we don't need to sort nor to lookup the weight keys. */ + if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1; + j++; + } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) { + listAddNodeTail(operations,createSortOperation( + REDIS_SORT_GET,c->argv[j+1])); + getop++; + j++; + } else { + decrRefCount(sortval); + listRelease(operations); + addReply(c,shared.syntaxerr); + return; + } + j++; + } + + /* Load the sorting vector with all the objects to sort */ + switch(sortval->type) { + case REDIS_LIST: vectorlen = listTypeLength(sortval); break; + case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break; + case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break; + default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */ + } + vector = zmalloc(sizeof(redisSortObject)*vectorlen); + j = 0; + + if (sortval->type == REDIS_LIST) { + listTypeIterator *li = listTypeInitIterator(sortval,0,REDIS_TAIL); + listTypeEntry entry; + while(listTypeNext(li,&entry)) { + vector[j].obj = listTypeGet(&entry); + vector[j].u.score = 0; + vector[j].u.cmpobj = NULL; + j++; + } + listTypeReleaseIterator(li); + } else { + dict *set; + dictIterator *di; + dictEntry *setele; + + if (sortval->type == REDIS_SET) { + set = sortval->ptr; + } else { + zset *zs = sortval->ptr; + set = zs->dict; + } + + di = dictGetIterator(set); + while((setele = dictNext(di)) != NULL) { + vector[j].obj = dictGetEntryKey(setele); + vector[j].u.score = 0; + vector[j].u.cmpobj = NULL; + j++; + } + dictReleaseIterator(di); + } + redisAssert(j == vectorlen); + + /* Now it's time to load the right scores in the sorting vector */ + if (dontsort == 0) { + for (j = 0; j < vectorlen; j++) { + robj *byval; + if (sortby) { + /* lookup value to sort by */ + byval = lookupKeyByPattern(c->db,sortby,vector[j].obj); + if (!byval) continue; + } else { + /* use object itself to sort by */ + byval = vector[j].obj; + } + + if (alpha) { + if (sortby) vector[j].u.cmpobj = getDecodedObject(byval); + } else { + if (byval->encoding == REDIS_ENCODING_RAW) { + vector[j].u.score = strtod(byval->ptr,NULL); + } else if (byval->encoding == REDIS_ENCODING_INT) { + /* Don't need to decode the object if it's + * integer-encoded (the only encoding supported) so + * far. We can just cast it */ + vector[j].u.score = (long)byval->ptr; + } else { + redisAssert(1 != 1); + } + } + + /* when the object was retrieved using lookupKeyByPattern, + * its refcount needs to be decreased. */ + if (sortby) { + decrRefCount(byval); + } + } + } + + /* We are ready to sort the vector... perform a bit of sanity check + * on the LIMIT option too. We'll use a partial version of quicksort. */ + start = (limit_start < 0) ? 0 : limit_start; + end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1; + if (start >= vectorlen) { + start = vectorlen-1; + end = vectorlen-2; + } + if (end >= vectorlen) end = vectorlen-1; + + if (dontsort == 0) { + server.sort_desc = desc; + server.sort_alpha = alpha; + server.sort_bypattern = sortby ? 1 : 0; + if (sortby && (start != 0 || end != vectorlen-1)) + pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end); + else + qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare); + } + + /* Send command output to the output buffer, performing the specified + * GET/DEL/INCR/DECR operations if any. */ + outputlen = getop ? getop*(end-start+1) : end-start+1; + if (storekey == NULL) { + /* STORE option not specified, sent the sorting result to client */ + addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen)); + for (j = start; j <= end; j++) { + listNode *ln; + listIter li; + + if (!getop) addReplyBulk(c,vector[j].obj); + listRewind(operations,&li); + while((ln = listNext(&li))) { + redisSortOperation *sop = ln->value; + robj *val = lookupKeyByPattern(c->db,sop->pattern, + vector[j].obj); + + if (sop->type == REDIS_SORT_GET) { + if (!val) { + addReply(c,shared.nullbulk); + } else { + addReplyBulk(c,val); + decrRefCount(val); + } + } else { + redisAssert(sop->type == REDIS_SORT_GET); /* always fails */ + } + } + } + } else { + robj *sobj = createZiplistObject(); + + /* STORE option specified, set the sorting result as a List object */ + for (j = start; j <= end; j++) { + listNode *ln; + listIter li; + + if (!getop) { + listTypePush(sobj,vector[j].obj,REDIS_TAIL); + } else { + listRewind(operations,&li); + while((ln = listNext(&li))) { + redisSortOperation *sop = ln->value; + robj *val = lookupKeyByPattern(c->db,sop->pattern, + vector[j].obj); + + if (sop->type == REDIS_SORT_GET) { + if (!val) val = createStringObject("",0); + + /* listTypePush does an incrRefCount, so we should take care + * care of the incremented refcount caused by either + * lookupKeyByPattern or createStringObject("",0) */ + listTypePush(sobj,val,REDIS_TAIL); + decrRefCount(val); + } else { + /* always fails */ + redisAssert(sop->type == REDIS_SORT_GET); + } + } + } + } + dbReplace(c->db,storekey,sobj); + /* Note: we add 1 because the DB is dirty anyway since even if the + * SORT result is empty a new key is set and maybe the old content + * replaced. */ + server.dirty += 1+outputlen; + addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen)); + } + + /* Cleanup */ + if (sortval->type == REDIS_LIST) + for (j = 0; j < vectorlen; j++) + decrRefCount(vector[j].obj); + decrRefCount(sortval); + listRelease(operations); + for (j = 0; j < vectorlen; j++) { + if (alpha && vector[j].u.cmpobj) + decrRefCount(vector[j].u.cmpobj); + } + zfree(vector); +} + + diff --git a/src/t_hash.c b/src/t_hash.c new file mode 100644 index 000000000..3f5fd6e16 --- /dev/null +++ b/src/t_hash.c @@ -0,0 +1,397 @@ +#include "redis.h" + +#include + +/*----------------------------------------------------------------------------- + * Hash type API + *----------------------------------------------------------------------------*/ + +/* Check the length of a number of objects to see if we need to convert a + * zipmap to a real hash. Note that we only check string encoded objects + * as their string length can be queried in constant time. */ +void hashTypeTryConversion(robj *subject, robj **argv, int start, int end) { + int i; + if (subject->encoding != REDIS_ENCODING_ZIPMAP) return; + + for (i = start; i <= end; i++) { + if (argv[i]->encoding == REDIS_ENCODING_RAW && + sdslen(argv[i]->ptr) > server.hash_max_zipmap_value) + { + convertToRealHash(subject); + return; + } + } +} + +/* Encode given objects in-place when the hash uses a dict. */ +void hashTypeTryObjectEncoding(robj *subject, robj **o1, robj **o2) { + if (subject->encoding == REDIS_ENCODING_HT) { + if (o1) *o1 = tryObjectEncoding(*o1); + if (o2) *o2 = tryObjectEncoding(*o2); + } +} + +/* Get the value from a hash identified by key. Returns either a string + * object or NULL if the value cannot be found. The refcount of the object + * is always increased by 1 when the value was found. */ +robj *hashTypeGet(robj *o, robj *key) { + robj *value = NULL; + if (o->encoding == REDIS_ENCODING_ZIPMAP) { + unsigned char *v; + unsigned int vlen; + key = getDecodedObject(key); + if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) { + value = createStringObject((char*)v,vlen); + } + decrRefCount(key); + } else { + dictEntry *de = dictFind(o->ptr,key); + if (de != NULL) { + value = dictGetEntryVal(de); + incrRefCount(value); + } + } + return value; +} + +/* Test if the key exists in the given hash. Returns 1 if the key + * exists and 0 when it doesn't. */ +int hashTypeExists(robj *o, robj *key) { + if (o->encoding == REDIS_ENCODING_ZIPMAP) { + key = getDecodedObject(key); + if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) { + decrRefCount(key); + return 1; + } + decrRefCount(key); + } else { + if (dictFind(o->ptr,key) != NULL) { + return 1; + } + } + return 0; +} + +/* Add an element, discard the old if the key already exists. + * Return 0 on insert and 1 on update. */ +int hashTypeSet(robj *o, robj *key, robj *value) { + int update = 0; + if (o->encoding == REDIS_ENCODING_ZIPMAP) { + key = getDecodedObject(key); + value = getDecodedObject(value); + o->ptr = zipmapSet(o->ptr, + key->ptr,sdslen(key->ptr), + value->ptr,sdslen(value->ptr), &update); + decrRefCount(key); + decrRefCount(value); + + /* Check if the zipmap needs to be upgraded to a real hash table */ + if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries) + convertToRealHash(o); + } else { + if (dictReplace(o->ptr,key,value)) { + /* Insert */ + incrRefCount(key); + } else { + /* Update */ + update = 1; + } + incrRefCount(value); + } + return update; +} + +/* Delete an element from a hash. + * Return 1 on deleted and 0 on not found. */ +int hashTypeDelete(robj *o, robj *key) { + int deleted = 0; + if (o->encoding == REDIS_ENCODING_ZIPMAP) { + key = getDecodedObject(key); + o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted); + decrRefCount(key); + } else { + deleted = dictDelete((dict*)o->ptr,key) == DICT_OK; + /* Always check if the dictionary needs a resize after a delete. */ + if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr); + } + return deleted; +} + +/* Return the number of elements in a hash. */ +unsigned long hashTypeLength(robj *o) { + return (o->encoding == REDIS_ENCODING_ZIPMAP) ? + zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr); +} + +hashTypeIterator *hashTypeInitIterator(robj *subject) { + hashTypeIterator *hi = zmalloc(sizeof(hashTypeIterator)); + hi->encoding = subject->encoding; + if (hi->encoding == REDIS_ENCODING_ZIPMAP) { + hi->zi = zipmapRewind(subject->ptr); + } else if (hi->encoding == REDIS_ENCODING_HT) { + hi->di = dictGetIterator(subject->ptr); + } else { + redisAssert(NULL); + } + return hi; +} + +void hashTypeReleaseIterator(hashTypeIterator *hi) { + if (hi->encoding == REDIS_ENCODING_HT) { + dictReleaseIterator(hi->di); + } + zfree(hi); +} + +/* Move to the next entry in the hash. Return REDIS_OK when the next entry + * could be found and REDIS_ERR when the iterator reaches the end. */ +int hashTypeNext(hashTypeIterator *hi) { + if (hi->encoding == REDIS_ENCODING_ZIPMAP) { + if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen, + &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR; + } else { + if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR; + } + return REDIS_OK; +} + +/* Get key or value object at current iteration position. + * This increases the refcount of the field object by 1. */ +robj *hashTypeCurrent(hashTypeIterator *hi, int what) { + robj *o; + if (hi->encoding == REDIS_ENCODING_ZIPMAP) { + if (what & REDIS_HASH_KEY) { + o = createStringObject((char*)hi->zk,hi->zklen); + } else { + o = createStringObject((char*)hi->zv,hi->zvlen); + } + } else { + if (what & REDIS_HASH_KEY) { + o = dictGetEntryKey(hi->de); + } else { + o = dictGetEntryVal(hi->de); + } + incrRefCount(o); + } + return o; +} + +robj *hashTypeLookupWriteOrCreate(redisClient *c, robj *key) { + robj *o = lookupKeyWrite(c->db,key); + if (o == NULL) { + o = createHashObject(); + dbAdd(c->db,key,o); + } else { + if (o->type != REDIS_HASH) { + addReply(c,shared.wrongtypeerr); + return NULL; + } + } + return o; +} + +void convertToRealHash(robj *o) { + unsigned char *key, *val, *p, *zm = o->ptr; + unsigned int klen, vlen; + dict *dict = dictCreate(&hashDictType,NULL); + + redisAssert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT); + p = zipmapRewind(zm); + while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) { + robj *keyobj, *valobj; + + keyobj = createStringObject((char*)key,klen); + valobj = createStringObject((char*)val,vlen); + keyobj = tryObjectEncoding(keyobj); + valobj = tryObjectEncoding(valobj); + dictAdd(dict,keyobj,valobj); + } + o->encoding = REDIS_ENCODING_HT; + o->ptr = dict; + zfree(zm); +} + +/*----------------------------------------------------------------------------- + * Hash type commands + *----------------------------------------------------------------------------*/ + +void hsetCommand(redisClient *c) { + int update; + robj *o; + + if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return; + hashTypeTryConversion(o,c->argv,2,3); + hashTypeTryObjectEncoding(o,&c->argv[2], &c->argv[3]); + update = hashTypeSet(o,c->argv[2],c->argv[3]); + addReply(c, update ? shared.czero : shared.cone); + server.dirty++; +} + +void hsetnxCommand(redisClient *c) { + robj *o; + if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return; + hashTypeTryConversion(o,c->argv,2,3); + + if (hashTypeExists(o, c->argv[2])) { + addReply(c, shared.czero); + } else { + hashTypeTryObjectEncoding(o,&c->argv[2], &c->argv[3]); + hashTypeSet(o,c->argv[2],c->argv[3]); + addReply(c, shared.cone); + server.dirty++; + } +} + +void hmsetCommand(redisClient *c) { + int i; + robj *o; + + if ((c->argc % 2) == 1) { + addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n")); + return; + } + + if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return; + hashTypeTryConversion(o,c->argv,2,c->argc-1); + for (i = 2; i < c->argc; i += 2) { + hashTypeTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]); + hashTypeSet(o,c->argv[i],c->argv[i+1]); + } + addReply(c, shared.ok); + server.dirty++; +} + +void hincrbyCommand(redisClient *c) { + long long value, incr; + robj *o, *current, *new; + + if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return; + if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return; + if ((current = hashTypeGet(o,c->argv[2])) != NULL) { + if (getLongLongFromObjectOrReply(c,current,&value, + "hash value is not an integer") != REDIS_OK) { + decrRefCount(current); + return; + } + decrRefCount(current); + } else { + value = 0; + } + + value += incr; + new = createStringObjectFromLongLong(value); + hashTypeTryObjectEncoding(o,&c->argv[2],NULL); + hashTypeSet(o,c->argv[2],new); + decrRefCount(new); + addReplyLongLong(c,value); + server.dirty++; +} + +void hgetCommand(redisClient *c) { + robj *o, *value; + if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL || + checkType(c,o,REDIS_HASH)) return; + + if ((value = hashTypeGet(o,c->argv[2])) != NULL) { + addReplyBulk(c,value); + decrRefCount(value); + } else { + addReply(c,shared.nullbulk); + } +} + +void hmgetCommand(redisClient *c) { + int i; + robj *o, *value; + o = lookupKeyRead(c->db,c->argv[1]); + if (o != NULL && o->type != REDIS_HASH) { + addReply(c,shared.wrongtypeerr); + } + + /* Note the check for o != NULL happens inside the loop. This is + * done because objects that cannot be found are considered to be + * an empty hash. The reply should then be a series of NULLs. */ + addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2)); + for (i = 2; i < c->argc; i++) { + if (o != NULL && (value = hashTypeGet(o,c->argv[i])) != NULL) { + addReplyBulk(c,value); + decrRefCount(value); + } else { + addReply(c,shared.nullbulk); + } + } +} + +void hdelCommand(redisClient *c) { + robj *o; + if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL || + checkType(c,o,REDIS_HASH)) return; + + if (hashTypeDelete(o,c->argv[2])) { + if (hashTypeLength(o) == 0) dbDelete(c->db,c->argv[1]); + addReply(c,shared.cone); + server.dirty++; + } else { + addReply(c,shared.czero); + } +} + +void hlenCommand(redisClient *c) { + robj *o; + if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL || + checkType(c,o,REDIS_HASH)) return; + + addReplyUlong(c,hashTypeLength(o)); +} + +void genericHgetallCommand(redisClient *c, int flags) { + robj *o, *lenobj, *obj; + unsigned long count = 0; + hashTypeIterator *hi; + + if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL + || checkType(c,o,REDIS_HASH)) return; + + lenobj = createObject(REDIS_STRING,NULL); + addReply(c,lenobj); + decrRefCount(lenobj); + + hi = hashTypeInitIterator(o); + while (hashTypeNext(hi) != REDIS_ERR) { + if (flags & REDIS_HASH_KEY) { + obj = hashTypeCurrent(hi,REDIS_HASH_KEY); + addReplyBulk(c,obj); + decrRefCount(obj); + count++; + } + if (flags & REDIS_HASH_VALUE) { + obj = hashTypeCurrent(hi,REDIS_HASH_VALUE); + addReplyBulk(c,obj); + decrRefCount(obj); + count++; + } + } + hashTypeReleaseIterator(hi); + + lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count); +} + +void hkeysCommand(redisClient *c) { + genericHgetallCommand(c,REDIS_HASH_KEY); +} + +void hvalsCommand(redisClient *c) { + genericHgetallCommand(c,REDIS_HASH_VALUE); +} + +void hgetallCommand(redisClient *c) { + genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE); +} + +void hexistsCommand(redisClient *c) { + robj *o; + if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL || + checkType(c,o,REDIS_HASH)) return; + + addReply(c, hashTypeExists(o,c->argv[2]) ? shared.cone : shared.czero); +} diff --git a/src/t_list.c b/src/t_list.c new file mode 100644 index 000000000..ec8b30c3f --- /dev/null +++ b/src/t_list.c @@ -0,0 +1,829 @@ +#include "redis.h" + +/*----------------------------------------------------------------------------- + * List API + *----------------------------------------------------------------------------*/ + +/* Check the argument length to see if it requires us to convert the ziplist + * to a real list. Only check raw-encoded objects because integer encoded + * objects are never too long. */ +void listTypeTryConversion(robj *subject, robj *value) { + if (subject->encoding != REDIS_ENCODING_ZIPLIST) return; + if (value->encoding == REDIS_ENCODING_RAW && + sdslen(value->ptr) > server.list_max_ziplist_value) + listTypeConvert(subject,REDIS_ENCODING_LINKEDLIST); +} + +void listTypePush(robj *subject, robj *value, int where) { + /* Check if we need to convert the ziplist */ + listTypeTryConversion(subject,value); + if (subject->encoding == REDIS_ENCODING_ZIPLIST && + ziplistLen(subject->ptr) >= server.list_max_ziplist_entries) + listTypeConvert(subject,REDIS_ENCODING_LINKEDLIST); + + if (subject->encoding == REDIS_ENCODING_ZIPLIST) { + int pos = (where == REDIS_HEAD) ? ZIPLIST_HEAD : ZIPLIST_TAIL; + value = getDecodedObject(value); + subject->ptr = ziplistPush(subject->ptr,value->ptr,sdslen(value->ptr),pos); + decrRefCount(value); + } else if (subject->encoding == REDIS_ENCODING_LINKEDLIST) { + if (where == REDIS_HEAD) { + listAddNodeHead(subject->ptr,value); + } else { + listAddNodeTail(subject->ptr,value); + } + incrRefCount(value); + } else { + redisPanic("Unknown list encoding"); + } +} + +robj *listTypePop(robj *subject, int where) { + robj *value = NULL; + if (subject->encoding == REDIS_ENCODING_ZIPLIST) { + unsigned char *p; + unsigned char *vstr; + unsigned int vlen; + long long vlong; + int pos = (where == REDIS_HEAD) ? 0 : -1; + p = ziplistIndex(subject->ptr,pos); + if (ziplistGet(p,&vstr,&vlen,&vlong)) { + if (vstr) { + value = createStringObject((char*)vstr,vlen); + } else { + value = createStringObjectFromLongLong(vlong); + } + /* We only need to delete an element when it exists */ + subject->ptr = ziplistDelete(subject->ptr,&p); + } + } else if (subject->encoding == REDIS_ENCODING_LINKEDLIST) { + list *list = subject->ptr; + listNode *ln; + if (where == REDIS_HEAD) { + ln = listFirst(list); + } else { + ln = listLast(list); + } + if (ln != NULL) { + value = listNodeValue(ln); + incrRefCount(value); + listDelNode(list,ln); + } + } else { + redisPanic("Unknown list encoding"); + } + return value; +} + +unsigned long listTypeLength(robj *subject) { + if (subject->encoding == REDIS_ENCODING_ZIPLIST) { + return ziplistLen(subject->ptr); + } else if (subject->encoding == REDIS_ENCODING_LINKEDLIST) { + return listLength((list*)subject->ptr); + } else { + redisPanic("Unknown list encoding"); + } +} + +/* Initialize an iterator at the specified index. */ +listTypeIterator *listTypeInitIterator(robj *subject, int index, unsigned char direction) { + listTypeIterator *li = zmalloc(sizeof(listTypeIterator)); + li->subject = subject; + li->encoding = subject->encoding; + li->direction = direction; + if (li->encoding == REDIS_ENCODING_ZIPLIST) { + li->zi = ziplistIndex(subject->ptr,index); + } else if (li->encoding == REDIS_ENCODING_LINKEDLIST) { + li->ln = listIndex(subject->ptr,index); + } else { + redisPanic("Unknown list encoding"); + } + return li; +} + +/* Clean up the iterator. */ +void listTypeReleaseIterator(listTypeIterator *li) { + zfree(li); +} + +/* Stores pointer to current the entry in the provided entry structure + * and advances the position of the iterator. Returns 1 when the current + * entry is in fact an entry, 0 otherwise. */ +int listTypeNext(listTypeIterator *li, listTypeEntry *entry) { + /* Protect from converting when iterating */ + redisAssert(li->subject->encoding == li->encoding); + + entry->li = li; + if (li->encoding == REDIS_ENCODING_ZIPLIST) { + entry->zi = li->zi; + if (entry->zi != NULL) { + if (li->direction == REDIS_TAIL) + li->zi = ziplistNext(li->subject->ptr,li->zi); + else + li->zi = ziplistPrev(li->subject->ptr,li->zi); + return 1; + } + } else if (li->encoding == REDIS_ENCODING_LINKEDLIST) { + entry->ln = li->ln; + if (entry->ln != NULL) { + if (li->direction == REDIS_TAIL) + li->ln = li->ln->next; + else + li->ln = li->ln->prev; + return 1; + } + } else { + redisPanic("Unknown list encoding"); + } + return 0; +} + +/* Return entry or NULL at the current position of the iterator. */ +robj *listTypeGet(listTypeEntry *entry) { + listTypeIterator *li = entry->li; + robj *value = NULL; + if (li->encoding == REDIS_ENCODING_ZIPLIST) { + unsigned char *vstr; + unsigned int vlen; + long long vlong; + redisAssert(entry->zi != NULL); + if (ziplistGet(entry->zi,&vstr,&vlen,&vlong)) { + if (vstr) { + value = createStringObject((char*)vstr,vlen); + } else { + value = createStringObjectFromLongLong(vlong); + } + } + } else if (li->encoding == REDIS_ENCODING_LINKEDLIST) { + redisAssert(entry->ln != NULL); + value = listNodeValue(entry->ln); + incrRefCount(value); + } else { + redisPanic("Unknown list encoding"); + } + return value; +} + +void listTypeInsert(listTypeEntry *entry, robj *value, int where) { + robj *subject = entry->li->subject; + if (entry->li->encoding == REDIS_ENCODING_ZIPLIST) { + value = getDecodedObject(value); + if (where == REDIS_TAIL) { + unsigned char *next = ziplistNext(subject->ptr,entry->zi); + + /* When we insert after the current element, but the current element + * is the tail of the list, we need to do a push. */ + if (next == NULL) { + subject->ptr = ziplistPush(subject->ptr,value->ptr,sdslen(value->ptr),REDIS_TAIL); + } else { + subject->ptr = ziplistInsert(subject->ptr,next,value->ptr,sdslen(value->ptr)); + } + } else { + subject->ptr = ziplistInsert(subject->ptr,entry->zi,value->ptr,sdslen(value->ptr)); + } + decrRefCount(value); + } else if (entry->li->encoding == REDIS_ENCODING_LINKEDLIST) { + if (where == REDIS_TAIL) { + listInsertNode(subject->ptr,entry->ln,value,AL_START_TAIL); + } else { + listInsertNode(subject->ptr,entry->ln,value,AL_START_HEAD); + } + incrRefCount(value); + } else { + redisPanic("Unknown list encoding"); + } +} + +/* Compare the given object with the entry at the current position. */ +int listTypeEqual(listTypeEntry *entry, robj *o) { + listTypeIterator *li = entry->li; + if (li->encoding == REDIS_ENCODING_ZIPLIST) { + redisAssert(o->encoding == REDIS_ENCODING_RAW); + return ziplistCompare(entry->zi,o->ptr,sdslen(o->ptr)); + } else if (li->encoding == REDIS_ENCODING_LINKEDLIST) { + return equalStringObjects(o,listNodeValue(entry->ln)); + } else { + redisPanic("Unknown list encoding"); + } +} + +/* Delete the element pointed to. */ +void listTypeDelete(listTypeEntry *entry) { + listTypeIterator *li = entry->li; + if (li->encoding == REDIS_ENCODING_ZIPLIST) { + unsigned char *p = entry->zi; + li->subject->ptr = ziplistDelete(li->subject->ptr,&p); + + /* Update position of the iterator depending on the direction */ + if (li->direction == REDIS_TAIL) + li->zi = p; + else + li->zi = ziplistPrev(li->subject->ptr,p); + } else if (entry->li->encoding == REDIS_ENCODING_LINKEDLIST) { + listNode *next; + if (li->direction == REDIS_TAIL) + next = entry->ln->next; + else + next = entry->ln->prev; + listDelNode(li->subject->ptr,entry->ln); + li->ln = next; + } else { + redisPanic("Unknown list encoding"); + } +} + +void listTypeConvert(robj *subject, int enc) { + listTypeIterator *li; + listTypeEntry entry; + redisAssert(subject->type == REDIS_LIST); + + if (enc == REDIS_ENCODING_LINKEDLIST) { + list *l = listCreate(); + listSetFreeMethod(l,decrRefCount); + + /* listTypeGet returns a robj with incremented refcount */ + li = listTypeInitIterator(subject,0,REDIS_TAIL); + while (listTypeNext(li,&entry)) listAddNodeTail(l,listTypeGet(&entry)); + listTypeReleaseIterator(li); + + subject->encoding = REDIS_ENCODING_LINKEDLIST; + zfree(subject->ptr); + subject->ptr = l; + } else { + redisPanic("Unsupported list conversion"); + } +} + +/*----------------------------------------------------------------------------- + * List Commands + *----------------------------------------------------------------------------*/ + +void pushGenericCommand(redisClient *c, int where) { + robj *lobj = lookupKeyWrite(c->db,c->argv[1]); + if (lobj == NULL) { + if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) { + addReply(c,shared.cone); + return; + } + lobj = createZiplistObject(); + dbAdd(c->db,c->argv[1],lobj); + } else { + if (lobj->type != REDIS_LIST) { + addReply(c,shared.wrongtypeerr); + return; + } + if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) { + addReply(c,shared.cone); + return; + } + } + listTypePush(lobj,c->argv[2],where); + addReplyLongLong(c,listTypeLength(lobj)); + server.dirty++; +} + +void lpushCommand(redisClient *c) { + pushGenericCommand(c,REDIS_HEAD); +} + +void rpushCommand(redisClient *c) { + pushGenericCommand(c,REDIS_TAIL); +} + +void pushxGenericCommand(redisClient *c, robj *refval, robj *val, int where) { + robj *subject; + listTypeIterator *iter; + listTypeEntry entry; + int inserted = 0; + + if ((subject = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL || + checkType(c,subject,REDIS_LIST)) return; + + if (refval != NULL) { + /* Note: we expect refval to be string-encoded because it is *not* the + * last argument of the multi-bulk LINSERT. */ + redisAssert(refval->encoding == REDIS_ENCODING_RAW); + + /* We're not sure if this value can be inserted yet, but we cannot + * convert the list inside the iterator. We don't want to loop over + * the list twice (once to see if the value can be inserted and once + * to do the actual insert), so we assume this value can be inserted + * and convert the ziplist to a regular list if necessary. */ + listTypeTryConversion(subject,val); + + /* Seek refval from head to tail */ + iter = listTypeInitIterator(subject,0,REDIS_TAIL); + while (listTypeNext(iter,&entry)) { + if (listTypeEqual(&entry,refval)) { + listTypeInsert(&entry,val,where); + inserted = 1; + break; + } + } + listTypeReleaseIterator(iter); + + if (inserted) { + /* Check if the length exceeds the ziplist length threshold. */ + if (subject->encoding == REDIS_ENCODING_ZIPLIST && + ziplistLen(subject->ptr) > server.list_max_ziplist_entries) + listTypeConvert(subject,REDIS_ENCODING_LINKEDLIST); + server.dirty++; + } else { + /* Notify client of a failed insert */ + addReply(c,shared.cnegone); + return; + } + } else { + listTypePush(subject,val,where); + server.dirty++; + } + + addReplyUlong(c,listTypeLength(subject)); +} + +void lpushxCommand(redisClient *c) { + pushxGenericCommand(c,NULL,c->argv[2],REDIS_HEAD); +} + +void rpushxCommand(redisClient *c) { + pushxGenericCommand(c,NULL,c->argv[2],REDIS_TAIL); +} + +void linsertCommand(redisClient *c) { + if (strcasecmp(c->argv[2]->ptr,"after") == 0) { + pushxGenericCommand(c,c->argv[3],c->argv[4],REDIS_TAIL); + } else if (strcasecmp(c->argv[2]->ptr,"before") == 0) { + pushxGenericCommand(c,c->argv[3],c->argv[4],REDIS_HEAD); + } else { + addReply(c,shared.syntaxerr); + } +} + +void llenCommand(redisClient *c) { + robj *o = lookupKeyReadOrReply(c,c->argv[1],shared.czero); + if (o == NULL || checkType(c,o,REDIS_LIST)) return; + addReplyUlong(c,listTypeLength(o)); +} + +void lindexCommand(redisClient *c) { + robj *o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk); + if (o == NULL || checkType(c,o,REDIS_LIST)) return; + int index = atoi(c->argv[2]->ptr); + robj *value = NULL; + + if (o->encoding == REDIS_ENCODING_ZIPLIST) { + unsigned char *p; + unsigned char *vstr; + unsigned int vlen; + long long vlong; + p = ziplistIndex(o->ptr,index); + if (ziplistGet(p,&vstr,&vlen,&vlong)) { + if (vstr) { + value = createStringObject((char*)vstr,vlen); + } else { + value = createStringObjectFromLongLong(vlong); + } + addReplyBulk(c,value); + decrRefCount(value); + } else { + addReply(c,shared.nullbulk); + } + } else if (o->encoding == REDIS_ENCODING_LINKEDLIST) { + listNode *ln = listIndex(o->ptr,index); + if (ln != NULL) { + value = listNodeValue(ln); + addReplyBulk(c,value); + } else { + addReply(c,shared.nullbulk); + } + } else { + redisPanic("Unknown list encoding"); + } +} + +void lsetCommand(redisClient *c) { + robj *o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr); + if (o == NULL || checkType(c,o,REDIS_LIST)) return; + int index = atoi(c->argv[2]->ptr); + robj *value = c->argv[3]; + + listTypeTryConversion(o,value); + if (o->encoding == REDIS_ENCODING_ZIPLIST) { + unsigned char *p, *zl = o->ptr; + p = ziplistIndex(zl,index); + if (p == NULL) { + addReply(c,shared.outofrangeerr); + } else { + o->ptr = ziplistDelete(o->ptr,&p); + value = getDecodedObject(value); + o->ptr = ziplistInsert(o->ptr,p,value->ptr,sdslen(value->ptr)); + decrRefCount(value); + addReply(c,shared.ok); + server.dirty++; + } + } else if (o->encoding == REDIS_ENCODING_LINKEDLIST) { + listNode *ln = listIndex(o->ptr,index); + if (ln == NULL) { + addReply(c,shared.outofrangeerr); + } else { + decrRefCount((robj*)listNodeValue(ln)); + listNodeValue(ln) = value; + incrRefCount(value); + addReply(c,shared.ok); + server.dirty++; + } + } else { + redisPanic("Unknown list encoding"); + } +} + +void popGenericCommand(redisClient *c, int where) { + robj *o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk); + if (o == NULL || checkType(c,o,REDIS_LIST)) return; + + robj *value = listTypePop(o,where); + if (value == NULL) { + addReply(c,shared.nullbulk); + } else { + addReplyBulk(c,value); + decrRefCount(value); + if (listTypeLength(o) == 0) dbDelete(c->db,c->argv[1]); + server.dirty++; + } +} + +void lpopCommand(redisClient *c) { + popGenericCommand(c,REDIS_HEAD); +} + +void rpopCommand(redisClient *c) { + popGenericCommand(c,REDIS_TAIL); +} + +void lrangeCommand(redisClient *c) { + robj *o, *value; + int start = atoi(c->argv[2]->ptr); + int end = atoi(c->argv[3]->ptr); + int llen; + int rangelen, j; + listTypeEntry entry; + + if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL + || checkType(c,o,REDIS_LIST)) return; + llen = listTypeLength(o); + + /* convert negative indexes */ + if (start < 0) start = llen+start; + if (end < 0) end = llen+end; + if (start < 0) start = 0; + if (end < 0) end = 0; + + /* indexes sanity checks */ + if (start > end || start >= llen) { + /* Out of range start or start > end result in empty list */ + addReply(c,shared.emptymultibulk); + return; + } + if (end >= llen) end = llen-1; + rangelen = (end-start)+1; + + /* Return the result in form of a multi-bulk reply */ + addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen)); + listTypeIterator *li = listTypeInitIterator(o,start,REDIS_TAIL); + for (j = 0; j < rangelen; j++) { + redisAssert(listTypeNext(li,&entry)); + value = listTypeGet(&entry); + addReplyBulk(c,value); + decrRefCount(value); + } + listTypeReleaseIterator(li); +} + +void ltrimCommand(redisClient *c) { + robj *o; + int start = atoi(c->argv[2]->ptr); + int end = atoi(c->argv[3]->ptr); + int llen; + int j, ltrim, rtrim; + list *list; + listNode *ln; + + if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL || + checkType(c,o,REDIS_LIST)) return; + llen = listTypeLength(o); + + /* convert negative indexes */ + if (start < 0) start = llen+start; + if (end < 0) end = llen+end; + if (start < 0) start = 0; + if (end < 0) end = 0; + + /* indexes sanity checks */ + if (start > end || start >= llen) { + /* Out of range start or start > end result in empty list */ + ltrim = llen; + rtrim = 0; + } else { + if (end >= llen) end = llen-1; + ltrim = start; + rtrim = llen-end-1; + } + + /* Remove list elements to perform the trim */ + if (o->encoding == REDIS_ENCODING_ZIPLIST) { + o->ptr = ziplistDeleteRange(o->ptr,0,ltrim); + o->ptr = ziplistDeleteRange(o->ptr,-rtrim,rtrim); + } else if (o->encoding == REDIS_ENCODING_LINKEDLIST) { + list = o->ptr; + for (j = 0; j < ltrim; j++) { + ln = listFirst(list); + listDelNode(list,ln); + } + for (j = 0; j < rtrim; j++) { + ln = listLast(list); + listDelNode(list,ln); + } + } else { + redisPanic("Unknown list encoding"); + } + if (listTypeLength(o) == 0) dbDelete(c->db,c->argv[1]); + server.dirty++; + addReply(c,shared.ok); +} + +void lremCommand(redisClient *c) { + robj *subject, *obj = c->argv[3]; + int toremove = atoi(c->argv[2]->ptr); + int removed = 0; + listTypeEntry entry; + + subject = lookupKeyWriteOrReply(c,c->argv[1],shared.czero); + if (subject == NULL || checkType(c,subject,REDIS_LIST)) return; + + /* Make sure obj is raw when we're dealing with a ziplist */ + if (subject->encoding == REDIS_ENCODING_ZIPLIST) + obj = getDecodedObject(obj); + + listTypeIterator *li; + if (toremove < 0) { + toremove = -toremove; + li = listTypeInitIterator(subject,-1,REDIS_HEAD); + } else { + li = listTypeInitIterator(subject,0,REDIS_TAIL); + } + + while (listTypeNext(li,&entry)) { + if (listTypeEqual(&entry,obj)) { + listTypeDelete(&entry); + server.dirty++; + removed++; + if (toremove && removed == toremove) break; + } + } + listTypeReleaseIterator(li); + + /* Clean up raw encoded object */ + if (subject->encoding == REDIS_ENCODING_ZIPLIST) + decrRefCount(obj); + + if (listTypeLength(subject) == 0) dbDelete(c->db,c->argv[1]); + addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed)); +} + +/* This is the semantic of this command: + * RPOPLPUSH srclist dstlist: + * IF LLEN(srclist) > 0 + * element = RPOP srclist + * LPUSH dstlist element + * RETURN element + * ELSE + * RETURN nil + * END + * END + * + * The idea is to be able to get an element from a list in a reliable way + * since the element is not just returned but pushed against another list + * as well. This command was originally proposed by Ezra Zygmuntowicz. + */ +void rpoplpushcommand(redisClient *c) { + robj *sobj, *value; + if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL || + checkType(c,sobj,REDIS_LIST)) return; + + if (listTypeLength(sobj) == 0) { + addReply(c,shared.nullbulk); + } else { + robj *dobj = lookupKeyWrite(c->db,c->argv[2]); + if (dobj && checkType(c,dobj,REDIS_LIST)) return; + value = listTypePop(sobj,REDIS_TAIL); + + /* Add the element to the target list (unless it's directly + * passed to some BLPOP-ing client */ + if (!handleClientsWaitingListPush(c,c->argv[2],value)) { + /* Create the list if the key does not exist */ + if (!dobj) { + dobj = createZiplistObject(); + dbAdd(c->db,c->argv[2],dobj); + } + listTypePush(dobj,value,REDIS_HEAD); + } + + /* Send the element to the client as reply as well */ + addReplyBulk(c,value); + + /* listTypePop returns an object with its refcount incremented */ + decrRefCount(value); + + /* Delete the source list when it is empty */ + if (listTypeLength(sobj) == 0) dbDelete(c->db,c->argv[1]); + server.dirty++; + } +} + +/*----------------------------------------------------------------------------- + * Blocking POP operations + *----------------------------------------------------------------------------*/ + +/* Currently Redis blocking operations support is limited to list POP ops, + * so the current implementation is not fully generic, but it is also not + * completely specific so it will not require a rewrite to support new + * kind of blocking operations in the future. + * + * Still it's important to note that list blocking operations can be already + * used as a notification mechanism in order to implement other blocking + * operations at application level, so there must be a very strong evidence + * of usefulness and generality before new blocking operations are implemented. + * + * This is how the current blocking POP works, we use BLPOP as example: + * - If the user calls BLPOP and the key exists and contains a non empty list + * then LPOP is called instead. So BLPOP is semantically the same as LPOP + * if there is not to block. + * - If instead BLPOP is called and the key does not exists or the list is + * empty we need to block. In order to do so we remove the notification for + * new data to read in the client socket (so that we'll not serve new + * requests if the blocking request is not served). Also we put the client + * in a dictionary (db->blocking_keys) mapping keys to a list of clients + * blocking for this keys. + * - If a PUSH operation against a key with blocked clients waiting is + * performed, we serve the first in the list: basically instead to push + * the new element inside the list we return it to the (first / oldest) + * blocking client, unblock the client, and remove it form the list. + * + * The above comment and the source code should be enough in order to understand + * the implementation and modify / fix it later. + */ + +/* Set a client in blocking mode for the specified key, with the specified + * timeout */ +void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) { + dictEntry *de; + list *l; + int j; + + c->blocking_keys = zmalloc(sizeof(robj*)*numkeys); + c->blocking_keys_num = numkeys; + c->blockingto = timeout; + for (j = 0; j < numkeys; j++) { + /* Add the key in the client structure, to map clients -> keys */ + c->blocking_keys[j] = keys[j]; + incrRefCount(keys[j]); + + /* And in the other "side", to map keys -> clients */ + de = dictFind(c->db->blocking_keys,keys[j]); + if (de == NULL) { + int retval; + + /* For every key we take a list of clients blocked for it */ + l = listCreate(); + retval = dictAdd(c->db->blocking_keys,keys[j],l); + incrRefCount(keys[j]); + redisAssert(retval == DICT_OK); + } else { + l = dictGetEntryVal(de); + } + listAddNodeTail(l,c); + } + /* Mark the client as a blocked client */ + c->flags |= REDIS_BLOCKED; + server.blpop_blocked_clients++; +} + +/* Unblock a client that's waiting in a blocking operation such as BLPOP */ +void unblockClientWaitingData(redisClient *c) { + dictEntry *de; + list *l; + int j; + + redisAssert(c->blocking_keys != NULL); + /* The client may wait for multiple keys, so unblock it for every key. */ + for (j = 0; j < c->blocking_keys_num; j++) { + /* Remove this client from the list of clients waiting for this key. */ + de = dictFind(c->db->blocking_keys,c->blocking_keys[j]); + redisAssert(de != NULL); + l = dictGetEntryVal(de); + listDelNode(l,listSearchKey(l,c)); + /* If the list is empty we need to remove it to avoid wasting memory */ + if (listLength(l) == 0) + dictDelete(c->db->blocking_keys,c->blocking_keys[j]); + decrRefCount(c->blocking_keys[j]); + } + /* Cleanup the client structure */ + zfree(c->blocking_keys); + c->blocking_keys = NULL; + c->flags &= (~REDIS_BLOCKED); + server.blpop_blocked_clients--; + /* We want to process data if there is some command waiting + * in the input buffer. Note that this is safe even if + * unblockClientWaitingData() gets called from freeClient() because + * freeClient() will be smart enough to call this function + * *after* c->querybuf was set to NULL. */ + if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c); +} + +/* This should be called from any function PUSHing into lists. + * 'c' is the "pushing client", 'key' is the key it is pushing data against, + * 'ele' is the element pushed. + * + * If the function returns 0 there was no client waiting for a list push + * against this key. + * + * If the function returns 1 there was a client waiting for a list push + * against this key, the element was passed to this client thus it's not + * needed to actually add it to the list and the caller should return asap. */ +int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) { + struct dictEntry *de; + redisClient *receiver; + list *l; + listNode *ln; + + de = dictFind(c->db->blocking_keys,key); + if (de == NULL) return 0; + l = dictGetEntryVal(de); + ln = listFirst(l); + redisAssert(ln != NULL); + receiver = ln->value; + + addReplySds(receiver,sdsnew("*2\r\n")); + addReplyBulk(receiver,key); + addReplyBulk(receiver,ele); + unblockClientWaitingData(receiver); + return 1; +} + +/* Blocking RPOP/LPOP */ +void blockingPopGenericCommand(redisClient *c, int where) { + robj *o; + time_t timeout; + int j; + + for (j = 1; j < c->argc-1; j++) { + o = lookupKeyWrite(c->db,c->argv[j]); + if (o != NULL) { + if (o->type != REDIS_LIST) { + addReply(c,shared.wrongtypeerr); + return; + } else { + if (listTypeLength(o) != 0) { + /* If the list contains elements fall back to the usual + * non-blocking POP operation */ + robj *argv[2], **orig_argv; + int orig_argc; + + /* We need to alter the command arguments before to call + * popGenericCommand() as the command takes a single key. */ + orig_argv = c->argv; + orig_argc = c->argc; + argv[1] = c->argv[j]; + c->argv = argv; + c->argc = 2; + + /* Also the return value is different, we need to output + * the multi bulk reply header and the key name. The + * "real" command will add the last element (the value) + * for us. If this souds like an hack to you it's just + * because it is... */ + addReplySds(c,sdsnew("*2\r\n")); + addReplyBulk(c,argv[1]); + popGenericCommand(c,where); + + /* Fix the client structure with the original stuff */ + c->argv = orig_argv; + c->argc = orig_argc; + return; + } + } + } + } + /* If the list is empty or the key does not exists we must block */ + timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10); + if (timeout > 0) timeout += time(NULL); + blockForKeys(c,c->argv+1,c->argc-2,timeout); +} + +void blpopCommand(redisClient *c) { + blockingPopGenericCommand(c,REDIS_HEAD); +} + +void brpopCommand(redisClient *c) { + blockingPopGenericCommand(c,REDIS_TAIL); +} diff --git a/src/t_set.c b/src/t_set.c new file mode 100644 index 000000000..808ef268e --- /dev/null +++ b/src/t_set.c @@ -0,0 +1,349 @@ +#include "redis.h" + +/*----------------------------------------------------------------------------- + * Set Commands + *----------------------------------------------------------------------------*/ + +void saddCommand(redisClient *c) { + robj *set; + + set = lookupKeyWrite(c->db,c->argv[1]); + if (set == NULL) { + set = createSetObject(); + dbAdd(c->db,c->argv[1],set); + } else { + if (set->type != REDIS_SET) { + addReply(c,shared.wrongtypeerr); + return; + } + } + if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) { + incrRefCount(c->argv[2]); + server.dirty++; + addReply(c,shared.cone); + } else { + addReply(c,shared.czero); + } +} + +void sremCommand(redisClient *c) { + robj *set; + + if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL || + checkType(c,set,REDIS_SET)) return; + + if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) { + server.dirty++; + if (htNeedsResize(set->ptr)) dictResize(set->ptr); + if (dictSize((dict*)set->ptr) == 0) dbDelete(c->db,c->argv[1]); + addReply(c,shared.cone); + } else { + addReply(c,shared.czero); + } +} + +void smoveCommand(redisClient *c) { + robj *srcset, *dstset; + + srcset = lookupKeyWrite(c->db,c->argv[1]); + dstset = lookupKeyWrite(c->db,c->argv[2]); + + /* If the source key does not exist return 0, if it's of the wrong type + * raise an error */ + if (srcset == NULL || srcset->type != REDIS_SET) { + addReply(c, srcset ? shared.wrongtypeerr : shared.czero); + return; + } + /* Error if the destination key is not a set as well */ + if (dstset && dstset->type != REDIS_SET) { + addReply(c,shared.wrongtypeerr); + return; + } + /* Remove the element from the source set */ + if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) { + /* Key not found in the src set! return zero */ + addReply(c,shared.czero); + return; + } + if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset) + dbDelete(c->db,c->argv[1]); + server.dirty++; + /* Add the element to the destination set */ + if (!dstset) { + dstset = createSetObject(); + dbAdd(c->db,c->argv[2],dstset); + } + if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK) + incrRefCount(c->argv[3]); + addReply(c,shared.cone); +} + +void sismemberCommand(redisClient *c) { + robj *set; + + if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL || + checkType(c,set,REDIS_SET)) return; + + if (dictFind(set->ptr,c->argv[2])) + addReply(c,shared.cone); + else + addReply(c,shared.czero); +} + +void scardCommand(redisClient *c) { + robj *o; + dict *s; + + if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL || + checkType(c,o,REDIS_SET)) return; + + s = o->ptr; + addReplyUlong(c,dictSize(s)); +} + +void spopCommand(redisClient *c) { + robj *set; + dictEntry *de; + + if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL || + checkType(c,set,REDIS_SET)) return; + + de = dictGetRandomKey(set->ptr); + if (de == NULL) { + addReply(c,shared.nullbulk); + } else { + robj *ele = dictGetEntryKey(de); + + addReplyBulk(c,ele); + dictDelete(set->ptr,ele); + if (htNeedsResize(set->ptr)) dictResize(set->ptr); + if (dictSize((dict*)set->ptr) == 0) dbDelete(c->db,c->argv[1]); + server.dirty++; + } +} + +void srandmemberCommand(redisClient *c) { + robj *set; + dictEntry *de; + + if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL || + checkType(c,set,REDIS_SET)) return; + + de = dictGetRandomKey(set->ptr); + if (de == NULL) { + addReply(c,shared.nullbulk); + } else { + robj *ele = dictGetEntryKey(de); + + addReplyBulk(c,ele); + } +} + +int qsortCompareSetsByCardinality(const void *s1, const void *s2) { + dict **d1 = (void*) s1, **d2 = (void*) s2; + + return dictSize(*d1)-dictSize(*d2); +} + +void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) { + dict **dv = zmalloc(sizeof(dict*)*setsnum); + dictIterator *di; + dictEntry *de; + robj *lenobj = NULL, *dstset = NULL; + unsigned long j, cardinality = 0; + + for (j = 0; j < setsnum; j++) { + robj *setobj; + + setobj = dstkey ? + lookupKeyWrite(c->db,setskeys[j]) : + lookupKeyRead(c->db,setskeys[j]); + if (!setobj) { + zfree(dv); + if (dstkey) { + if (dbDelete(c->db,dstkey)) + server.dirty++; + addReply(c,shared.czero); + } else { + addReply(c,shared.emptymultibulk); + } + return; + } + if (setobj->type != REDIS_SET) { + zfree(dv); + addReply(c,shared.wrongtypeerr); + return; + } + dv[j] = setobj->ptr; + } + /* Sort sets from the smallest to largest, this will improve our + * algorithm's performace */ + qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality); + + /* The first thing we should output is the total number of elements... + * since this is a multi-bulk write, but at this stage we don't know + * the intersection set size, so we use a trick, append an empty object + * to the output list and save the pointer to later modify it with the + * right length */ + if (!dstkey) { + lenobj = createObject(REDIS_STRING,NULL); + addReply(c,lenobj); + decrRefCount(lenobj); + } else { + /* If we have a target key where to store the resulting set + * create this key with an empty set inside */ + dstset = createSetObject(); + } + + /* Iterate all the elements of the first (smallest) set, and test + * the element against all the other sets, if at least one set does + * not include the element it is discarded */ + di = dictGetIterator(dv[0]); + + while((de = dictNext(di)) != NULL) { + robj *ele; + + for (j = 1; j < setsnum; j++) + if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break; + if (j != setsnum) + continue; /* at least one set does not contain the member */ + ele = dictGetEntryKey(de); + if (!dstkey) { + addReplyBulk(c,ele); + cardinality++; + } else { + dictAdd(dstset->ptr,ele,NULL); + incrRefCount(ele); + } + } + dictReleaseIterator(di); + + if (dstkey) { + /* Store the resulting set into the target, if the intersection + * is not an empty set. */ + dbDelete(c->db,dstkey); + if (dictSize((dict*)dstset->ptr) > 0) { + dbAdd(c->db,dstkey,dstset); + addReplyLongLong(c,dictSize((dict*)dstset->ptr)); + } else { + decrRefCount(dstset); + addReply(c,shared.czero); + } + server.dirty++; + } else { + lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality); + } + zfree(dv); +} + +void sinterCommand(redisClient *c) { + sinterGenericCommand(c,c->argv+1,c->argc-1,NULL); +} + +void sinterstoreCommand(redisClient *c) { + sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]); +} + +void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) { + dict **dv = zmalloc(sizeof(dict*)*setsnum); + dictIterator *di; + dictEntry *de; + robj *dstset = NULL; + int j, cardinality = 0; + + for (j = 0; j < setsnum; j++) { + robj *setobj; + + setobj = dstkey ? + lookupKeyWrite(c->db,setskeys[j]) : + lookupKeyRead(c->db,setskeys[j]); + if (!setobj) { + dv[j] = NULL; + continue; + } + if (setobj->type != REDIS_SET) { + zfree(dv); + addReply(c,shared.wrongtypeerr); + return; + } + dv[j] = setobj->ptr; + } + + /* We need a temp set object to store our union. If the dstkey + * is not NULL (that is, we are inside an SUNIONSTORE operation) then + * this set object will be the resulting object to set into the target key*/ + dstset = createSetObject(); + + /* Iterate all the elements of all the sets, add every element a single + * time to the result set */ + for (j = 0; j < setsnum; j++) { + if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */ + if (!dv[j]) continue; /* non existing keys are like empty sets */ + + di = dictGetIterator(dv[j]); + + while((de = dictNext(di)) != NULL) { + robj *ele; + + /* dictAdd will not add the same element multiple times */ + ele = dictGetEntryKey(de); + if (op == REDIS_OP_UNION || j == 0) { + if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) { + incrRefCount(ele); + cardinality++; + } + } else if (op == REDIS_OP_DIFF) { + if (dictDelete(dstset->ptr,ele) == DICT_OK) { + cardinality--; + } + } + } + dictReleaseIterator(di); + + /* result set is empty? Exit asap. */ + if (op == REDIS_OP_DIFF && cardinality == 0) break; + } + + /* Output the content of the resulting set, if not in STORE mode */ + if (!dstkey) { + addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality)); + di = dictGetIterator(dstset->ptr); + while((de = dictNext(di)) != NULL) { + robj *ele; + + ele = dictGetEntryKey(de); + addReplyBulk(c,ele); + } + dictReleaseIterator(di); + decrRefCount(dstset); + } else { + /* If we have a target key where to store the resulting set + * create this key with the result set inside */ + dbDelete(c->db,dstkey); + if (dictSize((dict*)dstset->ptr) > 0) { + dbAdd(c->db,dstkey,dstset); + addReplyLongLong(c,dictSize((dict*)dstset->ptr)); + } else { + decrRefCount(dstset); + addReply(c,shared.czero); + } + server.dirty++; + } + zfree(dv); +} + +void sunionCommand(redisClient *c) { + sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION); +} + +void sunionstoreCommand(redisClient *c) { + sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION); +} + +void sdiffCommand(redisClient *c) { + sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF); +} + +void sdiffstoreCommand(redisClient *c) { + sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF); +} diff --git a/src/t_string.c b/src/t_string.c new file mode 100644 index 000000000..eaaec05be --- /dev/null +++ b/src/t_string.c @@ -0,0 +1,251 @@ +#include "redis.h" + +/*----------------------------------------------------------------------------- + * String Commands + *----------------------------------------------------------------------------*/ + +void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) { + int retval; + long seconds = 0; /* initialized to avoid an harmness warning */ + + if (expire) { + if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK) + return; + if (seconds <= 0) { + addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n")); + return; + } + } + + touchWatchedKey(c->db,key); + if (nx) deleteIfVolatile(c->db,key); + retval = dbAdd(c->db,key,val); + if (retval == REDIS_ERR) { + if (!nx) { + dbReplace(c->db,key,val); + incrRefCount(val); + } else { + addReply(c,shared.czero); + return; + } + } else { + incrRefCount(val); + } + server.dirty++; + removeExpire(c->db,key); + if (expire) setExpire(c->db,key,time(NULL)+seconds); + addReply(c, nx ? shared.cone : shared.ok); +} + +void setCommand(redisClient *c) { + setGenericCommand(c,0,c->argv[1],c->argv[2],NULL); +} + +void setnxCommand(redisClient *c) { + setGenericCommand(c,1,c->argv[1],c->argv[2],NULL); +} + +void setexCommand(redisClient *c) { + setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]); +} + +int getGenericCommand(redisClient *c) { + robj *o; + + if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL) + return REDIS_OK; + + if (o->type != REDIS_STRING) { + addReply(c,shared.wrongtypeerr); + return REDIS_ERR; + } else { + addReplyBulk(c,o); + return REDIS_OK; + } +} + +void getCommand(redisClient *c) { + getGenericCommand(c); +} + +void getsetCommand(redisClient *c) { + if (getGenericCommand(c) == REDIS_ERR) return; + dbReplace(c->db,c->argv[1],c->argv[2]); + incrRefCount(c->argv[2]); + server.dirty++; + removeExpire(c->db,c->argv[1]); +} + +void mgetCommand(redisClient *c) { + int j; + + addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1)); + for (j = 1; j < c->argc; j++) { + robj *o = lookupKeyRead(c->db,c->argv[j]); + if (o == NULL) { + addReply(c,shared.nullbulk); + } else { + if (o->type != REDIS_STRING) { + addReply(c,shared.nullbulk); + } else { + addReplyBulk(c,o); + } + } + } +} + +void msetGenericCommand(redisClient *c, int nx) { + int j, busykeys = 0; + + if ((c->argc % 2) == 0) { + addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n")); + return; + } + /* Handle the NX flag. The MSETNX semantic is to return zero and don't + * set nothing at all if at least one already key exists. */ + if (nx) { + for (j = 1; j < c->argc; j += 2) { + if (lookupKeyWrite(c->db,c->argv[j]) != NULL) { + busykeys++; + } + } + } + if (busykeys) { + addReply(c, shared.czero); + return; + } + + for (j = 1; j < c->argc; j += 2) { + c->argv[j+1] = tryObjectEncoding(c->argv[j+1]); + dbReplace(c->db,c->argv[j],c->argv[j+1]); + incrRefCount(c->argv[j+1]); + removeExpire(c->db,c->argv[j]); + } + server.dirty += (c->argc-1)/2; + addReply(c, nx ? shared.cone : shared.ok); +} + +void msetCommand(redisClient *c) { + msetGenericCommand(c,0); +} + +void msetnxCommand(redisClient *c) { + msetGenericCommand(c,1); +} + +void incrDecrCommand(redisClient *c, long long incr) { + long long value; + robj *o; + + o = lookupKeyWrite(c->db,c->argv[1]); + if (o != NULL && checkType(c,o,REDIS_STRING)) return; + if (getLongLongFromObjectOrReply(c,o,&value,NULL) != REDIS_OK) return; + + value += incr; + o = createStringObjectFromLongLong(value); + dbReplace(c->db,c->argv[1],o); + server.dirty++; + addReply(c,shared.colon); + addReply(c,o); + addReply(c,shared.crlf); +} + +void incrCommand(redisClient *c) { + incrDecrCommand(c,1); +} + +void decrCommand(redisClient *c) { + incrDecrCommand(c,-1); +} + +void incrbyCommand(redisClient *c) { + long long incr; + + if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return; + incrDecrCommand(c,incr); +} + +void decrbyCommand(redisClient *c) { + long long incr; + + if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return; + incrDecrCommand(c,-incr); +} + +void appendCommand(redisClient *c) { + int retval; + size_t totlen; + robj *o; + + o = lookupKeyWrite(c->db,c->argv[1]); + if (o == NULL) { + /* Create the key */ + retval = dbAdd(c->db,c->argv[1],c->argv[2]); + incrRefCount(c->argv[2]); + totlen = stringObjectLen(c->argv[2]); + } else { + if (o->type != REDIS_STRING) { + addReply(c,shared.wrongtypeerr); + return; + } + /* If the object is specially encoded or shared we have to make + * a copy */ + if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) { + robj *decoded = getDecodedObject(o); + + o = createStringObject(decoded->ptr, sdslen(decoded->ptr)); + decrRefCount(decoded); + dbReplace(c->db,c->argv[1],o); + } + /* APPEND! */ + if (c->argv[2]->encoding == REDIS_ENCODING_RAW) { + o->ptr = sdscatlen(o->ptr, + c->argv[2]->ptr, sdslen(c->argv[2]->ptr)); + } else { + o->ptr = sdscatprintf(o->ptr, "%ld", + (unsigned long) c->argv[2]->ptr); + } + totlen = sdslen(o->ptr); + } + server.dirty++; + addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen)); +} + +void substrCommand(redisClient *c) { + robj *o; + long start = atoi(c->argv[2]->ptr); + long end = atoi(c->argv[3]->ptr); + size_t rangelen, strlen; + sds range; + + if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL || + checkType(c,o,REDIS_STRING)) return; + + o = getDecodedObject(o); + strlen = sdslen(o->ptr); + + /* convert negative indexes */ + if (start < 0) start = strlen+start; + if (end < 0) end = strlen+end; + if (start < 0) start = 0; + if (end < 0) end = 0; + + /* indexes sanity checks */ + if (start > end || (size_t)start >= strlen) { + /* Out of range start or start > end result in null reply */ + addReply(c,shared.nullbulk); + decrRefCount(o); + return; + } + if ((size_t)end >= strlen) end = strlen-1; + rangelen = (end-start)+1; + + /* Return the result */ + addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen)); + range = sdsnewlen((char*)o->ptr+start,rangelen); + addReplySds(c,range); + addReply(c,shared.crlf); + decrRefCount(o); +} + + diff --git a/src/t_zset.c b/src/t_zset.c new file mode 100644 index 000000000..de32a8eed --- /dev/null +++ b/src/t_zset.c @@ -0,0 +1,985 @@ +#include "redis.h" + +#include + +/*----------------------------------------------------------------------------- + * Sorted set API + *----------------------------------------------------------------------------*/ + +/* ZSETs are ordered sets using two data structures to hold the same elements + * in order to get O(log(N)) INSERT and REMOVE operations into a sorted + * data structure. + * + * The elements are added to an hash table mapping Redis objects to scores. + * At the same time the elements are added to a skip list mapping scores + * to Redis objects (so objects are sorted by scores in this "view"). */ + +/* This skiplist implementation is almost a C translation of the original + * algorithm described by William Pugh in "Skip Lists: A Probabilistic + * Alternative to Balanced Trees", modified in three ways: + * a) this implementation allows for repeated values. + * b) the comparison is not just by key (our 'score') but by satellite data. + * c) there is a back pointer, so it's a doubly linked list with the back + * pointers being only at "level 1". This allows to traverse the list + * from tail to head, useful for ZREVRANGE. */ + +zskiplistNode *zslCreateNode(int level, double score, robj *obj) { + zskiplistNode *zn = zmalloc(sizeof(*zn)); + + zn->forward = zmalloc(sizeof(zskiplistNode*) * level); + if (level > 1) + zn->span = zmalloc(sizeof(unsigned int) * (level - 1)); + else + zn->span = NULL; + zn->score = score; + zn->obj = obj; + return zn; +} + +zskiplist *zslCreate(void) { + int j; + zskiplist *zsl; + + zsl = zmalloc(sizeof(*zsl)); + zsl->level = 1; + zsl->length = 0; + zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL); + for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) { + zsl->header->forward[j] = NULL; + + /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */ + if (j < ZSKIPLIST_MAXLEVEL-1) + zsl->header->span[j] = 0; + } + zsl->header->backward = NULL; + zsl->tail = NULL; + return zsl; +} + +void zslFreeNode(zskiplistNode *node) { + decrRefCount(node->obj); + zfree(node->forward); + zfree(node->span); + zfree(node); +} + +void zslFree(zskiplist *zsl) { + zskiplistNode *node = zsl->header->forward[0], *next; + + zfree(zsl->header->forward); + zfree(zsl->header->span); + zfree(zsl->header); + while(node) { + next = node->forward[0]; + zslFreeNode(node); + node = next; + } + zfree(zsl); +} + +int zslRandomLevel(void) { + int level = 1; + while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF)) + level += 1; + return (levelheader; + for (i = zsl->level-1; i >= 0; i--) { + /* store rank that is crossed to reach the insert position */ + rank[i] = i == (zsl->level-1) ? 0 : rank[i+1]; + + while (x->forward[i] && + (x->forward[i]->score < score || + (x->forward[i]->score == score && + compareStringObjects(x->forward[i]->obj,obj) < 0))) { + rank[i] += i > 0 ? x->span[i-1] : 1; + x = x->forward[i]; + } + update[i] = x; + } + /* we assume the key is not already inside, since we allow duplicated + * scores, and the re-insertion of score and redis object should never + * happpen since the caller of zslInsert() should test in the hash table + * if the element is already inside or not. */ + level = zslRandomLevel(); + if (level > zsl->level) { + for (i = zsl->level; i < level; i++) { + rank[i] = 0; + update[i] = zsl->header; + update[i]->span[i-1] = zsl->length; + } + zsl->level = level; + } + x = zslCreateNode(level,score,obj); + for (i = 0; i < level; i++) { + x->forward[i] = update[i]->forward[i]; + update[i]->forward[i] = x; + + /* update span covered by update[i] as x is inserted here */ + if (i > 0) { + x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]); + update[i]->span[i-1] = (rank[0] - rank[i]) + 1; + } + } + + /* increment span for untouched levels */ + for (i = level; i < zsl->level; i++) { + update[i]->span[i-1]++; + } + + x->backward = (update[0] == zsl->header) ? NULL : update[0]; + if (x->forward[0]) + x->forward[0]->backward = x; + else + zsl->tail = x; + zsl->length++; +} + +/* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */ +void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) { + int i; + for (i = 0; i < zsl->level; i++) { + if (update[i]->forward[i] == x) { + if (i > 0) { + update[i]->span[i-1] += x->span[i-1] - 1; + } + update[i]->forward[i] = x->forward[i]; + } else { + /* invariant: i > 0, because update[0]->forward[0] + * is always equal to x */ + update[i]->span[i-1] -= 1; + } + } + if (x->forward[0]) { + x->forward[0]->backward = x->backward; + } else { + zsl->tail = x->backward; + } + while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL) + zsl->level--; + zsl->length--; +} + +/* Delete an element with matching score/object from the skiplist. */ +int zslDelete(zskiplist *zsl, double score, robj *obj) { + zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x; + int i; + + x = zsl->header; + for (i = zsl->level-1; i >= 0; i--) { + while (x->forward[i] && + (x->forward[i]->score < score || + (x->forward[i]->score == score && + compareStringObjects(x->forward[i]->obj,obj) < 0))) + x = x->forward[i]; + update[i] = x; + } + /* We may have multiple elements with the same score, what we need + * is to find the element with both the right score and object. */ + x = x->forward[0]; + if (x && score == x->score && equalStringObjects(x->obj,obj)) { + zslDeleteNode(zsl, x, update); + zslFreeNode(x); + return 1; + } else { + return 0; /* not found */ + } + return 0; /* not found */ +} + +/* Delete all the elements with score between min and max from the skiplist. + * Min and mx are inclusive, so a score >= min || score <= max is deleted. + * Note that this function takes the reference to the hash table view of the + * sorted set, in order to remove the elements from the hash table too. */ +unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) { + zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x; + unsigned long removed = 0; + int i; + + x = zsl->header; + for (i = zsl->level-1; i >= 0; i--) { + while (x->forward[i] && x->forward[i]->score < min) + x = x->forward[i]; + update[i] = x; + } + /* We may have multiple elements with the same score, what we need + * is to find the element with both the right score and object. */ + x = x->forward[0]; + while (x && x->score <= max) { + zskiplistNode *next = x->forward[0]; + zslDeleteNode(zsl, x, update); + dictDelete(dict,x->obj); + zslFreeNode(x); + removed++; + x = next; + } + return removed; /* not found */ +} + +/* Delete all the elements with rank between start and end from the skiplist. + * Start and end are inclusive. Note that start and end need to be 1-based */ +unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) { + zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x; + unsigned long traversed = 0, removed = 0; + int i; + + x = zsl->header; + for (i = zsl->level-1; i >= 0; i--) { + while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) { + traversed += i > 0 ? x->span[i-1] : 1; + x = x->forward[i]; + } + update[i] = x; + } + + traversed++; + x = x->forward[0]; + while (x && traversed <= end) { + zskiplistNode *next = x->forward[0]; + zslDeleteNode(zsl, x, update); + dictDelete(dict,x->obj); + zslFreeNode(x); + removed++; + traversed++; + x = next; + } + return removed; +} + +/* Find the first node having a score equal or greater than the specified one. + * Returns NULL if there is no match. */ +zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) { + zskiplistNode *x; + int i; + + x = zsl->header; + for (i = zsl->level-1; i >= 0; i--) { + while (x->forward[i] && x->forward[i]->score < score) + x = x->forward[i]; + } + /* We may have multiple elements with the same score, what we need + * is to find the element with both the right score and object. */ + return x->forward[0]; +} + +/* Find the rank for an element by both score and key. + * Returns 0 when the element cannot be found, rank otherwise. + * Note that the rank is 1-based due to the span of zsl->header to the + * first element. */ +unsigned long zslistTypeGetRank(zskiplist *zsl, double score, robj *o) { + zskiplistNode *x; + unsigned long rank = 0; + int i; + + x = zsl->header; + for (i = zsl->level-1; i >= 0; i--) { + while (x->forward[i] && + (x->forward[i]->score < score || + (x->forward[i]->score == score && + compareStringObjects(x->forward[i]->obj,o) <= 0))) { + rank += i > 0 ? x->span[i-1] : 1; + x = x->forward[i]; + } + + /* x might be equal to zsl->header, so test if obj is non-NULL */ + if (x->obj && equalStringObjects(x->obj,o)) { + return rank; + } + } + return 0; +} + +/* Finds an element by its rank. The rank argument needs to be 1-based. */ +zskiplistNode* zslistTypeGetElementByRank(zskiplist *zsl, unsigned long rank) { + zskiplistNode *x; + unsigned long traversed = 0; + int i; + + x = zsl->header; + for (i = zsl->level-1; i >= 0; i--) { + while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank) + { + traversed += i > 0 ? x->span[i-1] : 1; + x = x->forward[i]; + } + if (traversed == rank) { + return x; + } + } + return NULL; +} + +/*----------------------------------------------------------------------------- + * Sorted set commands + *----------------------------------------------------------------------------*/ + +/* This generic command implements both ZADD and ZINCRBY. + * scoreval is the score if the operation is a ZADD (doincrement == 0) or + * the increment if the operation is a ZINCRBY (doincrement == 1). */ +void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) { + robj *zsetobj; + zset *zs; + double *score; + + if (isnan(scoreval)) { + addReplySds(c,sdsnew("-ERR provide score is Not A Number (nan)\r\n")); + return; + } + + zsetobj = lookupKeyWrite(c->db,key); + if (zsetobj == NULL) { + zsetobj = createZsetObject(); + dbAdd(c->db,key,zsetobj); + } else { + if (zsetobj->type != REDIS_ZSET) { + addReply(c,shared.wrongtypeerr); + return; + } + } + zs = zsetobj->ptr; + + /* Ok now since we implement both ZADD and ZINCRBY here the code + * needs to handle the two different conditions. It's all about setting + * '*score', that is, the new score to set, to the right value. */ + score = zmalloc(sizeof(double)); + if (doincrement) { + dictEntry *de; + + /* Read the old score. If the element was not present starts from 0 */ + de = dictFind(zs->dict,ele); + if (de) { + double *oldscore = dictGetEntryVal(de); + *score = *oldscore + scoreval; + } else { + *score = scoreval; + } + if (isnan(*score)) { + addReplySds(c, + sdsnew("-ERR resulting score is Not A Number (nan)\r\n")); + zfree(score); + /* Note that we don't need to check if the zset may be empty and + * should be removed here, as we can only obtain Nan as score if + * there was already an element in the sorted set. */ + return; + } + } else { + *score = scoreval; + } + + /* What follows is a simple remove and re-insert operation that is common + * to both ZADD and ZINCRBY... */ + if (dictAdd(zs->dict,ele,score) == DICT_OK) { + /* case 1: New element */ + incrRefCount(ele); /* added to hash */ + zslInsert(zs->zsl,*score,ele); + incrRefCount(ele); /* added to skiplist */ + server.dirty++; + if (doincrement) + addReplyDouble(c,*score); + else + addReply(c,shared.cone); + } else { + dictEntry *de; + double *oldscore; + + /* case 2: Score update operation */ + de = dictFind(zs->dict,ele); + redisAssert(de != NULL); + oldscore = dictGetEntryVal(de); + if (*score != *oldscore) { + int deleted; + + /* Remove and insert the element in the skip list with new score */ + deleted = zslDelete(zs->zsl,*oldscore,ele); + redisAssert(deleted != 0); + zslInsert(zs->zsl,*score,ele); + incrRefCount(ele); + /* Update the score in the hash table */ + dictReplace(zs->dict,ele,score); + server.dirty++; + } else { + zfree(score); + } + if (doincrement) + addReplyDouble(c,*score); + else + addReply(c,shared.czero); + } +} + +void zaddCommand(redisClient *c) { + double scoreval; + + if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return; + zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0); +} + +void zincrbyCommand(redisClient *c) { + double scoreval; + + if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return; + zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1); +} + +void zremCommand(redisClient *c) { + robj *zsetobj; + zset *zs; + dictEntry *de; + double *oldscore; + int deleted; + + if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL || + checkType(c,zsetobj,REDIS_ZSET)) return; + + zs = zsetobj->ptr; + de = dictFind(zs->dict,c->argv[2]); + if (de == NULL) { + addReply(c,shared.czero); + return; + } + /* Delete from the skiplist */ + oldscore = dictGetEntryVal(de); + deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]); + redisAssert(deleted != 0); + + /* Delete from the hash table */ + dictDelete(zs->dict,c->argv[2]); + if (htNeedsResize(zs->dict)) dictResize(zs->dict); + if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]); + server.dirty++; + addReply(c,shared.cone); +} + +void zremrangebyscoreCommand(redisClient *c) { + double min; + double max; + long deleted; + robj *zsetobj; + zset *zs; + + if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) || + (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return; + + if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL || + checkType(c,zsetobj,REDIS_ZSET)) return; + + zs = zsetobj->ptr; + deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict); + if (htNeedsResize(zs->dict)) dictResize(zs->dict); + if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]); + server.dirty += deleted; + addReplyLongLong(c,deleted); +} + +void zremrangebyrankCommand(redisClient *c) { + long start; + long end; + int llen; + long deleted; + robj *zsetobj; + zset *zs; + + if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) || + (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return; + + if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL || + checkType(c,zsetobj,REDIS_ZSET)) return; + zs = zsetobj->ptr; + llen = zs->zsl->length; + + /* convert negative indexes */ + if (start < 0) start = llen+start; + if (end < 0) end = llen+end; + if (start < 0) start = 0; + if (end < 0) end = 0; + + /* indexes sanity checks */ + if (start > end || start >= llen) { + addReply(c,shared.czero); + return; + } + if (end >= llen) end = llen-1; + + /* increment start and end because zsl*Rank functions + * use 1-based rank */ + deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict); + if (htNeedsResize(zs->dict)) dictResize(zs->dict); + if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]); + server.dirty += deleted; + addReplyLongLong(c, deleted); +} + +typedef struct { + dict *dict; + double weight; +} zsetopsrc; + +int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) { + zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2; + unsigned long size1, size2; + size1 = d1->dict ? dictSize(d1->dict) : 0; + size2 = d2->dict ? dictSize(d2->dict) : 0; + return size1 - size2; +} + +#define REDIS_AGGR_SUM 1 +#define REDIS_AGGR_MIN 2 +#define REDIS_AGGR_MAX 3 +#define zunionInterDictValue(_e) (dictGetEntryVal(_e) == NULL ? 1.0 : *(double*)dictGetEntryVal(_e)) + +inline static void zunionInterAggregate(double *target, double val, int aggregate) { + if (aggregate == REDIS_AGGR_SUM) { + *target = *target + val; + } else if (aggregate == REDIS_AGGR_MIN) { + *target = val < *target ? val : *target; + } else if (aggregate == REDIS_AGGR_MAX) { + *target = val > *target ? val : *target; + } else { + /* safety net */ + redisPanic("Unknown ZUNION/INTER aggregate type"); + } +} + +void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) { + int i, j, setnum; + int aggregate = REDIS_AGGR_SUM; + zsetopsrc *src; + robj *dstobj; + zset *dstzset; + dictIterator *di; + dictEntry *de; + + /* expect setnum input keys to be given */ + setnum = atoi(c->argv[2]->ptr); + if (setnum < 1) { + addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNIONSTORE/ZINTERSTORE\r\n")); + return; + } + + /* test if the expected number of keys would overflow */ + if (3+setnum > c->argc) { + addReply(c,shared.syntaxerr); + return; + } + + /* read keys to be used for input */ + src = zmalloc(sizeof(zsetopsrc) * setnum); + for (i = 0, j = 3; i < setnum; i++, j++) { + robj *obj = lookupKeyWrite(c->db,c->argv[j]); + if (!obj) { + src[i].dict = NULL; + } else { + if (obj->type == REDIS_ZSET) { + src[i].dict = ((zset*)obj->ptr)->dict; + } else if (obj->type == REDIS_SET) { + src[i].dict = (obj->ptr); + } else { + zfree(src); + addReply(c,shared.wrongtypeerr); + return; + } + } + + /* default all weights to 1 */ + src[i].weight = 1.0; + } + + /* parse optional extra arguments */ + if (j < c->argc) { + int remaining = c->argc - j; + + while (remaining) { + if (remaining >= (setnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) { + j++; remaining--; + for (i = 0; i < setnum; i++, j++, remaining--) { + if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK) + return; + } + } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) { + j++; remaining--; + if (!strcasecmp(c->argv[j]->ptr,"sum")) { + aggregate = REDIS_AGGR_SUM; + } else if (!strcasecmp(c->argv[j]->ptr,"min")) { + aggregate = REDIS_AGGR_MIN; + } else if (!strcasecmp(c->argv[j]->ptr,"max")) { + aggregate = REDIS_AGGR_MAX; + } else { + zfree(src); + addReply(c,shared.syntaxerr); + return; + } + j++; remaining--; + } else { + zfree(src); + addReply(c,shared.syntaxerr); + return; + } + } + } + + /* sort sets from the smallest to largest, this will improve our + * algorithm's performance */ + qsort(src,setnum,sizeof(zsetopsrc),qsortCompareZsetopsrcByCardinality); + + dstobj = createZsetObject(); + dstzset = dstobj->ptr; + + if (op == REDIS_OP_INTER) { + /* skip going over all entries if the smallest zset is NULL or empty */ + if (src[0].dict && dictSize(src[0].dict) > 0) { + /* precondition: as src[0].dict is non-empty and the zsets are ordered + * from small to large, all src[i > 0].dict are non-empty too */ + di = dictGetIterator(src[0].dict); + while((de = dictNext(di)) != NULL) { + double *score = zmalloc(sizeof(double)), value; + *score = src[0].weight * zunionInterDictValue(de); + + for (j = 1; j < setnum; j++) { + dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de)); + if (other) { + value = src[j].weight * zunionInterDictValue(other); + zunionInterAggregate(score, value, aggregate); + } else { + break; + } + } + + /* skip entry when not present in every source dict */ + if (j != setnum) { + zfree(score); + } else { + robj *o = dictGetEntryKey(de); + dictAdd(dstzset->dict,o,score); + incrRefCount(o); /* added to dictionary */ + zslInsert(dstzset->zsl,*score,o); + incrRefCount(o); /* added to skiplist */ + } + } + dictReleaseIterator(di); + } + } else if (op == REDIS_OP_UNION) { + for (i = 0; i < setnum; i++) { + if (!src[i].dict) continue; + + di = dictGetIterator(src[i].dict); + while((de = dictNext(di)) != NULL) { + /* skip key when already processed */ + if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue; + + double *score = zmalloc(sizeof(double)), value; + *score = src[i].weight * zunionInterDictValue(de); + + /* because the zsets are sorted by size, its only possible + * for sets at larger indices to hold this entry */ + for (j = (i+1); j < setnum; j++) { + dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de)); + if (other) { + value = src[j].weight * zunionInterDictValue(other); + zunionInterAggregate(score, value, aggregate); + } + } + + robj *o = dictGetEntryKey(de); + dictAdd(dstzset->dict,o,score); + incrRefCount(o); /* added to dictionary */ + zslInsert(dstzset->zsl,*score,o); + incrRefCount(o); /* added to skiplist */ + } + dictReleaseIterator(di); + } + } else { + /* unknown operator */ + redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION); + } + + dbDelete(c->db,dstkey); + if (dstzset->zsl->length) { + dbAdd(c->db,dstkey,dstobj); + addReplyLongLong(c, dstzset->zsl->length); + server.dirty++; + } else { + decrRefCount(dstobj); + addReply(c, shared.czero); + } + zfree(src); +} + +void zunionstoreCommand(redisClient *c) { + zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION); +} + +void zinterstoreCommand(redisClient *c) { + zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER); +} + +void zrangeGenericCommand(redisClient *c, int reverse) { + robj *o; + long start; + long end; + int withscores = 0; + int llen; + int rangelen, j; + zset *zsetobj; + zskiplist *zsl; + zskiplistNode *ln; + robj *ele; + + if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) || + (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return; + + if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) { + withscores = 1; + } else if (c->argc >= 5) { + addReply(c,shared.syntaxerr); + return; + } + + if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL + || checkType(c,o,REDIS_ZSET)) return; + zsetobj = o->ptr; + zsl = zsetobj->zsl; + llen = zsl->length; + + /* convert negative indexes */ + if (start < 0) start = llen+start; + if (end < 0) end = llen+end; + if (start < 0) start = 0; + if (end < 0) end = 0; + + /* indexes sanity checks */ + if (start > end || start >= llen) { + /* Out of range start or start > end result in empty list */ + addReply(c,shared.emptymultibulk); + return; + } + if (end >= llen) end = llen-1; + rangelen = (end-start)+1; + + /* check if starting point is trivial, before searching + * the element in log(N) time */ + if (reverse) { + ln = start == 0 ? zsl->tail : zslistTypeGetElementByRank(zsl, llen-start); + } else { + ln = start == 0 ? + zsl->header->forward[0] : zslistTypeGetElementByRank(zsl, start+1); + } + + /* Return the result in form of a multi-bulk reply */ + addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n", + withscores ? (rangelen*2) : rangelen)); + for (j = 0; j < rangelen; j++) { + ele = ln->obj; + addReplyBulk(c,ele); + if (withscores) + addReplyDouble(c,ln->score); + ln = reverse ? ln->backward : ln->forward[0]; + } +} + +void zrangeCommand(redisClient *c) { + zrangeGenericCommand(c,0); +} + +void zrevrangeCommand(redisClient *c) { + zrangeGenericCommand(c,1); +} + +/* This command implements both ZRANGEBYSCORE and ZCOUNT. + * If justcount is non-zero, just the count is returned. */ +void genericZrangebyscoreCommand(redisClient *c, int justcount) { + robj *o; + double min, max; + int minex = 0, maxex = 0; /* are min or max exclusive? */ + int offset = 0, limit = -1; + int withscores = 0; + int badsyntax = 0; + + /* Parse the min-max interval. If one of the values is prefixed + * by the "(" character, it's considered "open". For instance + * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max + * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */ + if (((char*)c->argv[2]->ptr)[0] == '(') { + min = strtod((char*)c->argv[2]->ptr+1,NULL); + minex = 1; + } else { + min = strtod(c->argv[2]->ptr,NULL); + } + if (((char*)c->argv[3]->ptr)[0] == '(') { + max = strtod((char*)c->argv[3]->ptr+1,NULL); + maxex = 1; + } else { + max = strtod(c->argv[3]->ptr,NULL); + } + + /* Parse "WITHSCORES": note that if the command was called with + * the name ZCOUNT then we are sure that c->argc == 4, so we'll never + * enter the following paths to parse WITHSCORES and LIMIT. */ + if (c->argc == 5 || c->argc == 8) { + if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0) + withscores = 1; + else + badsyntax = 1; + } + if (c->argc != (4 + withscores) && c->argc != (7 + withscores)) + badsyntax = 1; + if (badsyntax) { + addReplySds(c, + sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n")); + return; + } + + /* Parse "LIMIT" */ + if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) { + addReply(c,shared.syntaxerr); + return; + } else if (c->argc == (7 + withscores)) { + offset = atoi(c->argv[5]->ptr); + limit = atoi(c->argv[6]->ptr); + if (offset < 0) offset = 0; + } + + /* Ok, lookup the key and get the range */ + o = lookupKeyRead(c->db,c->argv[1]); + if (o == NULL) { + addReply(c,justcount ? shared.czero : shared.emptymultibulk); + } else { + if (o->type != REDIS_ZSET) { + addReply(c,shared.wrongtypeerr); + } else { + zset *zsetobj = o->ptr; + zskiplist *zsl = zsetobj->zsl; + zskiplistNode *ln; + robj *ele, *lenobj = NULL; + unsigned long rangelen = 0; + + /* Get the first node with the score >= min, or with + * score > min if 'minex' is true. */ + ln = zslFirstWithScore(zsl,min); + while (minex && ln && ln->score == min) ln = ln->forward[0]; + + if (ln == NULL) { + /* No element matching the speciifed interval */ + addReply(c,justcount ? shared.czero : shared.emptymultibulk); + return; + } + + /* We don't know in advance how many matching elements there + * are in the list, so we push this object that will represent + * the multi-bulk length in the output buffer, and will "fix" + * it later */ + if (!justcount) { + lenobj = createObject(REDIS_STRING,NULL); + addReply(c,lenobj); + decrRefCount(lenobj); + } + + while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) { + if (offset) { + offset--; + ln = ln->forward[0]; + continue; + } + if (limit == 0) break; + if (!justcount) { + ele = ln->obj; + addReplyBulk(c,ele); + if (withscores) + addReplyDouble(c,ln->score); + } + ln = ln->forward[0]; + rangelen++; + if (limit > 0) limit--; + } + if (justcount) { + addReplyLongLong(c,(long)rangelen); + } else { + lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n", + withscores ? (rangelen*2) : rangelen); + } + } + } +} + +void zrangebyscoreCommand(redisClient *c) { + genericZrangebyscoreCommand(c,0); +} + +void zcountCommand(redisClient *c) { + genericZrangebyscoreCommand(c,1); +} + +void zcardCommand(redisClient *c) { + robj *o; + zset *zs; + + if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL || + checkType(c,o,REDIS_ZSET)) return; + + zs = o->ptr; + addReplyUlong(c,zs->zsl->length); +} + +void zscoreCommand(redisClient *c) { + robj *o; + zset *zs; + dictEntry *de; + + if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL || + checkType(c,o,REDIS_ZSET)) return; + + zs = o->ptr; + de = dictFind(zs->dict,c->argv[2]); + if (!de) { + addReply(c,shared.nullbulk); + } else { + double *score = dictGetEntryVal(de); + + addReplyDouble(c,*score); + } +} + +void zrankGenericCommand(redisClient *c, int reverse) { + robj *o; + zset *zs; + zskiplist *zsl; + dictEntry *de; + unsigned long rank; + double *score; + + if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL || + checkType(c,o,REDIS_ZSET)) return; + + zs = o->ptr; + zsl = zs->zsl; + de = dictFind(zs->dict,c->argv[2]); + if (!de) { + addReply(c,shared.nullbulk); + return; + } + + score = dictGetEntryVal(de); + rank = zslistTypeGetRank(zsl, *score, c->argv[2]); + if (rank) { + if (reverse) { + addReplyLongLong(c, zsl->length - rank); + } else { + addReplyLongLong(c, rank-1); + } + } else { + addReply(c,shared.nullbulk); + } +} + +void zrankCommand(redisClient *c) { + zrankGenericCommand(c, 0); +} + +void zrevrankCommand(redisClient *c) { + zrankGenericCommand(c, 1); +} diff --git a/src/util.c b/src/util.c new file mode 100644 index 000000000..cc2794f6a --- /dev/null +++ b/src/util.c @@ -0,0 +1,223 @@ +#include "redis.h" +#include +#include + +/* Glob-style pattern matching. */ +int stringmatchlen(const char *pattern, int patternLen, + const char *string, int stringLen, int nocase) +{ + while(patternLen) { + switch(pattern[0]) { + case '*': + while (pattern[1] == '*') { + pattern++; + patternLen--; + } + if (patternLen == 1) + return 1; /* match */ + while(stringLen) { + if (stringmatchlen(pattern+1, patternLen-1, + string, stringLen, nocase)) + return 1; /* match */ + string++; + stringLen--; + } + return 0; /* no match */ + break; + case '?': + if (stringLen == 0) + return 0; /* no match */ + string++; + stringLen--; + break; + case '[': + { + int not, match; + + pattern++; + patternLen--; + not = pattern[0] == '^'; + if (not) { + pattern++; + patternLen--; + } + match = 0; + while(1) { + if (pattern[0] == '\\') { + pattern++; + patternLen--; + if (pattern[0] == string[0]) + match = 1; + } else if (pattern[0] == ']') { + break; + } else if (patternLen == 0) { + pattern--; + patternLen++; + break; + } else if (pattern[1] == '-' && patternLen >= 3) { + int start = pattern[0]; + int end = pattern[2]; + int c = string[0]; + if (start > end) { + int t = start; + start = end; + end = t; + } + if (nocase) { + start = tolower(start); + end = tolower(end); + c = tolower(c); + } + pattern += 2; + patternLen -= 2; + if (c >= start && c <= end) + match = 1; + } else { + if (!nocase) { + if (pattern[0] == string[0]) + match = 1; + } else { + if (tolower((int)pattern[0]) == tolower((int)string[0])) + match = 1; + } + } + pattern++; + patternLen--; + } + if (not) + match = !match; + if (!match) + return 0; /* no match */ + string++; + stringLen--; + break; + } + case '\\': + if (patternLen >= 2) { + pattern++; + patternLen--; + } + /* fall through */ + default: + if (!nocase) { + if (pattern[0] != string[0]) + return 0; /* no match */ + } else { + if (tolower((int)pattern[0]) != tolower((int)string[0])) + return 0; /* no match */ + } + string++; + stringLen--; + break; + } + pattern++; + patternLen--; + if (stringLen == 0) { + while(*pattern == '*') { + pattern++; + patternLen--; + } + break; + } + } + if (patternLen == 0 && stringLen == 0) + return 1; + return 0; +} + +int stringmatch(const char *pattern, const char *string, int nocase) { + return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase); +} + +/* Convert a string representing an amount of memory into the number of + * bytes, so for instance memtoll("1Gi") will return 1073741824 that is + * (1024*1024*1024). + * + * On parsing error, if *err is not NULL, it's set to 1, otherwise it's + * set to 0 */ +long long memtoll(const char *p, int *err) { + const char *u; + char buf[128]; + long mul; /* unit multiplier */ + long long val; + unsigned int digits; + + if (err) *err = 0; + /* Search the first non digit character. */ + u = p; + if (*u == '-') u++; + while(*u && isdigit(*u)) u++; + if (*u == '\0' || !strcasecmp(u,"b")) { + mul = 1; + } else if (!strcasecmp(u,"k")) { + mul = 1000; + } else if (!strcasecmp(u,"kb")) { + mul = 1024; + } else if (!strcasecmp(u,"m")) { + mul = 1000*1000; + } else if (!strcasecmp(u,"mb")) { + mul = 1024*1024; + } else if (!strcasecmp(u,"g")) { + mul = 1000L*1000*1000; + } else if (!strcasecmp(u,"gb")) { + mul = 1024L*1024*1024; + } else { + if (err) *err = 1; + mul = 1; + } + digits = u-p; + if (digits >= sizeof(buf)) { + if (err) *err = 1; + return LLONG_MAX; + } + memcpy(buf,p,digits); + buf[digits] = '\0'; + val = strtoll(buf,NULL,10); + return val*mul; +} + +/* Convert a long long into a string. Returns the number of + * characters needed to represent the number, that can be shorter if passed + * buffer length is not enough to store the whole number. */ +int ll2string(char *s, size_t len, long long value) { + char buf[32], *p; + unsigned long long v; + size_t l; + + if (len == 0) return 0; + v = (value < 0) ? -value : value; + p = buf+31; /* point to the last character */ + do { + *p-- = '0'+(v%10); + v /= 10; + } while(v); + if (value < 0) *p-- = '-'; + p++; + l = 32-(p-buf); + if (l+1 > len) l = len-1; /* Make sure it fits, including the nul term */ + memcpy(s,p,l); + s[l] = '\0'; + return l; +} + +/* Check if the nul-terminated string 's' can be represented by a long + * (that is, is a number that fits into long without any other space or + * character before or after the digits). + * + * If so, the function returns REDIS_OK and *longval is set to the value + * of the number. Otherwise REDIS_ERR is returned */ +int isStringRepresentableAsLong(sds s, long *longval) { + char buf[32], *endptr; + long value; + int slen; + + value = strtol(s, &endptr, 10); + if (endptr[0] != '\0') return REDIS_ERR; + slen = ll2string(buf,32,value); + + /* If the number converted back into a string is not identical + * then it's not possible to encode the string as integer */ + if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR; + if (longval) *longval = value; + return REDIS_OK; +} diff --git a/src/version.h b/src/version.h new file mode 100644 index 000000000..86d422474 --- /dev/null +++ b/src/version.h @@ -0,0 +1 @@ +#define REDIS_VERSION "2.1.1" diff --git a/src/vm.c b/src/vm.c new file mode 100644 index 000000000..1aaa57eb5 --- /dev/null +++ b/src/vm.c @@ -0,0 +1,1126 @@ +#include "redis.h" + +#include +#include +#include +#include + +/* Virtual Memory is composed mainly of two subsystems: + * - Blocking Virutal Memory + * - Threaded Virtual Memory I/O + * The two parts are not fully decoupled, but functions are split among two + * different sections of the source code (delimited by comments) in order to + * make more clear what functionality is about the blocking VM and what about + * the threaded (not blocking) VM. + * + * Redis VM design: + * + * Redis VM is a blocking VM (one that blocks reading swapped values from + * disk into memory when a value swapped out is needed in memory) that is made + * unblocking by trying to examine the command argument vector in order to + * load in background values that will likely be needed in order to exec + * the command. The command is executed only once all the relevant keys + * are loaded into memory. + * + * This basically is almost as simple of a blocking VM, but almost as parallel + * as a fully non-blocking VM. + */ + +/* =================== Virtual Memory - Blocking Side ====================== */ + +/* Create a VM pointer object. This kind of objects are used in place of + * values in the key -> value hash table, for swapped out objects. */ +vmpointer *createVmPointer(int vtype) { + vmpointer *vp = zmalloc(sizeof(vmpointer)); + + vp->type = REDIS_VMPOINTER; + vp->storage = REDIS_VM_SWAPPED; + vp->vtype = vtype; + return vp; +} + +void vmInit(void) { + off_t totsize; + int pipefds[2]; + size_t stacksize; + struct flock fl; + + if (server.vm_max_threads != 0) + zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */ + + redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file); + /* Try to open the old swap file, otherwise create it */ + if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) { + server.vm_fp = fopen(server.vm_swap_file,"w+b"); + } + if (server.vm_fp == NULL) { + redisLog(REDIS_WARNING, + "Can't open the swap file: %s. Exiting.", + strerror(errno)); + exit(1); + } + server.vm_fd = fileno(server.vm_fp); + /* Lock the swap file for writing, this is useful in order to avoid + * another instance to use the same swap file for a config error. */ + fl.l_type = F_WRLCK; + fl.l_whence = SEEK_SET; + fl.l_start = fl.l_len = 0; + if (fcntl(server.vm_fd,F_SETLK,&fl) == -1) { + redisLog(REDIS_WARNING, + "Can't lock the swap file at '%s': %s. Make sure it is not used by another Redis instance.", server.vm_swap_file, strerror(errno)); + exit(1); + } + /* Initialize */ + server.vm_next_page = 0; + server.vm_near_pages = 0; + server.vm_stats_used_pages = 0; + server.vm_stats_swapped_objects = 0; + server.vm_stats_swapouts = 0; + server.vm_stats_swapins = 0; + totsize = server.vm_pages*server.vm_page_size; + redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize); + if (ftruncate(server.vm_fd,totsize) == -1) { + redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.", + strerror(errno)); + exit(1); + } else { + redisLog(REDIS_NOTICE,"Swap file allocated with success"); + } + server.vm_bitmap = zmalloc((server.vm_pages+7)/8); + redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages", + (long long) (server.vm_pages+7)/8, server.vm_pages); + memset(server.vm_bitmap,0,(server.vm_pages+7)/8); + + /* Initialize threaded I/O (used by Virtual Memory) */ + server.io_newjobs = listCreate(); + server.io_processing = listCreate(); + server.io_processed = listCreate(); + server.io_ready_clients = listCreate(); + pthread_mutex_init(&server.io_mutex,NULL); + pthread_mutex_init(&server.obj_freelist_mutex,NULL); + pthread_mutex_init(&server.io_swapfile_mutex,NULL); + server.io_active_threads = 0; + if (pipe(pipefds) == -1) { + redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting." + ,strerror(errno)); + exit(1); + } + server.io_ready_pipe_read = pipefds[0]; + server.io_ready_pipe_write = pipefds[1]; + redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR); + /* LZF requires a lot of stack */ + pthread_attr_init(&server.io_threads_attr); + pthread_attr_getstacksize(&server.io_threads_attr, &stacksize); + while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2; + pthread_attr_setstacksize(&server.io_threads_attr, stacksize); + /* Listen for events in the threaded I/O pipe */ + if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE, + vmThreadedIOCompletedJob, NULL) == AE_ERR) + oom("creating file event"); +} + +/* Mark the page as used */ +void vmMarkPageUsed(off_t page) { + off_t byte = page/8; + int bit = page&7; + redisAssert(vmFreePage(page) == 1); + server.vm_bitmap[byte] |= 1<= server.vm_pages) { + this -= server.vm_pages; + if (this == 0) { + /* Just overflowed, what we found on tail is no longer + * interesting, as it's no longer contiguous. */ + numfree = 0; + } + } + if (vmFreePage(this)) { + /* This is a free page */ + numfree++; + /* Already got N free pages? Return to the caller, with success */ + if (numfree == n) { + *first = this-(n-1); + server.vm_next_page = this+1; + redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first); + return REDIS_OK; + } + } else { + /* The current one is not a free page */ + numfree = 0; + } + + /* Fast-forward if the current page is not free and we already + * searched enough near this place. */ + since_jump++; + if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) { + offset += random() % REDIS_VM_MAX_RANDOM_JUMP; + since_jump = 0; + /* Note that even if we rewind after the jump, we are don't need + * to make sure numfree is set to zero as we only jump *if* it + * is set to zero. */ + } else { + /* Otherwise just check the next page */ + offset++; + } + } + return REDIS_ERR; +} + +/* Write the specified object at the specified page of the swap file */ +int vmWriteObjectOnSwap(robj *o, off_t page) { + if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex); + if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) { + if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex); + redisLog(REDIS_WARNING, + "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s", + strerror(errno)); + return REDIS_ERR; + } + rdbSaveObject(server.vm_fp,o); + fflush(server.vm_fp); + if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex); + return REDIS_OK; +} + +/* Transfers the 'val' object to disk. Store all the information + * a 'vmpointer' object containing all the information needed to load the + * object back later is returned. + * + * If we can't find enough contiguous empty pages to swap the object on disk + * NULL is returned. */ +vmpointer *vmSwapObjectBlocking(robj *val) { + off_t pages = rdbSavedObjectPages(val,NULL); + off_t page; + vmpointer *vp; + + redisAssert(val->storage == REDIS_VM_MEMORY); + redisAssert(val->refcount == 1); + if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return NULL; + if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return NULL; + + vp = createVmPointer(val->type); + vp->page = page; + vp->usedpages = pages; + decrRefCount(val); /* Deallocate the object from memory. */ + vmMarkPagesUsed(page,pages); + redisLog(REDIS_DEBUG,"VM: object %p swapped out at %lld (%lld pages)", + (void*) val, + (unsigned long long) page, (unsigned long long) pages); + server.vm_stats_swapped_objects++; + server.vm_stats_swapouts++; + return vp; +} + +robj *vmReadObjectFromSwap(off_t page, int type) { + robj *o; + + if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex); + if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) { + redisLog(REDIS_WARNING, + "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s", + strerror(errno)); + _exit(1); + } + o = rdbLoadObject(type,server.vm_fp); + if (o == NULL) { + redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno)); + _exit(1); + } + if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex); + return o; +} + +/* Load the specified object from swap to memory. + * The newly allocated object is returned. + * + * If preview is true the unserialized object is returned to the caller but + * the pages are not marked as freed, nor the vp object is freed. */ +robj *vmGenericLoadObject(vmpointer *vp, int preview) { + robj *val; + + redisAssert(vp->type == REDIS_VMPOINTER && + (vp->storage == REDIS_VM_SWAPPED || vp->storage == REDIS_VM_LOADING)); + val = vmReadObjectFromSwap(vp->page,vp->vtype); + if (!preview) { + redisLog(REDIS_DEBUG, "VM: object %p loaded from disk", (void*)vp); + vmMarkPagesFree(vp->page,vp->usedpages); + zfree(vp); + server.vm_stats_swapped_objects--; + } else { + redisLog(REDIS_DEBUG, "VM: object %p previewed from disk", (void*)vp); + } + server.vm_stats_swapins++; + return val; +} + +/* Plain object loading, from swap to memory. + * + * 'o' is actually a redisVmPointer structure that will be freed by the call. + * The return value is the loaded object. */ +robj *vmLoadObject(robj *o) { + /* If we are loading the object in background, stop it, we + * need to load this object synchronously ASAP. */ + if (o->storage == REDIS_VM_LOADING) + vmCancelThreadedIOJob(o); + return vmGenericLoadObject((vmpointer*)o,0); +} + +/* Just load the value on disk, without to modify the key. + * This is useful when we want to perform some operation on the value + * without to really bring it from swap to memory, like while saving the + * dataset or rewriting the append only log. */ +robj *vmPreviewObject(robj *o) { + return vmGenericLoadObject((vmpointer*)o,1); +} + +/* How a good candidate is this object for swapping? + * The better candidate it is, the greater the returned value. + * + * Currently we try to perform a fast estimation of the object size in + * memory, and combine it with aging informations. + * + * Basically swappability = idle-time * log(estimated size) + * + * Bigger objects are preferred over smaller objects, but not + * proportionally, this is why we use the logarithm. This algorithm is + * just a first try and will probably be tuned later. */ +double computeObjectSwappability(robj *o) { + /* actual age can be >= minage, but not < minage. As we use wrapping + * 21 bit clocks with minutes resolution for the LRU. */ + time_t minage = abs(server.lruclock - o->lru); + long asize = 0, elesize; + robj *ele; + list *l; + listNode *ln; + dict *d; + struct dictEntry *de; + int z; + + if (minage <= 0) return 0; + switch(o->type) { + case REDIS_STRING: + if (o->encoding != REDIS_ENCODING_RAW) { + asize = sizeof(*o); + } else { + asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2; + } + break; + case REDIS_LIST: + if (o->encoding == REDIS_ENCODING_ZIPLIST) { + asize = sizeof(*o)+ziplistSize(o->ptr); + } else { + l = o->ptr; + ln = listFirst(l); + asize = sizeof(list); + if (ln) { + ele = ln->value; + elesize = (ele->encoding == REDIS_ENCODING_RAW) ? + (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o); + asize += (sizeof(listNode)+elesize)*listLength(l); + } + } + break; + case REDIS_SET: + case REDIS_ZSET: + z = (o->type == REDIS_ZSET); + d = z ? ((zset*)o->ptr)->dict : o->ptr; + + asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d)); + if (z) asize += sizeof(zset)-sizeof(dict); + if (dictSize(d)) { + de = dictGetRandomKey(d); + ele = dictGetEntryKey(de); + elesize = (ele->encoding == REDIS_ENCODING_RAW) ? + (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o); + asize += (sizeof(struct dictEntry)+elesize)*dictSize(d); + if (z) asize += sizeof(zskiplistNode)*dictSize(d); + } + break; + case REDIS_HASH: + if (o->encoding == REDIS_ENCODING_ZIPMAP) { + unsigned char *p = zipmapRewind((unsigned char*)o->ptr); + unsigned int len = zipmapLen((unsigned char*)o->ptr); + unsigned int klen, vlen; + unsigned char *key, *val; + + if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) { + klen = 0; + vlen = 0; + } + asize = len*(klen+vlen+3); + } else if (o->encoding == REDIS_ENCODING_HT) { + d = o->ptr; + asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d)); + if (dictSize(d)) { + de = dictGetRandomKey(d); + ele = dictGetEntryKey(de); + elesize = (ele->encoding == REDIS_ENCODING_RAW) ? + (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o); + ele = dictGetEntryVal(de); + elesize = (ele->encoding == REDIS_ENCODING_RAW) ? + (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o); + asize += (sizeof(struct dictEntry)+elesize)*dictSize(d); + } + } + break; + } + return (double)minage*log(1+asize); +} + +/* Try to swap an object that's a good candidate for swapping. + * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible + * to swap any object at all. + * + * If 'usethreaded' is true, Redis will try to swap the object in background + * using I/O threads. */ +int vmSwapOneObject(int usethreads) { + int j, i; + struct dictEntry *best = NULL; + double best_swappability = 0; + redisDb *best_db = NULL; + robj *val; + sds key; + + for (j = 0; j < server.dbnum; j++) { + redisDb *db = server.db+j; + /* Why maxtries is set to 100? + * Because this way (usually) we'll find 1 object even if just 1% - 2% + * are swappable objects */ + int maxtries = 100; + + if (dictSize(db->dict) == 0) continue; + for (i = 0; i < 5; i++) { + dictEntry *de; + double swappability; + + if (maxtries) maxtries--; + de = dictGetRandomKey(db->dict); + val = dictGetEntryVal(de); + /* Only swap objects that are currently in memory. + * + * Also don't swap shared objects: not a good idea in general and + * we need to ensure that the main thread does not touch the + * object while the I/O thread is using it, but we can't + * control other keys without adding additional mutex. */ + if (val->storage != REDIS_VM_MEMORY || val->refcount != 1) { + if (maxtries) i--; /* don't count this try */ + continue; + } + swappability = computeObjectSwappability(val); + if (!best || swappability > best_swappability) { + best = de; + best_swappability = swappability; + best_db = db; + } + } + } + if (best == NULL) return REDIS_ERR; + key = dictGetEntryKey(best); + val = dictGetEntryVal(best); + + redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f", + key, best_swappability); + + /* Swap it */ + if (usethreads) { + robj *keyobj = createStringObject(key,sdslen(key)); + vmSwapObjectThreaded(keyobj,val,best_db); + decrRefCount(keyobj); + return REDIS_OK; + } else { + vmpointer *vp; + + if ((vp = vmSwapObjectBlocking(val)) != NULL) { + dictGetEntryVal(best) = vp; + return REDIS_OK; + } else { + return REDIS_ERR; + } + } +} + +int vmSwapOneObjectBlocking() { + return vmSwapOneObject(0); +} + +int vmSwapOneObjectThreaded() { + return vmSwapOneObject(1); +} + +/* Return true if it's safe to swap out objects in a given moment. + * Basically we don't want to swap objects out while there is a BGSAVE + * or a BGAEOREWRITE running in backgroud. */ +int vmCanSwapOut(void) { + return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1); +} + +/* =================== Virtual Memory - Threaded I/O ======================= */ + +void freeIOJob(iojob *j) { + if ((j->type == REDIS_IOJOB_PREPARE_SWAP || + j->type == REDIS_IOJOB_DO_SWAP || + j->type == REDIS_IOJOB_LOAD) && j->val != NULL) + { + /* we fix the storage type, otherwise decrRefCount() will try to + * kill the I/O thread Job (that does no longer exists). */ + if (j->val->storage == REDIS_VM_SWAPPING) + j->val->storage = REDIS_VM_MEMORY; + decrRefCount(j->val); + } + decrRefCount(j->key); + zfree(j); +} + +/* Every time a thread finished a Job, it writes a byte into the write side + * of an unix pipe in order to "awake" the main thread, and this function + * is called. */ +void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, + int mask) +{ + char buf[1]; + int retval, processed = 0, toprocess = -1, trytoswap = 1; + REDIS_NOTUSED(el); + REDIS_NOTUSED(mask); + REDIS_NOTUSED(privdata); + + /* For every byte we read in the read side of the pipe, there is one + * I/O job completed to process. */ + while((retval = read(fd,buf,1)) == 1) { + iojob *j; + listNode *ln; + struct dictEntry *de; + + redisLog(REDIS_DEBUG,"Processing I/O completed job"); + + /* Get the processed element (the oldest one) */ + lockThreadedIO(); + redisAssert(listLength(server.io_processed) != 0); + if (toprocess == -1) { + toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100; + if (toprocess <= 0) toprocess = 1; + } + ln = listFirst(server.io_processed); + j = ln->value; + listDelNode(server.io_processed,ln); + unlockThreadedIO(); + /* If this job is marked as canceled, just ignore it */ + if (j->canceled) { + freeIOJob(j); + continue; + } + /* Post process it in the main thread, as there are things we + * can do just here to avoid race conditions and/or invasive locks */ + redisLog(REDIS_DEBUG,"COMPLETED Job type: %d, ID %p, key: %s", j->type, (void*)j->id, (unsigned char*)j->key->ptr); + de = dictFind(j->db->dict,j->key->ptr); + redisAssert(de != NULL); + if (j->type == REDIS_IOJOB_LOAD) { + redisDb *db; + vmpointer *vp = dictGetEntryVal(de); + + /* Key loaded, bring it at home */ + vmMarkPagesFree(vp->page,vp->usedpages); + redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)", + (unsigned char*) j->key->ptr); + server.vm_stats_swapped_objects--; + server.vm_stats_swapins++; + dictGetEntryVal(de) = j->val; + incrRefCount(j->val); + db = j->db; + /* Handle clients waiting for this key to be loaded. */ + handleClientsBlockedOnSwappedKey(db,j->key); + freeIOJob(j); + zfree(vp); + } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) { + /* Now we know the amount of pages required to swap this object. + * Let's find some space for it, and queue this task again + * rebranded as REDIS_IOJOB_DO_SWAP. */ + if (!vmCanSwapOut() || + vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR) + { + /* Ooops... no space or we can't swap as there is + * a fork()ed Redis trying to save stuff on disk. */ + j->val->storage = REDIS_VM_MEMORY; /* undo operation */ + freeIOJob(j); + } else { + /* Note that we need to mark this pages as used now, + * if the job will be canceled, we'll mark them as freed + * again. */ + vmMarkPagesUsed(j->page,j->pages); + j->type = REDIS_IOJOB_DO_SWAP; + lockThreadedIO(); + queueIOJob(j); + unlockThreadedIO(); + } + } else if (j->type == REDIS_IOJOB_DO_SWAP) { + vmpointer *vp; + + /* Key swapped. We can finally free some memory. */ + if (j->val->storage != REDIS_VM_SWAPPING) { + vmpointer *vp = (vmpointer*) j->id; + printf("storage: %d\n",vp->storage); + printf("key->name: %s\n",(char*)j->key->ptr); + printf("val: %p\n",(void*)j->val); + printf("val->type: %d\n",j->val->type); + printf("val->ptr: %s\n",(char*)j->val->ptr); + } + redisAssert(j->val->storage == REDIS_VM_SWAPPING); + vp = createVmPointer(j->val->type); + vp->page = j->page; + vp->usedpages = j->pages; + dictGetEntryVal(de) = vp; + /* Fix the storage otherwise decrRefCount will attempt to + * remove the associated I/O job */ + j->val->storage = REDIS_VM_MEMORY; + decrRefCount(j->val); + redisLog(REDIS_DEBUG, + "VM: object %s swapped out at %lld (%lld pages) (threaded)", + (unsigned char*) j->key->ptr, + (unsigned long long) j->page, (unsigned long long) j->pages); + server.vm_stats_swapped_objects++; + server.vm_stats_swapouts++; + freeIOJob(j); + /* Put a few more swap requests in queue if we are still + * out of memory */ + if (trytoswap && vmCanSwapOut() && + zmalloc_used_memory() > server.vm_max_memory) + { + int more = 1; + while(more) { + lockThreadedIO(); + more = listLength(server.io_newjobs) < + (unsigned) server.vm_max_threads; + unlockThreadedIO(); + /* Don't waste CPU time if swappable objects are rare. */ + if (vmSwapOneObjectThreaded() == REDIS_ERR) { + trytoswap = 0; + break; + } + } + } + } + processed++; + if (processed == toprocess) return; + } + if (retval < 0 && errno != EAGAIN) { + redisLog(REDIS_WARNING, + "WARNING: read(2) error in vmThreadedIOCompletedJob() %s", + strerror(errno)); + } +} + +void lockThreadedIO(void) { + pthread_mutex_lock(&server.io_mutex); +} + +void unlockThreadedIO(void) { + pthread_mutex_unlock(&server.io_mutex); +} + +/* Remove the specified object from the threaded I/O queue if still not + * processed, otherwise make sure to flag it as canceled. */ +void vmCancelThreadedIOJob(robj *o) { + list *lists[3] = { + server.io_newjobs, /* 0 */ + server.io_processing, /* 1 */ + server.io_processed /* 2 */ + }; + int i; + + redisAssert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING); +again: + lockThreadedIO(); + /* Search for a matching object in one of the queues */ + for (i = 0; i < 3; i++) { + listNode *ln; + listIter li; + + listRewind(lists[i],&li); + while ((ln = listNext(&li)) != NULL) { + iojob *job = ln->value; + + if (job->canceled) continue; /* Skip this, already canceled. */ + if (job->id == o) { + redisLog(REDIS_DEBUG,"*** CANCELED %p (key %s) (type %d) (LIST ID %d)\n", + (void*)job, (char*)job->key->ptr, job->type, i); + /* Mark the pages as free since the swap didn't happened + * or happened but is now discarded. */ + if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP) + vmMarkPagesFree(job->page,job->pages); + /* Cancel the job. It depends on the list the job is + * living in. */ + switch(i) { + case 0: /* io_newjobs */ + /* If the job was yet not processed the best thing to do + * is to remove it from the queue at all */ + freeIOJob(job); + listDelNode(lists[i],ln); + break; + case 1: /* io_processing */ + /* Oh Shi- the thread is messing with the Job: + * + * Probably it's accessing the object if this is a + * PREPARE_SWAP or DO_SWAP job. + * If it's a LOAD job it may be reading from disk and + * if we don't wait for the job to terminate before to + * cancel it, maybe in a few microseconds data can be + * corrupted in this pages. So the short story is: + * + * Better to wait for the job to move into the + * next queue (processed)... */ + + /* We try again and again until the job is completed. */ + unlockThreadedIO(); + /* But let's wait some time for the I/O thread + * to finish with this job. After all this condition + * should be very rare. */ + usleep(1); + goto again; + case 2: /* io_processed */ + /* The job was already processed, that's easy... + * just mark it as canceled so that we'll ignore it + * when processing completed jobs. */ + job->canceled = 1; + break; + } + /* Finally we have to adjust the storage type of the object + * in order to "UNDO" the operaiton. */ + if (o->storage == REDIS_VM_LOADING) + o->storage = REDIS_VM_SWAPPED; + else if (o->storage == REDIS_VM_SWAPPING) + o->storage = REDIS_VM_MEMORY; + unlockThreadedIO(); + redisLog(REDIS_DEBUG,"*** DONE"); + return; + } + } + } + unlockThreadedIO(); + printf("Not found: %p\n", (void*)o); + redisAssert(1 != 1); /* We should never reach this */ +} + +void *IOThreadEntryPoint(void *arg) { + iojob *j; + listNode *ln; + REDIS_NOTUSED(arg); + + pthread_detach(pthread_self()); + while(1) { + /* Get a new job to process */ + lockThreadedIO(); + if (listLength(server.io_newjobs) == 0) { + /* No new jobs in queue, exit. */ + redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do", + (long) pthread_self()); + server.io_active_threads--; + unlockThreadedIO(); + return NULL; + } + ln = listFirst(server.io_newjobs); + j = ln->value; + listDelNode(server.io_newjobs,ln); + /* Add the job in the processing queue */ + j->thread = pthread_self(); + listAddNodeTail(server.io_processing,j); + ln = listLast(server.io_processing); /* We use ln later to remove it */ + unlockThreadedIO(); + redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'", + (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr); + + /* Process the Job */ + if (j->type == REDIS_IOJOB_LOAD) { + vmpointer *vp = (vmpointer*)j->id; + j->val = vmReadObjectFromSwap(j->page,vp->vtype); + } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) { + FILE *fp = fopen("/dev/null","w+"); + j->pages = rdbSavedObjectPages(j->val,fp); + fclose(fp); + } else if (j->type == REDIS_IOJOB_DO_SWAP) { + if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR) + j->canceled = 1; + } + + /* Done: insert the job into the processed queue */ + redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)", + (long) pthread_self(), (void*)j, (char*)j->key->ptr); + lockThreadedIO(); + listDelNode(server.io_processing,ln); + listAddNodeTail(server.io_processed,j); + unlockThreadedIO(); + + /* Signal the main thread there is new stuff to process */ + redisAssert(write(server.io_ready_pipe_write,"x",1) == 1); + } + return NULL; /* never reached */ +} + +void spawnIOThread(void) { + pthread_t thread; + sigset_t mask, omask; + int err; + + sigemptyset(&mask); + sigaddset(&mask,SIGCHLD); + sigaddset(&mask,SIGHUP); + sigaddset(&mask,SIGPIPE); + pthread_sigmask(SIG_SETMASK, &mask, &omask); + while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) { + redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s", + strerror(err)); + usleep(1000000); + } + pthread_sigmask(SIG_SETMASK, &omask, NULL); + server.io_active_threads++; +} + +/* We need to wait for the last thread to exit before we are able to + * fork() in order to BGSAVE or BGREWRITEAOF. */ +void waitEmptyIOJobsQueue(void) { + while(1) { + int io_processed_len; + + lockThreadedIO(); + if (listLength(server.io_newjobs) == 0 && + listLength(server.io_processing) == 0 && + server.io_active_threads == 0) + { + unlockThreadedIO(); + return; + } + /* While waiting for empty jobs queue condition we post-process some + * finshed job, as I/O threads may be hanging trying to write against + * the io_ready_pipe_write FD but there are so much pending jobs that + * it's blocking. */ + io_processed_len = listLength(server.io_processed); + unlockThreadedIO(); + if (io_processed_len) { + vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0); + usleep(1000); /* 1 millisecond */ + } else { + usleep(10000); /* 10 milliseconds */ + } + } +} + +void vmReopenSwapFile(void) { + /* Note: we don't close the old one as we are in the child process + * and don't want to mess at all with the original file object. */ + server.vm_fp = fopen(server.vm_swap_file,"r+b"); + if (server.vm_fp == NULL) { + redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.", + server.vm_swap_file); + _exit(1); + } + server.vm_fd = fileno(server.vm_fp); +} + +/* This function must be called while with threaded IO locked */ +void queueIOJob(iojob *j) { + redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n", + (void*)j, j->type, (char*)j->key->ptr); + listAddNodeTail(server.io_newjobs,j); + if (server.io_active_threads < server.vm_max_threads) + spawnIOThread(); +} + +int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) { + iojob *j; + + j = zmalloc(sizeof(*j)); + j->type = REDIS_IOJOB_PREPARE_SWAP; + j->db = db; + j->key = key; + incrRefCount(key); + j->id = j->val = val; + incrRefCount(val); + j->canceled = 0; + j->thread = (pthread_t) -1; + val->storage = REDIS_VM_SWAPPING; + + lockThreadedIO(); + queueIOJob(j); + unlockThreadedIO(); + return REDIS_OK; +} + +/* ============ Virtual Memory - Blocking clients on missing keys =========== */ + +/* This function makes the clinet 'c' waiting for the key 'key' to be loaded. + * If there is not already a job loading the key, it is craeted. + * The key is added to the io_keys list in the client structure, and also + * in the hash table mapping swapped keys to waiting clients, that is, + * server.io_waited_keys. */ +int waitForSwappedKey(redisClient *c, robj *key) { + struct dictEntry *de; + robj *o; + list *l; + + /* If the key does not exist or is already in RAM we don't need to + * block the client at all. */ + de = dictFind(c->db->dict,key->ptr); + if (de == NULL) return 0; + o = dictGetEntryVal(de); + if (o->storage == REDIS_VM_MEMORY) { + return 0; + } else if (o->storage == REDIS_VM_SWAPPING) { + /* We were swapping the key, undo it! */ + vmCancelThreadedIOJob(o); + return 0; + } + + /* OK: the key is either swapped, or being loaded just now. */ + + /* Add the key to the list of keys this client is waiting for. + * This maps clients to keys they are waiting for. */ + listAddNodeTail(c->io_keys,key); + incrRefCount(key); + + /* Add the client to the swapped keys => clients waiting map. */ + de = dictFind(c->db->io_keys,key); + if (de == NULL) { + int retval; + + /* For every key we take a list of clients blocked for it */ + l = listCreate(); + retval = dictAdd(c->db->io_keys,key,l); + incrRefCount(key); + redisAssert(retval == DICT_OK); + } else { + l = dictGetEntryVal(de); + } + listAddNodeTail(l,c); + + /* Are we already loading the key from disk? If not create a job */ + if (o->storage == REDIS_VM_SWAPPED) { + iojob *j; + vmpointer *vp = (vmpointer*)o; + + o->storage = REDIS_VM_LOADING; + j = zmalloc(sizeof(*j)); + j->type = REDIS_IOJOB_LOAD; + j->db = c->db; + j->id = (robj*)vp; + j->key = key; + incrRefCount(key); + j->page = vp->page; + j->val = NULL; + j->canceled = 0; + j->thread = (pthread_t) -1; + lockThreadedIO(); + queueIOJob(j); + unlockThreadedIO(); + } + return 1; +} + +/* Preload keys for any command with first, last and step values for + * the command keys prototype, as defined in the command table. */ +void waitForMultipleSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) { + int j, last; + if (cmd->vm_firstkey == 0) return; + last = cmd->vm_lastkey; + if (last < 0) last = argc+last; + for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep) { + redisAssert(j < argc); + waitForSwappedKey(c,argv[j]); + } +} + +/* Preload keys needed for the ZUNIONSTORE and ZINTERSTORE commands. + * Note that the number of keys to preload is user-defined, so we need to + * apply a sanity check against argc. */ +void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) { + int i, num; + REDIS_NOTUSED(cmd); + + num = atoi(argv[2]->ptr); + if (num > (argc-3)) return; + for (i = 0; i < num; i++) { + waitForSwappedKey(c,argv[3+i]); + } +} + +/* Preload keys needed to execute the entire MULTI/EXEC block. + * + * This function is called by blockClientOnSwappedKeys when EXEC is issued, + * and will block the client when any command requires a swapped out value. */ +void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) { + int i, margc; + struct redisCommand *mcmd; + robj **margv; + REDIS_NOTUSED(cmd); + REDIS_NOTUSED(argc); + REDIS_NOTUSED(argv); + + if (!(c->flags & REDIS_MULTI)) return; + for (i = 0; i < c->mstate.count; i++) { + mcmd = c->mstate.commands[i].cmd; + margc = c->mstate.commands[i].argc; + margv = c->mstate.commands[i].argv; + + if (mcmd->vm_preload_proc != NULL) { + mcmd->vm_preload_proc(c,mcmd,margc,margv); + } else { + waitForMultipleSwappedKeys(c,mcmd,margc,margv); + } + } +} + +/* Is this client attempting to run a command against swapped keys? + * If so, block it ASAP, load the keys in background, then resume it. + * + * The important idea about this function is that it can fail! If keys will + * still be swapped when the client is resumed, this key lookups will + * just block loading keys from disk. In practical terms this should only + * happen with SORT BY command or if there is a bug in this function. + * + * Return 1 if the client is marked as blocked, 0 if the client can + * continue as the keys it is going to access appear to be in memory. */ +int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd) { + if (cmd->vm_preload_proc != NULL) { + cmd->vm_preload_proc(c,cmd,c->argc,c->argv); + } else { + waitForMultipleSwappedKeys(c,cmd,c->argc,c->argv); + } + + /* If the client was blocked for at least one key, mark it as blocked. */ + if (listLength(c->io_keys)) { + c->flags |= REDIS_IO_WAIT; + aeDeleteFileEvent(server.el,c->fd,AE_READABLE); + server.vm_blocked_clients++; + return 1; + } else { + return 0; + } +} + +/* Remove the 'key' from the list of blocked keys for a given client. + * + * The function returns 1 when there are no longer blocking keys after + * the current one was removed (and the client can be unblocked). */ +int dontWaitForSwappedKey(redisClient *c, robj *key) { + list *l; + listNode *ln; + listIter li; + struct dictEntry *de; + + /* Remove the key from the list of keys this client is waiting for. */ + listRewind(c->io_keys,&li); + while ((ln = listNext(&li)) != NULL) { + if (equalStringObjects(ln->value,key)) { + listDelNode(c->io_keys,ln); + break; + } + } + redisAssert(ln != NULL); + + /* Remove the client form the key => waiting clients map. */ + de = dictFind(c->db->io_keys,key); + redisAssert(de != NULL); + l = dictGetEntryVal(de); + ln = listSearchKey(l,c); + redisAssert(ln != NULL); + listDelNode(l,ln); + if (listLength(l) == 0) + dictDelete(c->db->io_keys,key); + + return listLength(c->io_keys) == 0; +} + +/* Every time we now a key was loaded back in memory, we handle clients + * waiting for this key if any. */ +void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) { + struct dictEntry *de; + list *l; + listNode *ln; + int len; + + de = dictFind(db->io_keys,key); + if (!de) return; + + l = dictGetEntryVal(de); + len = listLength(l); + /* Note: we can't use something like while(listLength(l)) as the list + * can be freed by the calling function when we remove the last element. */ + while (len--) { + ln = listFirst(l); + redisClient *c = ln->value; + + if (dontWaitForSwappedKey(c,key)) { + /* Put the client in the list of clients ready to go as we + * loaded all the keys about it. */ + listAddNodeTail(server.io_ready_clients,c); + } + } +} diff --git a/src/ziplist.c b/src/ziplist.c new file mode 100644 index 000000000..4b9d0fadc --- /dev/null +++ b/src/ziplist.c @@ -0,0 +1,959 @@ +/* Memory layout of a ziplist, containing "foo", "bar", "quux": + * "foo""bar""quux" + * + * is an unsigned integer to hold the number of bytes that + * the ziplist occupies. This is stored to not have to traverse the ziplist + * to know the new length when pushing. + * + * is the number of items in the ziplist. When this value is + * greater than 254, we need to traverse the entire list to know + * how many items it holds. + * + * is the number of bytes occupied by a single entry. When this + * number is greater than 253, the length will occupy 5 bytes, where + * the extra bytes contain an unsigned integer to hold the length. + */ + +#include +#include +#include +#include +#include +#include +#include "zmalloc.h" +#include "ziplist.h" + +/* Important note: the ZIP_END value is used to depict the end of the + * ziplist structure. When a pointer contains an entry, the first couple + * of bytes contain the encoded length of the previous entry. This length + * is encoded as ZIP_ENC_RAW length, so the first two bits will contain 00 + * and the byte will therefore never have a value of 255. */ +#define ZIP_END 255 +#define ZIP_BIGLEN 254 + +/* Entry encoding */ +#define ZIP_ENC_RAW 0 +#define ZIP_ENC_INT16 1 +#define ZIP_ENC_INT32 2 +#define ZIP_ENC_INT64 3 +#define ZIP_ENCODING(p) ((p)[0] >> 6) + +/* Length encoding for raw entries */ +#define ZIP_LEN_INLINE 0 +#define ZIP_LEN_UINT16 1 +#define ZIP_LEN_UINT32 2 + +/* Utility macros */ +#define ZIPLIST_BYTES(zl) (*((uint32_t*)(zl))) +#define ZIPLIST_TAIL_OFFSET(zl) (*((uint32_t*)((zl)+sizeof(uint32_t)))) +#define ZIPLIST_LENGTH(zl) (*((uint16_t*)((zl)+sizeof(uint32_t)*2))) +#define ZIPLIST_HEADER_SIZE (sizeof(uint32_t)*2+sizeof(uint16_t)) +#define ZIPLIST_ENTRY_HEAD(zl) ((zl)+ZIPLIST_HEADER_SIZE) +#define ZIPLIST_ENTRY_TAIL(zl) ((zl)+ZIPLIST_TAIL_OFFSET(zl)) +#define ZIPLIST_ENTRY_END(zl) ((zl)+ZIPLIST_BYTES(zl)-1) + +/* We know a positive increment can only be 1 because entries can only be + * pushed one at a time. */ +#define ZIPLIST_INCR_LENGTH(zl,incr) { \ + if (ZIPLIST_LENGTH(zl) < UINT16_MAX) ZIPLIST_LENGTH(zl)+=incr; } + +typedef struct zlentry { + unsigned int prevrawlensize, prevrawlen; + unsigned int lensize, len; + unsigned int headersize; + unsigned char encoding; + unsigned char *p; +} zlentry; + +/* Return bytes needed to store integer encoded by 'encoding' */ +static unsigned int zipEncodingSize(unsigned char encoding) { + if (encoding == ZIP_ENC_INT16) { + return sizeof(int16_t); + } else if (encoding == ZIP_ENC_INT32) { + return sizeof(int32_t); + } else if (encoding == ZIP_ENC_INT64) { + return sizeof(int64_t); + } + assert(NULL); +} + +/* Decode the encoded length pointed by 'p'. If a pointer to 'lensize' is + * provided, it is set to the number of bytes required to encode the length. */ +static unsigned int zipDecodeLength(unsigned char *p, unsigned int *lensize) { + unsigned char encoding = ZIP_ENCODING(p), lenenc; + unsigned int len; + + if (encoding == ZIP_ENC_RAW) { + lenenc = (p[0] >> 4) & 0x3; + if (lenenc == ZIP_LEN_INLINE) { + len = p[0] & 0xf; + if (lensize) *lensize = 1; + } else if (lenenc == ZIP_LEN_UINT16) { + len = p[1] | (p[2] << 8); + if (lensize) *lensize = 3; + } else { + len = p[1] | (p[2] << 8) | (p[3] << 16) | (p[4] << 24); + if (lensize) *lensize = 5; + } + } else { + len = zipEncodingSize(encoding); + if (lensize) *lensize = 1; + } + return len; +} + +/* Encode the length 'l' writing it in 'p'. If p is NULL it just returns + * the amount of bytes required to encode such a length. */ +static unsigned int zipEncodeLength(unsigned char *p, char encoding, unsigned int rawlen) { + unsigned char len = 1, lenenc, buf[5]; + if (encoding == ZIP_ENC_RAW) { + if (rawlen <= 0xf) { + if (!p) return len; + lenenc = ZIP_LEN_INLINE; + buf[0] = rawlen; + } else if (rawlen <= 0xffff) { + len += 2; + if (!p) return len; + lenenc = ZIP_LEN_UINT16; + buf[1] = (rawlen ) & 0xff; + buf[2] = (rawlen >> 8) & 0xff; + } else { + len += 4; + if (!p) return len; + lenenc = ZIP_LEN_UINT32; + buf[1] = (rawlen ) & 0xff; + buf[2] = (rawlen >> 8) & 0xff; + buf[3] = (rawlen >> 16) & 0xff; + buf[4] = (rawlen >> 24) & 0xff; + } + buf[0] = (lenenc << 4) | (buf[0] & 0xf); + } + if (!p) return len; + + /* Apparently we need to store the length in 'p' */ + buf[0] = (encoding << 6) | (buf[0] & 0x3f); + memcpy(p,buf,len); + return len; +} + +/* Decode the length of the previous element stored at "p". */ +static unsigned int zipPrevDecodeLength(unsigned char *p, unsigned int *lensize) { + unsigned int len = *p; + if (len < ZIP_BIGLEN) { + if (lensize) *lensize = 1; + } else { + if (lensize) *lensize = 1+sizeof(len); + memcpy(&len,p+1,sizeof(len)); + } + return len; +} + +/* Encode the length of the previous entry and write it to "p". Return the + * number of bytes needed to encode this length if "p" is NULL. */ +static unsigned int zipPrevEncodeLength(unsigned char *p, unsigned int len) { + if (p == NULL) { + return (len < ZIP_BIGLEN) ? 1 : sizeof(len)+1; + } else { + if (len < ZIP_BIGLEN) { + p[0] = len; + return 1; + } else { + p[0] = ZIP_BIGLEN; + memcpy(p+1,&len,sizeof(len)); + return 1+sizeof(len); + } + } +} + +/* Return the difference in number of bytes needed to store the new length + * "len" on the entry pointed to by "p". */ +static int zipPrevLenByteDiff(unsigned char *p, unsigned int len) { + unsigned int prevlensize; + zipPrevDecodeLength(p,&prevlensize); + return zipPrevEncodeLength(NULL,len)-prevlensize; +} + +/* Check if string pointed to by 'entry' can be encoded as an integer. + * Stores the integer value in 'v' and its encoding in 'encoding'. + * Warning: this function requires a NULL-terminated string! */ +static int zipTryEncoding(unsigned char *entry, long long *v, unsigned char *encoding) { + long long value; + char *eptr; + + if (entry[0] == '-' || (entry[0] >= '0' && entry[0] <= '9')) { + value = strtoll((char*)entry,&eptr,10); + if (eptr[0] != '\0') return 0; + if (value >= INT16_MIN && value <= INT16_MAX) { + *encoding = ZIP_ENC_INT16; + } else if (value >= INT32_MIN && value <= INT32_MAX) { + *encoding = ZIP_ENC_INT32; + } else { + *encoding = ZIP_ENC_INT64; + } + *v = value; + return 1; + } + return 0; +} + +/* Store integer 'value' at 'p', encoded as 'encoding' */ +static void zipSaveInteger(unsigned char *p, int64_t value, unsigned char encoding) { + int16_t i16; + int32_t i32; + int64_t i64; + if (encoding == ZIP_ENC_INT16) { + i16 = value; + memcpy(p,&i16,sizeof(i16)); + } else if (encoding == ZIP_ENC_INT32) { + i32 = value; + memcpy(p,&i32,sizeof(i32)); + } else if (encoding == ZIP_ENC_INT64) { + i64 = value; + memcpy(p,&i64,sizeof(i64)); + } else { + assert(NULL); + } +} + +/* Read integer encoded as 'encoding' from 'p' */ +static int64_t zipLoadInteger(unsigned char *p, unsigned char encoding) { + int16_t i16; + int32_t i32; + int64_t i64, ret; + if (encoding == ZIP_ENC_INT16) { + memcpy(&i16,p,sizeof(i16)); + ret = i16; + } else if (encoding == ZIP_ENC_INT32) { + memcpy(&i32,p,sizeof(i32)); + ret = i32; + } else if (encoding == ZIP_ENC_INT64) { + memcpy(&i64,p,sizeof(i64)); + ret = i64; + } else { + assert(NULL); + } + return ret; +} + +/* Return a struct with all information about an entry. */ +static zlentry zipEntry(unsigned char *p) { + zlentry e; + e.prevrawlen = zipPrevDecodeLength(p,&e.prevrawlensize); + e.len = zipDecodeLength(p+e.prevrawlensize,&e.lensize); + e.headersize = e.prevrawlensize+e.lensize; + e.encoding = ZIP_ENCODING(p+e.prevrawlensize); + e.p = p; + return e; +} + +/* Return the total number of bytes used by the entry at "p". */ +static unsigned int zipRawEntryLength(unsigned char *p) { + zlentry e = zipEntry(p); + return e.headersize + e.len; +} + +/* Create a new empty ziplist. */ +unsigned char *ziplistNew(void) { + unsigned int bytes = ZIPLIST_HEADER_SIZE+1; + unsigned char *zl = zmalloc(bytes); + ZIPLIST_BYTES(zl) = bytes; + ZIPLIST_TAIL_OFFSET(zl) = ZIPLIST_HEADER_SIZE; + ZIPLIST_LENGTH(zl) = 0; + zl[bytes-1] = ZIP_END; + return zl; +} + +/* Resize the ziplist. */ +static unsigned char *ziplistResize(unsigned char *zl, unsigned int len) { + zl = zrealloc(zl,len); + ZIPLIST_BYTES(zl) = len; + zl[len-1] = ZIP_END; + return zl; +} + +/* Delete "num" entries, starting at "p". Returns pointer to the ziplist. */ +static unsigned char *__ziplistDelete(unsigned char *zl, unsigned char *p, unsigned int num) { + unsigned int i, totlen, deleted = 0; + int nextdiff = 0; + zlentry first = zipEntry(p); + for (i = 0; p[0] != ZIP_END && i < num; i++) { + p += zipRawEntryLength(p); + deleted++; + } + + totlen = p-first.p; + if (totlen > 0) { + if (p[0] != ZIP_END) { + /* Tricky: storing the prevlen in this entry might reduce or + * increase the number of bytes needed, compared to the current + * prevlen. Note that we can always store this length because + * it was previously stored by an entry that is being deleted. */ + nextdiff = zipPrevLenByteDiff(p,first.prevrawlen); + zipPrevEncodeLength(p-nextdiff,first.prevrawlen); + + /* Update offset for tail */ + ZIPLIST_TAIL_OFFSET(zl) -= totlen+nextdiff; + + /* Move tail to the front of the ziplist */ + memmove(first.p,p-nextdiff,ZIPLIST_BYTES(zl)-(p-zl)-1+nextdiff); + } else { + /* The entire tail was deleted. No need to move memory. */ + ZIPLIST_TAIL_OFFSET(zl) = (first.p-zl)-first.prevrawlen; + } + + /* Resize and update length */ + zl = ziplistResize(zl, ZIPLIST_BYTES(zl)-totlen+nextdiff); + ZIPLIST_INCR_LENGTH(zl,-deleted); + } + return zl; +} + +/* Insert item at "p". */ +static unsigned char *__ziplistInsert(unsigned char *zl, unsigned char *p, unsigned char *s, unsigned int slen) { + unsigned int curlen = ZIPLIST_BYTES(zl), reqlen, prevlen = 0; + unsigned int offset, nextdiff = 0; + unsigned char *tail; + unsigned char encoding = ZIP_ENC_RAW; + long long value; + zlentry entry; + + /* Find out prevlen for the entry that is inserted. */ + if (p[0] != ZIP_END) { + entry = zipEntry(p); + prevlen = entry.prevrawlen; + } else { + tail = ZIPLIST_ENTRY_TAIL(zl); + if (tail[0] != ZIP_END) { + prevlen = zipRawEntryLength(tail); + } + } + + /* See if the entry can be encoded */ + if (zipTryEncoding(s,&value,&encoding)) { + reqlen = zipEncodingSize(encoding); + } else { + reqlen = slen; + } + + /* We need space for both the length of the previous entry and + * the length of the payload. */ + reqlen += zipPrevEncodeLength(NULL,prevlen); + reqlen += zipEncodeLength(NULL,encoding,slen); + + /* When the insert position is not equal to the tail, we need to + * make sure that the next entry can hold this entry's length in + * its prevlen field. */ + nextdiff = (p[0] != ZIP_END) ? zipPrevLenByteDiff(p,reqlen) : 0; + + /* Store offset because a realloc may change the address of zl. */ + offset = p-zl; + zl = ziplistResize(zl,curlen+reqlen+nextdiff); + p = zl+offset; + + /* Apply memory move when necessary and update tail offset. */ + if (p[0] != ZIP_END) { + /* Subtract one because of the ZIP_END bytes */ + memmove(p+reqlen,p-nextdiff,curlen-offset-1+nextdiff); + /* Encode this entry's raw length in the next entry. */ + zipPrevEncodeLength(p+reqlen,reqlen); + /* Update offset for tail */ + ZIPLIST_TAIL_OFFSET(zl) += reqlen+nextdiff; + } else { + /* This element will be the new tail. */ + ZIPLIST_TAIL_OFFSET(zl) = p-zl; + } + + /* Write the entry */ + p += zipPrevEncodeLength(p,prevlen); + p += zipEncodeLength(p,encoding,slen); + if (encoding != ZIP_ENC_RAW) { + zipSaveInteger(p,value,encoding); + } else { + memcpy(p,s,slen); + } + ZIPLIST_INCR_LENGTH(zl,1); + return zl; +} + +unsigned char *ziplistPush(unsigned char *zl, unsigned char *s, unsigned int slen, int where) { + unsigned char *p; + p = (where == ZIPLIST_HEAD) ? ZIPLIST_ENTRY_HEAD(zl) : ZIPLIST_ENTRY_END(zl); + return __ziplistInsert(zl,p,s,slen); +} + +/* Returns an offset to use for iterating with ziplistNext. When the given + * index is negative, the list is traversed back to front. When the list + * doesn't contain an element at the provided index, NULL is returned. */ +unsigned char *ziplistIndex(unsigned char *zl, int index) { + unsigned char *p; + zlentry entry; + if (index < 0) { + index = (-index)-1; + p = ZIPLIST_ENTRY_TAIL(zl); + if (p[0] != ZIP_END) { + entry = zipEntry(p); + while (entry.prevrawlen > 0 && index--) { + p -= entry.prevrawlen; + entry = zipEntry(p); + } + } + } else { + p = ZIPLIST_ENTRY_HEAD(zl); + while (p[0] != ZIP_END && index--) { + p += zipRawEntryLength(p); + } + } + return (p[0] == ZIP_END || index > 0) ? NULL : p; +} + +/* Return pointer to next entry in ziplist. */ +unsigned char *ziplistNext(unsigned char *zl, unsigned char *p) { + ((void) zl); + + /* "p" could be equal to ZIP_END, caused by ziplistDelete, + * and we should return NULL. Otherwise, we should return NULL + * when the *next* element is ZIP_END (there is no next entry). */ + if (p[0] == ZIP_END) { + return NULL; + } else { + p = p+zipRawEntryLength(p); + return (p[0] == ZIP_END) ? NULL : p; + } +} + +/* Return pointer to previous entry in ziplist. */ +unsigned char *ziplistPrev(unsigned char *zl, unsigned char *p) { + zlentry entry; + + /* Iterating backwards from ZIP_END should return the tail. When "p" is + * equal to the first element of the list, we're already at the head, + * and should return NULL. */ + if (p[0] == ZIP_END) { + p = ZIPLIST_ENTRY_TAIL(zl); + return (p[0] == ZIP_END) ? NULL : p; + } else if (p == ZIPLIST_ENTRY_HEAD(zl)) { + return NULL; + } else { + entry = zipEntry(p); + return p-entry.prevrawlen; + } +} + +/* Get entry pointer to by 'p' and store in either 'e' or 'v' depending + * on the encoding of the entry. 'e' is always set to NULL to be able + * to find out whether the string pointer or the integer value was set. + * Return 0 if 'p' points to the end of the zipmap, 1 otherwise. */ +unsigned int ziplistGet(unsigned char *p, unsigned char **sstr, unsigned int *slen, long long *sval) { + zlentry entry; + if (p == NULL || p[0] == ZIP_END) return 0; + if (sstr) *sstr = NULL; + + entry = zipEntry(p); + if (entry.encoding == ZIP_ENC_RAW) { + if (sstr) { + *slen = entry.len; + *sstr = p+entry.headersize; + } + } else { + if (sval) { + *sval = zipLoadInteger(p+entry.headersize,entry.encoding); + } + } + return 1; +} + +/* Insert an entry at "p". */ +unsigned char *ziplistInsert(unsigned char *zl, unsigned char *p, unsigned char *s, unsigned int slen) { + return __ziplistInsert(zl,p,s,slen); +} + +/* Delete a single entry from the ziplist, pointed to by *p. + * Also update *p in place, to be able to iterate over the + * ziplist, while deleting entries. */ +unsigned char *ziplistDelete(unsigned char *zl, unsigned char **p) { + unsigned int offset = *p-zl; + zl = __ziplistDelete(zl,*p,1); + + /* Store pointer to current element in p, because ziplistDelete will + * do a realloc which might result in a different "zl"-pointer. + * When the delete direction is back to front, we might delete the last + * entry and end up with "p" pointing to ZIP_END, so check this. */ + *p = zl+offset; + return zl; +} + +/* Delete a range of entries from the ziplist. */ +unsigned char *ziplistDeleteRange(unsigned char *zl, unsigned int index, unsigned int num) { + unsigned char *p = ziplistIndex(zl,index); + return (p == NULL) ? zl : __ziplistDelete(zl,p,num); +} + +/* Compare entry pointer to by 'p' with 'entry'. Return 1 if equal. */ +unsigned int ziplistCompare(unsigned char *p, unsigned char *sstr, unsigned int slen) { + zlentry entry; + unsigned char sencoding; + long long zval, sval; + if (p[0] == ZIP_END) return 0; + + entry = zipEntry(p); + if (entry.encoding == ZIP_ENC_RAW) { + /* Raw compare */ + if (entry.len == slen) { + return memcmp(p+entry.headersize,sstr,slen) == 0; + } else { + return 0; + } + } else { + /* Try to compare encoded values */ + if (zipTryEncoding(sstr,&sval,&sencoding)) { + if (entry.encoding == sencoding) { + zval = zipLoadInteger(p+entry.headersize,entry.encoding); + return zval == sval; + } + } + } + return 0; +} + +/* Return length of ziplist. */ +unsigned int ziplistLen(unsigned char *zl) { + unsigned int len = 0; + if (ZIPLIST_LENGTH(zl) < UINT16_MAX) { + len = ZIPLIST_LENGTH(zl); + } else { + unsigned char *p = zl+ZIPLIST_HEADER_SIZE; + while (*p != ZIP_END) { + p += zipRawEntryLength(p); + len++; + } + + /* Re-store length if small enough */ + if (len < UINT16_MAX) ZIPLIST_LENGTH(zl) = len; + } + return len; +} + +/* Return size in bytes of ziplist. */ +unsigned int ziplistSize(unsigned char *zl) { + return ZIPLIST_BYTES(zl); +} + +void ziplistRepr(unsigned char *zl) { + unsigned char *p; + zlentry entry; + + printf("{total bytes %d} {length %u}\n",ZIPLIST_BYTES(zl), ZIPLIST_LENGTH(zl)); + p = ZIPLIST_ENTRY_HEAD(zl); + while(*p != ZIP_END) { + entry = zipEntry(p); + printf("{offset %ld, header %u, payload %u} ",p-zl,entry.headersize,entry.len); + p += entry.headersize; + if (entry.encoding == ZIP_ENC_RAW) { + fwrite(p,entry.len,1,stdout); + } else { + printf("%lld", zipLoadInteger(p,entry.encoding)); + } + printf("\n"); + p += entry.len; + } + printf("{end}\n\n"); +} + +#ifdef ZIPLIST_TEST_MAIN +#include + +unsigned char *createList() { + unsigned char *zl = ziplistNew(); + zl = ziplistPush(zl, (unsigned char*)"foo", 3, ZIPLIST_TAIL); + zl = ziplistPush(zl, (unsigned char*)"quux", 4, ZIPLIST_TAIL); + zl = ziplistPush(zl, (unsigned char*)"hello", 5, ZIPLIST_HEAD); + zl = ziplistPush(zl, (unsigned char*)"1024", 4, ZIPLIST_TAIL); + return zl; +} + +unsigned char *createIntList() { + unsigned char *zl = ziplistNew(); + char buf[32]; + + sprintf(buf, "100"); + zl = ziplistPush(zl, (unsigned char*)buf, strlen(buf), ZIPLIST_TAIL); + sprintf(buf, "128000"); + zl = ziplistPush(zl, (unsigned char*)buf, strlen(buf), ZIPLIST_TAIL); + sprintf(buf, "-100"); + zl = ziplistPush(zl, (unsigned char*)buf, strlen(buf), ZIPLIST_HEAD); + sprintf(buf, "4294967296"); + zl = ziplistPush(zl, (unsigned char*)buf, strlen(buf), ZIPLIST_HEAD); + sprintf(buf, "non integer"); + zl = ziplistPush(zl, (unsigned char*)buf, strlen(buf), ZIPLIST_TAIL); + sprintf(buf, "much much longer non integer"); + zl = ziplistPush(zl, (unsigned char*)buf, strlen(buf), ZIPLIST_TAIL); + return zl; +} + +long long usec(void) { + struct timeval tv; + gettimeofday(&tv,NULL); + return (((long long)tv.tv_sec)*1000000)+tv.tv_usec; +} + +void stress(int pos, int num, int maxsize, int dnum) { + int i,j,k; + unsigned char *zl; + char posstr[2][5] = { "HEAD", "TAIL" }; + long long start; + for (i = 0; i < maxsize; i+=dnum) { + zl = ziplistNew(); + for (j = 0; j < i; j++) { + zl = ziplistPush(zl,(unsigned char*)"quux",4,ZIPLIST_TAIL); + } + + /* Do num times a push+pop from pos */ + start = usec(); + for (k = 0; k < num; k++) { + zl = ziplistPush(zl,(unsigned char*)"quux",4,pos); + zl = ziplistDeleteRange(zl,0,1); + } + printf("List size: %8d, bytes: %8d, %dx push+pop (%s): %6lld usec\n", + i,ZIPLIST_BYTES(zl),num,posstr[pos],usec()-start); + zfree(zl); + } +} + +void pop(unsigned char *zl, int where) { + unsigned char *p, *vstr; + unsigned int vlen; + long long vlong; + + p = ziplistIndex(zl,where == ZIPLIST_HEAD ? 0 : -1); + if (ziplistGet(p,&vstr,&vlen,&vlong)) { + if (where == ZIPLIST_HEAD) + printf("Pop head: "); + else + printf("Pop tail: "); + + if (vstr) + fwrite(vstr,vlen,1,stdout); + else + printf("%lld", vlong); + + printf("\n"); + ziplistDeleteRange(zl,-1,1); + } else { + printf("ERROR: Could not pop\n"); + exit(1); + } +} + +int main(int argc, char **argv) { + unsigned char *zl, *p; + unsigned char *entry; + unsigned int elen; + long long value; + + zl = createIntList(); + ziplistRepr(zl); + + zl = createList(); + ziplistRepr(zl); + + pop(zl,ZIPLIST_TAIL); + ziplistRepr(zl); + + pop(zl,ZIPLIST_HEAD); + ziplistRepr(zl); + + pop(zl,ZIPLIST_TAIL); + ziplistRepr(zl); + + pop(zl,ZIPLIST_TAIL); + ziplistRepr(zl); + + printf("Get element at index 3:\n"); + { + zl = createList(); + p = ziplistIndex(zl, 3); + if (!ziplistGet(p, &entry, &elen, &value)) { + printf("ERROR: Could not access index 3\n"); + return 1; + } + if (entry) { + fwrite(entry,elen,1,stdout); + printf("\n"); + } else { + printf("%lld\n", value); + } + printf("\n"); + } + + printf("Get element at index 4 (out of range):\n"); + { + zl = createList(); + p = ziplistIndex(zl, 4); + if (p == NULL) { + printf("No entry\n"); + } else { + printf("ERROR: Out of range index should return NULL, returned offset: %ld\n", p-zl); + return 1; + } + printf("\n"); + } + + printf("Get element at index -1 (last element):\n"); + { + zl = createList(); + p = ziplistIndex(zl, -1); + if (!ziplistGet(p, &entry, &elen, &value)) { + printf("ERROR: Could not access index -1\n"); + return 1; + } + if (entry) { + fwrite(entry,elen,1,stdout); + printf("\n"); + } else { + printf("%lld\n", value); + } + printf("\n"); + } + + printf("Get element at index -4 (first element):\n"); + { + zl = createList(); + p = ziplistIndex(zl, -4); + if (!ziplistGet(p, &entry, &elen, &value)) { + printf("ERROR: Could not access index -4\n"); + return 1; + } + if (entry) { + fwrite(entry,elen,1,stdout); + printf("\n"); + } else { + printf("%lld\n", value); + } + printf("\n"); + } + + printf("Get element at index -5 (reverse out of range):\n"); + { + zl = createList(); + p = ziplistIndex(zl, -5); + if (p == NULL) { + printf("No entry\n"); + } else { + printf("ERROR: Out of range index should return NULL, returned offset: %ld\n", p-zl); + return 1; + } + printf("\n"); + } + + printf("Iterate list from 0 to end:\n"); + { + zl = createList(); + p = ziplistIndex(zl, 0); + while (ziplistGet(p, &entry, &elen, &value)) { + printf("Entry: "); + if (entry) { + fwrite(entry,elen,1,stdout); + } else { + printf("%lld", value); + } + p = ziplistNext(zl,p); + printf("\n"); + } + printf("\n"); + } + + printf("Iterate list from 1 to end:\n"); + { + zl = createList(); + p = ziplistIndex(zl, 1); + while (ziplistGet(p, &entry, &elen, &value)) { + printf("Entry: "); + if (entry) { + fwrite(entry,elen,1,stdout); + } else { + printf("%lld", value); + } + p = ziplistNext(zl,p); + printf("\n"); + } + printf("\n"); + } + + printf("Iterate list from 2 to end:\n"); + { + zl = createList(); + p = ziplistIndex(zl, 2); + while (ziplistGet(p, &entry, &elen, &value)) { + printf("Entry: "); + if (entry) { + fwrite(entry,elen,1,stdout); + } else { + printf("%lld", value); + } + p = ziplistNext(zl,p); + printf("\n"); + } + printf("\n"); + } + + printf("Iterate starting out of range:\n"); + { + zl = createList(); + p = ziplistIndex(zl, 4); + if (!ziplistGet(p, &entry, &elen, &value)) { + printf("No entry\n"); + } else { + printf("ERROR\n"); + } + printf("\n"); + } + + printf("Iterate from back to front:\n"); + { + zl = createList(); + p = ziplistIndex(zl, -1); + while (ziplistGet(p, &entry, &elen, &value)) { + printf("Entry: "); + if (entry) { + fwrite(entry,elen,1,stdout); + } else { + printf("%lld", value); + } + p = ziplistPrev(zl,p); + printf("\n"); + } + printf("\n"); + } + + printf("Iterate from back to front, deleting all items:\n"); + { + zl = createList(); + p = ziplistIndex(zl, -1); + while (ziplistGet(p, &entry, &elen, &value)) { + printf("Entry: "); + if (entry) { + fwrite(entry,elen,1,stdout); + } else { + printf("%lld", value); + } + zl = ziplistDelete(zl,&p); + p = ziplistPrev(zl,p); + printf("\n"); + } + printf("\n"); + } + + printf("Delete inclusive range 0,0:\n"); + { + zl = createList(); + zl = ziplistDeleteRange(zl, 0, 1); + ziplistRepr(zl); + } + + printf("Delete inclusive range 0,1:\n"); + { + zl = createList(); + zl = ziplistDeleteRange(zl, 0, 2); + ziplistRepr(zl); + } + + printf("Delete inclusive range 1,2:\n"); + { + zl = createList(); + zl = ziplistDeleteRange(zl, 1, 2); + ziplistRepr(zl); + } + + printf("Delete with start index out of range:\n"); + { + zl = createList(); + zl = ziplistDeleteRange(zl, 5, 1); + ziplistRepr(zl); + } + + printf("Delete with num overflow:\n"); + { + zl = createList(); + zl = ziplistDeleteRange(zl, 1, 5); + ziplistRepr(zl); + } + + printf("Delete foo while iterating:\n"); + { + zl = createList(); + p = ziplistIndex(zl,0); + while (ziplistGet(p,&entry,&elen,&value)) { + if (entry && strncmp("foo",(char*)entry,elen) == 0) { + printf("Delete foo\n"); + zl = ziplistDelete(zl,&p); + } else { + printf("Entry: "); + if (entry) { + fwrite(entry,elen,1,stdout); + } else { + printf("%lld",value); + } + p = ziplistNext(zl,p); + printf("\n"); + } + } + printf("\n"); + ziplistRepr(zl); + } + + printf("Create long list and check indices:\n"); + { + zl = ziplistNew(); + char buf[32]; + int i,len; + for (i = 0; i < 1000; i++) { + len = sprintf(buf,"%d",i); + zl = ziplistPush(zl,(unsigned char*)buf,len,ZIPLIST_TAIL); + } + for (i = 0; i < 1000; i++) { + p = ziplistIndex(zl,i); + assert(ziplistGet(p,NULL,NULL,&value)); + assert(i == value); + + p = ziplistIndex(zl,-i-1); + assert(ziplistGet(p,NULL,NULL,&value)); + assert(999-i == value); + } + printf("SUCCESS\n\n"); + } + + printf("Compare strings with ziplist entries:\n"); + { + zl = createList(); + p = ziplistIndex(zl,0); + if (!ziplistCompare(p,(unsigned char*)"hello",5)) { + printf("ERROR: not \"hello\"\n"); + return 1; + } + if (ziplistCompare(p,(unsigned char*)"hella",5)) { + printf("ERROR: \"hella\"\n"); + return 1; + } + + p = ziplistIndex(zl,3); + if (!ziplistCompare(p,(unsigned char*)"1024",4)) { + printf("ERROR: not \"1024\"\n"); + return 1; + } + if (ziplistCompare(p,(unsigned char*)"1025",4)) { + printf("ERROR: \"1025\"\n"); + return 1; + } + printf("SUCCESS\n"); + } + + printf("Stress with variable ziplist size:\n"); + { + stress(ZIPLIST_HEAD,100000,16384,256); + stress(ZIPLIST_TAIL,100000,16384,256); + } + + return 0; +} + +#endif diff --git a/src/ziplist.h b/src/ziplist.h new file mode 100644 index 000000000..311257256 --- /dev/null +++ b/src/ziplist.h @@ -0,0 +1,15 @@ +#define ZIPLIST_HEAD 0 +#define ZIPLIST_TAIL 1 + +unsigned char *ziplistNew(void); +unsigned char *ziplistPush(unsigned char *zl, unsigned char *s, unsigned int slen, int where); +unsigned char *ziplistIndex(unsigned char *zl, int index); +unsigned char *ziplistNext(unsigned char *zl, unsigned char *p); +unsigned char *ziplistPrev(unsigned char *zl, unsigned char *p); +unsigned int ziplistGet(unsigned char *p, unsigned char **sval, unsigned int *slen, long long *lval); +unsigned char *ziplistInsert(unsigned char *zl, unsigned char *p, unsigned char *s, unsigned int slen); +unsigned char *ziplistDelete(unsigned char *zl, unsigned char **p); +unsigned char *ziplistDeleteRange(unsigned char *zl, unsigned int index, unsigned int num); +unsigned int ziplistCompare(unsigned char *p, unsigned char *s, unsigned int slen); +unsigned int ziplistLen(unsigned char *zl); +unsigned int ziplistSize(unsigned char *zl); diff --git a/src/zipmap.c b/src/zipmap.c new file mode 100644 index 000000000..35faeabef --- /dev/null +++ b/src/zipmap.c @@ -0,0 +1,455 @@ +/* String -> String Map data structure optimized for size. + * This file implements a data structure mapping strings to other strings + * implementing an O(n) lookup data structure designed to be very memory + * efficient. + * + * The Redis Hash type uses this data structure for hashes composed of a small + * number of elements, to switch to an hash table once a given number of + * elements is reached. + * + * Given that many times Redis Hashes are used to represent objects composed + * of few fields, this is a very big win in terms of used memory. + * + * -------------------------------------------------------------------------- + * + * Copyright (c) 2009-2010, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* Memory layout of a zipmap, for the map "foo" => "bar", "hello" => "world": + * + * "foo""bar""hello""world" + * + * is 1 byte length that holds the current size of the zipmap. + * When the zipmap length is greater than or equal to 254, this value + * is not used and the zipmap needs to be traversed to find out the length. + * + * is the length of the following string (key or value). + * lengths are encoded in a single value or in a 5 bytes value. + * If the first byte value (as an unsigned 8 bit value) is between 0 and + * 252, it's a single-byte length. If it is 253 then a four bytes unsigned + * integer follows (in the host byte ordering). A value fo 255 is used to + * signal the end of the hash. The special value 254 is used to mark + * empty space that can be used to add new key/value pairs. + * + * is the number of free unused bytes + * after the string, resulting from modification of values associated to a + * key (for instance if "foo" is set to "bar', and later "foo" will be se to + * "hi", I'll have a free byte to use if the value will enlarge again later, + * or even in order to add a key/value pair if it fits. + * + * is always an unsigned 8 bit number, because if after an + * update operation there are more than a few free bytes, the zipmap will be + * reallocated to make sure it is as small as possible. + * + * The most compact representation of the above two elements hash is actually: + * + * "\x02\x03foo\x03\x00bar\x05hello\x05\x00world\xff" + * + * Note that because keys and values are prefixed length "objects", + * the lookup will take O(N) where N is the number of elements + * in the zipmap and *not* the number of bytes needed to represent the zipmap. + * This lowers the constant times considerably. + */ + +#include +#include +#include +#include "zmalloc.h" + +#define ZIPMAP_BIGLEN 254 +#define ZIPMAP_END 255 + +/* The following defines the max value for the field described in the + * comments above, that is, the max number of trailing bytes in a value. */ +#define ZIPMAP_VALUE_MAX_FREE 4 + +/* The following macro returns the number of bytes needed to encode the length + * for the integer value _l, that is, 1 byte for lengths < ZIPMAP_BIGLEN and + * 5 bytes for all the other lengths. */ +#define ZIPMAP_LEN_BYTES(_l) (((_l) < ZIPMAP_BIGLEN) ? 1 : sizeof(unsigned int)+1) + +/* Create a new empty zipmap. */ +unsigned char *zipmapNew(void) { + unsigned char *zm = zmalloc(2); + + zm[0] = 0; /* Length */ + zm[1] = ZIPMAP_END; + return zm; +} + +/* Decode the encoded length pointed by 'p' */ +static unsigned int zipmapDecodeLength(unsigned char *p) { + unsigned int len = *p; + + if (len < ZIPMAP_BIGLEN) return len; + memcpy(&len,p+1,sizeof(unsigned int)); + return len; +} + +/* Encode the length 'l' writing it in 'p'. If p is NULL it just returns + * the amount of bytes required to encode such a length. */ +static unsigned int zipmapEncodeLength(unsigned char *p, unsigned int len) { + if (p == NULL) { + return ZIPMAP_LEN_BYTES(len); + } else { + if (len < ZIPMAP_BIGLEN) { + p[0] = len; + return 1; + } else { + p[0] = ZIPMAP_BIGLEN; + memcpy(p+1,&len,sizeof(len)); + return 1+sizeof(len); + } + } +} + +/* Search for a matching key, returning a pointer to the entry inside the + * zipmap. Returns NULL if the key is not found. + * + * If NULL is returned, and totlen is not NULL, it is set to the entire + * size of the zimap, so that the calling function will be able to + * reallocate the original zipmap to make room for more entries. */ +static unsigned char *zipmapLookupRaw(unsigned char *zm, unsigned char *key, unsigned int klen, unsigned int *totlen) { + unsigned char *p = zm+1, *k = NULL; + unsigned int l,llen; + + while(*p != ZIPMAP_END) { + unsigned char free; + + /* Match or skip the key */ + l = zipmapDecodeLength(p); + llen = zipmapEncodeLength(NULL,l); + if (k == NULL && l == klen && !memcmp(p+llen,key,l)) { + /* Only return when the user doesn't care + * for the total length of the zipmap. */ + if (totlen != NULL) { + k = p; + } else { + return p; + } + } + p += llen+l; + /* Skip the value as well */ + l = zipmapDecodeLength(p); + p += zipmapEncodeLength(NULL,l); + free = p[0]; + p += l+1+free; /* +1 to skip the free byte */ + } + if (totlen != NULL) *totlen = (unsigned int)(p-zm)+1; + return k; +} + +static unsigned long zipmapRequiredLength(unsigned int klen, unsigned int vlen) { + unsigned int l; + + l = klen+vlen+3; + if (klen >= ZIPMAP_BIGLEN) l += 4; + if (vlen >= ZIPMAP_BIGLEN) l += 4; + return l; +} + +/* Return the total amount used by a key (encoded length + payload) */ +static unsigned int zipmapRawKeyLength(unsigned char *p) { + unsigned int l = zipmapDecodeLength(p); + return zipmapEncodeLength(NULL,l) + l; +} + +/* Return the total amount used by a value + * (encoded length + single byte free count + payload) */ +static unsigned int zipmapRawValueLength(unsigned char *p) { + unsigned int l = zipmapDecodeLength(p); + unsigned int used; + + used = zipmapEncodeLength(NULL,l); + used += p[used] + 1 + l; + return used; +} + +/* If 'p' points to a key, this function returns the total amount of + * bytes used to store this entry (entry = key + associated value + trailing + * free space if any). */ +static unsigned int zipmapRawEntryLength(unsigned char *p) { + unsigned int l = zipmapRawKeyLength(p); + return l + zipmapRawValueLength(p+l); +} + +static inline unsigned char *zipmapResize(unsigned char *zm, unsigned int len) { + zm = zrealloc(zm, len); + zm[len-1] = ZIPMAP_END; + return zm; +} + +/* Set key to value, creating the key if it does not already exist. + * If 'update' is not NULL, *update is set to 1 if the key was + * already preset, otherwise to 0. */ +unsigned char *zipmapSet(unsigned char *zm, unsigned char *key, unsigned int klen, unsigned char *val, unsigned int vlen, int *update) { + unsigned int zmlen, offset; + unsigned int freelen, reqlen = zipmapRequiredLength(klen,vlen); + unsigned int empty, vempty; + unsigned char *p; + + freelen = reqlen; + if (update) *update = 0; + p = zipmapLookupRaw(zm,key,klen,&zmlen); + if (p == NULL) { + /* Key not found: enlarge */ + zm = zipmapResize(zm, zmlen+reqlen); + p = zm+zmlen-1; + zmlen = zmlen+reqlen; + + /* Increase zipmap length (this is an insert) */ + if (zm[0] < ZIPMAP_BIGLEN) zm[0]++; + } else { + /* Key found. Is there enough space for the new value? */ + /* Compute the total length: */ + if (update) *update = 1; + freelen = zipmapRawEntryLength(p); + if (freelen < reqlen) { + /* Store the offset of this key within the current zipmap, so + * it can be resized. Then, move the tail backwards so this + * pair fits at the current position. */ + offset = p-zm; + zm = zipmapResize(zm, zmlen-freelen+reqlen); + p = zm+offset; + + /* The +1 in the number of bytes to be moved is caused by the + * end-of-zipmap byte. Note: the *original* zmlen is used. */ + memmove(p+reqlen, p+freelen, zmlen-(offset+freelen+1)); + zmlen = zmlen-freelen+reqlen; + freelen = reqlen; + } + } + + /* We now have a suitable block where the key/value entry can + * be written. If there is too much free space, move the tail + * of the zipmap a few bytes to the front and shrink the zipmap, + * as we want zipmaps to be very space efficient. */ + empty = freelen-reqlen; + if (empty >= ZIPMAP_VALUE_MAX_FREE) { + /* First, move the tail bytes to the front, then resize + * the zipmap to be bytes smaller. */ + offset = p-zm; + memmove(p+reqlen, p+freelen, zmlen-(offset+freelen+1)); + zmlen -= empty; + zm = zipmapResize(zm, zmlen); + p = zm+offset; + vempty = 0; + } else { + vempty = empty; + } + + /* Just write the key + value and we are done. */ + /* Key: */ + p += zipmapEncodeLength(p,klen); + memcpy(p,key,klen); + p += klen; + /* Value: */ + p += zipmapEncodeLength(p,vlen); + *p++ = vempty; + memcpy(p,val,vlen); + return zm; +} + +/* Remove the specified key. If 'deleted' is not NULL the pointed integer is + * set to 0 if the key was not found, to 1 if it was found and deleted. */ +unsigned char *zipmapDel(unsigned char *zm, unsigned char *key, unsigned int klen, int *deleted) { + unsigned int zmlen, freelen; + unsigned char *p = zipmapLookupRaw(zm,key,klen,&zmlen); + if (p) { + freelen = zipmapRawEntryLength(p); + memmove(p, p+freelen, zmlen-((p-zm)+freelen+1)); + zm = zipmapResize(zm, zmlen-freelen); + + /* Decrease zipmap length */ + if (zm[0] < ZIPMAP_BIGLEN) zm[0]--; + + if (deleted) *deleted = 1; + } else { + if (deleted) *deleted = 0; + } + return zm; +} + +/* Call it before to iterate trought elements via zipmapNext() */ +unsigned char *zipmapRewind(unsigned char *zm) { + return zm+1; +} + +/* This function is used to iterate through all the zipmap elements. + * In the first call the first argument is the pointer to the zipmap + 1. + * In the next calls what zipmapNext returns is used as first argument. + * Example: + * + * unsigned char *i = zipmapRewind(my_zipmap); + * while((i = zipmapNext(i,&key,&klen,&value,&vlen)) != NULL) { + * printf("%d bytes key at $p\n", klen, key); + * printf("%d bytes value at $p\n", vlen, value); + * } + */ +unsigned char *zipmapNext(unsigned char *zm, unsigned char **key, unsigned int *klen, unsigned char **value, unsigned int *vlen) { + if (zm[0] == ZIPMAP_END) return NULL; + if (key) { + *key = zm; + *klen = zipmapDecodeLength(zm); + *key += ZIPMAP_LEN_BYTES(*klen); + } + zm += zipmapRawKeyLength(zm); + if (value) { + *value = zm+1; + *vlen = zipmapDecodeLength(zm); + *value += ZIPMAP_LEN_BYTES(*vlen); + } + zm += zipmapRawValueLength(zm); + return zm; +} + +/* Search a key and retrieve the pointer and len of the associated value. + * If the key is found the function returns 1, otherwise 0. */ +int zipmapGet(unsigned char *zm, unsigned char *key, unsigned int klen, unsigned char **value, unsigned int *vlen) { + unsigned char *p; + + if ((p = zipmapLookupRaw(zm,key,klen,NULL)) == NULL) return 0; + p += zipmapRawKeyLength(p); + *vlen = zipmapDecodeLength(p); + *value = p + ZIPMAP_LEN_BYTES(*vlen) + 1; + return 1; +} + +/* Return 1 if the key exists, otherwise 0 is returned. */ +int zipmapExists(unsigned char *zm, unsigned char *key, unsigned int klen) { + return zipmapLookupRaw(zm,key,klen,NULL) != NULL; +} + +/* Return the number of entries inside a zipmap */ +unsigned int zipmapLen(unsigned char *zm) { + unsigned int len = 0; + if (zm[0] < ZIPMAP_BIGLEN) { + len = zm[0]; + } else { + unsigned char *p = zipmapRewind(zm); + while((p = zipmapNext(p,NULL,NULL,NULL,NULL)) != NULL) len++; + + /* Re-store length if small enough */ + if (len < ZIPMAP_BIGLEN) zm[0] = len; + } + return len; +} + +void zipmapRepr(unsigned char *p) { + unsigned int l; + + printf("{status %u}",*p++); + while(1) { + if (p[0] == ZIPMAP_END) { + printf("{end}"); + break; + } else { + unsigned char e; + + l = zipmapDecodeLength(p); + printf("{key %u}",l); + p += zipmapEncodeLength(NULL,l); + fwrite(p,l,1,stdout); + p += l; + + l = zipmapDecodeLength(p); + printf("{value %u}",l); + p += zipmapEncodeLength(NULL,l); + e = *p++; + fwrite(p,l,1,stdout); + p += l+e; + if (e) { + printf("["); + while(e--) printf("."); + printf("]"); + } + } + } + printf("\n"); +} + +#ifdef ZIPMAP_TEST_MAIN +int main(void) { + unsigned char *zm; + + zm = zipmapNew(); + + zm = zipmapSet(zm,(unsigned char*) "name",4, (unsigned char*) "foo",3,NULL); + zm = zipmapSet(zm,(unsigned char*) "surname",7, (unsigned char*) "foo",3,NULL); + zm = zipmapSet(zm,(unsigned char*) "age",3, (unsigned char*) "foo",3,NULL); + zipmapRepr(zm); + + zm = zipmapSet(zm,(unsigned char*) "hello",5, (unsigned char*) "world!",6,NULL); + zm = zipmapSet(zm,(unsigned char*) "foo",3, (unsigned char*) "bar",3,NULL); + zm = zipmapSet(zm,(unsigned char*) "foo",3, (unsigned char*) "!",1,NULL); + zipmapRepr(zm); + zm = zipmapSet(zm,(unsigned char*) "foo",3, (unsigned char*) "12345",5,NULL); + zipmapRepr(zm); + zm = zipmapSet(zm,(unsigned char*) "new",3, (unsigned char*) "xx",2,NULL); + zm = zipmapSet(zm,(unsigned char*) "noval",5, (unsigned char*) "",0,NULL); + zipmapRepr(zm); + zm = zipmapDel(zm,(unsigned char*) "new",3,NULL); + zipmapRepr(zm); + + printf("\nLook up large key:\n"); + { + unsigned char buf[512]; + unsigned char *value; + unsigned int vlen, i; + for (i = 0; i < 512; i++) buf[i] = 'a'; + + zm = zipmapSet(zm,buf,512,(unsigned char*) "long",4,NULL); + if (zipmapGet(zm,buf,512,&value,&vlen)) { + printf(" is associated to the %d bytes value: %.*s\n", + vlen, vlen, value); + } + } + + printf("\nPerform a direct lookup:\n"); + { + unsigned char *value; + unsigned int vlen; + + if (zipmapGet(zm,(unsigned char*) "foo",3,&value,&vlen)) { + printf(" foo is associated to the %d bytes value: %.*s\n", + vlen, vlen, value); + } + } + printf("\nIterate trought elements:\n"); + { + unsigned char *i = zipmapRewind(zm); + unsigned char *key, *value; + unsigned int klen, vlen; + + while((i = zipmapNext(i,&key,&klen,&value,&vlen)) != NULL) { + printf(" %d:%.*s => %d:%.*s\n", klen, klen, key, vlen, vlen, value); + } + } + return 0; +} +#endif diff --git a/src/zipmap.h b/src/zipmap.h new file mode 100644 index 000000000..e5f6c9f28 --- /dev/null +++ b/src/zipmap.h @@ -0,0 +1,48 @@ +/* String -> String Map data structure optimized for size. + * + * See zipmap.c for more info. + * + * -------------------------------------------------------------------------- + * + * Copyright (c) 2009-2010, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _ZIMMAP_H +#define _ZIPMAP_H + +unsigned char *zipmapNew(void); +unsigned char *zipmapSet(unsigned char *zm, unsigned char *key, unsigned int klen, unsigned char *val, unsigned int vlen, int *update); +unsigned char *zipmapDel(unsigned char *zm, unsigned char *key, unsigned int klen, int *deleted); +unsigned char *zipmapRewind(unsigned char *zm); +unsigned char *zipmapNext(unsigned char *zm, unsigned char **key, unsigned int *klen, unsigned char **value, unsigned int *vlen); +int zipmapGet(unsigned char *zm, unsigned char *key, unsigned int klen, unsigned char **value, unsigned int *vlen); +int zipmapExists(unsigned char *zm, unsigned char *key, unsigned int klen); +unsigned int zipmapLen(unsigned char *zm); +void zipmapRepr(unsigned char *p); + +#endif diff --git a/src/zmalloc.c b/src/zmalloc.c new file mode 100644 index 000000000..8658376a3 --- /dev/null +++ b/src/zmalloc.c @@ -0,0 +1,158 @@ +/* zmalloc - total amount of allocated memory aware version of malloc() + * + * Copyright (c) 2009-2010, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include "config.h" + +#if defined(__sun) +#define PREFIX_SIZE sizeof(long long) +#else +#define PREFIX_SIZE sizeof(size_t) +#endif + +#define increment_used_memory(__n) do { \ + size_t _n = (__n); \ + if (_n&(sizeof(long)-1)) _n += sizeof(long)-(_n&(sizeof(long)-1)); \ + if (zmalloc_thread_safe) { \ + pthread_mutex_lock(&used_memory_mutex); \ + used_memory += _n; \ + pthread_mutex_unlock(&used_memory_mutex); \ + } else { \ + used_memory += _n; \ + } \ +} while(0) + +#define decrement_used_memory(__n) do { \ + size_t _n = (__n); \ + if (_n&(sizeof(long)-1)) _n += sizeof(long)-(_n&(sizeof(long)-1)); \ + if (zmalloc_thread_safe) { \ + pthread_mutex_lock(&used_memory_mutex); \ + used_memory -= _n; \ + pthread_mutex_unlock(&used_memory_mutex); \ + } else { \ + used_memory -= _n; \ + } \ +} while(0) + +static size_t used_memory = 0; +static int zmalloc_thread_safe = 0; +pthread_mutex_t used_memory_mutex = PTHREAD_MUTEX_INITIALIZER; + +static void zmalloc_oom(size_t size) { + fprintf(stderr, "zmalloc: Out of memory trying to allocate %zu bytes\n", + size); + fflush(stderr); + abort(); +} + +void *zmalloc(size_t size) { + void *ptr = malloc(size+PREFIX_SIZE); + + if (!ptr) zmalloc_oom(size); +#ifdef HAVE_MALLOC_SIZE + increment_used_memory(redis_malloc_size(ptr)); + return ptr; +#else + *((size_t*)ptr) = size; + increment_used_memory(size+PREFIX_SIZE); + return (char*)ptr+PREFIX_SIZE; +#endif +} + +void *zrealloc(void *ptr, size_t size) { +#ifndef HAVE_MALLOC_SIZE + void *realptr; +#endif + size_t oldsize; + void *newptr; + + if (ptr == NULL) return zmalloc(size); +#ifdef HAVE_MALLOC_SIZE + oldsize = redis_malloc_size(ptr); + newptr = realloc(ptr,size); + if (!newptr) zmalloc_oom(size); + + decrement_used_memory(oldsize); + increment_used_memory(redis_malloc_size(newptr)); + return newptr; +#else + realptr = (char*)ptr-PREFIX_SIZE; + oldsize = *((size_t*)realptr); + newptr = realloc(realptr,size+PREFIX_SIZE); + if (!newptr) zmalloc_oom(size); + + *((size_t*)newptr) = size; + decrement_used_memory(oldsize); + increment_used_memory(size); + return (char*)newptr+PREFIX_SIZE; +#endif +} + +void zfree(void *ptr) { +#ifndef HAVE_MALLOC_SIZE + void *realptr; + size_t oldsize; +#endif + + if (ptr == NULL) return; +#ifdef HAVE_MALLOC_SIZE + decrement_used_memory(redis_malloc_size(ptr)); + free(ptr); +#else + realptr = (char*)ptr-PREFIX_SIZE; + oldsize = *((size_t*)realptr); + decrement_used_memory(oldsize+PREFIX_SIZE); + free(realptr); +#endif +} + +char *zstrdup(const char *s) { + size_t l = strlen(s)+1; + char *p = zmalloc(l); + + memcpy(p,s,l); + return p; +} + +size_t zmalloc_used_memory(void) { + size_t um; + + if (zmalloc_thread_safe) pthread_mutex_lock(&used_memory_mutex); + um = used_memory; + if (zmalloc_thread_safe) pthread_mutex_unlock(&used_memory_mutex); + return um; +} + +void zmalloc_enable_thread_safeness(void) { + zmalloc_thread_safe = 1; +} diff --git a/src/zmalloc.h b/src/zmalloc.h new file mode 100644 index 000000000..193e7eda5 --- /dev/null +++ b/src/zmalloc.h @@ -0,0 +1,41 @@ +/* zmalloc - total amount of allocated memory aware version of malloc() + * + * Copyright (c) 2009-2010, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _ZMALLOC_H +#define _ZMALLOC_H + +void *zmalloc(size_t size); +void *zrealloc(void *ptr, size_t size); +void zfree(void *ptr); +char *zstrdup(const char *s); +size_t zmalloc_used_memory(void); +void zmalloc_enable_thread_safeness(void); + +#endif /* _ZMALLOC_H */ -- cgit v1.2.1