diff options
92 files changed, 6015 insertions, 1345 deletions
diff --git a/.gitignore b/.gitignore index d3b1c2f24..3d346fbcf 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ dump.rdb redis-benchmark redis-check-aof +redis-check-rdb redis-check-dump redis-cli redis-sentinel diff --git a/CONTRIBUTING b/CONTRIBUTING index f7b6836f7..b416b9561 100644 --- a/CONTRIBUTING +++ b/CONTRIBUTING @@ -20,7 +20,7 @@ each source file that you contribute. # How to provide a patch for a new feature -1. Drop a message to the Redis Google Group with a proposal of semantics/API. +1. If it is a major feature or a semantical change, write an RCP (Redis Change Proposal). Check the documentation here: https://github.com/redis/redis-rcp 2. If in step 1 you get an acknowledge from the project leaders, use the following procedure to submit a patch: @@ -31,4 +31,6 @@ each source file that you contribute. d. Initiate a pull request on github ( http://help.github.com/send-pull-requests/ ) e. Done :) +For minor fixes just open a pull request on Github. + Thanks! @@ -26,6 +26,25 @@ After building Redis is a good idea to test it, using: % make test +Fixing build problems with dependencies or cached build options +—-------- +Redis has some dependencies which are included into the "deps" directory. +"make" does not rebuild dependencies automatically, even if something in the +source code of dependencies is changes. + +When you update the source code with `git pull` or when code inside the +dependencies tree is modified in any other way, make sure to use the following +command in order to really clean everything and rebuild from scratch: + + make distclean + +This will clean: jemalloc, lua, hiredis, linenoise. + +Also if you force certain build options like 32bit target, no C compiler +optimizations (for debugging purposes), and other similar build time options, +those options are cached indefinitely until you issue a "make distclean" +command. + Fixing problems building 32 bit binaries --------- diff --git a/deps/Makefile b/deps/Makefile index 1f623ea7b..71f6d3a2c 100644 --- a/deps/Makefile +++ b/deps/Makefile @@ -58,7 +58,7 @@ ifeq ($(uname_S),SunOS) LUA_CFLAGS= -D__C99FEATURES__=1 endif -LUA_CFLAGS+= -O2 -Wall -DLUA_ANSI -DENABLE_CJSON_GLOBAL $(CFLAGS) +LUA_CFLAGS+= -O2 -Wall -DLUA_ANSI -DENABLE_CJSON_GLOBAL -DREDIS_STATIC='' $(CFLAGS) LUA_LDFLAGS+= $(LDFLAGS) # lua's Makefile defines AR="ar rcu", which is unusual, and makes it more # challenging to cross-compile lua (and redis). These defines make it easier diff --git a/deps/lua/src/lua_cmsgpack.c b/deps/lua/src/lua_cmsgpack.c index 32b9a0bc3..e13f053d2 100644 --- a/deps/lua/src/lua_cmsgpack.c +++ b/deps/lua/src/lua_cmsgpack.c @@ -13,26 +13,27 @@ #define LUACMSGPACK_COPYRIGHT "Copyright (C) 2012, Salvatore Sanfilippo" #define LUACMSGPACK_DESCRIPTION "MessagePack C implementation for Lua" -#define LUACMSGPACK_MAX_NESTING 16 /* Max tables nesting. */ - /* Allows a preprocessor directive to override MAX_NESTING */ #ifndef LUACMSGPACK_MAX_NESTING - #define LUACMSGPACK_MAX_NESTING 16 -#endif - -#if (_XOPEN_SOURCE >= 600 || _ISOC99_SOURCE || _POSIX_C_SOURCE >= 200112L) - #define IS_FINITE(x) isfinite(x) -#else - #define IS_FINITE(x) ((x) == (x) && (x) + 1 > (x)) + #define LUACMSGPACK_MAX_NESTING 16 /* Max tables nesting. */ #endif /* Check if float or double can be an integer without loss of precision */ -#define IS_INT_TYPE_EQUIVALENT(x, T) (IS_FINITE(x) && (T)(x) == (x)) +#define IS_INT_TYPE_EQUIVALENT(x, T) (!isinf(x) && (T)(x) == (x)) #define IS_INT64_EQUIVALENT(x) IS_INT_TYPE_EQUIVALENT(x, int64_t) #define IS_INT_EQUIVALENT(x) IS_INT_TYPE_EQUIVALENT(x, int) -#if LUA_VERSION_NUM < 503 +/* If size of pointer is equal to a 4 byte integer, we're on 32 bits. */ +#if UINTPTR_MAX == UINT_MAX + #define BITS_32 1 +#else + #define BITS_32 0 +#endif + +#if BITS_32 + #define lua_pushunsigned(L, n) lua_pushnumber(L, n) +#else #define lua_pushunsigned(L, n) lua_pushinteger(L, n) #endif @@ -72,7 +73,7 @@ static void memrevifle(void *ptr, size_t len) { int test = 1; unsigned char *testp = (unsigned char*) &test; - if (testp[0] == 0) return; /* Big endian, nothign to do. */ + if (testp[0] == 0) return; /* Big endian, nothing to do. */ len /= 2; while(len--) { aux = *p; @@ -84,7 +85,7 @@ static void memrevifle(void *ptr, size_t len) { } /* ---------------------------- String buffer ---------------------------------- - * This is a simple implementation of string buffers. The only opereation + * This is a simple implementation of string buffers. The only operation * supported is creating empty buffers and appending bytes to it. * The string buffer uses 2x preallocation on every realloc for O(N) append * behavior. */ @@ -108,7 +109,7 @@ static mp_buf *mp_buf_new(lua_State *L) { mp_buf *buf = NULL; /* Old size = 0; new size = sizeof(*buf) */ - buf = (mp_buf*)mp_realloc(L, buf, 0, sizeof(*buf)); + buf = (mp_buf*)mp_realloc(L, NULL, 0, sizeof(*buf)); buf->L = L; buf->b = NULL; @@ -143,7 +144,7 @@ void mp_buf_free(mp_buf *buf) { * be used to report errors. */ #define MP_CUR_ERROR_NONE 0 -#define MP_CUR_ERROR_EOF 1 /* Not enough data to complete opereation. */ +#define MP_CUR_ERROR_EOF 1 /* Not enough data to complete operation. */ #define MP_CUR_ERROR_BADFMT 2 /* Bad data format */ typedef struct mp_cur { @@ -160,7 +161,7 @@ static void mp_cur_init(mp_cur *cursor, const unsigned char *s, size_t len) { #define mp_cur_consume(_c,_len) do { _c->p += _len; _c->left -= _len; } while(0) -/* When there is not enough room we set an error in the cursor and return, this +/* When there is not enough room we set an error in the cursor and return. This * is very common across the code so we have a macro to make the code look * a bit simpler. */ #define mp_cur_need(_c,_len) do { \ @@ -253,7 +254,7 @@ static void mp_encode_int(mp_buf *buf, int64_t n) { } } else { if (n >= -32) { - b[0] = ((char)n); /* negative fixnum */ + b[0] = ((signed char)n); /* negative fixnum */ enclen = 1; } else if (n >= -128) { b[0] = 0xd0; /* int 8 */ @@ -350,7 +351,11 @@ static void mp_encode_lua_bool(lua_State *L, mp_buf *buf) { /* Lua 5.3 has a built in 64-bit integer type */ static void mp_encode_lua_integer(lua_State *L, mp_buf *buf) { +#if (LUA_VERSION_NUM < 503) && BITS_32 + lua_Number i = lua_tonumber(L,-1); +#else lua_Integer i = lua_tointeger(L,-1); +#endif mp_encode_int(buf, (int64_t)i); } @@ -392,7 +397,7 @@ static void mp_encode_lua_table_as_map(lua_State *L, mp_buf *buf, int level) { /* First step: count keys into table. No other way to do it with the * Lua API, we need to iterate a first time. Note that an alternative * would be to do a single run, and then hack the buffer to insert the - * map opcodes for message pack. Too hachish for this lib. */ + * map opcodes for message pack. Too hackish for this lib. */ lua_pushnil(L); while(lua_next(L,-2)) { lua_pop(L,1); /* remove value, keep key for next iteration. */ @@ -432,11 +437,12 @@ static int table_is_an_array(lua_State *L) { lua_pop(L,1); /* Stack: ... key */ /* The <= 0 check is valid here because we're comparing indexes. */ #if LUA_VERSION_NUM < 503 - if (!lua_isnumber(L,-1) || (n = lua_tonumber(L, -1)) <= 0 || - !IS_INT_EQUIVALENT(n)) { + if ((LUA_TNUMBER != lua_type(L,-1)) || (n = lua_tonumber(L, -1)) <= 0 || + !IS_INT_EQUIVALENT(n)) #else - if (!lua_isinteger(L,-1) || (n = lua_tointeger(L, -1)) <= 0) { + if (!lua_isinteger(L,-1) || (n = lua_tointeger(L, -1)) <= 0) #endif + { lua_settop(L, stacktop); return 0; } @@ -473,7 +479,7 @@ static void mp_encode_lua_null(lua_State *L, mp_buf *buf) { static void mp_encode_lua_type(lua_State *L, mp_buf *buf, int level) { int t = lua_type(L,-1); - /* Limit the encoding of nested tables to a specfiied maximum depth, so that + /* Limit the encoding of nested tables to a specified maximum depth, so that * we survive when called against circular references in tables. */ if (t == LUA_TTABLE && level == LUACMSGPACK_MAX_NESTING) t = LUA_TNIL; switch(t) { @@ -536,6 +542,7 @@ static int mp_pack(lua_State *L) { void mp_decode_to_lua_type(lua_State *L, mp_cur *c); void mp_decode_to_lua_array(lua_State *L, mp_cur *c, size_t len) { + assert(len <= UINT_MAX); int index = 1; lua_newtable(L); @@ -548,6 +555,7 @@ void mp_decode_to_lua_array(lua_State *L, mp_cur *c, size_t len) { } void mp_decode_to_lua_hash(lua_State *L, mp_cur *c, size_t len) { + assert(len <= UINT_MAX); lua_newtable(L); while(len--) { mp_decode_to_lua_type(L,c); /* key */ @@ -580,7 +588,7 @@ void mp_decode_to_lua_type(lua_State *L, mp_cur *c) { break; case 0xd0: /* int 8 */ mp_cur_need(c,2); - lua_pushinteger(L,(char)c->p[1]); + lua_pushinteger(L,(signed char)c->p[1]); mp_cur_consume(c,2); break; case 0xcd: /* uint 16 */ @@ -630,7 +638,11 @@ void mp_decode_to_lua_type(lua_State *L, mp_cur *c) { break; case 0xd3: /* int 64 */ mp_cur_need(c,9); +#if LUA_VERSION_NUM < 503 + lua_pushnumber(L, +#else lua_pushinteger(L, +#endif ((int64_t)c->p[1] << 56) | ((int64_t)c->p[2] << 48) | ((int64_t)c->p[3] << 40) | @@ -687,13 +699,14 @@ void mp_decode_to_lua_type(lua_State *L, mp_cur *c) { case 0xdb: /* raw 32 */ mp_cur_need(c,5); { - size_t l = (c->p[1] << 24) | - (c->p[2] << 16) | - (c->p[3] << 8) | - c->p[4]; - mp_cur_need(c,5+l); - lua_pushlstring(L,(char*)c->p+5,l); - mp_cur_consume(c,5+l); + size_t l = ((size_t)c->p[1] << 24) | + ((size_t)c->p[2] << 16) | + ((size_t)c->p[3] << 8) | + (size_t)c->p[4]; + mp_cur_consume(c,5); + mp_cur_need(c,l); + lua_pushlstring(L,(char*)c->p,l); + mp_cur_consume(c,l); } break; case 0xdc: /* array 16 */ @@ -707,10 +720,10 @@ void mp_decode_to_lua_type(lua_State *L, mp_cur *c) { case 0xdd: /* array 32 */ mp_cur_need(c,5); { - size_t l = (c->p[1] << 24) | - (c->p[2] << 16) | - (c->p[3] << 8) | - c->p[4]; + size_t l = ((size_t)c->p[1] << 24) | + ((size_t)c->p[2] << 16) | + ((size_t)c->p[3] << 8) | + (size_t)c->p[4]; mp_cur_consume(c,5); mp_decode_to_lua_array(L,c,l); } @@ -726,10 +739,10 @@ void mp_decode_to_lua_type(lua_State *L, mp_cur *c) { case 0xdf: /* map 32 */ mp_cur_need(c,5); { - size_t l = (c->p[1] << 24) | - (c->p[2] << 16) | - (c->p[3] << 8) | - c->p[4]; + size_t l = ((size_t)c->p[1] << 24) | + ((size_t)c->p[2] << 16) | + ((size_t)c->p[3] << 8) | + (size_t)c->p[4]; mp_cur_consume(c,5); mp_decode_to_lua_hash(L,c,l); } @@ -818,15 +831,15 @@ static int mp_unpack(lua_State *L) { } static int mp_unpack_one(lua_State *L) { - int offset = luaL_optint(L, 2, 0); + int offset = luaL_optinteger(L, 2, 0); /* Variable pop because offset may not exist */ lua_pop(L, lua_gettop(L)-1); return mp_unpack_full(L, 1, offset); } static int mp_unpack_limit(lua_State *L) { - int limit = luaL_checkint(L, 2); - int offset = luaL_optint(L, 3, 0); + int limit = luaL_checkinteger(L, 2); + int offset = luaL_optinteger(L, 3, 0); /* Variable pop because offset may not exist */ lua_pop(L, lua_gettop(L)-1); diff --git a/redis.conf b/redis.conf index 81dcf5a86..38e258698 100644 --- a/redis.conf +++ b/redis.conf @@ -36,6 +36,17 @@ # Note that Redis will write a pid file in /var/run/redis.pid when daemonized. daemonize no +# If you run Redis from upstart or systemd, Redis can interact with your +# supervision tree. Options: +# supervised no - no supervision interaction +# supervised upstart - signal upstart by putting Redis into SIGSTOP mode +# supervised systemd - signal systemd by writing READY=1 to $NOTIFY_SOCKET +# supervised auto - detect upstart or systemd method based on +# UPSTART_JOB or NOTIFY_SOCKET environment variables +# Note: these supervision methods only signal "process is ready." +# They do not enable continuous liveness pings back to your supervisor. +supervised no + # When running daemonized, Redis writes a pid file in /var/run/redis.pid by # default. You can specify a custom pid file location here. pidfile /var/run/redis.pid @@ -242,6 +253,10 @@ slave-read-only yes # Replication SYNC strategy: disk or socket. # +# ------------------------------------------------------- +# WARNING: DISKLESS REPLICATION IS EXPERIMENTAL CURRENTLY +# ------------------------------------------------------- +# # New slaves and reconnecting slaves that are not able to continue the replication # process just receiving differences, need to do what is called a "full # synchronization". An RDB file is transmitted from the master to the slaves. @@ -268,7 +283,7 @@ slave-read-only yes repl-diskless-sync no # When diskless replication is enabled, it is possible to configure the delay -# the server waits in order to spawn the child that trnasfers the RDB via socket +# the server waits in order to spawn the child that transfers the RDB via socket # to the slaves. # # This is important since once the transfer starts, it is not possible to serve @@ -615,6 +630,12 @@ lua-time-limit 5000 ################################ REDIS CLUSTER ############################### # +# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +# WARNING EXPERIMENTAL: Redis Cluster is considered to be stable code, however +# in order to mark it as "mature" we need to wait for a non trivial percentage +# of users to deploy it in production. +# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +# # Normal Redis instances can't be part of a Redis Cluster; only nodes that are # started as cluster nodes can. In order to start a Redis instance as a # cluster node enable the cluster support uncommenting the following: @@ -756,7 +777,7 @@ slowlog-max-len 128 # By default latency monitoring is disabled since it is mostly not needed # if you don't have latency issues, and collecting data has a performance # impact, that while very small, can be measured under big load. Latency -# monitoring can easily be enalbed at runtime using the command +# monitoring can easily be enabled at runtime using the command # "CONFIG SET latency-monitor-threshold <milliseconds>" if needed. latency-monitor-threshold 0 @@ -814,11 +835,36 @@ notify-keyspace-events "" hash-max-ziplist-entries 512 hash-max-ziplist-value 64 -# Similarly to hashes, small lists are also encoded in a special way in order -# to save a lot of space. The special representation is only used when -# you are under the following limits: -list-max-ziplist-entries 512 -list-max-ziplist-value 64 +# Lists are also encoded in a special way to save a lot of space. +# The number of entries allowed per internal list node can be specified +# as a fixed maximum size or a maximum number of elements. +# For a fixed maximum size, use -5 through -1, meaning: +# -5: max size: 64 Kb <-- not recommended for normal workloads +# -4: max size: 32 Kb <-- not recommended +# -3: max size: 16 Kb <-- probably not recommended +# -2: max size: 8 Kb <-- good +# -1: max size: 4 Kb <-- good +# Positive numbers mean store up to _exactly_ that number of elements +# per list node. +# The highest performing option is usually -2 (8 Kb size) or -1 (4 Kb size), +# but if your use case is unique, adjust the settings as necessary. +list-max-ziplist-size -2 + +# Lists may also be compressed. +# Compress depth is the number of quicklist ziplist nodes from *each* side of +# the list to *exclude* from compression. The head and tail of the list +# are always uncompressed for fast push/pop operations. Settings are: +# 0: disable all list compression +# 1: depth 1 means "don't start compressing until after 1 node into the list, +# going from either the head or tail" +# So: [head]->node->node->...->node->[tail] +# [head], [tail] will always be uncompressed; inner nodes will compress. +# 2: [head]->[next]->node->node->...->node->[prev]->[tail] +# 2 here means: don't compress head or head->next or tail->prev or tail, +# but compress all nodes between them. +# 3: [head]->[next]->[next]->node->node->...->node->[prev]->[prev]->[tail] +# etc. +list-compress-depth 0 # Sets have a special encoding in just one case: when a set is composed # of just strings that happen to be integers in radix 10 in the range diff --git a/src/Makefile b/src/Makefile index 96af74afa..271ab34d8 100644 --- a/src/Makefile +++ b/src/Makefile @@ -18,7 +18,7 @@ OPTIMIZATION?=-O2 DEPENDENCY_TARGETS=hiredis linenoise lua # Default settings -STD=-std=c99 -pedantic +STD=-std=c99 -pedantic -DREDIS_STATIC='' WARN=-Wall -W OPT=$(OPTIMIZATION) @@ -46,6 +46,10 @@ ifeq ($(USE_JEMALLOC),yes) MALLOC=jemalloc endif +ifeq ($(USE_JEMALLOC),no) + MALLOC=libc +endif + # Override default settings if possible -include .make-settings @@ -58,7 +62,7 @@ ifeq ($(uname_S),SunOS) # SunOS INSTALL=cp -pf FINAL_CFLAGS+= -D__EXTENSIONS__ -D_XPG6 - FINAL_LIBS+= -ldl -lnsl -lsocket -lresolv -lpthread + FINAL_LIBS+= -ldl -lnsl -lsocket -lresolv -lpthread -lrt else ifeq ($(uname_S),Darwin) # Darwin (nothing to do) @@ -113,17 +117,16 @@ endif REDIS_SERVER_NAME=redis-server REDIS_SENTINEL_NAME=redis-sentinel -REDIS_SERVER_OBJ=adlist.o ae.o anet.o dict.o redis.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o crc16.o endianconv.o slowlog.o scripting.o bio.o rio.o rand.o memtest.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o +REDIS_SERVER_OBJ=adlist.o quicklist.o ae.o anet.o dict.o redis.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o crc16.o endianconv.o slowlog.o scripting.o bio.o rio.o rand.o memtest.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o REDIS_CLI_NAME=redis-cli REDIS_CLI_OBJ=anet.o sds.o adlist.o redis-cli.o zmalloc.o release.o anet.o ae.o crc64.o REDIS_BENCHMARK_NAME=redis-benchmark REDIS_BENCHMARK_OBJ=ae.o anet.o redis-benchmark.o sds.o adlist.o zmalloc.o redis-benchmark.o -REDIS_CHECK_DUMP_NAME=redis-check-dump -REDIS_CHECK_DUMP_OBJ=redis-check-dump.o lzf_c.o lzf_d.o crc64.o +REDIS_CHECK_RDB_NAME=redis-check-rdb REDIS_CHECK_AOF_NAME=redis-check-aof REDIS_CHECK_AOF_OBJ=redis-check-aof.o -all: $(REDIS_SERVER_NAME) $(REDIS_SENTINEL_NAME) $(REDIS_CLI_NAME) $(REDIS_BENCHMARK_NAME) $(REDIS_CHECK_DUMP_NAME) $(REDIS_CHECK_AOF_NAME) +all: $(REDIS_SERVER_NAME) $(REDIS_SENTINEL_NAME) $(REDIS_CLI_NAME) $(REDIS_BENCHMARK_NAME) $(REDIS_CHECK_RDB_NAME) $(REDIS_CHECK_AOF_NAME) @echo "" @echo "Hint: It's a good idea to run 'make test' ;)" @echo "" @@ -174,6 +177,10 @@ $(REDIS_SERVER_NAME): $(REDIS_SERVER_OBJ) $(REDIS_SENTINEL_NAME): $(REDIS_SERVER_NAME) $(REDIS_INSTALL) $(REDIS_SERVER_NAME) $(REDIS_SENTINEL_NAME) +# redis-check-rdb +$(REDIS_CHECK_RDB_NAME): $(REDIS_SERVER_NAME) + $(REDIS_INSTALL) $(REDIS_SERVER_NAME) $(REDIS_CHECK_RDB_NAME) + # redis-cli $(REDIS_CLI_NAME): $(REDIS_CLI_OBJ) $(REDIS_LD) -o $@ $^ ../deps/hiredis/libhiredis.a ../deps/linenoise/linenoise.o $(FINAL_LIBS) @@ -182,10 +189,6 @@ $(REDIS_CLI_NAME): $(REDIS_CLI_OBJ) $(REDIS_BENCHMARK_NAME): $(REDIS_BENCHMARK_OBJ) $(REDIS_LD) -o $@ $^ ../deps/hiredis/libhiredis.a $(FINAL_LIBS) -# redis-check-dump -$(REDIS_CHECK_DUMP_NAME): $(REDIS_CHECK_DUMP_OBJ) - $(REDIS_LD) -o $@ $^ $(FINAL_LIBS) - # redis-check-aof $(REDIS_CHECK_AOF_NAME): $(REDIS_CHECK_AOF_OBJ) $(REDIS_LD) -o $@ $^ $(FINAL_LIBS) @@ -197,7 +200,7 @@ $(REDIS_CHECK_AOF_NAME): $(REDIS_CHECK_AOF_OBJ) $(REDIS_CC) -c $< clean: - rm -rf $(REDIS_SERVER_NAME) $(REDIS_SENTINEL_NAME) $(REDIS_CLI_NAME) $(REDIS_BENCHMARK_NAME) $(REDIS_CHECK_DUMP_NAME) $(REDIS_CHECK_AOF_NAME) *.o *.gcda *.gcno *.gcov redis.info lcov-html + rm -rf $(REDIS_SERVER_NAME) $(REDIS_SENTINEL_NAME) $(REDIS_CLI_NAME) $(REDIS_BENCHMARK_NAME) $(REDIS_CHECK_RDB_NAME) $(REDIS_CHECK_AOF_NAME) *.o *.gcda *.gcno *.gcov redis.info lcov-html .PHONY: clean @@ -221,6 +224,10 @@ lcov: @geninfo -o redis.info . @genhtml --legend -o lcov-html redis.info +test-sds: sds.c sds.h + $(REDIS_CC) sds.c zmalloc.c -DSDS_TEST_MAIN -o /tmp/sds_test + /tmp/sds_test + .PHONY: lcov bench: $(REDIS_BENCHMARK_NAME) @@ -249,5 +256,6 @@ install: all $(REDIS_INSTALL) $(REDIS_SERVER_NAME) $(INSTALL_BIN) $(REDIS_INSTALL) $(REDIS_BENCHMARK_NAME) $(INSTALL_BIN) $(REDIS_INSTALL) $(REDIS_CLI_NAME) $(INSTALL_BIN) - $(REDIS_INSTALL) $(REDIS_CHECK_DUMP_NAME) $(INSTALL_BIN) + $(REDIS_INSTALL) $(REDIS_CHECK_RDB_NAME) $(INSTALL_BIN) $(REDIS_INSTALL) $(REDIS_CHECK_AOF_NAME) $(INSTALL_BIN) + @ln -sf $(REDIS_SERVER_NAME) $(INSTALL_BIN)/$(REDIS_SENTINEL_NAME) diff --git a/src/anet.c b/src/anet.c index 1e5d85495..76e9b67ae 100644 --- a/src/anet.c +++ b/src/anet.c @@ -589,6 +589,23 @@ error: return -1; } +/* Format an IP,port pair into something easy to parse. If IP is IPv6 + * (matches for ":"), the ip is surrounded by []. IP and port are just + * separated by colons. This the standard to display addresses within Redis. */ +int anetFormatAddr(char *buf, size_t buf_len, char *ip, int port) { + return snprintf(buf,buf_len, strchr(ip,':') ? + "[%s]:%d" : "%s:%d", ip, port); +} + +/* Like anetFormatAddr() but extract ip and port from the socket's peer. */ +int anetFormatPeer(int fd, char *buf, size_t buf_len) { + char ip[INET6_ADDRSTRLEN]; + int port; + + anetPeerToString(fd,ip,sizeof(ip),&port); + return anetFormatAddr(buf, buf_len, ip, port); +} + int anetSockName(int fd, char *ip, size_t ip_len, int *port) { struct sockaddr_storage sa; socklen_t salen = sizeof(sa); @@ -610,3 +627,11 @@ int anetSockName(int fd, char *ip, size_t ip_len, int *port) { } return 0; } + +int anetFormatSock(int fd, char *fmt, size_t fmt_len) { + char ip[INET6_ADDRSTRLEN]; + int port; + + anetSockName(fd,ip,sizeof(ip),&port); + return anetFormatAddr(fmt, fmt_len, ip, port); +} diff --git a/src/anet.h b/src/anet.h index b94a0cd17..ea9c77f2e 100644 --- a/src/anet.h +++ b/src/anet.h @@ -70,5 +70,8 @@ int anetSendTimeout(char *err, int fd, long long ms); int anetPeerToString(int fd, char *ip, size_t ip_len, int *port); int anetKeepAlive(char *err, int fd, int interval); int anetSockName(int fd, char *ip, size_t ip_len, int *port); +int anetFormatAddr(char *fmt, size_t fmt_len, char *ip, int port); +int anetFormatPeer(int fd, char *fmt, size_t fmt_len); +int anetFormatSock(int fd, char *fmt, size_t fmt_len); #endif @@ -770,52 +770,29 @@ int rioWriteBulkObject(rio *r, robj *obj) { int rewriteListObject(rio *r, robj *key, robj *o) { long long count = 0, items = listTypeLength(o); - if (o->encoding == REDIS_ENCODING_ZIPLIST) { - unsigned char *zl = o->ptr; - unsigned char *p = ziplistIndex(zl,0); - unsigned char *vstr; - unsigned int vlen; - long long vlong; + if (o->encoding == REDIS_ENCODING_QUICKLIST) { + quicklist *list = o->ptr; + quicklistIter *li = quicklistGetIterator(list, AL_START_HEAD); + quicklistEntry entry; - while(ziplistGet(p,&vstr,&vlen,&vlong)) { + while (quicklistNext(li,&entry)) { if (count == 0) { int cmd_items = (items > REDIS_AOF_REWRITE_ITEMS_PER_CMD) ? REDIS_AOF_REWRITE_ITEMS_PER_CMD : items; - if (rioWriteBulkCount(r,'*',2+cmd_items) == 0) return 0; if (rioWriteBulkString(r,"RPUSH",5) == 0) return 0; if (rioWriteBulkObject(r,key) == 0) return 0; } - if (vstr) { - if (rioWriteBulkString(r,(char*)vstr,vlen) == 0) return 0; - } else { - if (rioWriteBulkLongLong(r,vlong) == 0) return 0; - } - p = ziplistNext(zl,p); - if (++count == REDIS_AOF_REWRITE_ITEMS_PER_CMD) count = 0; - items--; - } - } else if (o->encoding == REDIS_ENCODING_LINKEDLIST) { - list *list = o->ptr; - listNode *ln; - listIter li; - listRewind(list,&li); - while((ln = listNext(&li))) { - robj *eleobj = listNodeValue(ln); - - if (count == 0) { - int cmd_items = (items > REDIS_AOF_REWRITE_ITEMS_PER_CMD) ? - REDIS_AOF_REWRITE_ITEMS_PER_CMD : items; - - if (rioWriteBulkCount(r,'*',2+cmd_items) == 0) return 0; - if (rioWriteBulkString(r,"RPUSH",5) == 0) return 0; - if (rioWriteBulkObject(r,key) == 0) return 0; + if (entry.value) { + if (rioWriteBulkString(r,(char*)entry.value,entry.sz) == 0) return 0; + } else { + if (rioWriteBulkLongLong(r,entry.longval) == 0) return 0; } - if (rioWriteBulkObject(r,eleobj) == 0) return 0; if (++count == REDIS_AOF_REWRITE_ITEMS_PER_CMD) count = 0; items--; } + quicklistReleaseIterator(li); } else { redisPanic("Unknown list encoding"); } @@ -1105,6 +1082,7 @@ int rewriteAppendOnlyFile(char *filename) { } } dictReleaseIterator(di); + di = NULL; } /* Do an initial slow fsync here while the parent is still sending diff --git a/src/bitops.c b/src/bitops.c index 94c7f3537..4c8662244 100644 --- a/src/bitops.c +++ b/src/bitops.c @@ -70,16 +70,19 @@ size_t redisPopcount(void *s, long count) { count--; } - /* Count bits 16 bytes at a time */ + /* Count bits 28 bytes at a time */ p4 = (uint32_t*)p; - while(count>=16) { - uint32_t aux1, aux2, aux3, aux4; + while(count>=28) { + uint32_t aux1, aux2, aux3, aux4, aux5, aux6, aux7; aux1 = *p4++; aux2 = *p4++; aux3 = *p4++; aux4 = *p4++; - count -= 16; + aux5 = *p4++; + aux6 = *p4++; + aux7 = *p4++; + count -= 28; aux1 = aux1 - ((aux1 >> 1) & 0x55555555); aux1 = (aux1 & 0x33333333) + ((aux1 >> 2) & 0x33333333); @@ -89,10 +92,19 @@ size_t redisPopcount(void *s, long count) { aux3 = (aux3 & 0x33333333) + ((aux3 >> 2) & 0x33333333); aux4 = aux4 - ((aux4 >> 1) & 0x55555555); aux4 = (aux4 & 0x33333333) + ((aux4 >> 2) & 0x33333333); - bits += ((((aux1 + (aux1 >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24) + - ((((aux2 + (aux2 >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24) + - ((((aux3 + (aux3 >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24) + - ((((aux4 + (aux4 >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24); + aux5 = aux5 - ((aux5 >> 1) & 0x55555555); + aux5 = (aux5 & 0x33333333) + ((aux5 >> 2) & 0x33333333); + aux6 = aux6 - ((aux6 >> 1) & 0x55555555); + aux6 = (aux6 & 0x33333333) + ((aux6 >> 2) & 0x33333333); + aux7 = aux7 - ((aux7 >> 1) & 0x55555555); + aux7 = (aux7 & 0x33333333) + ((aux7 >> 2) & 0x33333333); + bits += ((((aux1 + (aux1 >> 4)) & 0x0F0F0F0F) + + ((aux2 + (aux2 >> 4)) & 0x0F0F0F0F) + + ((aux3 + (aux3 >> 4)) & 0x0F0F0F0F) + + ((aux4 + (aux4 >> 4)) & 0x0F0F0F0F) + + ((aux5 + (aux5 >> 4)) & 0x0F0F0F0F) + + ((aux6 + (aux6 >> 4)) & 0x0F0F0F0F) + + ((aux7 + (aux7 >> 4)) & 0x0F0F0F0F))* 0x01010101) >> 24; } /* Count the remaining bytes. */ p = (unsigned char*)p4; @@ -348,7 +360,7 @@ void bitopCommand(redisClient *c) { * can take a fast path that performs much better than the * vanilla algorithm. */ j = 0; - if (minlen && numkeys <= 16) { + if (minlen >= sizeof(unsigned long)*4 && numkeys <= 16) { unsigned long *lp[16]; unsigned long *lres = (unsigned long*) res; diff --git a/src/cluster.c b/src/cluster.c index 608f8d2c5..2da0ed5f6 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -40,6 +40,7 @@ #include <sys/socket.h> #include <sys/stat.h> #include <sys/file.h> +#include <math.h> /* A global reference to myself is handy to make code more clear. * Myself always points to server.cluster->myself, that is, the clusterNode @@ -479,6 +480,7 @@ void clusterInit(void) { * the IP address via MEET messages. */ myself->port = server.port; + server.cluster->mf_end = 0; resetManualFailover(); } @@ -593,7 +595,7 @@ void clusterAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) { if (cfd == ANET_ERR) { if (errno != EWOULDBLOCK) redisLog(REDIS_VERBOSE, - "Accepting cluster node: %s", server.neterr); + "Error accepting cluster node: %s", server.neterr); return; } anetNonBlock(NULL,cfd); @@ -782,8 +784,11 @@ int clusterNodeRemoveSlave(clusterNode *master, clusterNode *slave) { for (j = 0; j < master->numslaves; j++) { if (master->slaves[j] == slave) { - memmove(master->slaves+j,master->slaves+(j+1), - (master->numslaves-1)-j); + if ((j+1) < master->numslaves) { + int remaining_slaves = (master->numslaves - j) - 1; + memmove(master->slaves+j,master->slaves+(j+1), + (sizeof(*master->slaves) * remaining_slaves)); + } master->numslaves--; return REDIS_OK; } @@ -818,15 +823,30 @@ int clusterCountNonFailingSlaves(clusterNode *n) { return okslaves; } +/* Low level cleanup of the node structure. Only called by clusterDelNode(). */ void freeClusterNode(clusterNode *n) { sds nodename; + int j; + + /* If the node is a master with associated slaves, we have to set + * all the slaves->slaveof fields to NULL (unknown). */ + if (nodeIsMaster(n)) { + for (j = 0; j < n->numslaves; j++) + n->slaves[j]->slaveof = NULL; + } + + /* Remove this node from the list of slaves of its master. */ + if (nodeIsSlave(n) && n->slaveof) clusterNodeRemoveSlave(n->slaveof,n); + /* Unlink from the set of nodes. */ nodename = sdsnewlen(n->name, REDIS_CLUSTER_NAMELEN); redisAssert(dictDelete(server.cluster->nodes,nodename) == DICT_OK); sdsfree(nodename); - if (n->slaveof) clusterNodeRemoveSlave(n->slaveof, n); + + /* Release link and associated data structures. */ if (n->link) freeClusterLink(n->link); listRelease(n->fail_reports); + zfree(n->slaves); zfree(n); } @@ -839,11 +859,16 @@ int clusterAddNode(clusterNode *node) { return (retval == DICT_OK) ? REDIS_OK : REDIS_ERR; } -/* Remove a node from the cluster: - * 1) Mark all the nodes handled by it as unassigned. - * 2) Remove all the failure reports sent by this node. - * 3) Free the node, that will in turn remove it from the hash table - * and from the list of slaves of its master, if it is a slave node. +/* Remove a node from the cluster. The functio performs the high level + * cleanup, calling freeClusterNode() for the low level cleanup. + * Here we do the following: + * + * 1) Mark all the slots handled by it as unassigned. + * 2) Remove all the failure reports sent by this node and referenced by + * other nodes. + * 3) Free the node with freeClusterNode() that will in turn remove it + * from the hash table and from the list of slaves of its master, if + * it is a slave node. */ void clusterDelNode(clusterNode *delnode) { int j; @@ -870,11 +895,7 @@ void clusterDelNode(clusterNode *delnode) { } dictReleaseIterator(di); - /* 3) Remove this node from its master's slaves if needed. */ - if (nodeIsSlave(delnode) && delnode->slaveof) - clusterNodeRemoveSlave(delnode->slaveof,delnode); - - /* 4) Free the node, unlinking it from the cluster. */ + /* 3) Free the node, unlinking it from the cluster. */ freeClusterNode(delnode); } @@ -1118,6 +1139,7 @@ int clusterStartHandshake(char *ip, int port) { /* Set norm_ip as the normalized string representation of the node * IP address. */ + memset(norm_ip,0,REDIS_IP_STR_LEN); if (sa.ss_family == AF_INET) inet_ntop(AF_INET, (void*)&(((struct sockaddr_in *)&sa)->sin_addr), @@ -1232,7 +1254,7 @@ void nodeIp2String(char *buf, clusterLink *link) { * The function returns 0 if the node address is still the same, * otherwise 1 is returned. */ int nodeUpdateAddressIfNeeded(clusterNode *node, clusterLink *link, int port) { - char ip[REDIS_IP_STR_LEN]; + char ip[REDIS_IP_STR_LEN] = {0}; /* We don't proceed if the link is the same as the sender link, as this * function is designed to see if the node link is consistent with the @@ -1463,7 +1485,8 @@ int clusterProcessPacket(clusterLink *link) { /* Perform sanity checks */ if (totlen < 16) return 1; /* At least signature, version, totlen, count. */ - if (ntohs(hdr->ver) != 0) return 1; /* Can't handle versions other than 0.*/ + if (ntohs(hdr->ver) != CLUSTER_PROTO_VER) + return 1; /* Can't handle versions other than the current one.*/ if (totlen > sdslen(link->rcvbuf)) return 1; if (type == CLUSTERMSG_TYPE_PING || type == CLUSTERMSG_TYPE_PONG || type == CLUSTERMSG_TYPE_MEET) @@ -1482,7 +1505,8 @@ int clusterProcessPacket(clusterLink *link) { } else if (type == CLUSTERMSG_TYPE_PUBLISH) { uint32_t explen = sizeof(clusterMsg)-sizeof(union clusterMsgData); - explen += sizeof(clusterMsgDataPublish) + + explen += sizeof(clusterMsgDataPublish) - + 8 + ntohl(hdr->data.publish.msg.channel_len) + ntohl(hdr->data.publish.msg.message_len); if (totlen != explen) return 1; @@ -1543,8 +1567,12 @@ int clusterProcessPacket(clusterLink *link) { * later if we changed address, and those nodes will use our * official address to connect to us. So by obtaining this address * from the socket is a simple way to discover / update our own - * address in the cluster without it being hardcoded in the config. */ - if (type == CLUSTERMSG_TYPE_MEET) { + * address in the cluster without it being hardcoded in the config. + * + * However if we don't have an address at all, we update the address + * even with a normal PING packet. If it's wrong it will be fixed + * by MEET later. */ + if (type == CLUSTERMSG_TYPE_MEET || myself->ip[0] == '\0') { char ip[REDIS_IP_STR_LEN]; if (anetSockName(link->fd,ip,sizeof(ip),NULL) != -1 && @@ -1603,7 +1631,7 @@ int clusterProcessPacket(clusterLink *link) { } /* Free this node as we already have it. This will * cause the link to be freed as well. */ - freeClusterNode(link->node); + clusterDelNode(link->node); return 0; } @@ -2010,7 +2038,8 @@ void clusterBroadcastMessage(void *buf, size_t len) { dictReleaseIterator(di); } -/* Build the message header */ +/* Build the message header. hdr must point to a buffer at least + * sizeof(clusterMsg) in bytes. */ void clusterBuildMessageHdr(clusterMsg *hdr, int type) { int totlen = 0; uint64_t offset; @@ -2024,6 +2053,7 @@ void clusterBuildMessageHdr(clusterMsg *hdr, int type) { myself->slaveof : myself; memset(hdr,0,sizeof(*hdr)); + hdr->ver = htons(CLUSTER_PROTO_VER); hdr->sig[0] = 'R'; hdr->sig[1] = 'C'; hdr->sig[2] = 'm'; @@ -2070,40 +2100,90 @@ void clusterBuildMessageHdr(clusterMsg *hdr, int type) { /* Send a PING or PONG packet to the specified node, making sure to add enough * gossip informations. */ void clusterSendPing(clusterLink *link, int type) { - unsigned char buf[sizeof(clusterMsg)]; - clusterMsg *hdr = (clusterMsg*) buf; - int gossipcount = 0, totlen; - /* freshnodes is the number of nodes we can still use to populate the - * gossip section of the ping packet. Basically we start with the nodes - * we have in memory minus two (ourself and the node we are sending the - * message to). Every time we add a node we decrement the counter, so when - * it will drop to <= zero we know there is no more gossip info we can - * send. */ + unsigned char *buf; + clusterMsg *hdr; + int gossipcount = 0; /* Number of gossip sections added so far. */ + int wanted; /* Number of gossip sections we want to append if possible. */ + int totlen; /* Total packet length. */ + /* freshnodes is the max number of nodes we can hope to append at all: + * nodes available minus two (ourself and the node we are sending the + * message to). However practically there may be less valid nodes since + * nodes in handshake state, disconnected, are not considered. */ int freshnodes = dictSize(server.cluster->nodes)-2; + /* How many gossip sections we want to add? 1/10 of the number of nodes + * and anyway at least 3. Why 1/10? + * + * If we have N masters, with N/10 entries, and we consider that in + * node_timeout we exchange with each other node at least 4 packets + * (we ping in the worst case in node_timeout/2 time, and we also + * receive two pings from the host), we have a total of 8 packets + * in the node_timeout*2 falure reports validity time. So we have + * that, for a single PFAIL node, we can expect to receive the following + * number of failure reports (in the specified window of time): + * + * PROB * GOSSIP_ENTRIES_PER_PACKET * TOTAL_PACKETS: + * + * PROB = probability of being featured in a single gossip entry, + * which is 1 / NUM_OF_NODES. + * ENTRIES = 10. + * TOTAL_PACKETS = 2 * 4 * NUM_OF_MASTERS. + * + * If we assume we have just masters (so num of nodes and num of masters + * is the same), with 1/10 we always get over the majority, and specifically + * 80% of the number of nodes, to account for many masters failing at the + * same time. + * + * Since we have non-voting slaves that lower the probability of an entry + * to feature our node, we set the number of entires per packet as + * 10% of the total nodes we have. */ + wanted = floor(dictSize(server.cluster->nodes)/10); + if (wanted < 3) wanted = 3; + if (wanted > freshnodes) wanted = freshnodes; + + /* Compute the maxium totlen to allocate our buffer. We'll fix the totlen + * later according to the number of gossip sections we really were able + * to put inside the packet. */ + totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData); + totlen += (sizeof(clusterMsgDataGossip)*wanted); + /* Note: clusterBuildMessageHdr() expects the buffer to be always at least + * sizeof(clusterMsg) or more. */ + if (totlen < (int)sizeof(clusterMsg)) totlen = sizeof(clusterMsg); + buf = zcalloc(totlen); + hdr = (clusterMsg*) buf; + + /* Populate the header. */ if (link->node && type == CLUSTERMSG_TYPE_PING) link->node->ping_sent = mstime(); clusterBuildMessageHdr(hdr,type); /* Populate the gossip fields */ - while(freshnodes > 0 && gossipcount < 3) { + int maxiterations = wanted*3; + while(freshnodes > 0 && gossipcount < wanted && maxiterations--) { dictEntry *de = dictGetRandomKey(server.cluster->nodes); clusterNode *this = dictGetVal(de); clusterMsgDataGossip *gossip; int j; + /* Don't include this node: the whole packet header is about us + * already, so we just gossip about other nodes. */ + if (this == myself) continue; + + /* Give a bias to FAIL/PFAIL nodes. */ + if (maxiterations > wanted*2 && + !(this->flags & (REDIS_NODE_PFAIL|REDIS_NODE_FAIL))) + continue; + /* In the gossip section don't include: - * 1) Myself. - * 2) Nodes in HANDSHAKE state. + * 1) Nodes in HANDSHAKE state. * 3) Nodes with the NOADDR flag set. * 4) Disconnected nodes if they don't have configured slots. */ - if (this == myself || - this->flags & (REDIS_NODE_HANDSHAKE|REDIS_NODE_NOADDR) || + if (this->flags & (REDIS_NODE_HANDSHAKE|REDIS_NODE_NOADDR) || (this->link == NULL && this->numslots == 0)) { - freshnodes--; /* otherwise we may loop forever. */ - continue; + freshnodes--; /* Tecnically not correct, but saves CPU. */ + continue; } /* Check if we already added this node */ @@ -2122,13 +2202,19 @@ void clusterSendPing(clusterLink *link, int type) { memcpy(gossip->ip,this->ip,sizeof(this->ip)); gossip->port = htons(this->port); gossip->flags = htons(this->flags); + gossip->notused1 = 0; + gossip->notused2 = 0; gossipcount++; } + + /* Ready to send... fix the totlen fiend and queue the message in the + * output buffer. */ totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData); totlen += (sizeof(clusterMsgDataGossip)*gossipcount); hdr->count = htons(gossipcount); hdr->totlen = htonl(totlen); clusterSendMessage(link,buf,totlen); + zfree(buf); } /* Send a PONG packet to every connected node that's not in handshake state @@ -2184,7 +2270,7 @@ void clusterSendPublish(clusterLink *link, robj *channel, robj *message) { clusterBuildMessageHdr(hdr,CLUSTERMSG_TYPE_PUBLISH); totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData); - totlen += sizeof(clusterMsgDataPublish) + channel_len + message_len; + totlen += sizeof(clusterMsgDataPublish) - 8 + channel_len + message_len; hdr->data.publish.msg.channel_len = htonl(channel_len); hdr->data.publish.msg.message_len = htonl(message_len); @@ -2517,7 +2603,7 @@ void clusterHandleSlaveFailover(void) { /* Compute the failover timeout (the max time we have to send votes * and wait for replies), and the failover retry time (the time to wait - * before waiting again. + * before trying to get voted again). * * Timeout is MIN(NODE_TIMEOUT*2,2000) milliseconds. * Retry is two times the Timeout. @@ -2775,6 +2861,7 @@ void clusterHandleSlaveMigration(int max_slaves) { } } } + dictReleaseIterator(di); /* Step 4: perform the migration if there is a target, and if I'm the * candidate. */ @@ -2896,7 +2983,7 @@ void clusterCron(void) { /* A Node in HANDSHAKE state has a limited lifespan equal to the * configured node timeout. */ if (nodeInHandshake(node) && now - node->ctime > handshake_timeout) { - freeClusterNode(node); + clusterDelNode(node); continue; } @@ -3883,10 +3970,7 @@ void clusterCommand(redisClient *c) { server.cluster->stats_bus_messages_sent, server.cluster->stats_bus_messages_received ); - addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n", - (unsigned long)sdslen(info))); - addReplySds(c,info); - addReply(c,shared.crlf); + addReplyBulkSds(c, info); } else if (!strcasecmp(c->argv[1]->ptr,"saveconfig") && c->argc == 2) { int retval = clusterSaveConfig(1); @@ -4010,6 +4094,18 @@ void clusterCommand(redisClient *c) { addReplyBulkCString(c,ni); sdsfree(ni); } + } else if (!strcasecmp(c->argv[1]->ptr,"count-failure-reports") && + c->argc == 3) + { + /* CLUSTER COUNT-FAILURE-REPORTS <NODE ID> */ + clusterNode *n = clusterLookupNode(c->argv[2]->ptr); + + if (!n) { + addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr); + return; + } else { + addReplyLongLong(c,clusterNodeFailureReportsCount(n)); + } } else if (!strcasecmp(c->argv[1]->ptr,"failover") && (c->argc == 2 || c->argc == 3)) { @@ -4408,7 +4504,7 @@ try_again: /* Check if the key is here. If not we reply with success as there is * nothing to migrate (for instance the key expired in the meantime), but * we include such information in the reply string. */ - if ((o = lookupKeyRead(c->db,c->argv[3])) == NULL) { + if ((o = lookupKeyWrite(c->db,c->argv[3])) == NULL) { addReplySds(c,sdsnew("+NOKEY\r\n")); return; } diff --git a/src/cluster.h b/src/cluster.h index b05a30ded..ef5caf0d6 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -163,10 +163,11 @@ typedef struct { char nodename[REDIS_CLUSTER_NAMELEN]; uint32_t ping_sent; uint32_t pong_received; - char ip[REDIS_IP_STR_LEN]; /* IP address last time it was seen */ - uint16_t port; /* port last time it was seen */ - uint16_t flags; - uint32_t notused; /* for 64 bit alignment */ + char ip[REDIS_IP_STR_LEN]; /* IP address last time it was seen */ + uint16_t port; /* port last time it was seen */ + uint16_t flags; /* node->flags copy */ + uint16_t notused1; /* Some room for future improvements. */ + uint32_t notused2; } clusterMsgDataGossip; typedef struct { @@ -176,7 +177,10 @@ typedef struct { typedef struct { uint32_t channel_len; uint32_t message_len; - unsigned char bulk_data[8]; /* defined as 8 just for alignment concerns. */ + /* We can't reclare bulk_data as bulk_data[] since this structure is + * nested. The 8 bytes are removed from the count during the message + * length computation. */ + unsigned char bulk_data[8]; } clusterMsgDataPublish; typedef struct { @@ -208,6 +212,7 @@ union clusterMsgData { } update; }; +#define CLUSTER_PROTO_VER 0 /* Cluster bus protocol version. */ typedef struct { char sig[4]; /* Siganture "RCmb" (Redis Cluster message bus). */ diff --git a/src/config.c b/src/config.c index 05cb7c9fe..8255a56b7 100644 --- a/src/config.c +++ b/src/config.c @@ -60,6 +60,8 @@ clientBufferLimitsConfig clientBufferLimitsDefaults[REDIS_CLIENT_TYPE_COUNT] = { * Config file parsing *----------------------------------------------------------------------------*/ +int supervisedToMode(const char *str); + int yesnotoi(char *s) { if (!strcasecmp(s,"yes")) return 1; else if (!strcasecmp(s,"no")) return 0; @@ -397,9 +399,13 @@ void loadServerConfigFromString(char *config) { } else if (!strcasecmp(argv[0],"hash-max-ziplist-value") && argc == 2) { server.hash_max_ziplist_value = memtoll(argv[1], NULL); } else if (!strcasecmp(argv[0],"list-max-ziplist-entries") && argc == 2){ - server.list_max_ziplist_entries = memtoll(argv[1], NULL); + /* DEAD OPTION */ } else if (!strcasecmp(argv[0],"list-max-ziplist-value") && argc == 2) { - server.list_max_ziplist_value = memtoll(argv[1], NULL); + /* DEAD OPTION */ + } else if (!strcasecmp(argv[0],"list-max-ziplist-size") && argc == 2) { + server.list_max_ziplist_size = atoi(argv[1]); + } else if (!strcasecmp(argv[0],"list-compress-depth") && argc == 2) { + server.list_compress_depth = atoi(argv[1]); } else if (!strcasecmp(argv[0],"set-max-intset-entries") && argc == 2) { server.set_max_intset_entries = memtoll(argv[1], NULL); } else if (!strcasecmp(argv[0],"zset-max-ziplist-entries") && argc == 2) { @@ -529,6 +535,15 @@ void loadServerConfigFromString(char *config) { goto loaderr; } server.notify_keyspace_events = flags; + } else if (!strcasecmp(argv[0],"supervised") && argc == 2) { + int mode = supervisedToMode(argv[1]); + + if (mode == -1) { + err = "Invalid option for 'supervised'. " + "Allowed values: 'upstart', 'systemd', 'auto', or 'no'"; + goto loaderr; + } + server.supervised_mode = mode; } else if (!strcasecmp(argv[0],"sentinel")) { /* argc == 1 is handled by main() as we need to enter the sentinel * mode ASAP. */ @@ -795,12 +810,12 @@ void configSetCommand(redisClient *c) { } else if (!strcasecmp(c->argv[2]->ptr,"hash-max-ziplist-value")) { if (getLongLongFromObject(o,&ll) == REDIS_ERR || ll < 0) goto badfmt; server.hash_max_ziplist_value = ll; - } else if (!strcasecmp(c->argv[2]->ptr,"list-max-ziplist-entries")) { + } else if (!strcasecmp(c->argv[2]->ptr,"list-max-ziplist-size")) { if (getLongLongFromObject(o,&ll) == REDIS_ERR || ll < 0) goto badfmt; - server.list_max_ziplist_entries = ll; - } else if (!strcasecmp(c->argv[2]->ptr,"list-max-ziplist-value")) { + server.list_max_ziplist_size = ll; + } else if (!strcasecmp(c->argv[2]->ptr,"list-compress-depth")) { if (getLongLongFromObject(o,&ll) == REDIS_ERR || ll < 0) goto badfmt; - server.list_max_ziplist_value = ll; + server.list_compress_depth = ll; } else if (!strcasecmp(c->argv[2]->ptr,"set-max-intset-entries")) { if (getLongLongFromObject(o,&ll) == REDIS_ERR || ll < 0) goto badfmt; server.set_max_intset_entries = ll; @@ -1004,6 +1019,47 @@ badfmt: /* Bad format errors */ } \ } while(0); +char *maxmemoryToString() { + char *s; + switch(server.maxmemory_policy) { + case REDIS_MAXMEMORY_VOLATILE_LRU: s = "volatile-lru"; break; + case REDIS_MAXMEMORY_VOLATILE_TTL: s = "volatile-ttl"; break; + case REDIS_MAXMEMORY_VOLATILE_RANDOM: s = "volatile-random"; break; + case REDIS_MAXMEMORY_ALLKEYS_LRU: s = "allkeys-lru"; break; + case REDIS_MAXMEMORY_ALLKEYS_RANDOM: s = "allkeys-random"; break; + case REDIS_MAXMEMORY_NO_EVICTION: s = "noeviction"; break; + default: s = "unknown"; break; + } + return s; +} + +int supervisedToMode(const char *str) { + int mode; + if (!strcasecmp(str,"upstart")) { + mode = REDIS_SUPERVISED_UPSTART; + } else if (!strcasecmp(str,"systemd")) { + mode = REDIS_SUPERVISED_SYSTEMD; + } else if (!strcasecmp(str,"auto")) { + mode = REDIS_SUPERVISED_AUTODETECT; + } else if (!strcasecmp(str,"no")) { + mode = REDIS_SUPERVISED_NONE; + } else { + mode = -1; + } + return mode; +} + +char *supervisedToString(void) { + char *s; + switch(server.supervised_mode) { + case REDIS_SUPERVISED_UPSTART: s = "upstart"; break; + case REDIS_SUPERVISED_SYSTEMD: s = "systemd"; break; + case REDIS_SUPERVISED_AUTODETECT: s = "auto"; break; + case REDIS_SUPERVISED_NONE: s = "no"; break; + default: s = "no"; break; + } + return s; +} void configGetCommand(redisClient *c) { robj *o = c->argv[2]; void *replylen = addDeferredMultiBulkLength(c); @@ -1033,10 +1089,10 @@ void configGetCommand(redisClient *c) { server.hash_max_ziplist_entries); config_get_numerical_field("hash-max-ziplist-value", server.hash_max_ziplist_value); - config_get_numerical_field("list-max-ziplist-entries", - server.list_max_ziplist_entries); - config_get_numerical_field("list-max-ziplist-value", - server.list_max_ziplist_value); + config_get_numerical_field("list-max-ziplist-size", + server.list_max_ziplist_size); + config_get_numerical_field("list-compress-depth", + server.list_compress_depth); config_get_numerical_field("set-max-intset-entries", server.set_max_intset_entries); config_get_numerical_field("zset-max-ziplist-entries", @@ -1112,19 +1168,8 @@ void configGetCommand(redisClient *c) { matches++; } if (stringmatch(pattern,"maxmemory-policy",0)) { - char *s; - - switch(server.maxmemory_policy) { - case REDIS_MAXMEMORY_VOLATILE_LRU: s = "volatile-lru"; break; - case REDIS_MAXMEMORY_VOLATILE_TTL: s = "volatile-ttl"; break; - case REDIS_MAXMEMORY_VOLATILE_RANDOM: s = "volatile-random"; break; - case REDIS_MAXMEMORY_ALLKEYS_LRU: s = "allkeys-lru"; break; - case REDIS_MAXMEMORY_ALLKEYS_RANDOM: s = "allkeys-random"; break; - case REDIS_MAXMEMORY_NO_EVICTION: s = "noeviction"; break; - default: s = "unknown"; break; /* too harmless to panic */ - } addReplyBulkCString(c,"maxmemory-policy"); - addReplyBulkCString(c,s); + addReplyBulkCString(c,maxmemoryToString()); matches++; } if (stringmatch(pattern,"appendfsync",0)) { @@ -1170,6 +1215,11 @@ void configGetCommand(redisClient *c) { addReplyBulkCString(c,s); matches++; } + if (stringmatch(pattern,"supervised",0)) { + addReplyBulkCString(c,"supervised"); + addReplyBulkCString(c,supervisedToString()); + matches++; + } if (stringmatch(pattern,"client-output-buffer-limit",0)) { sds buf = sdsempty(); int j; @@ -1854,8 +1904,8 @@ int rewriteConfig(char *path) { rewriteConfigNotifykeyspaceeventsOption(state); rewriteConfigNumericalOption(state,"hash-max-ziplist-entries",server.hash_max_ziplist_entries,REDIS_HASH_MAX_ZIPLIST_ENTRIES); rewriteConfigNumericalOption(state,"hash-max-ziplist-value",server.hash_max_ziplist_value,REDIS_HASH_MAX_ZIPLIST_VALUE); - rewriteConfigNumericalOption(state,"list-max-ziplist-entries",server.list_max_ziplist_entries,REDIS_LIST_MAX_ZIPLIST_ENTRIES); - rewriteConfigNumericalOption(state,"list-max-ziplist-value",server.list_max_ziplist_value,REDIS_LIST_MAX_ZIPLIST_VALUE); + rewriteConfigNumericalOption(state,"list-max-ziplist-size",server.list_max_ziplist_size,REDIS_LIST_MAX_ZIPLIST_SIZE); + rewriteConfigNumericalOption(state,"list-compress-depth",server.list_compress_depth,REDIS_LIST_COMPRESS_DEPTH); rewriteConfigNumericalOption(state,"set-max-intset-entries",server.set_max_intset_entries,REDIS_SET_MAX_INTSET_ENTRIES); rewriteConfigNumericalOption(state,"zset-max-ziplist-entries",server.zset_max_ziplist_entries,REDIS_ZSET_MAX_ZIPLIST_ENTRIES); rewriteConfigNumericalOption(state,"zset-max-ziplist-value",server.zset_max_ziplist_value,REDIS_ZSET_MAX_ZIPLIST_VALUE); @@ -1865,6 +1915,12 @@ int rewriteConfig(char *path) { rewriteConfigNumericalOption(state,"hz",server.hz,REDIS_DEFAULT_HZ); rewriteConfigYesNoOption(state,"aof-rewrite-incremental-fsync",server.aof_rewrite_incremental_fsync,REDIS_DEFAULT_AOF_REWRITE_INCREMENTAL_FSYNC); rewriteConfigYesNoOption(state,"aof-load-truncated",server.aof_load_truncated,REDIS_DEFAULT_AOF_LOAD_TRUNCATED); + rewriteConfigEnumOption(state,"supervised",server.supervised_mode, + "upstart", REDIS_SUPERVISED_UPSTART, + "systemd", REDIS_SUPERVISED_SYSTEMD, + "auto", REDIS_SUPERVISED_AUTODETECT, + "no", REDIS_SUPERVISED_NONE, + NULL, REDIS_SUPERVISED_NONE); if (server.sentinel_mode) rewriteConfigSentinelOption(state); /* Step 3: remove all the orphaned lines in the old file, that is, lines diff --git a/src/config.h b/src/config.h index 57d07599a..1ed8ef301 100644 --- a/src/config.h +++ b/src/config.h @@ -48,6 +48,7 @@ #define HAVE_PROC_STAT 1 #define HAVE_PROC_MAPS 1 #define HAVE_PROC_SMAPS 1 +#define HAVE_PROC_SOMAXCONN 1 #endif /* Test for task_info() */ @@ -56,10 +57,15 @@ #endif /* Test for backtrace() */ -#if defined(__APPLE__) || defined(__linux__) +#if defined(__APPLE__) || (defined(__linux__) && defined(__GLIBC__)) #define HAVE_BACKTRACE 1 #endif +/* MSG_NOSIGNAL. */ +#ifdef __linux__ +#define HAVE_MSG_NOSIGNAL 1 +#endif + /* Test for polling API */ #ifdef __linux__ #define HAVE_EPOLL 1 @@ -112,7 +118,7 @@ #define USE_SETPROCTITLE #endif -#if (defined __linux || defined __APPLE__) +#if ((defined __linux && defined(__GLIBC__)) || defined __APPLE__) #define USE_SETPROCTITLE #define INIT_SETPROCTITLE_REPLACEMENT void spt_init(int argc, char *argv[]); diff --git a/src/crc64.c b/src/crc64.c index ecdba90e0..f1f764922 100644 --- a/src/crc64.c +++ b/src/crc64.c @@ -181,9 +181,13 @@ uint64_t crc64(uint64_t crc, const unsigned char *s, uint64_t l) { } /* Test main */ -#ifdef TEST_MAIN +#ifdef REDIS_TEST #include <stdio.h> -int main(void) { + +#define UNUSED(x) (void)(x) +int crc64Test(int argc, char *argv[]) { + UNUSED(argc); + UNUSED(argv); printf("e9c6d914c4b8d9ca == %016llx\n", (unsigned long long) crc64(0,(unsigned char*)"123456789",9)); return 0; diff --git a/src/crc64.h b/src/crc64.h index ab375d3f4..c9fca519d 100644 --- a/src/crc64.h +++ b/src/crc64.h @@ -5,4 +5,8 @@ uint64_t crc64(uint64_t crc, const unsigned char *s, uint64_t l); +#ifdef REDIS_TEST +int crc64Test(int argc, char *argv[]); +#endif + #endif @@ -60,7 +60,32 @@ robj *lookupKey(redisDb *db, robj *key) { robj *lookupKeyRead(redisDb *db, robj *key) { robj *val; - expireIfNeeded(db,key); + if (expireIfNeeded(db,key) == 1) { + /* Key expired. If we are in the context of a master, expireIfNeeded() + * returns 0 only when the key does not exist at all, so it's save + * to return NULL ASAP. */ + if (server.masterhost == NULL) return NULL; + + /* However if we are in the context of a slave, expireIfNeeded() will + * not really try to expire the key, it only returns information + * about the "logical" status of the key: key expiring is up to the + * master in order to have a consistent view of master's data set. + * + * However, if the command caller is not the master, and as additional + * safety measure, the command invoked is a read-only command, we can + * safely return NULL here, and provide a more consistent behavior + * to clients accessign expired values in a read-only fashion, that + * will say the key as non exisitng. + * + * Notably this covers GETs when slaves are used to scale reads. */ + if (server.current_client && + server.current_client != server.master && + server.current_client->cmd && + server.current_client->cmd->flags & REDIS_CMD_READONLY) + { + return NULL; + } + } val = lookupKey(db,key); if (val == NULL) server.stat_keyspace_misses++; @@ -381,7 +406,7 @@ void scanCallback(void *privdata, const dictEntry *de) { } else if (o->type == REDIS_ZSET) { key = dictGetKey(de); incrRefCount(key); - val = createStringObjectFromLongDouble(*(double*)dictGetVal(de)); + val = createStringObjectFromLongDouble(*(double*)dictGetVal(de),0); } else { redisPanic("Type not handled in SCAN callback."); } @@ -425,8 +450,8 @@ void scanGenericCommand(redisClient *c, robj *o, unsigned long cursor) { list *keys = listCreate(); listNode *node, *nextnode; long count = 10; - sds pat; - int patlen, use_pattern = 0; + sds pat = NULL; + int patlen = 0, use_pattern = 0; dict *ht; /* Object must be NULL (to iterate keys names), or the type of the object @@ -877,7 +902,7 @@ void expireGenericCommand(redisClient *c, long long basetime, int unit) { when += basetime; /* No key, return zero. */ - if (lookupKeyRead(c->db,key) == NULL) { + if (lookupKeyWrite(c->db,key) == NULL) { addReply(c,shared.czero); return; } diff --git a/src/debug.c b/src/debug.c index d566f716d..162274cce 100644 --- a/src/debug.c +++ b/src/debug.c @@ -252,6 +252,12 @@ void computeDatasetDigest(unsigned char *final) { } } +void inputCatSds(void *result, const char *str) { + /* result is actually a (sds *), so re-cast it here */ + sds *info = (sds *)result; + *info = sdscat(*info, str); +} + void debugCommand(redisClient *c) { if (!strcasecmp(c->argv[1]->ptr,"segfault")) { *((char*)-1) = 'x'; @@ -295,13 +301,46 @@ void debugCommand(redisClient *c) { val = dictGetVal(de); strenc = strEncoding(val->encoding); + char extra[128] = {0}; + if (val->encoding == REDIS_ENCODING_QUICKLIST) { + char *nextra = extra; + int remaining = sizeof(extra); + quicklist *ql = val->ptr; + /* Add number of quicklist nodes */ + int used = snprintf(nextra, remaining, " ql_nodes:%u", ql->len); + nextra += used; + remaining -= used; + /* Add average quicklist fill factor */ + double avg = (double)ql->count/ql->len; + used = snprintf(nextra, remaining, " ql_avg_node:%.2f", avg); + nextra += used; + remaining -= used; + /* Add quicklist fill level / max ziplist size */ + used = snprintf(nextra, remaining, " ql_ziplist_max:%d", ql->fill); + nextra += used; + remaining -= used; + /* Add isCompressed? */ + int compressed = ql->compress != 0; + used = snprintf(nextra, remaining, " ql_compressed:%d", compressed); + nextra += used; + remaining -= used; + /* Add total uncompressed size */ + unsigned long sz = 0; + for (quicklistNode *node = ql->head; node; node = node->next) { + sz += node->sz; + } + used = snprintf(nextra, remaining, " ql_uncompressed_size:%lu", sz); + nextra += used; + remaining -= used; + } + addReplyStatusFormat(c, "Value at:%p refcount:%d " "encoding:%s serializedlength:%lld " - "lru:%d lru_seconds_idle:%llu", + "lru:%d lru_seconds_idle:%llu%s", (void*)val, val->refcount, strenc, (long long) rdbSavedObjectLen(val), - val->lru, estimateObjectIdleTime(val)); + val->lru, estimateObjectIdleTime(val)/1000, extra); } else if (!strcasecmp(c->argv[1]->ptr,"sdslen") && c->argc == 3) { dictEntry *de; robj *val; @@ -338,7 +377,7 @@ void debugCommand(redisClient *c) { snprintf(buf,sizeof(buf),"%s:%lu", (c->argc == 3) ? "key" : (char*)c->argv[3]->ptr, j); key = createStringObject(buf,strlen(buf)); - if (lookupKeyRead(c->db,key) != NULL) { + if (lookupKeyWrite(c->db,key) != NULL) { decrRefCount(key); continue; } @@ -379,6 +418,25 @@ void debugCommand(redisClient *c) { errstr = sdsmapchars(errstr,"\n\r"," ",2); /* no newlines in errors. */ errstr = sdscatlen(errstr,"\r\n",2); addReplySds(c,errstr); + } else if (!strcasecmp(c->argv[1]->ptr,"structsize") && c->argc == 2) { + sds sizes = sdsempty(); + sizes = sdscatprintf(sizes,"bits:%d ", (sizeof(void*) == 8)?64:32); + sizes = sdscatprintf(sizes,"robj:%d ", (int)sizeof(robj)); + sizes = sdscatprintf(sizes,"dictentry:%d ", (int)sizeof(dictEntry)); + sizes = sdscatprintf(sizes,"sdshdr:%d", (int)sizeof(struct sdshdr)); + addReplyBulkSds(c,sizes); + } else if (!strcasecmp(c->argv[1]->ptr,"jemalloc") && c->argc == 3) { +#if defined(USE_JEMALLOC) + if (!strcasecmp(c->argv[2]->ptr, "info")) { + sds info = sdsempty(); + je_malloc_stats_print(inputCatSds, &info, NULL); + addReplyBulkSds(c, info); + } else { + addReplyErrorFormat(c, "Valid jemalloc debug fields: info"); + } +#else + addReplyErrorFormat(c, "jemalloc support not available"); +#endif } else { addReplyErrorFormat(c, "Unknown DEBUG subcommand or wrong number of arguments for '%s'", (char*)c->argv[1]->ptr); @@ -857,7 +915,7 @@ void sigsegvHandler(int sig, siginfo_t *info, void *secret) { " Suspect RAM error? Use redis-server --test-memory to verify it.\n\n" ); /* free(messages); Don't call free() with possibly corrupted memory. */ - if (server.daemonize) unlink(server.pidfile); + if (server.daemonize && server.supervised == 0) unlink(server.pidfile); /* Make sure we exit with the right signal at the end. So for instance * the core will be dumped if enabled. */ diff --git a/src/dict.c b/src/dict.c index 29d400099..7d8db3631 100644 --- a/src/dict.c +++ b/src/dict.c @@ -342,7 +342,10 @@ dictEntry *dictAddRaw(dict *d, void *key) if ((index = _dictKeyIndex(d, key)) == -1) return NULL; - /* Allocate the memory and store the new entry */ + /* Allocate the memory and store the new entry. + * Insert the element in top, with the assumption that in a database + * system it is more likely that recently added entries are accessed + * more frequently. */ ht = dictIsRehashing(d) ? &d->ht[1] : &d->ht[0]; entry = zmalloc(sizeof(*entry)); entry->next = ht->table[index]; diff --git a/src/endianconv.c b/src/endianconv.c index 9adf09c1f..f3b0b4730 100644 --- a/src/endianconv.c +++ b/src/endianconv.c @@ -101,12 +101,16 @@ uint64_t intrev64(uint64_t v) { return v; } -#ifdef TESTMAIN +#ifdef REDIS_TEST #include <stdio.h> -int main(void) { +#define UNUSED(x) (void)(x) +int endianconvTest(int argc, char *argv[]) { char buf[32]; + UNUSED(argc); + UNUSED(argv); + sprintf(buf,"ciaoroma"); memrev16(buf); printf("%s\n", buf); diff --git a/src/endianconv.h b/src/endianconv.h index d93cd99ba..08f553136 100644 --- a/src/endianconv.h +++ b/src/endianconv.h @@ -71,4 +71,8 @@ uint64_t intrev64(uint64_t v); #define ntohu64(v) intrev64(v) #endif +#ifdef REDIS_TEST +int endianconvTest(int argc, char *argv[]); +#endif + #endif diff --git a/src/fmacros.h b/src/fmacros.h index e49735ce5..6e56c759d 100644 --- a/src/fmacros.h +++ b/src/fmacros.h @@ -34,6 +34,7 @@ #if defined(__linux__) #define _GNU_SOURCE +#define _DEFAULT_SOURCE #endif #if defined(_AIX) diff --git a/src/help.h b/src/help.h index 8395c525b..9f4c979df 100644 --- a/src/help.h +++ b/src/help.h @@ -651,8 +651,8 @@ struct commandHelp { 0, "1.0.0" }, { "SPOP", - "key", - "Remove and return a random member from a set", + "key [count]", + "Remove and return one or multiple random members from a set", 3, "1.0.0" }, { "SRANDMEMBER", diff --git a/src/hyperloglog.c b/src/hyperloglog.c index 005beb18f..b3542f997 100644 --- a/src/hyperloglog.c +++ b/src/hyperloglog.c @@ -1213,7 +1213,7 @@ void pfcountCommand(redisClient *c) { for (j = 1; j < c->argc; j++) { /* Check type and size. */ robj *o = lookupKeyRead(c->db,c->argv[j]); - if (o == NULL) continue; /* Assume empty HLL for non existing var. */ + if (o == NULL) continue; /* Assume empty HLL for non existing var.*/ if (isHLLObjectOrReply(c,o) != REDIS_OK) return; /* Merge with this HLL with our 'max' HHL by setting max[i] @@ -1233,7 +1233,7 @@ void pfcountCommand(redisClient *c) { * * The user specified a single key. Either return the cached value * or compute one and update the cache. */ - o = lookupKeyRead(c->db,c->argv[1]); + o = lookupKeyWrite(c->db,c->argv[1]); if (o == NULL) { /* No key? Cardinality is zero since no element was added, otherwise * we would have a key as HLLADD creates it as a side effect. */ @@ -1458,7 +1458,7 @@ void pfdebugCommand(redisClient *c) { robj *o; int j; - o = lookupKeyRead(c->db,c->argv[2]); + o = lookupKeyWrite(c->db,c->argv[2]); if (o == NULL) { addReplyError(c,"The specified key does not exist"); return; diff --git a/src/intset.c b/src/intset.c index 5d894e3cd..762bd48c8 100644 --- a/src/intset.c +++ b/src/intset.c @@ -261,6 +261,90 @@ int64_t intsetRandom(intset *is) { return _intsetGet(is,rand()%intrev32ifbe(is->length)); } +/* How many times bigger should the set length be compared to the requested + * count of members for us to use the Floyd algorithm instead of + * the Knuth algorithm */ +#define RANDOMMEMBERS_ALGORITHM_SELECTION_RATIO (2) + +/* Copies 'count' random members from the set into the 'values' array. + * 'values' must be an array of int64_t values, of length 'count'. + * Returns the amount of items returned. If this amount is less than 'count', + * then the remaining 'values' are left uninitialized. */ +int intsetRandomMembers(intset *is, int64_t* values, int count) { + + /* We don't check that is and values are non-NULL - the caller must + * play nice. */ + + int length = intsetLen(is); + + if (count > length) { + /* Return everything in the set */ + count = length; + } + + /* Choose between the Knuth shuffle algorithm, O(1) space, O(length) time, + * and the Floyd algorithm, O(length) space, O(count) time. */ + if ((RANDOMMEMBERS_ALGORITHM_SELECTION_RATIO * count) > length) { + + /* If the count of members requested is almost the length of the set, + * use the Knuth shuffle algorithm, O(1) space, O(length) time. */ + + /* First, fill the values array with unique random indexes inside + * the set. */ + int in, im, rn, rm; + im = 0; + for (in = 0; in < length && im < count; in++) { + + rn = length - in; + rm = count - im; + if (rand() % rn < rm) { + values[im++] = in; + } + } + + } else { + + /* If the length is considerably more than the count of members + * requested, use Robert Floyd's algorithm, O(length) space, + * O(count) time. + * Based on Jon Bentley's Programming Pearls */ + + int64_t *is_used = zcalloc(sizeof(int64_t) * length); + int in, im, r; + + r = 0; + im = 0; + + for (in = length - count; in < length && im < count; in++) { + + /* Generate a random number r */ + r = rand() % (in + 1); + + /* Do we already have the value in r? */ + if (is_used[r]) { + /* Use in instead of the generated number */ + r = in; + } + + values[im++] = r ; + + /* Mark it as used */ + is_used[r] = 1; + } + + zfree(is_used); + } + + /* Replace each random index with the value stored there in the intset */ + uint8_t encoding = intrev32ifbe(is->encoding); + for (int currentValue = 0; currentValue < count; currentValue++) { + values[currentValue] = + _intsetGetEncoded(is, values[currentValue], encoding); + } + + return count; +} + /* Sets the value to the value at the given position. When this position is * out of range the function returns 0, when in range it returns 1. */ uint8_t intsetGet(intset *is, uint32_t pos, int64_t *value) { @@ -281,44 +365,46 @@ size_t intsetBlobLen(intset *is) { return sizeof(intset)+intrev32ifbe(is->length)*intrev32ifbe(is->encoding); } -#ifdef INTSET_TEST_MAIN +#ifdef REDIS_TEST #include <sys/time.h> +#include <time.h> -void intsetRepr(intset *is) { - int i; - for (i = 0; i < intrev32ifbe(is->length); i++) { +#if 0 +static void intsetRepr(intset *is) { + for (uint32_t i = 0; i < intrev32ifbe(is->length); i++) { printf("%lld\n", (uint64_t)_intsetGet(is,i)); } printf("\n"); } -void error(char *err) { +static void error(char *err) { printf("%s\n", err); exit(1); } +#endif -void ok(void) { +static void ok(void) { printf("OK\n"); } -long long usec(void) { +static long long usec(void) { struct timeval tv; gettimeofday(&tv,NULL); return (((long long)tv.tv_sec)*1000000)+tv.tv_usec; } #define assert(_e) ((_e)?(void)0:(_assert(#_e,__FILE__,__LINE__),exit(1))) -void _assert(char *estr, char *file, int line) { +static void _assert(char *estr, char *file, int line) { printf("\n\n=== ASSERTION FAILED ===\n"); printf("==> %s:%d '%s' is not true\n",file,line,estr); } -intset *createSet(int bits, int size) { +static intset *createSet(int bits, int size) { uint64_t mask = (1<<bits)-1; - uint64_t i, value; + uint64_t value; intset *is = intsetNew(); - for (i = 0; i < size; i++) { + for (int i = 0; i < size; i++) { if (bits > 32) { value = (rand()*rand()) & mask; } else { @@ -329,10 +415,8 @@ intset *createSet(int bits, int size) { return is; } -void checkConsistency(intset *is) { - int i; - - for (i = 0; i < (intrev32ifbe(is->length)-1); i++) { +static void checkConsistency(intset *is) { + for (uint32_t i = 0; i < (intrev32ifbe(is->length)-1); i++) { uint32_t encoding = intrev32ifbe(is->encoding); if (encoding == INTSET_ENC_INT16) { @@ -348,11 +432,15 @@ void checkConsistency(intset *is) { } } -int main(int argc, char **argv) { +#define UNUSED(x) (void)(x) +int intsetTest(int argc, char **argv) { uint8_t success; int i; intset *is; - sranddev(); + srand(time(NULL)); + + UNUSED(argc); + UNUSED(argv); printf("Value encodings: "); { assert(_intsetValueEncoding(-32768) == INTSET_ENC_INT16); @@ -363,8 +451,10 @@ int main(int argc, char **argv) { assert(_intsetValueEncoding(+2147483647) == INTSET_ENC_INT32); assert(_intsetValueEncoding(-2147483649) == INTSET_ENC_INT64); assert(_intsetValueEncoding(+2147483648) == INTSET_ENC_INT64); - assert(_intsetValueEncoding(-9223372036854775808ull) == INTSET_ENC_INT64); - assert(_intsetValueEncoding(+9223372036854775807ull) == INTSET_ENC_INT64); + assert(_intsetValueEncoding(-9223372036854775808ull) == + INTSET_ENC_INT64); + assert(_intsetValueEncoding(+9223372036854775807ull) == + INTSET_ENC_INT64); ok(); } @@ -378,7 +468,7 @@ int main(int argc, char **argv) { } printf("Large number of random adds: "); { - int inserts = 0; + uint32_t inserts = 0; is = intsetNew(); for (i = 0; i < 1024; i++) { is = intsetAdd(is,rand()%0x800,&success); @@ -461,7 +551,8 @@ int main(int argc, char **argv) { start = usec(); for (i = 0; i < num; i++) intsetSearch(is,rand() % ((1<<bits)-1),NULL); - printf("%ld lookups, %ld element set, %lldusec\n",num,size,usec()-start); + printf("%ld lookups, %ld element set, %lldusec\n", + num,size,usec()-start); } printf("Stress add+delete: "); { @@ -479,5 +570,7 @@ int main(int argc, char **argv) { checkConsistency(is); ok(); } + + return 0; } #endif diff --git a/src/intset.h b/src/intset.h index bd01ff22f..7550df303 100644 --- a/src/intset.h +++ b/src/intset.h @@ -43,8 +43,13 @@ intset *intsetAdd(intset *is, int64_t value, uint8_t *success); intset *intsetRemove(intset *is, int64_t value, int *success); uint8_t intsetFind(intset *is, int64_t value); int64_t intsetRandom(intset *is); +int intsetRandomMembers(intset *is, int64_t* value, int count); uint8_t intsetGet(intset *is, uint32_t pos, int64_t *value); uint32_t intsetLen(intset *is); size_t intsetBlobLen(intset *is); +#ifdef REDIS_TEST +int intsetTest(int argc, char *argv[]); +#endif + #endif // __INTSET_H diff --git a/src/latency.c b/src/latency.c index 9875aa164..cb116fb90 100644 --- a/src/latency.c +++ b/src/latency.c @@ -512,7 +512,6 @@ sds latencyCommandGenSparkeline(char *event, struct latencyTimeSeries *ts) { for (j = 0; j < LATENCY_TS_LEN; j++) { int i = (ts->idx + j) % LATENCY_TS_LEN; int elapsed; - char *label; char buf[64]; if (ts->samples[i].time == 0) continue; @@ -534,8 +533,7 @@ sds latencyCommandGenSparkeline(char *event, struct latencyTimeSeries *ts) { snprintf(buf,sizeof(buf),"%dh",elapsed/3600); else snprintf(buf,sizeof(buf),"%dd",elapsed/(3600*24)); - label = zstrdup(buf); - sparklineSequenceAddSample(seq,ts->samples[i].latency,label); + sparklineSequenceAddSample(seq,ts->samples[i].latency,buf); } graph = sdscatprintf(graph, diff --git a/src/lzfP.h b/src/lzfP.h index c9eae3f6a..c6d2e096c 100644 --- a/src/lzfP.h +++ b/src/lzfP.h @@ -49,7 +49,7 @@ * the difference between 15 and 14 is very small * for small blocks (and 14 is usually a bit faster). * For a low-memory/faster configuration, use HLOG == 13; - * For best compression, use 15 or 16 (or more, up to 23). + * For best compression, use 15 or 16 (or more, up to 22). */ #ifndef HLOG # define HLOG 16 @@ -94,7 +94,7 @@ /* * Avoid assigning values to errno variable? for some embedding purposes * (linux kernel for example), this is necessary. NOTE: this breaks - * the documentation in lzf.h. + * the documentation in lzf.h. Avoiding errno has no speed impact. */ #ifndef AVOID_ERRNO # define AVOID_ERRNO 0 @@ -121,16 +121,52 @@ # define CHECK_INPUT 1 #endif +/* + * Whether to store pointers or offsets inside the hash table. On + * 64 bit architetcures, pointers take up twice as much space, + * and might also be slower. Default is to autodetect. + */ +/*#define LZF_USER_OFFSETS autodetect */ + /*****************************************************************************/ /* nothing should be changed below */ +#ifdef __cplusplus +# include <cstring> +# include <climits> +using namespace std; +#else +# include <string.h> +# include <limits.h> +#endif + +#ifndef LZF_USE_OFFSETS +# if defined (WIN32) +# define LZF_USE_OFFSETS defined(_M_X64) +# else +# if __cplusplus > 199711L +# include <cstdint> +# else +# include <stdint.h> +# endif +# define LZF_USE_OFFSETS (UINTPTR_MAX > 0xffffffffU) +# endif +#endif + typedef unsigned char u8; -typedef const u8 *LZF_STATE[1 << (HLOG)]; +#if LZF_USE_OFFSETS +# define LZF_HSLOT_BIAS ((const u8 *)in_data) + typedef unsigned int LZF_HSLOT; +#else +# define LZF_HSLOT_BIAS 0 + typedef const u8 *LZF_HSLOT; +#endif + +typedef LZF_HSLOT LZF_STATE[1 << (HLOG)]; #if !STRICT_ALIGN /* for unaligned accesses we need a 16 bit datatype. */ -# include <limits.h> # if USHRT_MAX == 65535 typedef unsigned short u16; # elif UINT_MAX == 65535 @@ -142,17 +178,7 @@ typedef const u8 *LZF_STATE[1 << (HLOG)]; #endif #if ULTRA_FAST -# if defined(VERY_FAST) -# undef VERY_FAST -# endif -#endif - -#if INIT_HTAB -# ifdef __cplusplus -# include <cstring> -# else -# include <string.h> -# endif +# undef VERY_FAST #endif #endif diff --git a/src/lzf_c.c b/src/lzf_c.c index 9e031ad0b..e9c69a0b8 100644 --- a/src/lzf_c.c +++ b/src/lzf_c.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Marc Alexander Lehmann <schmorp@schmorp.de> + * Copyright (c) 2000-2010 Marc Alexander Lehmann <schmorp@schmorp.de> * * Redistribution and use in source and binary forms, with or without modifica- * tion, are permitted provided that the following conditions are met: @@ -40,8 +40,8 @@ /* * don't play with this unless you benchmark! - * decompression is not dependent on the hash function - * the hashing function might seem strange, just believe me + * the data format is not dependent on the hash function. + * the hash function might seem strange, just believe me, * it works ;) */ #ifndef FRST @@ -89,9 +89,9 @@ /* * compressed format * - * 000LLLLL <L+1> ; literal - * LLLooooo oooooooo ; backref L - * 111ooooo LLLLLLLL oooooooo ; backref L+7 + * 000LLLLL <L+1> ; literal, L+1=1..33 octets + * LLLooooo oooooooo ; backref L+1=1..7 octets, o+1=1..4096 offset + * 111ooooo LLLLLLLL oooooooo ; backref L+8 octets, o+1=1..4096 offset * */ @@ -106,7 +106,6 @@ lzf_compress (const void *const in_data, unsigned int in_len, #if !LZF_STATE_ARG LZF_STATE htab; #endif - const u8 **hslot; const u8 *ip = (const u8 *)in_data; u8 *op = (u8 *)out_data; const u8 *in_end = ip + in_len; @@ -133,10 +132,6 @@ lzf_compress (const void *const in_data, unsigned int in_len, #if INIT_HTAB memset (htab, 0, sizeof (htab)); -# if 0 - for (hslot = htab; hslot < htab + HSIZE; hslot++) - *hslot++ = ip; -# endif #endif lit = 0; op++; /* start run */ @@ -144,24 +139,23 @@ lzf_compress (const void *const in_data, unsigned int in_len, hval = FRST (ip); while (ip < in_end - 2) { + LZF_HSLOT *hslot; + hval = NEXT (hval, ip); hslot = htab + IDX (hval); - ref = *hslot; *hslot = ip; + ref = *hslot + LZF_HSLOT_BIAS; *hslot = ip - LZF_HSLOT_BIAS; if (1 #if INIT_HTAB && ref < ip /* the next test will actually take care of this, but this is faster */ #endif && (off = ip - ref - 1) < MAX_OFF - && ip + 4 < in_end && ref > (u8 *)in_data -#if STRICT_ALIGN - && ref[0] == ip[0] - && ref[1] == ip[1] && ref[2] == ip[2] +#if STRICT_ALIGN + && ((ref[1] << 8) | ref[0]) == ((ip[1] << 8) | ip[0]) #else && *(u16 *)ref == *(u16 *)ip - && ref[2] == ip[2] #endif ) { @@ -170,12 +164,13 @@ lzf_compress (const void *const in_data, unsigned int in_len, unsigned int maxlen = in_end - ip - len; maxlen = maxlen > MAX_REF ? MAX_REF : maxlen; + if (expect_false (op + 3 + 1 >= out_end)) /* first a faster conservative test */ + if (op - !lit + 3 + 1 >= out_end) /* second the exact but rare test */ + return 0; + op [- lit - 1] = lit - 1; /* stop run */ op -= !lit; /* undo run if length is zero */ - if (expect_false (op + 3 + 1 >= out_end)) - return 0; - for (;;) { if (expect_true (maxlen > 16)) @@ -222,6 +217,7 @@ lzf_compress (const void *const in_data, unsigned int in_len, } *op++ = off; + lit = 0; op++; /* start run */ ip += len + 1; @@ -237,12 +233,12 @@ lzf_compress (const void *const in_data, unsigned int in_len, hval = FRST (ip); hval = NEXT (hval, ip); - htab[IDX (hval)] = ip; + htab[IDX (hval)] = ip - LZF_HSLOT_BIAS; ip++; # if VERY_FAST && !ULTRA_FAST hval = NEXT (hval, ip); - htab[IDX (hval)] = ip; + htab[IDX (hval)] = ip - LZF_HSLOT_BIAS; ip++; # endif #else @@ -251,7 +247,7 @@ lzf_compress (const void *const in_data, unsigned int in_len, do { hval = NEXT (hval, ip); - htab[IDX (hval)] = ip; + htab[IDX (hval)] = ip - LZF_HSLOT_BIAS; ip++; } while (len--); diff --git a/src/lzf_d.c b/src/lzf_d.c index 6c723f5e0..c32be8e87 100644 --- a/src/lzf_d.c +++ b/src/lzf_d.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Marc Alexander Lehmann <schmorp@schmorp.de> + * Copyright (c) 2000-2010 Marc Alexander Lehmann <schmorp@schmorp.de> * * Redistribution and use in source and binary forms, with or without modifica- * tion, are permitted provided that the following conditions are met: @@ -43,14 +43,14 @@ # define SET_ERRNO(n) errno = (n) #endif -/* +#if USE_REP_MOVSB /* small win on amd, big loss on intel */ #if (__i386 || __amd64) && __GNUC__ >= 3 # define lzf_movsb(dst, src, len) \ asm ("rep movsb" \ : "=D" (dst), "=S" (src), "=c" (len) \ : "0" (dst), "1" (src), "2" (len)); #endif -*/ +#endif unsigned int lzf_decompress (const void *const in_data, unsigned int in_len, @@ -86,9 +86,17 @@ lzf_decompress (const void *const in_data, unsigned int in_len, #ifdef lzf_movsb lzf_movsb (op, ip, ctrl); #else - do - *op++ = *ip++; - while (--ctrl); + switch (ctrl) + { + case 32: *op++ = *ip++; case 31: *op++ = *ip++; case 30: *op++ = *ip++; case 29: *op++ = *ip++; + case 28: *op++ = *ip++; case 27: *op++ = *ip++; case 26: *op++ = *ip++; case 25: *op++ = *ip++; + case 24: *op++ = *ip++; case 23: *op++ = *ip++; case 22: *op++ = *ip++; case 21: *op++ = *ip++; + case 20: *op++ = *ip++; case 19: *op++ = *ip++; case 18: *op++ = *ip++; case 17: *op++ = *ip++; + case 16: *op++ = *ip++; case 15: *op++ = *ip++; case 14: *op++ = *ip++; case 13: *op++ = *ip++; + case 12: *op++ = *ip++; case 11: *op++ = *ip++; case 10: *op++ = *ip++; case 9: *op++ = *ip++; + case 8: *op++ = *ip++; case 7: *op++ = *ip++; case 6: *op++ = *ip++; case 5: *op++ = *ip++; + case 4: *op++ = *ip++; case 3: *op++ = *ip++; case 2: *op++ = *ip++; case 1: *op++ = *ip++; + } #endif } else /* back reference */ @@ -134,12 +142,39 @@ lzf_decompress (const void *const in_data, unsigned int in_len, len += 2; lzf_movsb (op, ref, len); #else - *op++ = *ref++; - *op++ = *ref++; - - do - *op++ = *ref++; - while (--len); + switch (len) + { + default: + len += 2; + + if (op >= ref + len) + { + /* disjunct areas */ + memcpy (op, ref, len); + op += len; + } + else + { + /* overlapping, use octte by octte copying */ + do + *op++ = *ref++; + while (--len); + } + + break; + + case 9: *op++ = *ref++; + case 8: *op++ = *ref++; + case 7: *op++ = *ref++; + case 6: *op++ = *ref++; + case 5: *op++ = *ref++; + case 4: *op++ = *ref++; + case 3: *op++ = *ref++; + case 2: *op++ = *ref++; + case 1: *op++ = *ref++; + case 0: *op++ = *ref++; /* two octets more */ + *op++ = *ref++; + } #endif } } diff --git a/src/memtest.c b/src/memtest.c index 18d821b10..39fc4fcaa 100644 --- a/src/memtest.c +++ b/src/memtest.c @@ -35,6 +35,9 @@ #include <errno.h> #include <termios.h> #include <sys/ioctl.h> +#if defined(__sun) +#include <stropts.h> +#endif #include "config.h" #if (ULONG_MAX == 4294967295UL) diff --git a/src/networking.c b/src/networking.c index f10a1c5e2..607d225fd 100644 --- a/src/networking.c +++ b/src/networking.c @@ -525,6 +525,14 @@ void addReplyBulkCBuffer(redisClient *c, void *p, size_t len) { addReply(c,shared.crlf); } +/* Add sds to reply (takes ownership of sds and frees it) */ +void addReplyBulkSds(redisClient *c, sds s) { + addReplySds(c,sdscatfmt(sdsempty(),"$%u\r\n", + (unsigned long)sdslen(s))); + addReplySds(c,s); + addReply(c,shared.crlf); +} + /* Add a C nul term string as bulk reply */ void addReplyBulkCString(redisClient *c, char *s) { if (s == NULL) { @@ -839,6 +847,7 @@ void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) { * * However if we are over the maxmemory limit we ignore that and * just deliver as much data as it is possible to deliver. */ + server.stat_net_output_bytes += totwritten; if (totwritten > REDIS_MAX_WRITE_PER_EVENT && (server.maxmemory == 0 || zmalloc_used_memory() < server.maxmemory)) break; @@ -926,8 +935,10 @@ int processInlineBuffer(redisClient *c) { sdsrange(c->querybuf,querylen+2,-1); /* Setup argv array on client structure */ - if (c->argv) zfree(c->argv); - c->argv = zmalloc(sizeof(robj*)*argc); + if (argc) { + if (c->argv) zfree(c->argv); + c->argv = zmalloc(sizeof(robj*)*argc); + } /* Create redis objects for all arguments. */ for (c->argc = 0, j = 0; j < argc; j++) { @@ -1179,6 +1190,7 @@ void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) { sdsIncrLen(c->querybuf,nread); c->lastinteraction = server.unixtime; if (c->flags & REDIS_MASTER) c->reploff += nread; + server.stat_net_input_bytes += nread; } else { server.current_client = NULL; return; @@ -1215,17 +1227,6 @@ void getClientsMaxBuffers(unsigned long *longest_output_list, *biggest_input_buffer = bib; } -/* This is a helper function for genClientPeerId(). - * It writes the specified ip/port to "peerid" as a null termiated string - * in the form ip:port if ip does not contain ":" itself, otherwise - * [ip]:port format is used (for IPv6 addresses basically). */ -void formatPeerId(char *peerid, size_t peerid_len, char *ip, int port) { - if (strchr(ip,':')) - snprintf(peerid,peerid_len,"[%s]:%d",ip,port); - else - snprintf(peerid,peerid_len,"%s:%d",ip,port); -} - /* A Redis "Peer ID" is a colon separated ip:port pair. * For IPv4 it's in the form x.y.z.k:port, example: "127.0.0.1:1234". * For IPv6 addresses we use [] around the IP part, like in "[::1]:1234". @@ -1234,24 +1235,17 @@ void formatPeerId(char *peerid, size_t peerid_len, char *ip, int port) { * A Peer ID always fits inside a buffer of REDIS_PEER_ID_LEN bytes, including * the null term. * - * The function returns REDIS_OK on succcess, and REDIS_ERR on failure. - * * On failure the function still populates 'peerid' with the "?:0" string * in case you want to relax error checking or need to display something * anyway (see anetPeerToString implementation for more info). */ -int genClientPeerId(redisClient *client, char *peerid, size_t peerid_len) { - char ip[REDIS_IP_STR_LEN]; - int port; - +void genClientPeerId(redisClient *client, char *peerid, + size_t peerid_len) { if (client->flags & REDIS_UNIX_SOCKET) { /* Unix socket client. */ snprintf(peerid,peerid_len,"%s:0",server.unixsocket); - return REDIS_OK; } else { /* TCP client. */ - int retval = anetPeerToString(client->fd,ip,sizeof(ip),&port); - formatPeerId(peerid,peerid_len,ip,port); - return (retval == -1) ? REDIS_ERR : REDIS_OK; + anetFormatPeer(client->fd,peerid,peerid_len); } } diff --git a/src/object.c b/src/object.c index 6b8e42477..f75421ee8 100644 --- a/src/object.c +++ b/src/object.c @@ -109,26 +109,44 @@ robj *createStringObjectFromLongLong(long long value) { return o; } -/* Note: this function is defined into object.c since here it is where it - * belongs but it is actually designed to be used just for INCRBYFLOAT */ -robj *createStringObjectFromLongDouble(long double value) { +/* Create a string object from a long double. If humanfriendly is non-zero + * it does not use exponential format and trims trailing zeroes at the end, + * however this results in loss of precision. Otherwise exp format is used + * and the output of snprintf() is not modified. + * + * The 'humanfriendly' option is used for INCRBYFLOAT and HINCRBYFLOAT. */ +robj *createStringObjectFromLongDouble(long double value, int humanfriendly) { char buf[256]; int len; - /* We use 17 digits precision since with 128 bit floats that precision - * after rounding is able to represent most small decimal numbers in a way - * that is "non surprising" for the user (that is, most small decimal - * numbers will be represented in a way that when converted back into - * a string are exactly the same as what the user typed.) */ - len = snprintf(buf,sizeof(buf),"%.17Lf", value); - /* Now remove trailing zeroes after the '.' */ - if (strchr(buf,'.') != NULL) { - char *p = buf+len-1; - while(*p == '0') { - p--; - len--; + if (isinf(value)) { + /* Libc in odd systems (Hi Solaris!) will format infinite in a + * different way, so better to handle it in an explicit way. */ + if (value > 0) { + memcpy(buf,"inf",3); + len = 3; + } else { + memcpy(buf,"-inf",4); + len = 4; } - if (*p == '.') len--; + } else if (humanfriendly) { + /* We use 17 digits precision since with 128 bit floats that precision + * after rounding is able to represent most small decimal numbers in a + * way that is "non surprising" for the user (that is, most small + * decimal numbers will be represented in a way that when converted + * back into a string are exactly the same as what the user typed.) */ + len = snprintf(buf,sizeof(buf),"%.17Lf", value); + /* Now remove trailing zeroes after the '.' */ + if (strchr(buf,'.') != NULL) { + char *p = buf+len-1; + while(*p == '0') { + p--; + len--; + } + if (*p == '.') len--; + } + } else { + len = snprintf(buf,sizeof(buf),"%.17Lg", value); } return createStringObject(buf,len); } @@ -162,11 +180,10 @@ robj *dupStringObject(robj *o) { } } -robj *createListObject(void) { - list *l = listCreate(); +robj *createQuicklistObject(void) { + quicklist *l = quicklistCreate(); robj *o = createObject(REDIS_LIST,l); - listSetFreeMethod(l,decrRefCountVoid); - o->encoding = REDIS_ENCODING_LINKEDLIST; + o->encoding = REDIS_ENCODING_QUICKLIST; return o; } @@ -224,11 +241,8 @@ void freeStringObject(robj *o) { void freeListObject(robj *o) { switch (o->encoding) { - case REDIS_ENCODING_LINKEDLIST: - listRelease((list*) o->ptr); - break; - case REDIS_ENCODING_ZIPLIST: - zfree(o->ptr); + case REDIS_ENCODING_QUICKLIST: + quicklistRelease(o->ptr); break; default: redisPanic("Unknown list encoding type"); @@ -660,7 +674,7 @@ char *strEncoding(int encoding) { case REDIS_ENCODING_RAW: return "raw"; case REDIS_ENCODING_INT: return "int"; case REDIS_ENCODING_HT: return "hashtable"; - case REDIS_ENCODING_LINKEDLIST: return "linkedlist"; + case REDIS_ENCODING_QUICKLIST: return "quicklist"; case REDIS_ENCODING_ZIPLIST: return "ziplist"; case REDIS_ENCODING_INTSET: return "intset"; case REDIS_ENCODING_SKIPLIST: return "skiplist"; diff --git a/src/quicklist.c b/src/quicklist.c new file mode 100644 index 000000000..6682b2087 --- /dev/null +++ b/src/quicklist.c @@ -0,0 +1,2639 @@ +/* quicklist.c - A doubly linked list of ziplists + * + * Copyright (c) 2014, Matt Stancliff <matt@genges.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must start the above copyright notice, + * this quicklist of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this quicklist of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <string.h> /* for memcpy */ +#include "quicklist.h" +#include "zmalloc.h" +#include "ziplist.h" +#include "util.h" /* for ll2string */ +#include "lzf.h" + +#if defined(REDIS_TEST) || defined(REDIS_TEST_VERBOSE) +#include <stdio.h> /* for printf (debug printing), snprintf (genstr) */ +#endif + +#ifndef REDIS_STATIC +#define REDIS_STATIC static +#endif + +/* Optimization levels for size-based filling */ +static const size_t optimization_level[] = {4096, 8192, 16384, 32768, 65536}; + +/* Maximum size in bytes of any multi-element ziplist. + * Larger values will live in their own isolated ziplists. */ +#define SIZE_SAFETY_LIMIT 8192 + +/* Minimum ziplist size in bytes for attempting compression. */ +#define MIN_COMPRESS_BYTES 48 + +/* Minimum size reduction in bytes to store compressed quicklistNode data. + * This also prevents us from storing compression if the compression + * resulted in a larger size than the original data. */ +#define MIN_COMPRESS_IMPROVE 8 + +/* If not verbose testing, remove all debug printing. */ +#ifndef REDIS_TEST_VERBOSE +#define D(...) +#else +#define D(...) \ + do { \ + printf("%s:%s:%d:\t", __FILE__, __FUNCTION__, __LINE__); \ + printf(__VA_ARGS__); \ + printf("\n"); \ + } while (0); +#endif + +/* Simple way to give quicklistEntry structs default values with one call. */ +#define initEntry(e) \ + do { \ + (e)->zi = (e)->value = NULL; \ + (e)->longval = -123456789; \ + (e)->quicklist = NULL; \ + (e)->node = NULL; \ + (e)->offset = 123456789; \ + (e)->sz = 0; \ + } while (0) + +#if __GNUC__ >= 3 +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) +#else +#define likely(x) (x) +#define unlikely(x) (x) +#endif + +/* Create a new quicklist. + * Free with quicklistRelease(). */ +quicklist *quicklistCreate(void) { + struct quicklist *quicklist; + + quicklist = zmalloc(sizeof(*quicklist)); + quicklist->head = quicklist->tail = NULL; + quicklist->len = 0; + quicklist->count = 0; + quicklist->compress = 0; + quicklist->fill = -2; + return quicklist; +} + +#define COMPRESS_MAX (1 << 16) +void quicklistSetCompressDepth(quicklist *quicklist, int compress) { + if (compress > COMPRESS_MAX) { + compress = COMPRESS_MAX; + } else if (compress < 0) { + compress = 0; + } + quicklist->compress = compress; +} + +#define FILL_MAX (1 << 15) +void quicklistSetFill(quicklist *quicklist, int fill) { + if (fill > FILL_MAX) { + fill = FILL_MAX; + } else if (fill < -5) { + fill = -5; + } + quicklist->fill = fill; +} + +void quicklistSetOptions(quicklist *quicklist, int fill, int depth) { + quicklistSetFill(quicklist, fill); + quicklistSetCompressDepth(quicklist, depth); +} + +/* Create a new quicklist with some default parameters. */ +quicklist *quicklistNew(int fill, int compress) { + quicklist *quicklist = quicklistCreate(); + quicklistSetOptions(quicklist, fill, compress); + return quicklist; +} + +REDIS_STATIC quicklistNode *quicklistCreateNode(void) { + quicklistNode *node; + node = zmalloc(sizeof(*node)); + node->zl = NULL; + node->count = 0; + node->sz = 0; + node->next = node->prev = NULL; + node->encoding = QUICKLIST_NODE_ENCODING_RAW; + node->container = QUICKLIST_NODE_CONTAINER_ZIPLIST; + node->recompress = 0; + return node; +} + +/* Return cached quicklist count */ +unsigned int quicklistCount(quicklist *ql) { return ql->count; } + +/* Free entire quicklist. */ +void quicklistRelease(quicklist *quicklist) { + unsigned long len; + quicklistNode *current, *next; + + current = quicklist->head; + len = quicklist->len; + while (len--) { + next = current->next; + + zfree(current->zl); + quicklist->count -= current->count; + + zfree(current); + + quicklist->len--; + current = next; + } + zfree(quicklist); +} + +/* Compress the ziplist in 'node' and update encoding details. + * Returns 1 if ziplist compressed successfully. + * Returns 0 if compression failed or if ziplist too small to compress. */ +REDIS_STATIC int __quicklistCompressNode(quicklistNode *node) { +#ifdef REDIS_TEST + node->attempted_compress = 1; +#endif + + /* Don't bother compressing small values */ + if (node->sz < MIN_COMPRESS_BYTES) + return 0; + + quicklistLZF *lzf = zmalloc(sizeof(*lzf) + node->sz); + + /* Cancel if compression fails or doesn't compress small enough */ + if (((lzf->sz = lzf_compress(node->zl, node->sz, lzf->compressed, + node->sz)) == 0) || + lzf->sz + MIN_COMPRESS_IMPROVE >= node->sz) { + /* lzf_compress aborts/rejects compression if value not compressable. */ + zfree(lzf); + return 0; + } + lzf = zrealloc(lzf, sizeof(*lzf) + lzf->sz); + zfree(node->zl); + node->zl = (unsigned char *)lzf; + node->encoding = QUICKLIST_NODE_ENCODING_LZF; + node->recompress = 0; + return 1; +} + +/* Compress only uncompressed nodes. */ +#define quicklistCompressNode(_node) \ + do { \ + if ((_node) && (_node)->encoding == QUICKLIST_NODE_ENCODING_RAW) { \ + __quicklistCompressNode((_node)); \ + } \ + } while (0) + +/* Uncompress the ziplist in 'node' and update encoding details. + * Returns 1 on successful decode, 0 on failure to decode. */ +REDIS_STATIC int __quicklistDecompressNode(quicklistNode *node) { +#ifdef REDIS_TEST + node->attempted_compress = 0; +#endif + + void *decompressed = zmalloc(node->sz); + quicklistLZF *lzf = (quicklistLZF *)node->zl; + if (lzf_decompress(lzf->compressed, lzf->sz, decompressed, node->sz) == 0) { + /* Someone requested decompress, but we can't decompress. Not good. */ + zfree(decompressed); + return 0; + } + zfree(lzf); + node->zl = decompressed; + node->encoding = QUICKLIST_NODE_ENCODING_RAW; + return 1; +} + +/* Decompress only compressed nodes. */ +#define quicklistDecompressNode(_node) \ + do { \ + if ((_node) && (_node)->encoding == QUICKLIST_NODE_ENCODING_LZF) { \ + __quicklistDecompressNode((_node)); \ + } \ + } while (0) + +/* Force node to not be immediately re-compresable */ +#define quicklistDecompressNodeForUse(_node) \ + do { \ + if ((_node) && (_node)->encoding == QUICKLIST_NODE_ENCODING_LZF) { \ + __quicklistDecompressNode((_node)); \ + (_node)->recompress = 1; \ + } \ + } while (0) + +/* Extract the raw LZF data from this quicklistNode. + * Pointer to LZF data is assigned to '*data'. + * Return value is the length of compressed LZF data. */ +size_t quicklistGetLzf(const quicklistNode *node, void **data) { + quicklistLZF *lzf = (quicklistLZF *)node->zl; + *data = lzf->compressed; + return lzf->sz; +} + +#define quicklistAllowsCompression(_ql) ((_ql)->compress != 0) + +/* Force 'quicklist' to meet compression guidelines set by compress depth. + * The only way to guarantee interior nodes get compressed is to iterate + * to our "interior" compress depth then compress the next node we find. + * If compress depth is larger than the entire list, we return immediately. */ +REDIS_STATIC void __quicklistCompress(const quicklist *quicklist, + quicklistNode *node) { + /* If length is less than our compress depth (from both sides), + * we can't compress anything. */ + if (!quicklistAllowsCompression(quicklist) || + quicklist->len < (unsigned int)(quicklist->compress * 2)) + return; + +#if 0 + /* Optimized cases for small depth counts */ + if (quicklist->compress == 1) { + quicklistNode *h = quicklist->head, *t = quicklist->tail; + quicklistDecompressNode(h); + quicklistDecompressNode(t); + if (h != node && t != node) + quicklistCompressNode(node); + return; + } else if (quicklist->compress == 2) { + quicklistNode *h = quicklist->head, *hn = h->next, *hnn = hn->next; + quicklistNode *t = quicklist->tail, *tp = t->prev, *tpp = tp->prev; + quicklistDecompressNode(h); + quicklistDecompressNode(hn); + quicklistDecompressNode(t); + quicklistDecompressNode(tp); + if (h != node && hn != node && t != node && tp != node) { + quicklistCompressNode(node); + } + if (hnn != t) { + quicklistCompressNode(hnn); + } + if (tpp != h) { + quicklistCompressNode(tpp); + } + return; + } +#endif + + /* Iterate until we reach compress depth for both sides of the list.a + * Note: because we do length checks at the *top* of this function, + * we can skip explicit null checks below. Everything exists. */ + quicklistNode *forward = quicklist->head; + quicklistNode *reverse = quicklist->tail; + int depth = 0; + int in_depth = 0; + while (depth++ < quicklist->compress) { + quicklistDecompressNode(forward); + quicklistDecompressNode(reverse); + + if (forward == node || reverse == node) + in_depth = 1; + + if (forward == reverse) + return; + + forward = forward->next; + reverse = reverse->prev; + } + + if (!in_depth) + quicklistCompressNode(node); + + if (depth > 2) { + /* At this point, forward and reverse are one node beyond depth */ + quicklistCompressNode(forward); + quicklistCompressNode(reverse); + } +} + +#define quicklistCompress(_ql, _node) \ + do { \ + if ((_node)->recompress) \ + quicklistCompressNode((_node)); \ + else \ + __quicklistCompress((_ql), (_node)); \ + } while (0) + +/* If we previously used quicklistDecompressNodeForUse(), just recompress. */ +#define quicklistRecompressOnly(_ql, _node) \ + do { \ + if ((_node)->recompress) \ + quicklistCompressNode((_node)); \ + } while (0) + +/* Insert 'new_node' after 'old_node' if 'after' is 1. + * Insert 'new_node' before 'old_node' if 'after' is 0. + * Note: 'new_node' is *always* uncompressed, so if we assign it to + * head or tail, we do not need to uncompress it. */ +REDIS_STATIC void __quicklistInsertNode(quicklist *quicklist, + quicklistNode *old_node, + quicklistNode *new_node, int after) { + if (after) { + new_node->prev = old_node; + if (old_node) { + new_node->next = old_node->next; + if (old_node->next) + old_node->next->prev = new_node; + old_node->next = new_node; + } + if (quicklist->tail == old_node) + quicklist->tail = new_node; + } else { + new_node->next = old_node; + if (old_node) { + new_node->prev = old_node->prev; + if (old_node->prev) + old_node->prev->next = new_node; + old_node->prev = new_node; + } + if (quicklist->head == old_node) + quicklist->head = new_node; + } + /* If this insert creates the only element so far, initialize head/tail. */ + if (quicklist->len == 0) { + quicklist->head = quicklist->tail = new_node; + } + + if (old_node) + quicklistCompress(quicklist, old_node); + + quicklist->len++; +} + +/* Wrappers for node inserting around existing node. */ +REDIS_STATIC void _quicklistInsertNodeBefore(quicklist *quicklist, + quicklistNode *old_node, + quicklistNode *new_node) { + __quicklistInsertNode(quicklist, old_node, new_node, 0); +} + +REDIS_STATIC void _quicklistInsertNodeAfter(quicklist *quicklist, + quicklistNode *old_node, + quicklistNode *new_node) { + __quicklistInsertNode(quicklist, old_node, new_node, 1); +} + +REDIS_STATIC int +_quicklistNodeSizeMeetsOptimizationRequirement(const size_t sz, + const int fill) { + if (fill >= 0) + return 0; + + size_t offset = (-fill) - 1; + if (offset < (sizeof(optimization_level) / sizeof(*optimization_level))) { + if (sz <= optimization_level[offset]) { + return 1; + } else { + return 0; + } + } else { + return 0; + } +} + +#define sizeMeetsSafetyLimit(sz) ((sz) <= SIZE_SAFETY_LIMIT) + +REDIS_STATIC int _quicklistNodeAllowInsert(const quicklistNode *node, + const int fill, const size_t sz) { + if (unlikely(!node)) + return 0; + + int ziplist_overhead; + /* size of previous offset */ + if (sz < 254) + ziplist_overhead = 1; + else + ziplist_overhead = 5; + + /* size of forward offset */ + if (sz < 64) + ziplist_overhead += 1; + else if (likely(sz < 16384)) + ziplist_overhead += 2; + else + ziplist_overhead += 5; + + /* new_sz overestimates if 'sz' encodes to an integer type */ + unsigned int new_sz = node->sz + sz + ziplist_overhead; + if (likely(_quicklistNodeSizeMeetsOptimizationRequirement(new_sz, fill))) + return 1; + else if (!sizeMeetsSafetyLimit(new_sz)) + return 0; + else if ((int)node->count < fill) + return 1; + else + return 0; +} + +REDIS_STATIC int _quicklistNodeAllowMerge(const quicklistNode *a, + const quicklistNode *b, + const int fill) { + if (!a || !b) + return 0; + + /* approximate merged ziplist size (- 11 to remove one ziplist + * header/trailer) */ + unsigned int merge_sz = a->sz + b->sz - 11; + if (likely(_quicklistNodeSizeMeetsOptimizationRequirement(merge_sz, fill))) + return 1; + else if (!sizeMeetsSafetyLimit(merge_sz)) + return 0; + else if ((int)(a->count + b->count) <= fill) + return 1; + else + return 0; +} + +#define quicklistNodeUpdateSz(node) \ + do { \ + (node)->sz = ziplistBlobLen((node)->zl); \ + } while (0) + +/* Add new entry to head node of quicklist. + * + * Returns 0 if used existing head. + * Returns 1 if new head created. */ +int quicklistPushHead(quicklist *quicklist, void *value, size_t sz) { + quicklistNode *orig_head = quicklist->head; + if (likely( + _quicklistNodeAllowInsert(quicklist->head, quicklist->fill, sz))) { + quicklist->head->zl = + ziplistPush(quicklist->head->zl, value, sz, ZIPLIST_HEAD); + quicklistNodeUpdateSz(quicklist->head); + } else { + quicklistNode *node = quicklistCreateNode(); + node->zl = ziplistPush(ziplistNew(), value, sz, ZIPLIST_HEAD); + + quicklistNodeUpdateSz(node); + _quicklistInsertNodeBefore(quicklist, quicklist->head, node); + } + quicklist->count++; + quicklist->head->count++; + return (orig_head != quicklist->head); +} + +/* Add new entry to tail node of quicklist. + * + * Returns 0 if used existing tail. + * Returns 1 if new tail created. */ +int quicklistPushTail(quicklist *quicklist, void *value, size_t sz) { + quicklistNode *orig_tail = quicklist->tail; + if (likely( + _quicklistNodeAllowInsert(quicklist->tail, quicklist->fill, sz))) { + quicklist->tail->zl = + ziplistPush(quicklist->tail->zl, value, sz, ZIPLIST_TAIL); + quicklistNodeUpdateSz(quicklist->tail); + } else { + quicklistNode *node = quicklistCreateNode(); + node->zl = ziplistPush(ziplistNew(), value, sz, ZIPLIST_TAIL); + + quicklistNodeUpdateSz(node); + _quicklistInsertNodeAfter(quicklist, quicklist->tail, node); + } + quicklist->count++; + quicklist->tail->count++; + return (orig_tail != quicklist->tail); +} + +/* Create new node consisting of a pre-formed ziplist. + * Used for loading RDBs where entire ziplists have been stored + * to be retrieved later. */ +void quicklistAppendZiplist(quicklist *quicklist, unsigned char *zl) { + quicklistNode *node = quicklistCreateNode(); + + node->zl = zl; + node->count = ziplistLen(node->zl); + node->sz = ziplistBlobLen(zl); + + _quicklistInsertNodeAfter(quicklist, quicklist->tail, node); + quicklist->count += node->count; +} + +/* Append all values of ziplist 'zl' individually into 'quicklist'. + * + * This allows us to restore old RDB ziplists into new quicklists + * with smaller ziplist sizes than the saved RDB ziplist. + * + * Returns 'quicklist' argument. Frees passed-in ziplist 'zl' */ +quicklist *quicklistAppendValuesFromZiplist(quicklist *quicklist, + unsigned char *zl) { + unsigned char *value; + unsigned int sz; + long long longval; + char longstr[32] = {0}; + + unsigned char *p = ziplistIndex(zl, 0); + while (ziplistGet(p, &value, &sz, &longval)) { + if (!value) { + /* Write the longval as a string so we can re-add it */ + sz = ll2string(longstr, sizeof(longstr), longval); + value = (unsigned char *)longstr; + } + quicklistPushTail(quicklist, value, sz); + p = ziplistNext(zl, p); + } + zfree(zl); + return quicklist; +} + +/* Create new (potentially multi-node) quicklist from a single existing ziplist. + * + * Returns new quicklist. Frees passed-in ziplist 'zl'. */ +quicklist *quicklistCreateFromZiplist(int fill, int compress, + unsigned char *zl) { + return quicklistAppendValuesFromZiplist(quicklistNew(fill, compress), zl); +} + +#define quicklistDeleteIfEmpty(ql, n) \ + do { \ + if ((n)->count == 0) { \ + __quicklistDelNode((ql), (n)); \ + (n) = NULL; \ + } \ + } while (0) + +REDIS_STATIC void __quicklistDelNode(quicklist *quicklist, + quicklistNode *node) { + if (node->next) + node->next->prev = node->prev; + if (node->prev) + node->prev->next = node->next; + + if (node == quicklist->tail) { + quicklist->tail = node->prev; + } + + if (node == quicklist->head) { + quicklist->head = node->next; + } + + /* If we deleted a node within our compress depth, we + * now have compressed nodes needing to be decompressed. */ + __quicklistCompress(quicklist, NULL); + + quicklist->count -= node->count; + + zfree(node->zl); + zfree(node); + quicklist->len--; +} + +/* Delete one entry from list given the node for the entry and a pointer + * to the entry in the node. + * + * Note: quicklistDelIndex() *requires* uncompressed nodes because you + * already had to get *p from an uncompressed node somewhere. + * + * Returns 1 if the entire node was deleted, 0 if node still exists. + * Also updates in/out param 'p' with the next offset in the ziplist. */ +REDIS_STATIC int quicklistDelIndex(quicklist *quicklist, quicklistNode *node, + unsigned char **p) { + int gone = 0; + + node->zl = ziplistDelete(node->zl, p); + node->count--; + if (node->count == 0) { + gone = 1; + __quicklistDelNode(quicklist, node); + } else { + quicklistNodeUpdateSz(node); + } + quicklist->count--; + /* If we deleted the node, the original node is no longer valid */ + return gone ? 1 : 0; +} + +/* Delete one element represented by 'entry' + * + * 'entry' stores enough metadata to delete the proper position in + * the correct ziplist in the correct quicklist node. */ +void quicklistDelEntry(quicklistIter *iter, quicklistEntry *entry) { + quicklistNode *prev = entry->node->prev; + quicklistNode *next = entry->node->next; + int deleted_node = quicklistDelIndex((quicklist *)entry->quicklist, + entry->node, &entry->zi); + + /* after delete, the zi is now invalid for any future usage. */ + iter->zi = NULL; + + /* If current node is deleted, we must update iterator node and offset. */ + if (deleted_node) { + if (iter->direction == AL_START_HEAD) { + iter->current = next; + iter->offset = 0; + } else if (iter->direction == AL_START_TAIL) { + iter->current = prev; + iter->offset = -1; + } + } + /* else if (!deleted_node), no changes needed. + * we already reset iter->zi above, and the existing iter->offset + * doesn't move again because: + * - [1, 2, 3] => delete offset 1 => [1, 3]: next element still offset 1 + * - [1, 2, 3] => delete offset 0 => [2, 3]: next element still offset 0 + * if we deleted the last element at offet N and now + * length of this ziplist is N-1, the next call into + * quicklistNext() will jump to the next node. */ +} + +/* Replace quicklist entry at offset 'index' by 'data' with length 'sz'. + * + * Returns 1 if replace happened. + * Returns 0 if replace failed and no changes happened. */ +int quicklistReplaceAtIndex(quicklist *quicklist, long index, void *data, + int sz) { + quicklistEntry entry; + if (likely(quicklistIndex(quicklist, index, &entry))) { + /* quicklistIndex provides an uncompressed node */ + entry.node->zl = ziplistDelete(entry.node->zl, &entry.zi); + entry.node->zl = ziplistInsert(entry.node->zl, entry.zi, data, sz); + quicklistCompress(quicklist, entry.node); + return 1; + } else { + return 0; + } +} + +/* Given two nodes, try to merge their ziplists. + * + * This helps us not have a quicklist with 3 element ziplists if + * our fill factor can handle much higher levels. + * + * Note: 'a' must be to the LEFT of 'b'. + * + * After calling this function, both 'a' and 'b' should be considered + * unusable. The return value from this function must be used + * instead of re-using any of the quicklistNode input arguments. + * + * Returns the input node picked to merge against or NULL if + * merging was not possible. */ +REDIS_STATIC quicklistNode *_quicklistZiplistMerge(quicklist *quicklist, + quicklistNode *a, + quicklistNode *b) { + D("Requested merge (a,b) (%u, %u)", a->count, b->count); + + quicklistDecompressNode(a); + quicklistDecompressNode(b); + if ((ziplistMerge(&a->zl, &b->zl))) { + /* We merged ziplists! Now remove the unused quicklistNode. */ + quicklistNode *keep = NULL, *nokeep = NULL; + if (!a->zl) { + nokeep = a; + keep = b; + } else if (!b->zl) { + nokeep = b; + keep = a; + } + keep->count = ziplistLen(keep->zl); + quicklistNodeUpdateSz(keep); + + nokeep->count = 0; + __quicklistDelNode(quicklist, nokeep); + quicklistCompress(quicklist, keep); + return keep; + } else { + /* else, the merge returned NULL and nothing changed. */ + return NULL; + } +} + +/* Attempt to merge ziplists within two nodes on either side of 'center'. + * + * We attempt to merge: + * - (center->prev->prev, center->prev) + * - (center->next, center->next->next) + * - (center->prev, center) + * - (center, center->next) + */ +REDIS_STATIC void _quicklistMergeNodes(quicklist *quicklist, + quicklistNode *center) { + int fill = quicklist->fill; + quicklistNode *prev, *prev_prev, *next, *next_next, *target; + prev = prev_prev = next = next_next = target = NULL; + + if (center->prev) { + prev = center->prev; + if (center->prev->prev) + prev_prev = center->prev->prev; + } + + if (center->next) { + next = center->next; + if (center->next->next) + next_next = center->next->next; + } + + /* Try to merge prev_prev and prev */ + if (_quicklistNodeAllowMerge(prev, prev_prev, fill)) { + _quicklistZiplistMerge(quicklist, prev_prev, prev); + prev_prev = prev = NULL; /* they could have moved, invalidate them. */ + } + + /* Try to merge next and next_next */ + if (_quicklistNodeAllowMerge(next, next_next, fill)) { + _quicklistZiplistMerge(quicklist, next, next_next); + next = next_next = NULL; /* they could have moved, invalidate them. */ + } + + /* Try to merge center node and previous node */ + if (_quicklistNodeAllowMerge(center, center->prev, fill)) { + target = _quicklistZiplistMerge(quicklist, center->prev, center); + center = NULL; /* center could have been deleted, invalidate it. */ + } else { + /* else, we didn't merge here, but target needs to be valid below. */ + target = center; + } + + /* Use result of center merge (or original) to merge with next node. */ + if (_quicklistNodeAllowMerge(target, target->next, fill)) { + _quicklistZiplistMerge(quicklist, target, target->next); + } +} + +/* Split 'node' into two parts, parameterized by 'offset' and 'after'. + * + * The 'after' argument controls which quicklistNode gets returned. + * If 'after'==1, returned node has elements after 'offset'. + * input node keeps elements up to 'offset', including 'offset'. + * If 'after'==0, returned node has elements up to 'offset', including 'offset'. + * input node keeps elements after 'offset'. + * + * If 'after'==1, returned node will have elements _after_ 'offset'. + * The returned node will have elements [OFFSET+1, END]. + * The input node keeps elements [0, OFFSET]. + * + * If 'after'==0, returned node will keep elements up to and including 'offset'. + * The returned node will have elements [0, OFFSET]. + * The input node keeps elements [OFFSET+1, END]. + * + * The input node keeps all elements not taken by the returned node. + * + * Returns newly created node or NULL if split not possible. */ +REDIS_STATIC quicklistNode *_quicklistSplitNode(quicklistNode *node, int offset, + int after) { + size_t zl_sz = node->sz; + + quicklistNode *new_node = quicklistCreateNode(); + new_node->zl = zmalloc(zl_sz); + + /* Copy original ziplist so we can split it */ + memcpy(new_node->zl, node->zl, zl_sz); + + /* -1 here means "continue deleting until the list ends" */ + int orig_start = after ? offset + 1 : 0; + int orig_extent = after ? -1 : offset; + int new_start = after ? 0 : offset; + int new_extent = after ? offset + 1 : -1; + + D("After %d (%d); ranges: [%d, %d], [%d, %d]", after, offset, orig_start, + orig_extent, new_start, new_extent); + + node->zl = ziplistDeleteRange(node->zl, orig_start, orig_extent); + node->count = ziplistLen(node->zl); + quicklistNodeUpdateSz(node); + + new_node->zl = ziplistDeleteRange(new_node->zl, new_start, new_extent); + new_node->count = ziplistLen(new_node->zl); + quicklistNodeUpdateSz(new_node); + + D("After split lengths: orig (%d), new (%d)", node->count, new_node->count); + return new_node; +} + +/* Insert a new entry before or after existing entry 'entry'. + * + * If after==1, the new value is inserted after 'entry', otherwise + * the new value is inserted before 'entry'. */ +REDIS_STATIC void _quicklistInsert(quicklist *quicklist, quicklistEntry *entry, + void *value, const size_t sz, int after) { + int full = 0, at_tail = 0, at_head = 0, full_next = 0, full_prev = 0; + int fill = quicklist->fill; + quicklistNode *node = entry->node; + quicklistNode *new_node = NULL; + + if (!node) { + /* we have no reference node, so let's create only node in the list */ + D("No node given!"); + new_node = quicklistCreateNode(); + new_node->zl = ziplistPush(ziplistNew(), value, sz, ZIPLIST_HEAD); + __quicklistInsertNode(quicklist, NULL, new_node, after); + new_node->count++; + quicklist->count++; + return; + } + + /* Populate accounting flags for easier boolean checks later */ + if (!_quicklistNodeAllowInsert(node, fill, sz)) { + D("Current node is full with count %d with requested fill %lu", + node->count, fill); + full = 1; + } + + if (after && (entry->offset == node->count)) { + D("At Tail of current ziplist"); + at_tail = 1; + if (!_quicklistNodeAllowInsert(node->next, fill, sz)) { + D("Next node is full too."); + full_next = 1; + } + } + + if (!after && (entry->offset == 0)) { + D("At Head"); + at_head = 1; + if (!_quicklistNodeAllowInsert(node->prev, fill, sz)) { + D("Prev node is full too."); + full_prev = 1; + } + } + + /* Now determine where and how to insert the new element */ + if (!full && after) { + D("Not full, inserting after current position."); + quicklistDecompressNodeForUse(node); + unsigned char *next = ziplistNext(node->zl, entry->zi); + if (next == NULL) { + node->zl = ziplistPush(node->zl, value, sz, ZIPLIST_TAIL); + } else { + node->zl = ziplistInsert(node->zl, next, value, sz); + } + node->count++; + quicklistNodeUpdateSz(node); + quicklistRecompressOnly(quicklist, node); + } else if (!full && !after) { + D("Not full, inserting before current position."); + quicklistDecompressNodeForUse(node); + node->zl = ziplistInsert(node->zl, entry->zi, value, sz); + node->count++; + quicklistNodeUpdateSz(node); + quicklistRecompressOnly(quicklist, node); + } else if (full && at_tail && node->next && !full_next && after) { + /* If we are: at tail, next has free space, and inserting after: + * - insert entry at head of next node. */ + D("Full and tail, but next isn't full; inserting next node head"); + new_node = node->next; + quicklistDecompressNodeForUse(new_node); + new_node->zl = ziplistPush(new_node->zl, value, sz, ZIPLIST_HEAD); + new_node->count++; + quicklistNodeUpdateSz(new_node); + quicklistRecompressOnly(quicklist, new_node); + } else if (full && at_head && node->prev && !full_prev && !after) { + /* If we are: at head, previous has free space, and inserting before: + * - insert entry at tail of previous node. */ + D("Full and head, but prev isn't full, inserting prev node tail"); + new_node = node->prev; + quicklistDecompressNodeForUse(new_node); + new_node->zl = ziplistPush(new_node->zl, value, sz, ZIPLIST_TAIL); + new_node->count++; + quicklistNodeUpdateSz(new_node); + quicklistRecompressOnly(quicklist, new_node); + } else if (full && ((at_tail && node->next && full_next && after) || + (at_head && node->prev && full_prev && !after))) { + /* If we are: full, and our prev/next is full, then: + * - create new node and attach to quicklist */ + D("\tprovisioning new node..."); + new_node = quicklistCreateNode(); + new_node->zl = ziplistPush(ziplistNew(), value, sz, ZIPLIST_HEAD); + new_node->count++; + quicklistNodeUpdateSz(new_node); + __quicklistInsertNode(quicklist, node, new_node, after); + } else if (full) { + /* else, node is full we need to split it. */ + /* covers both after and !after cases */ + D("\tsplitting node..."); + quicklistDecompressNodeForUse(node); + new_node = _quicklistSplitNode(node, entry->offset, after); + new_node->zl = ziplistPush(new_node->zl, value, sz, + after ? ZIPLIST_HEAD : ZIPLIST_TAIL); + new_node->count++; + quicklistNodeUpdateSz(new_node); + __quicklistInsertNode(quicklist, node, new_node, after); + _quicklistMergeNodes(quicklist, node); + } + + quicklist->count++; +} + +void quicklistInsertBefore(quicklist *quicklist, quicklistEntry *entry, + void *value, const size_t sz) { + _quicklistInsert(quicklist, entry, value, sz, 0); +} + +void quicklistInsertAfter(quicklist *quicklist, quicklistEntry *entry, + void *value, const size_t sz) { + _quicklistInsert(quicklist, entry, value, sz, 1); +} + +/* Delete a range of elements from the quicklist. + * + * elements may span across multiple quicklistNodes, so we + * have to be careful about tracking where we start and end. + * + * Returns 1 if entries were deleted, 0 if nothing was deleted. */ +int quicklistDelRange(quicklist *quicklist, const long start, + const long count) { + if (count <= 0) + return 0; + + unsigned long extent = count; /* range is inclusive of start position */ + + if (start >= 0 && extent > (quicklist->count - start)) { + /* if requesting delete more elements than exist, limit to list size. */ + extent = quicklist->count - start; + } else if (start < 0 && extent > (unsigned long)(-start)) { + /* else, if at negative offset, limit max size to rest of list. */ + extent = -start; /* c.f. LREM -29 29; just delete until end. */ + } + + quicklistEntry entry; + if (!quicklistIndex(quicklist, start, &entry)) + return 0; + + D("Quicklist delete request for start %ld, count %ld, extent: %ld", start, + count, extent); + quicklistNode *node = entry.node; + + /* iterate over next nodes until everything is deleted. */ + while (extent) { + quicklistNode *next = node->next; + + unsigned long del; + int delete_entire_node = 0; + if (entry.offset == 0 && extent >= node->count) { + /* If we are deleting more than the count of this node, we + * can just delete the entire node without ziplist math. */ + delete_entire_node = 1; + del = node->count; + } else if (entry.offset >= 0 && extent >= node->count) { + /* If deleting more nodes after this one, calculate delete based + * on size of current node. */ + del = node->count - entry.offset; + } else if (entry.offset < 0) { + /* If offset is negative, we are in the first run of this loop + * and we are deleting the entire range + * from this start offset to end of list. Since the Negative + * offset is the number of elements until the tail of the list, + * just use it directly as the deletion count. */ + del = -entry.offset; + + /* If the positive offset is greater than the remaining extent, + * we only delete the remaining extent, not the entire offset. + */ + if (del > extent) + del = extent; + } else { + /* else, we are deleting less than the extent of this node, so + * use extent directly. */ + del = extent; + } + + D("[%ld]: asking to del: %ld because offset: %d; (ENTIRE NODE: %d), " + "node count: %u", + extent, del, entry.offset, delete_entire_node, node->count); + + if (delete_entire_node) { + __quicklistDelNode(quicklist, node); + } else { + quicklistDecompressNodeForUse(node); + node->zl = ziplistDeleteRange(node->zl, entry.offset, del); + quicklistNodeUpdateSz(node); + node->count -= del; + quicklist->count -= del; + quicklistDeleteIfEmpty(quicklist, node); + if (node) + quicklistRecompressOnly(quicklist, node); + } + + extent -= del; + + node = next; + + entry.offset = 0; + } + return 1; +} + +/* Passthrough to ziplistCompare() */ +int quicklistCompare(unsigned char *p1, unsigned char *p2, int p2_len) { + return ziplistCompare(p1, p2, p2_len); +} + +/* Returns a quicklist iterator 'iter'. After the initialization every + * call to quicklistNext() will return the next element of the quicklist. */ +quicklistIter *quicklistGetIterator(const quicklist *quicklist, int direction) { + quicklistIter *iter; + + iter = zmalloc(sizeof(*iter)); + + if (direction == AL_START_HEAD) { + iter->current = quicklist->head; + iter->offset = 0; + } else if (direction == AL_START_TAIL) { + iter->current = quicklist->tail; + iter->offset = -1; + } + + iter->direction = direction; + iter->quicklist = quicklist; + + iter->zi = NULL; + + return iter; +} + +/* Initialize an iterator at a specific offset 'idx' and make the iterator + * return nodes in 'direction' direction. */ +quicklistIter *quicklistGetIteratorAtIdx(const quicklist *quicklist, + const int direction, + const long long idx) { + quicklistEntry entry; + + if (quicklistIndex(quicklist, idx, &entry)) { + quicklistIter *base = quicklistGetIterator(quicklist, direction); + base->zi = NULL; + base->current = entry.node; + base->offset = entry.offset; + return base; + } else { + return NULL; + } +} + +/* Release iterator. + * If we still have a valid current node, then re-encode current node. */ +void quicklistReleaseIterator(quicklistIter *iter) { + if (iter->current) + quicklistCompress(iter->quicklist, iter->current); + + zfree(iter); +} + +/* Get next element in iterator. + * + * Note: You must NOT insert into the list while iterating over it. + * You *may* delete from the list while iterating using the + * quicklistDelEntry() function. + * If you insert into the quicklist while iterating, you should + * re-create the iterator after your addition. + * + * iter = quicklistGetIterator(quicklist,<direction>); + * quicklistEntry entry; + * while (quicklistNext(iter, &entry)) { + * if (entry.value) + * [[ use entry.value with entry.sz ]] + * else + * [[ use entry.longval ]] + * } + * + * Populates 'entry' with values for this iteration. + * Returns 0 when iteration is complete or if iteration not possible. + * If return value is 0, the contents of 'entry' are not valid. + */ +int quicklistNext(quicklistIter *iter, quicklistEntry *entry) { + initEntry(entry); + + if (!iter) { + D("Returning because no iter!"); + return 0; + } + + entry->quicklist = iter->quicklist; + entry->node = iter->current; + + if (!iter->current) { + D("Returning because current node is NULL") + return 0; + } + + unsigned char *(*nextFn)(unsigned char *, unsigned char *) = NULL; + int offset_update = 0; + + if (!iter->zi) { + /* If !zi, use current index. */ + quicklistDecompressNodeForUse(iter->current); + iter->zi = ziplistIndex(iter->current->zl, iter->offset); + } else { + /* else, use existing iterator offset and get prev/next as necessary. */ + if (iter->direction == AL_START_HEAD) { + nextFn = ziplistNext; + offset_update = 1; + } else if (iter->direction == AL_START_TAIL) { + nextFn = ziplistPrev; + offset_update = -1; + } + iter->zi = nextFn(iter->current->zl, iter->zi); + iter->offset += offset_update; + } + + entry->zi = iter->zi; + entry->offset = iter->offset; + + if (iter->zi) { + /* Populate value from existing ziplist position */ + ziplistGet(entry->zi, &entry->value, &entry->sz, &entry->longval); + return 1; + } else { + /* We ran out of ziplist entries. + * Pick next node, update offset, then re-run retrieval. */ + quicklistCompress(iter->quicklist, iter->current); + if (iter->direction == AL_START_HEAD) { + /* Forward traversal */ + D("Jumping to start of next node"); + iter->current = iter->current->next; + iter->offset = 0; + } else if (iter->direction == AL_START_TAIL) { + /* Reverse traversal */ + D("Jumping to end of previous node"); + iter->current = iter->current->prev; + iter->offset = -1; + } + iter->zi = NULL; + return quicklistNext(iter, entry); + } +} + +/* Duplicate the quicklist. + * On success a copy of the original quicklist is returned. + * + * The original quicklist both on success or error is never modified. + * + * Returns newly allocated quicklist. */ +quicklist *quicklistDup(quicklist *orig) { + quicklist *copy; + + copy = quicklistNew(orig->fill, orig->compress); + + for (quicklistNode *current = orig->head; current; + current = current->next) { + quicklistNode *node = quicklistCreateNode(); + + if (node->encoding == QUICKLIST_NODE_ENCODING_LZF) { + quicklistLZF *lzf = (quicklistLZF *)node->zl; + size_t lzf_sz = sizeof(*lzf) + lzf->sz; + node->zl = zmalloc(lzf_sz); + memcpy(node->zl, current->zl, lzf_sz); + } else if (node->encoding == QUICKLIST_NODE_ENCODING_RAW) { + node->zl = zmalloc(current->sz); + memcpy(node->zl, current->zl, current->sz); + } + + node->count = current->count; + copy->count += node->count; + node->sz = current->sz; + node->encoding = current->encoding; + + _quicklistInsertNodeAfter(copy, copy->tail, node); + } + + /* copy->count must equal orig->count here */ + return copy; +} + +/* Populate 'entry' with the element at the specified zero-based index + * where 0 is the head, 1 is the element next to head + * and so on. Negative integers are used in order to count + * from the tail, -1 is the last element, -2 the penultimate + * and so on. If the index is out of range 0 is returned. + * + * Returns 1 if element found + * Returns 0 if element not found */ +int quicklistIndex(const quicklist *quicklist, const long long idx, + quicklistEntry *entry) { + quicklistNode *n; + unsigned long long accum = 0; + unsigned long long index; + int forward = idx < 0 ? 0 : 1; /* < 0 -> reverse, 0+ -> forward */ + + initEntry(entry); + entry->quicklist = quicklist; + + if (!forward) { + index = (-idx) - 1; + n = quicklist->tail; + } else { + index = idx; + n = quicklist->head; + } + + if (index >= quicklist->count) + return 0; + + while (likely(n)) { + if ((accum + n->count) > index) { + break; + } else { + D("Skipping over (%p) %u at accum %lld", (void *)n, n->count, + accum); + accum += n->count; + n = forward ? n->next : n->prev; + } + } + + if (!n) + return 0; + + D("Found node: %p at accum %llu, idx %llu, sub+ %llu, sub- %llu", (void *)n, + accum, index, index - accum, (-index) - 1 + accum); + + entry->node = n; + if (forward) { + /* forward = normal head-to-tail offset. */ + entry->offset = index - accum; + } else { + /* reverse = need negative offset for tail-to-head, so undo + * the result of the original if (index < 0) above. */ + entry->offset = (-index) - 1 + accum; + } + + quicklistDecompressNodeForUse(entry->node); + entry->zi = ziplistIndex(entry->node->zl, entry->offset); + ziplistGet(entry->zi, &entry->value, &entry->sz, &entry->longval); + /* The caller will use our result, so we don't re-compress here. + * The caller can recompress or delete the node as needed. */ + return 1; +} + +/* Rotate quicklist by moving the tail element to the head. */ +void quicklistRotate(quicklist *quicklist) { + if (quicklist->count <= 1) + return; + + /* First, get the tail entry */ + unsigned char *p = ziplistIndex(quicklist->tail->zl, -1); + unsigned char *value; + long long longval; + unsigned int sz; + char longstr[32] = {0}; + ziplistGet(p, &value, &sz, &longval); + + /* If value found is NULL, then ziplistGet populated longval instead */ + if (!value) { + /* Write the longval as a string so we can re-add it */ + sz = ll2string(longstr, sizeof(longstr), longval); + value = (unsigned char *)longstr; + } + + /* Add tail entry to head (must happen before tail is deleted). */ + quicklistPushHead(quicklist, value, sz); + + /* If quicklist has only one node, the head ziplist is also the + * tail ziplist and PushHead() could have reallocated our single ziplist, + * which would make our pre-existing 'p' unusable. */ + if (quicklist->len == 1) { + p = ziplistIndex(quicklist->tail->zl, -1); + } + + /* Remove tail entry. */ + quicklistDelIndex(quicklist, quicklist->tail, &p); +} + +/* pop from quicklist and return result in 'data' ptr. Value of 'data' + * is the return value of 'saver' function pointer if the data is NOT a number. + * + * If the quicklist element is a long long, then the return value is returned in + * 'sval'. + * + * Return value of 0 means no elements available. + * Return value of 1 means check 'data' and 'sval' for values. + * If 'data' is set, use 'data' and 'sz'. Otherwise, use 'sval'. */ +int quicklistPopCustom(quicklist *quicklist, int where, unsigned char **data, + unsigned int *sz, long long *sval, + void *(*saver)(unsigned char *data, unsigned int sz)) { + unsigned char *p; + unsigned char *vstr; + unsigned int vlen; + long long vlong; + int pos = (where == QUICKLIST_HEAD) ? 0 : -1; + + if (quicklist->count == 0) + return 0; + + if (data) + *data = NULL; + if (sz) + *sz = 0; + if (sval) + *sval = -123456789; + + quicklistNode *node; + if (where == QUICKLIST_HEAD && quicklist->head) { + node = quicklist->head; + } else if (where == QUICKLIST_TAIL && quicklist->tail) { + node = quicklist->tail; + } else { + return 0; + } + + p = ziplistIndex(node->zl, pos); + if (ziplistGet(p, &vstr, &vlen, &vlong)) { + if (vstr) { + if (data) + *data = saver(vstr, vlen); + if (sz) + *sz = vlen; + } else { + if (data) + *data = NULL; + if (sval) + *sval = vlong; + } + quicklistDelIndex(quicklist, node, &p); + return 1; + } + return 0; +} + +/* Return a malloc'd copy of data passed in */ +REDIS_STATIC void *_quicklistSaver(unsigned char *data, unsigned int sz) { + unsigned char *vstr; + if (data) { + vstr = zmalloc(sz); + memcpy(data, vstr, sz); + return vstr; + } + return NULL; +} + +/* Default pop function + * + * Returns malloc'd value from quicklist */ +int quicklistPop(quicklist *quicklist, int where, unsigned char **data, + unsigned int *sz, long long *slong) { + unsigned char *vstr; + unsigned int vlen; + long long vlong; + if (quicklist->count == 0) + return 0; + int ret = quicklistPopCustom(quicklist, where, &vstr, &vlen, &vlong, + _quicklistSaver); + if (data) + *data = vstr; + if (slong) + *slong = vlong; + if (sz) + *sz = vlen; + return ret; +} + +/* Wrapper to allow argument-based switching between HEAD/TAIL pop */ +void quicklistPush(quicklist *quicklist, void *value, const size_t sz, + int where) { + if (where == QUICKLIST_HEAD) { + quicklistPushHead(quicklist, value, sz); + } else if (where == QUICKLIST_TAIL) { + quicklistPushTail(quicklist, value, sz); + } +} + +/* The rest of this file is test cases and test helpers. */ +#ifdef REDIS_TEST +#include <stdint.h> +#include <sys/time.h> + +#define assert(_e) \ + do { \ + if (!(_e)) { \ + printf("\n\n=== ASSERTION FAILED ===\n"); \ + printf("==> %s:%d '%s' is not true\n", __FILE__, __LINE__, #_e); \ + err++; \ + } \ + } while (0) + +#define yell(str, ...) printf("ERROR! " str "\n\n", __VA_ARGS__) + +#define OK printf("\tOK\n") + +#define ERROR \ + do { \ + printf("\tERROR!\n"); \ + err++; \ + } while (0) + +#define ERR(x, ...) \ + do { \ + printf("%s:%s:%d:\t", __FILE__, __FUNCTION__, __LINE__); \ + printf("ERROR! " x "\n", __VA_ARGS__); \ + err++; \ + } while (0) + +#define TEST(name) printf("test — %s\n", name); +#define TEST_DESC(name, ...) printf("test — " name "\n", __VA_ARGS__); + +#define QL_TEST_VERBOSE 0 + +#define UNUSED(x) (void)(x) +static void ql_info(quicklist *ql) { +#if QL_TEST_VERBOSE + printf("Container length: %lu\n", ql->len); + printf("Container size: %lu\n", ql->count); + if (ql->head) + printf("\t(zsize head: %d)\n", ziplistLen(ql->head->zl)); + if (ql->tail) + printf("\t(zsize tail: %d)\n", ziplistLen(ql->tail->zl)); + printf("\n"); +#else + UNUSED(ql); +#endif +} + +/* Return the UNIX time in microseconds */ +static long long ustime(void) { + struct timeval tv; + long long ust; + + gettimeofday(&tv, NULL); + ust = ((long long)tv.tv_sec) * 1000000; + ust += tv.tv_usec; + return ust; +} + +/* Return the UNIX time in milliseconds */ +static long long mstime(void) { return ustime() / 1000; } + +/* Iterate over an entire quicklist. + * Print the list if 'print' == 1. + * + * Returns physical count of elements found by iterating over the list. */ +static int _itrprintr(quicklist *ql, int print, int forward) { + quicklistIter *iter = + quicklistGetIterator(ql, forward ? AL_START_HEAD : AL_START_TAIL); + quicklistEntry entry; + int i = 0; + int p = 0; + quicklistNode *prev = NULL; + while (quicklistNext(iter, &entry)) { + if (entry.node != prev) { + /* Count the number of list nodes too */ + p++; + prev = entry.node; + } + if (print) { + printf("[%3d (%2d)]: [%.*s] (%lld)\n", i, p, entry.sz, + (char *)entry.value, entry.longval); + } + i++; + } + quicklistReleaseIterator(iter); + return i; +} +static int itrprintr(quicklist *ql, int print) { + return _itrprintr(ql, print, 1); +} + +static int itrprintr_rev(quicklist *ql, int print) { + return _itrprintr(ql, print, 0); +} + +#define ql_verify(a, b, c, d, e) \ + do { \ + err += _ql_verify((a), (b), (c), (d), (e)); \ + } while (0) + +/* Verify list metadata matches physical list contents. */ +static int _ql_verify(quicklist *ql, uint32_t len, uint32_t count, + uint32_t head_count, uint32_t tail_count) { + int errors = 0; + + ql_info(ql); + if (len != ql->len) { + yell("quicklist length wrong: expected %d, got %u", len, ql->len); + errors++; + } + + if (count != ql->count) { + yell("quicklist count wrong: expected %d, got %lu", count, ql->count); + errors++; + } + + int loopr = itrprintr(ql, 0); + if (loopr != (int)ql->count) { + yell("quicklist cached count not match actual count: expected %lu, got " + "%d", + ql->count, loopr); + errors++; + } + + int rloopr = itrprintr_rev(ql, 0); + if (loopr != rloopr) { + yell("quicklist has different forward count than reverse count! " + "Forward count is %d, reverse count is %d.", + loopr, rloopr); + errors++; + } + + if (ql->len == 0 && !errors) { + OK; + return errors; + } + + if (ql->head && head_count != ql->head->count && + head_count != ziplistLen(ql->head->zl)) { + yell("quicklist head count wrong: expected %d, " + "got cached %d vs. actual %d", + head_count, ql->head->count, ziplistLen(ql->head->zl)); + errors++; + } + + if (ql->tail && tail_count != ql->tail->count && + tail_count != ziplistLen(ql->tail->zl)) { + yell("quicklist tail count wrong: expected %d, " + "got cached %u vs. actual %d", + tail_count, ql->tail->count, ziplistLen(ql->tail->zl)); + errors++; + } + + if (quicklistAllowsCompression(ql)) { + quicklistNode *node = ql->head; + unsigned int low_raw = ql->compress; + unsigned int high_raw = ql->len - ql->compress; + + for (unsigned int at = 0; at < ql->len; at++, node = node->next) { + if (node && (at < low_raw || at >= high_raw)) { + if (node->encoding != QUICKLIST_NODE_ENCODING_RAW) { + yell("Incorrect compression: node %d is " + "compressed at depth %d ((%u, %u); total " + "nodes: %u; size: %u; recompress: %d)", + at, ql->compress, low_raw, high_raw, ql->len, node->sz, + node->recompress); + errors++; + } + } else { + if (node->encoding != QUICKLIST_NODE_ENCODING_LZF && + !node->attempted_compress) { + yell("Incorrect non-compression: node %d is NOT " + "compressed at depth %d ((%u, %u); total " + "nodes: %u; size: %u; recompress: %d; attempted: %d)", + at, ql->compress, low_raw, high_raw, ql->len, node->sz, + node->recompress, node->attempted_compress); + errors++; + } + } + } + } + + if (!errors) + OK; + return errors; +} + +/* Generate new string concatenating integer i against string 'prefix' */ +static char *genstr(char *prefix, int i) { + static char result[64] = {0}; + snprintf(result, sizeof(result), "%s%d", prefix, i); + return result; +} + +/* main test, but callable from other files */ +int quicklistTest(int argc, char *argv[]) { + UNUSED(argc); + UNUSED(argv); + + unsigned int err = 0; + int optimize_start = + -(int)(sizeof(optimization_level) / sizeof(*optimization_level)); + + printf("Starting optimization offset at: %d\n", optimize_start); + + int options[] = {0, 1, 2, 3, 4, 5, 6, 10}; + size_t option_count = sizeof(options) / sizeof(*options); + long long runtime[option_count]; + + for (int _i = 0; _i < (int)option_count; _i++) { + printf("Testing Option %d\n", options[_i]); + long long start = mstime(); + + TEST("create list") { + quicklist *ql = quicklistNew(-2, options[_i]); + ql_verify(ql, 0, 0, 0, 0); + quicklistRelease(ql); + } + + TEST("add to tail of empty list") { + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistPushTail(ql, "hello", 6); + /* 1 for head and 1 for tail beacuse 1 node = head = tail */ + ql_verify(ql, 1, 1, 1, 1); + quicklistRelease(ql); + } + + TEST("add to head of empty list") { + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistPushHead(ql, "hello", 6); + /* 1 for head and 1 for tail beacuse 1 node = head = tail */ + ql_verify(ql, 1, 1, 1, 1); + quicklistRelease(ql); + } + + for (int f = optimize_start; f < 32; f++) { + TEST_DESC("add to tail 5x at fill %d at compress %d", f, + options[_i]) { + quicklist *ql = quicklistNew(f, options[_i]); + for (int i = 0; i < 5; i++) + quicklistPushTail(ql, genstr("hello", i), 32); + if (ql->count != 5) + ERROR; + if (f == 32) + ql_verify(ql, 1, 5, 5, 5); + quicklistRelease(ql); + } + } + + for (int f = optimize_start; f < 32; f++) { + TEST_DESC("add to head 5x at fill %d at compress %d", f, + options[_i]) { + quicklist *ql = quicklistNew(f, options[_i]); + for (int i = 0; i < 5; i++) + quicklistPushHead(ql, genstr("hello", i), 32); + if (ql->count != 5) + ERROR; + if (f == 32) + ql_verify(ql, 1, 5, 5, 5); + quicklistRelease(ql); + } + } + + for (int f = optimize_start; f < 512; f++) { + TEST_DESC("add to tail 500x at fill %d at compress %d", f, + options[_i]) { + quicklist *ql = quicklistNew(f, options[_i]); + for (int i = 0; i < 500; i++) + quicklistPushTail(ql, genstr("hello", i), 64); + if (ql->count != 500) + ERROR; + if (f == 32) + ql_verify(ql, 16, 500, 32, 20); + quicklistRelease(ql); + } + } + + for (int f = optimize_start; f < 512; f++) { + TEST_DESC("add to head 500x at fill %d at compress %d", f, + options[_i]) { + quicklist *ql = quicklistNew(f, options[_i]); + for (int i = 0; i < 500; i++) + quicklistPushHead(ql, genstr("hello", i), 32); + if (ql->count != 500) + ERROR; + if (f == 32) + ql_verify(ql, 16, 500, 20, 32); + quicklistRelease(ql); + } + } + + TEST("rotate empty") { + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistRotate(ql); + ql_verify(ql, 0, 0, 0, 0); + quicklistRelease(ql); + } + + for (int f = optimize_start; f < 32; f++) { + TEST("rotate one val once") { + quicklist *ql = quicklistNew(f, options[_i]); + quicklistPushHead(ql, "hello", 6); + quicklistRotate(ql); + /* Ignore compression verify because ziplist is + * too small to compress. */ + ql_verify(ql, 1, 1, 1, 1); + quicklistRelease(ql); + } + } + + for (int f = optimize_start; f < 3; f++) { + TEST_DESC("rotate 500 val 5000 times at fill %d at compress %d", f, + options[_i]) { + quicklist *ql = quicklistNew(f, options[_i]); + quicklistPushHead(ql, "900", 3); + quicklistPushHead(ql, "7000", 4); + quicklistPushHead(ql, "-1200", 5); + quicklistPushHead(ql, "42", 2); + for (int i = 0; i < 500; i++) + quicklistPushHead(ql, genstr("hello", i), 64); + ql_info(ql); + for (int i = 0; i < 5000; i++) { + ql_info(ql); + quicklistRotate(ql); + } + if (f == 1) + ql_verify(ql, 504, 504, 1, 1); + else if (f == 2) + ql_verify(ql, 252, 504, 2, 2); + else if (f == 32) + ql_verify(ql, 16, 504, 32, 24); + quicklistRelease(ql); + } + } + + TEST("pop empty") { + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistPop(ql, QUICKLIST_HEAD, NULL, NULL, NULL); + ql_verify(ql, 0, 0, 0, 0); + quicklistRelease(ql); + } + + TEST("pop 1 string from 1") { + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistPushHead(ql, genstr("hello", 331), 32); + unsigned char *data; + unsigned int sz; + long long lv; + ql_info(ql); + quicklistPop(ql, QUICKLIST_HEAD, &data, &sz, &lv); + assert(data != NULL); + assert(sz == 32); + zfree(data); + ql_verify(ql, 0, 0, 0, 0); + quicklistRelease(ql); + } + + TEST("pop head 1 number from 1") { + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistPushHead(ql, "55513", 5); + unsigned char *data; + unsigned int sz; + long long lv; + ql_info(ql); + quicklistPop(ql, QUICKLIST_HEAD, &data, &sz, &lv); + assert(data == NULL); + assert(lv == 55513); + ql_verify(ql, 0, 0, 0, 0); + quicklistRelease(ql); + } + + TEST("pop head 500 from 500") { + quicklist *ql = quicklistNew(-2, options[_i]); + for (int i = 0; i < 500; i++) + quicklistPushHead(ql, genstr("hello", i), 32); + ql_info(ql); + for (int i = 0; i < 500; i++) { + unsigned char *data; + unsigned int sz; + long long lv; + int ret = quicklistPop(ql, QUICKLIST_HEAD, &data, &sz, &lv); + assert(ret == 1); + assert(data != NULL); + assert(sz == 32); + zfree(data); + } + ql_verify(ql, 0, 0, 0, 0); + quicklistRelease(ql); + } + + TEST("pop head 5000 from 500") { + quicklist *ql = quicklistNew(-2, options[_i]); + for (int i = 0; i < 500; i++) + quicklistPushHead(ql, genstr("hello", i), 32); + for (int i = 0; i < 5000; i++) { + unsigned char *data; + unsigned int sz; + long long lv; + int ret = quicklistPop(ql, QUICKLIST_HEAD, &data, &sz, &lv); + if (i < 500) { + assert(ret == 1); + assert(data != NULL); + assert(sz == 32); + zfree(data); + } else { + assert(ret == 0); + } + } + ql_verify(ql, 0, 0, 0, 0); + quicklistRelease(ql); + } + + TEST("iterate forward over 500 list") { + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistSetFill(ql, 32); + for (int i = 0; i < 500; i++) + quicklistPushHead(ql, genstr("hello", i), 32); + quicklistIter *iter = quicklistGetIterator(ql, AL_START_HEAD); + quicklistEntry entry; + int i = 499, count = 0; + while (quicklistNext(iter, &entry)) { + char *h = genstr("hello", i); + if (strcmp((char *)entry.value, h)) + ERR("value [%s] didn't match [%s] at position %d", + entry.value, h, i); + i--; + count++; + } + if (count != 500) + ERR("Didn't iterate over exactly 500 elements (%d)", i); + ql_verify(ql, 16, 500, 20, 32); + quicklistReleaseIterator(iter); + quicklistRelease(ql); + } + + TEST("iterate reverse over 500 list") { + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistSetFill(ql, 32); + for (int i = 0; i < 500; i++) + quicklistPushHead(ql, genstr("hello", i), 32); + quicklistIter *iter = quicklistGetIterator(ql, AL_START_TAIL); + quicklistEntry entry; + int i = 0; + while (quicklistNext(iter, &entry)) { + char *h = genstr("hello", i); + if (strcmp((char *)entry.value, h)) + ERR("value [%s] didn't match [%s] at position %d", + entry.value, h, i); + i++; + } + if (i != 500) + ERR("Didn't iterate over exactly 500 elements (%d)", i); + ql_verify(ql, 16, 500, 20, 32); + quicklistReleaseIterator(iter); + quicklistRelease(ql); + } + + TEST("insert before with 0 elements") { + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistEntry entry; + quicklistIndex(ql, 0, &entry); + quicklistInsertBefore(ql, &entry, "abc", 4); + ql_verify(ql, 1, 1, 1, 1); + quicklistRelease(ql); + } + + TEST("insert after with 0 elements") { + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistEntry entry; + quicklistIndex(ql, 0, &entry); + quicklistInsertAfter(ql, &entry, "abc", 4); + ql_verify(ql, 1, 1, 1, 1); + quicklistRelease(ql); + } + + TEST("insert after 1 element") { + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistPushHead(ql, "hello", 6); + quicklistEntry entry; + quicklistIndex(ql, 0, &entry); + quicklistInsertAfter(ql, &entry, "abc", 4); + ql_verify(ql, 1, 2, 2, 2); + quicklistRelease(ql); + } + + TEST("insert before 1 element") { + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistPushHead(ql, "hello", 6); + quicklistEntry entry; + quicklistIndex(ql, 0, &entry); + quicklistInsertAfter(ql, &entry, "abc", 4); + ql_verify(ql, 1, 2, 2, 2); + quicklistRelease(ql); + } + + for (int f = optimize_start; f < 12; f++) { + TEST_DESC("insert once in elements while iterating at fill %d at " + "compress %d\n", + f, options[_i]) { + quicklist *ql = quicklistNew(f, options[_i]); + quicklistPushTail(ql, "abc", 3); + quicklistSetFill(ql, 1); + quicklistPushTail(ql, "def", 3); /* force to unique node */ + quicklistSetFill(ql, f); + quicklistPushTail(ql, "bob", 3); /* force to reset for +3 */ + quicklistPushTail(ql, "foo", 3); + quicklistPushTail(ql, "zoo", 3); + + itrprintr(ql, 0); + /* insert "bar" before "bob" while iterating over list. */ + quicklistIter *iter = quicklistGetIterator(ql, AL_START_HEAD); + quicklistEntry entry; + while (quicklistNext(iter, &entry)) { + if (!strncmp((char *)entry.value, "bob", 3)) { + /* Insert as fill = 1 so it spills into new node. */ + quicklistInsertBefore(ql, &entry, "bar", 3); + break; /* didn't we fix insert-while-iterating? */ + } + } + itrprintr(ql, 0); + + /* verify results */ + quicklistIndex(ql, 0, &entry); + if (strncmp((char *)entry.value, "abc", 3)) + ERR("Value 0 didn't match, instead got: %.*s", entry.sz, + entry.value); + quicklistIndex(ql, 1, &entry); + if (strncmp((char *)entry.value, "def", 3)) + ERR("Value 1 didn't match, instead got: %.*s", entry.sz, + entry.value); + quicklistIndex(ql, 2, &entry); + if (strncmp((char *)entry.value, "bar", 3)) + ERR("Value 2 didn't match, instead got: %.*s", entry.sz, + entry.value); + quicklistIndex(ql, 3, &entry); + if (strncmp((char *)entry.value, "bob", 3)) + ERR("Value 3 didn't match, instead got: %.*s", entry.sz, + entry.value); + quicklistIndex(ql, 4, &entry); + if (strncmp((char *)entry.value, "foo", 3)) + ERR("Value 4 didn't match, instead got: %.*s", entry.sz, + entry.value); + quicklistIndex(ql, 5, &entry); + if (strncmp((char *)entry.value, "zoo", 3)) + ERR("Value 5 didn't match, instead got: %.*s", entry.sz, + entry.value); + quicklistReleaseIterator(iter); + quicklistRelease(ql); + } + } + + for (int f = optimize_start; f < 1024; f++) { + TEST_DESC( + "insert [before] 250 new in middle of 500 elements at fill" + " %d at compress %d", + f, options[_i]) { + quicklist *ql = quicklistNew(f, options[_i]); + for (int i = 0; i < 500; i++) + quicklistPushTail(ql, genstr("hello", i), 32); + for (int i = 0; i < 250; i++) { + quicklistEntry entry; + quicklistIndex(ql, 250, &entry); + quicklistInsertBefore(ql, &entry, genstr("abc", i), 32); + } + if (f == 32) + ql_verify(ql, 25, 750, 32, 20); + quicklistRelease(ql); + } + } + + for (int f = optimize_start; f < 1024; f++) { + TEST_DESC("insert [after] 250 new in middle of 500 elements at " + "fill %d at compress %d", + f, options[_i]) { + quicklist *ql = quicklistNew(f, options[_i]); + for (int i = 0; i < 500; i++) + quicklistPushHead(ql, genstr("hello", i), 32); + for (int i = 0; i < 250; i++) { + quicklistEntry entry; + quicklistIndex(ql, 250, &entry); + quicklistInsertAfter(ql, &entry, genstr("abc", i), 32); + } + + if (ql->count != 750) + ERR("List size not 750, but rather %ld", ql->count); + + if (f == 32) + ql_verify(ql, 26, 750, 20, 32); + quicklistRelease(ql); + } + } + + TEST("duplicate empty list") { + quicklist *ql = quicklistNew(-2, options[_i]); + ql_verify(ql, 0, 0, 0, 0); + quicklist *copy = quicklistDup(ql); + ql_verify(copy, 0, 0, 0, 0); + quicklistRelease(ql); + quicklistRelease(copy); + } + + TEST("duplicate list of 1 element") { + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistPushHead(ql, genstr("hello", 3), 32); + ql_verify(ql, 1, 1, 1, 1); + quicklist *copy = quicklistDup(ql); + ql_verify(copy, 1, 1, 1, 1); + quicklistRelease(ql); + quicklistRelease(copy); + } + + TEST("duplicate list of 500") { + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistSetFill(ql, 32); + for (int i = 0; i < 500; i++) + quicklistPushHead(ql, genstr("hello", i), 32); + ql_verify(ql, 16, 500, 20, 32); + + quicklist *copy = quicklistDup(ql); + ql_verify(copy, 16, 500, 20, 32); + quicklistRelease(ql); + quicklistRelease(copy); + } + + for (int f = optimize_start; f < 512; f++) { + TEST_DESC("index 1,200 from 500 list at fill %d at compress %d", f, + options[_i]) { + quicklist *ql = quicklistNew(f, options[_i]); + for (int i = 0; i < 500; i++) + quicklistPushTail(ql, genstr("hello", i + 1), 32); + quicklistEntry entry; + quicklistIndex(ql, 1, &entry); + if (!strcmp((char *)entry.value, "hello2")) + OK; + else + ERR("Value: %s", entry.value); + quicklistIndex(ql, 200, &entry); + if (!strcmp((char *)entry.value, "hello201")) + OK; + else + ERR("Value: %s", entry.value); + quicklistRelease(ql); + } + + TEST_DESC("index -1,-2 from 500 list at fill %d at compress %d", f, + options[_i]) { + quicklist *ql = quicklistNew(f, options[_i]); + for (int i = 0; i < 500; i++) + quicklistPushTail(ql, genstr("hello", i + 1), 32); + quicklistEntry entry; + quicklistIndex(ql, -1, &entry); + if (!strcmp((char *)entry.value, "hello500")) + OK; + else + ERR("Value: %s", entry.value); + quicklistIndex(ql, -2, &entry); + if (!strcmp((char *)entry.value, "hello499")) + OK; + else + ERR("Value: %s", entry.value); + quicklistRelease(ql); + } + + TEST_DESC("index -100 from 500 list at fill %d at compress %d", f, + options[_i]) { + quicklist *ql = quicklistNew(f, options[_i]); + for (int i = 0; i < 500; i++) + quicklistPushTail(ql, genstr("hello", i + 1), 32); + quicklistEntry entry; + quicklistIndex(ql, -100, &entry); + if (!strcmp((char *)entry.value, "hello401")) + OK; + else + ERR("Value: %s", entry.value); + quicklistRelease(ql); + } + + TEST_DESC("index too big +1 from 50 list at fill %d at compress %d", + f, options[_i]) { + quicklist *ql = quicklistNew(f, options[_i]); + for (int i = 0; i < 50; i++) + quicklistPushTail(ql, genstr("hello", i + 1), 32); + quicklistEntry entry; + if (quicklistIndex(ql, 50, &entry)) + ERR("Index found at 50 with 50 list: %.*s", entry.sz, + entry.value); + else + OK; + quicklistRelease(ql); + } + } + + TEST("delete range empty list") { + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistDelRange(ql, 5, 20); + ql_verify(ql, 0, 0, 0, 0); + quicklistRelease(ql); + } + + TEST("delete range of entire node in list of one node") { + quicklist *ql = quicklistNew(-2, options[_i]); + for (int i = 0; i < 32; i++) + quicklistPushHead(ql, genstr("hello", i), 32); + ql_verify(ql, 1, 32, 32, 32); + quicklistDelRange(ql, 0, 32); + ql_verify(ql, 0, 0, 0, 0); + quicklistRelease(ql); + } + + TEST("delete range of entire node with overflow counts") { + quicklist *ql = quicklistNew(-2, options[_i]); + for (int i = 0; i < 32; i++) + quicklistPushHead(ql, genstr("hello", i), 32); + ql_verify(ql, 1, 32, 32, 32); + quicklistDelRange(ql, 0, 128); + ql_verify(ql, 0, 0, 0, 0); + quicklistRelease(ql); + } + + TEST("delete middle 100 of 500 list") { + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistSetFill(ql, 32); + for (int i = 0; i < 500; i++) + quicklistPushTail(ql, genstr("hello", i + 1), 32); + ql_verify(ql, 16, 500, 32, 20); + quicklistDelRange(ql, 200, 100); + ql_verify(ql, 14, 400, 32, 20); + quicklistRelease(ql); + } + + TEST("delete negative 1 from 500 list") { + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistSetFill(ql, 32); + for (int i = 0; i < 500; i++) + quicklistPushTail(ql, genstr("hello", i + 1), 32); + ql_verify(ql, 16, 500, 32, 20); + quicklistDelRange(ql, -1, 1); + ql_verify(ql, 16, 499, 32, 19); + quicklistRelease(ql); + } + + TEST("delete negative 1 from 500 list with overflow counts") { + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistSetFill(ql, 32); + for (int i = 0; i < 500; i++) + quicklistPushTail(ql, genstr("hello", i + 1), 32); + ql_verify(ql, 16, 500, 32, 20); + quicklistDelRange(ql, -1, 128); + ql_verify(ql, 16, 499, 32, 19); + quicklistRelease(ql); + } + + TEST("delete negative 100 from 500 list") { + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistSetFill(ql, 32); + for (int i = 0; i < 500; i++) + quicklistPushTail(ql, genstr("hello", i + 1), 32); + quicklistDelRange(ql, -100, 100); + ql_verify(ql, 13, 400, 32, 16); + quicklistRelease(ql); + } + + TEST("delete -10 count 5 from 50 list") { + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistSetFill(ql, 32); + for (int i = 0; i < 50; i++) + quicklistPushTail(ql, genstr("hello", i + 1), 32); + ql_verify(ql, 2, 50, 32, 18); + quicklistDelRange(ql, -10, 5); + ql_verify(ql, 2, 45, 32, 13); + quicklistRelease(ql); + } + + TEST("numbers only list read") { + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistPushTail(ql, "1111", 4); + quicklistPushTail(ql, "2222", 4); + quicklistPushTail(ql, "3333", 4); + quicklistPushTail(ql, "4444", 4); + ql_verify(ql, 1, 4, 4, 4); + quicklistEntry entry; + quicklistIndex(ql, 0, &entry); + if (entry.longval != 1111) + ERR("Not 1111, %lld", entry.longval); + quicklistIndex(ql, 1, &entry); + if (entry.longval != 2222) + ERR("Not 2222, %lld", entry.longval); + quicklistIndex(ql, 2, &entry); + if (entry.longval != 3333) + ERR("Not 3333, %lld", entry.longval); + quicklistIndex(ql, 3, &entry); + if (entry.longval != 4444) + ERR("Not 4444, %lld", entry.longval); + if (quicklistIndex(ql, 4, &entry)) + ERR("Index past elements: %lld", entry.longval); + quicklistIndex(ql, -1, &entry); + if (entry.longval != 4444) + ERR("Not 4444 (reverse), %lld", entry.longval); + quicklistIndex(ql, -2, &entry); + if (entry.longval != 3333) + ERR("Not 3333 (reverse), %lld", entry.longval); + quicklistIndex(ql, -3, &entry); + if (entry.longval != 2222) + ERR("Not 2222 (reverse), %lld", entry.longval); + quicklistIndex(ql, -4, &entry); + if (entry.longval != 1111) + ERR("Not 1111 (reverse), %lld", entry.longval); + if (quicklistIndex(ql, -5, &entry)) + ERR("Index past elements (reverse), %lld", entry.longval); + quicklistRelease(ql); + } + + TEST("numbers larger list read") { + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistSetFill(ql, 32); + char num[32]; + long long nums[5000]; + for (int i = 0; i < 5000; i++) { + nums[i] = -5157318210846258176 + i; + int sz = ll2string(num, sizeof(num), nums[i]); + quicklistPushTail(ql, num, sz); + } + quicklistPushTail(ql, "xxxxxxxxxxxxxxxxxxxx", 20); + quicklistEntry entry; + for (int i = 0; i < 5000; i++) { + quicklistIndex(ql, i, &entry); + if (entry.longval != nums[i]) + ERR("[%d] Not longval %lld but rather %lld", i, nums[i], + entry.longval); + entry.longval = 0xdeadbeef; + } + quicklistIndex(ql, 5000, &entry); + if (strncmp((char *)entry.value, "xxxxxxxxxxxxxxxxxxxx", 20)) + ERR("String val not match: %s", entry.value); + ql_verify(ql, 157, 5001, 32, 9); + quicklistRelease(ql); + } + + TEST("numbers larger list read B") { + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistPushTail(ql, "99", 2); + quicklistPushTail(ql, "98", 2); + quicklistPushTail(ql, "xxxxxxxxxxxxxxxxxxxx", 20); + quicklistPushTail(ql, "96", 2); + quicklistPushTail(ql, "95", 2); + quicklistReplaceAtIndex(ql, 1, "foo", 3); + quicklistReplaceAtIndex(ql, -1, "bar", 3); + quicklistRelease(ql); + OK; + } + + for (int f = optimize_start; f < 16; f++) { + TEST_DESC("lrem test at fill %d at compress %d", f, options[_i]) { + quicklist *ql = quicklistNew(f, options[_i]); + char *words[] = {"abc", "foo", "bar", "foobar", "foobared", + "zap", "bar", "test", "foo"}; + char *result[] = {"abc", "foo", "foobar", "foobared", + "zap", "test", "foo"}; + char *resultB[] = {"abc", "foo", "foobar", + "foobared", "zap", "test"}; + for (int i = 0; i < 9; i++) + quicklistPushTail(ql, words[i], strlen(words[i])); + + /* lrem 0 bar */ + quicklistIter *iter = quicklistGetIterator(ql, AL_START_HEAD); + quicklistEntry entry; + int i = 0; + while (quicklistNext(iter, &entry)) { + if (quicklistCompare(entry.zi, (unsigned char *)"bar", 3)) { + quicklistDelEntry(iter, &entry); + } + i++; + } + quicklistReleaseIterator(iter); + + /* check result of lrem 0 bar */ + iter = quicklistGetIterator(ql, AL_START_HEAD); + i = 0; + int ok = 1; + while (quicklistNext(iter, &entry)) { + /* Result must be: abc, foo, foobar, foobared, zap, test, + * foo */ + if (strncmp((char *)entry.value, result[i], entry.sz)) { + ERR("No match at position %d, got %.*s instead of %s", + i, entry.sz, entry.value, result[i]); + ok = 0; + } + i++; + } + quicklistReleaseIterator(iter); + + quicklistPushTail(ql, "foo", 3); + + /* lrem -2 foo */ + iter = quicklistGetIterator(ql, AL_START_TAIL); + i = 0; + int del = 2; + while (quicklistNext(iter, &entry)) { + if (quicklistCompare(entry.zi, (unsigned char *)"foo", 3)) { + quicklistDelEntry(iter, &entry); + del--; + } + if (!del) + break; + i++; + } + quicklistReleaseIterator(iter); + + /* check result of lrem -2 foo */ + /* (we're ignoring the '2' part and still deleting all foo + * because + * we only have two foo) */ + iter = quicklistGetIterator(ql, AL_START_TAIL); + i = 0; + size_t resB = sizeof(resultB) / sizeof(*resultB); + while (quicklistNext(iter, &entry)) { + /* Result must be: abc, foo, foobar, foobared, zap, test, + * foo */ + if (strncmp((char *)entry.value, resultB[resB - 1 - i], + entry.sz)) { + ERR("No match at position %d, got %.*s instead of %s", + i, entry.sz, entry.value, resultB[resB - 1 - i]); + ok = 0; + } + i++; + } + + quicklistReleaseIterator(iter); + /* final result of all tests */ + if (ok) + OK; + quicklistRelease(ql); + } + } + + for (int f = optimize_start; f < 16; f++) { + TEST_DESC("iterate reverse + delete at fill %d at compress %d", f, + options[_i]) { + quicklist *ql = quicklistNew(f, options[_i]); + quicklistPushTail(ql, "abc", 3); + quicklistPushTail(ql, "def", 3); + quicklistPushTail(ql, "hij", 3); + quicklistPushTail(ql, "jkl", 3); + quicklistPushTail(ql, "oop", 3); + + quicklistEntry entry; + quicklistIter *iter = quicklistGetIterator(ql, AL_START_TAIL); + int i = 0; + while (quicklistNext(iter, &entry)) { + if (quicklistCompare(entry.zi, (unsigned char *)"hij", 3)) { + quicklistDelEntry(iter, &entry); + } + i++; + } + quicklistReleaseIterator(iter); + + if (i != 5) + ERR("Didn't iterate 5 times, iterated %d times.", i); + + /* Check results after deletion of "hij" */ + iter = quicklistGetIterator(ql, AL_START_HEAD); + i = 0; + char *vals[] = {"abc", "def", "jkl", "oop"}; + while (quicklistNext(iter, &entry)) { + if (!quicklistCompare(entry.zi, (unsigned char *)vals[i], + 3)) { + ERR("Value at %d didn't match %s\n", i, vals[i]); + } + i++; + } + quicklistReleaseIterator(iter); + quicklistRelease(ql); + } + } + + for (int f = optimize_start; f < 800; f++) { + TEST_DESC("iterator at index test at fill %d at compress %d", f, + options[_i]) { + quicklist *ql = quicklistNew(f, options[_i]); + char num[32]; + long long nums[5000]; + for (int i = 0; i < 760; i++) { + nums[i] = -5157318210846258176 + i; + int sz = ll2string(num, sizeof(num), nums[i]); + quicklistPushTail(ql, num, sz); + } + + quicklistEntry entry; + quicklistIter *iter = + quicklistGetIteratorAtIdx(ql, AL_START_HEAD, 437); + int i = 437; + while (quicklistNext(iter, &entry)) { + if (entry.longval != nums[i]) + ERR("Expected %lld, but got %lld", entry.longval, + nums[i]); + i++; + } + quicklistReleaseIterator(iter); + quicklistRelease(ql); + } + } + + for (int f = optimize_start; f < 40; f++) { + TEST_DESC("ltrim test A at fill %d at compress %d", f, + options[_i]) { + quicklist *ql = quicklistNew(f, options[_i]); + char num[32]; + long long nums[5000]; + for (int i = 0; i < 32; i++) { + nums[i] = -5157318210846258176 + i; + int sz = ll2string(num, sizeof(num), nums[i]); + quicklistPushTail(ql, num, sz); + } + if (f == 32) + ql_verify(ql, 1, 32, 32, 32); + /* ltrim 25 53 (keep [25,32] inclusive = 7 remaining) */ + quicklistDelRange(ql, 0, 25); + quicklistDelRange(ql, 0, 0); + quicklistEntry entry; + for (int i = 0; i < 7; i++) { + quicklistIndex(ql, i, &entry); + if (entry.longval != nums[25 + i]) + ERR("Deleted invalid range! Expected %lld but got " + "%lld", + entry.longval, nums[25 + i]); + } + if (f == 32) + ql_verify(ql, 1, 7, 7, 7); + quicklistRelease(ql); + } + } + + for (int f = optimize_start; f < 40; f++) { + TEST_DESC("ltrim test B at fill %d at compress %d", f, + options[_i]) { + /* Force-disable compression because our 33 sequential + * integers don't compress and the check always fails. */ + quicklist *ql = quicklistNew(f, QUICKLIST_NOCOMPRESS); + char num[32]; + long long nums[5000]; + for (int i = 0; i < 33; i++) { + nums[i] = i; + int sz = ll2string(num, sizeof(num), nums[i]); + quicklistPushTail(ql, num, sz); + } + if (f == 32) + ql_verify(ql, 2, 33, 32, 1); + /* ltrim 5 16 (keep [5,16] inclusive = 12 remaining) */ + quicklistDelRange(ql, 0, 5); + quicklistDelRange(ql, -16, 16); + if (f == 32) + ql_verify(ql, 1, 12, 12, 12); + quicklistEntry entry; + quicklistIndex(ql, 0, &entry); + if (entry.longval != 5) + ERR("A: longval not 5, but %lld", entry.longval); + else + OK; + quicklistIndex(ql, -1, &entry); + if (entry.longval != 16) + ERR("B! got instead: %lld", entry.longval); + else + OK; + quicklistPushTail(ql, "bobobob", 7); + quicklistIndex(ql, -1, &entry); + if (strncmp((char *)entry.value, "bobobob", 7)) + ERR("Tail doesn't match bobobob, it's %.*s instead", + entry.sz, entry.value); + for (int i = 0; i < 12; i++) { + quicklistIndex(ql, i, &entry); + if (entry.longval != nums[5 + i]) + ERR("Deleted invalid range! Expected %lld but got " + "%lld", + entry.longval, nums[5 + i]); + } + quicklistRelease(ql); + } + } + + for (int f = optimize_start; f < 40; f++) { + TEST_DESC("ltrim test C at fill %d at compress %d", f, + options[_i]) { + quicklist *ql = quicklistNew(f, options[_i]); + char num[32]; + long long nums[5000]; + for (int i = 0; i < 33; i++) { + nums[i] = -5157318210846258176 + i; + int sz = ll2string(num, sizeof(num), nums[i]); + quicklistPushTail(ql, num, sz); + } + if (f == 32) + ql_verify(ql, 2, 33, 32, 1); + /* ltrim 3 3 (keep [3,3] inclusive = 1 remaining) */ + quicklistDelRange(ql, 0, 3); + quicklistDelRange(ql, -29, + 4000); /* make sure not loop forever */ + if (f == 32) + ql_verify(ql, 1, 1, 1, 1); + quicklistEntry entry; + quicklistIndex(ql, 0, &entry); + if (entry.longval != -5157318210846258173) + ERROR; + else + OK; + quicklistRelease(ql); + } + } + + for (int f = optimize_start; f < 40; f++) { + TEST_DESC("ltrim test D at fill %d at compress %d", f, + options[_i]) { + quicklist *ql = quicklistNew(f, options[_i]); + char num[32]; + long long nums[5000]; + for (int i = 0; i < 33; i++) { + nums[i] = -5157318210846258176 + i; + int sz = ll2string(num, sizeof(num), nums[i]); + quicklistPushTail(ql, num, sz); + } + if (f == 32) + ql_verify(ql, 2, 33, 32, 1); + quicklistDelRange(ql, -12, 3); + if (ql->count != 30) + ERR("Didn't delete exactly three elements! Count is: %lu", + ql->count); + quicklistRelease(ql); + } + } + + for (int f = optimize_start; f < 72; f++) { + TEST_DESC("create quicklist from ziplist at fill %d at compress %d", + f, options[_i]) { + unsigned char *zl = ziplistNew(); + long long nums[64]; + char num[64]; + for (int i = 0; i < 33; i++) { + nums[i] = -5157318210846258176 + i; + int sz = ll2string(num, sizeof(num), nums[i]); + zl = + ziplistPush(zl, (unsigned char *)num, sz, ZIPLIST_TAIL); + } + for (int i = 0; i < 33; i++) { + zl = ziplistPush(zl, (unsigned char *)genstr("hello", i), + 32, ZIPLIST_TAIL); + } + quicklist *ql = quicklistCreateFromZiplist(f, options[_i], zl); + if (f == 1) + ql_verify(ql, 66, 66, 1, 1); + else if (f == 32) + ql_verify(ql, 3, 66, 32, 2); + else if (f == 66) + ql_verify(ql, 1, 66, 66, 66); + quicklistRelease(ql); + } + } + + long long stop = mstime(); + runtime[_i] = stop - start; + } + + /* Run a longer test of compression depth outside of primary test loop. */ + int list_sizes[] = {250, 251, 500, 999, 1000}; + long long start = mstime(); + for (int list = 0; list < (int)(sizeof(list_sizes) / sizeof(*list_sizes)); + list++) { + for (int f = optimize_start; f < 128; f++) { + for (int depth = 1; depth < 40; depth++) { + /* skip over many redundant test cases */ + TEST_DESC("verify specific compression of interior nodes with " + "%d list " + "at fill %d at compress %d", + list_sizes[list], f, depth) { + quicklist *ql = quicklistNew(f, depth); + for (int i = 0; i < list_sizes[list]; i++) { + quicklistPushTail(ql, genstr("hello TAIL", i + 1), 64); + quicklistPushHead(ql, genstr("hello HEAD", i + 1), 64); + } + + quicklistNode *node = ql->head; + unsigned int low_raw = ql->compress; + unsigned int high_raw = ql->len - ql->compress; + + for (unsigned int at = 0; at < ql->len; + at++, node = node->next) { + if (at < low_raw || at >= high_raw) { + if (node->encoding != QUICKLIST_NODE_ENCODING_RAW) { + ERR("Incorrect compression: node %d is " + "compressed at depth %d ((%u, %u); total " + "nodes: %u; size: %u)", + at, depth, low_raw, high_raw, ql->len, + node->sz); + } + } else { + if (node->encoding != QUICKLIST_NODE_ENCODING_LZF) { + ERR("Incorrect non-compression: node %d is NOT " + "compressed at depth %d ((%u, %u); total " + "nodes: %u; size: %u; attempted: %d)", + at, depth, low_raw, high_raw, ql->len, + node->sz, node->attempted_compress); + } + } + } + quicklistRelease(ql); + } + } + } + } + long long stop = mstime(); + + printf("\n"); + for (size_t i = 0; i < option_count; i++) + printf("Test Loop %02d: %0.2f seconds.\n", options[i], + (float)runtime[i] / 1000); + printf("Compressions: %0.2f seconds.\n", (float)(stop - start) / 1000); + printf("\n"); + + if (!err) + printf("ALL TESTS PASSED!\n"); + else + ERR("Sorry, not all tests passed! In fact, %d tests failed.", err); + + return err; +} +#endif diff --git a/src/quicklist.h b/src/quicklist.h new file mode 100644 index 000000000..5c9530ccd --- /dev/null +++ b/src/quicklist.h @@ -0,0 +1,169 @@ +/* quicklist.h - A generic doubly linked quicklist implementation + * + * Copyright (c) 2014, Matt Stancliff <matt@genges.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this quicklist of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this quicklist of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __QUICKLIST_H__ +#define __QUICKLIST_H__ + +/* Node, quicklist, and Iterator are the only data structures used currently. */ + +/* quicklistNode is a 32 byte struct describing a ziplist for a quicklist. + * We use bit fields keep the quicklistNode at 32 bytes. + * count: 16 bits, max 65536 (max zl bytes is 65k, so max count actually < 32k). + * encoding: 2 bits, RAW=1, LZF=2. + * container: 2 bits, NONE=1, ZIPLIST=2. + * recompress: 1 bit, bool, true if node is temporarry decompressed for usage. + * attempted_compress: 1 bit, boolean, used for verifying during testing. + * extra: 12 bits, free for future use; pads out the remainder of 32 bits */ +typedef struct quicklistNode { + struct quicklistNode *prev; + struct quicklistNode *next; + unsigned char *zl; + unsigned int sz; /* ziplist size in bytes */ + unsigned int count : 16; /* count of items in ziplist */ + unsigned int encoding : 2; /* RAW==1 or LZF==2 */ + unsigned int container : 2; /* NONE==1 or ZIPLIST==2 */ + unsigned int recompress : 1; /* was this node previous compressed? */ + unsigned int attempted_compress : 1; /* node can't compress; too small */ + unsigned int extra : 10; /* more bits to steal for future usage */ +} quicklistNode; + +/* quicklistLZF is a 4+N byte struct holding 'sz' followed by 'compressed'. + * 'sz' is byte length of 'compressed' field. + * 'compressed' is LZF data with total (compressed) length 'sz' + * NOTE: uncompressed length is stored in quicklistNode->sz. + * When quicklistNode->zl is compressed, node->zl points to a quicklistLZF */ +typedef struct quicklistLZF { + unsigned int sz; /* LZF size in bytes*/ + char compressed[]; +} quicklistLZF; + +/* quicklist is a 32 byte struct (on 64-bit systems) describing a quicklist. + * 'count' is the number of total entries. + * 'len' is the number of quicklist nodes. + * 'compress' is: -1 if compression disabled, otherwise it's the number + * of quicklistNodes to leave uncompressed at ends of quicklist. + * 'fill' is the user-requested (or default) fill factor. */ +typedef struct quicklist { + quicklistNode *head; + quicklistNode *tail; + unsigned long count; /* total count of all entries in all ziplists */ + unsigned int len; /* number of quicklistNodes */ + int fill : 16; /* fill factor for individual nodes */ + unsigned int compress : 16; /* depth of end nodes not to compress;0=off */ +} quicklist; + +typedef struct quicklistIter { + const quicklist *quicklist; + quicklistNode *current; + unsigned char *zi; + long offset; /* offset in current ziplist */ + int direction; +} quicklistIter; + +typedef struct quicklistEntry { + const quicklist *quicklist; + quicklistNode *node; + unsigned char *zi; + unsigned char *value; + unsigned int sz; + long long longval; + int offset; +} quicklistEntry; + +#define QUICKLIST_HEAD 0 +#define QUICKLIST_TAIL -1 + +/* quicklist node encodings */ +#define QUICKLIST_NODE_ENCODING_RAW 1 +#define QUICKLIST_NODE_ENCODING_LZF 2 + +/* quicklist compression disable */ +#define QUICKLIST_NOCOMPRESS 0 + +/* quicklist container formats */ +#define QUICKLIST_NODE_CONTAINER_NONE 1 +#define QUICKLIST_NODE_CONTAINER_ZIPLIST 2 + +#define quicklistNodeIsCompressed(node) \ + ((node)->encoding == QUICKLIST_NODE_ENCODING_LZF) + +/* Prototypes */ +quicklist *quicklistCreate(void); +quicklist *quicklistNew(int fill, int compress); +void quicklistSetCompressDepth(quicklist *quicklist, int depth); +void quicklistSetFill(quicklist *quicklist, int fill); +void quicklistSetOptions(quicklist *quicklist, int fill, int depth); +void quicklistRelease(quicklist *quicklist); +int quicklistPushHead(quicklist *quicklist, void *value, const size_t sz); +int quicklistPushTail(quicklist *quicklist, void *value, const size_t sz); +void quicklistPush(quicklist *quicklist, void *value, const size_t sz, + int where); +void quicklistAppendZiplist(quicklist *quicklist, unsigned char *zl); +quicklist *quicklistAppendValuesFromZiplist(quicklist *quicklist, + unsigned char *zl); +quicklist *quicklistCreateFromZiplist(int fill, int compress, + unsigned char *zl); +void quicklistInsertAfter(quicklist *quicklist, quicklistEntry *node, + void *value, const size_t sz); +void quicklistInsertBefore(quicklist *quicklist, quicklistEntry *node, + void *value, const size_t sz); +void quicklistDelEntry(quicklistIter *iter, quicklistEntry *entry); +int quicklistReplaceAtIndex(quicklist *quicklist, long index, void *data, + int sz); +int quicklistDelRange(quicklist *quicklist, const long start, const long stop); +quicklistIter *quicklistGetIterator(const quicklist *quicklist, int direction); +quicklistIter *quicklistGetIteratorAtIdx(const quicklist *quicklist, + int direction, const long long idx); +int quicklistNext(quicklistIter *iter, quicklistEntry *node); +void quicklistReleaseIterator(quicklistIter *iter); +quicklist *quicklistDup(quicklist *orig); +int quicklistIndex(const quicklist *quicklist, const long long index, + quicklistEntry *entry); +void quicklistRewind(quicklist *quicklist, quicklistIter *li); +void quicklistRewindTail(quicklist *quicklist, quicklistIter *li); +void quicklistRotate(quicklist *quicklist); +int quicklistPopCustom(quicklist *quicklist, int where, unsigned char **data, + unsigned int *sz, long long *sval, + void *(*saver)(unsigned char *data, unsigned int sz)); +int quicklistPop(quicklist *quicklist, int where, unsigned char **data, + unsigned int *sz, long long *slong); +unsigned int quicklistCount(quicklist *ql); +int quicklistCompare(unsigned char *p1, unsigned char *p2, int p2_len); +size_t quicklistGetLzf(const quicklistNode *node, void **data); + +#ifdef REDIS_TEST +int quicklistTest(int argc, char *argv[]); +#endif + +/* Directions for iterators */ +#define AL_START_HEAD 0 +#define AL_START_TAIL 1 + +#endif /* __QUICKLIST_H__ */ @@ -40,6 +40,20 @@ #include <arpa/inet.h> #include <sys/stat.h> +#define RDB_LOAD_NONE 0 +#define RDB_LOAD_ENC (1<<0) +#define RDB_LOAD_PLAIN (1<<1) + +#define rdbExitReportCorruptRDB(reason) rdbCheckThenExit(reason, __LINE__); + +void rdbCheckThenExit(char *reason, int where) { + redisLog(REDIS_WARNING, "Corrupt RDB detected at rdb.c:%d (%s). " + "Running 'redis-check-rdb %s'", + where, reason, server.rdb_filename); + redis_check_rdb(server.rdb_filename); + exit(1); +} + static int rdbWriteRaw(rio *rdb, void *p, size_t len) { if (rdb && rioWrite(rdb,p,len) == 0) return -1; @@ -161,9 +175,11 @@ int rdbEncodeInteger(long long value, unsigned char *enc) { } /* Loads an integer-encoded object with the specified encoding type "enctype". - * If the "encode" argument is set the function may return an integer-encoded - * string object, otherwise it always returns a raw string object. */ -robj *rdbLoadIntegerObject(rio *rdb, int enctype, int encode) { + * The returned value changes according to the flags, see + * rdbGenerincLoadStringObject() for more info. */ +void *rdbLoadIntegerObject(rio *rdb, int enctype, int flags) { + int plain = flags & RDB_LOAD_PLAIN; + int encode = flags & RDB_LOAD_ENC; unsigned char enc[4]; long long val; @@ -182,12 +198,19 @@ robj *rdbLoadIntegerObject(rio *rdb, int enctype, int encode) { val = (int32_t)v; } else { val = 0; /* anti-warning */ - redisPanic("Unknown RDB integer encoding type"); + rdbExitReportCorruptRDB("Unknown RDB integer encoding type"); } - if (encode) + if (plain) { + char buf[REDIS_LONGSTR_SIZE], *p; + int len = ll2string(buf,sizeof(buf),val); + p = zmalloc(len); + memcpy(p,buf,len); + return p; + } else if (encode) { return createStringObjectFromLongLong(val); - else + } else { return createObject(REDIS_STRING,sdsfromlonglong(val)); + } } /* String objects in the form "2391" "-100" without any space and with a @@ -209,44 +232,54 @@ int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) { return rdbEncodeInteger(value,enc); } -int rdbSaveLzfStringObject(rio *rdb, unsigned char *s, size_t len) { - size_t comprlen, outlen; +int rdbSaveLzfBlob(rio *rdb, void *data, size_t compress_len, + size_t original_len) { unsigned char byte; int n, nwritten = 0; - void *out; - /* We require at least four bytes compression for this to be worth it */ - if (len <= 4) return 0; - outlen = len-4; - if ((out = zmalloc(outlen+1)) == NULL) return 0; - comprlen = lzf_compress(s, len, out, outlen); - if (comprlen == 0) { - zfree(out); - return 0; - } /* Data compressed! Let's save it on disk */ byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF; if ((n = rdbWriteRaw(rdb,&byte,1)) == -1) goto writeerr; nwritten += n; - if ((n = rdbSaveLen(rdb,comprlen)) == -1) goto writeerr; + if ((n = rdbSaveLen(rdb,compress_len)) == -1) goto writeerr; nwritten += n; - if ((n = rdbSaveLen(rdb,len)) == -1) goto writeerr; + if ((n = rdbSaveLen(rdb,original_len)) == -1) goto writeerr; nwritten += n; - if ((n = rdbWriteRaw(rdb,out,comprlen)) == -1) goto writeerr; + if ((n = rdbWriteRaw(rdb,data,compress_len)) == -1) goto writeerr; nwritten += n; - zfree(out); return nwritten; writeerr: - zfree(out); return -1; } -robj *rdbLoadLzfStringObject(rio *rdb) { +int rdbSaveLzfStringObject(rio *rdb, unsigned char *s, size_t len) { + size_t comprlen, outlen; + void *out; + + /* We require at least four bytes compression for this to be worth it */ + if (len <= 4) return 0; + outlen = len-4; + if ((out = zmalloc(outlen+1)) == NULL) return 0; + comprlen = lzf_compress(s, len, out, outlen); + if (comprlen == 0) { + zfree(out); + return 0; + } + size_t nwritten = rdbSaveLzfBlob(rdb, out, comprlen, len); + zfree(out); + return nwritten; +} + +/* Load an LZF compressed string in RDB format. The returned value + * changes according to 'flags'. For more info check the + * rdbGenericLoadStringObject() function. */ +void *rdbLoadLzfStringObject(rio *rdb, int flags) { + int plain = flags & RDB_LOAD_PLAIN; unsigned int len, clen; unsigned char *c = NULL; sds val = NULL; @@ -254,14 +287,29 @@ robj *rdbLoadLzfStringObject(rio *rdb) { if ((clen = rdbLoadLen(rdb,NULL)) == REDIS_RDB_LENERR) return NULL; if ((len = rdbLoadLen(rdb,NULL)) == REDIS_RDB_LENERR) return NULL; if ((c = zmalloc(clen)) == NULL) goto err; - if ((val = sdsnewlen(NULL,len)) == NULL) goto err; + + /* Allocate our target according to the uncompressed size. */ + if (plain) { + val = zmalloc(len); + } else { + if ((val = sdsnewlen(NULL,len)) == NULL) goto err; + } + + /* Load the compressed representation and uncompress it to target. */ if (rioRead(rdb,c,clen) == 0) goto err; if (lzf_decompress(c,clen,val,len) == 0) goto err; zfree(c); - return createObject(REDIS_STRING,val); + + if (plain) + return val; + else + return createObject(REDIS_STRING,val); err: zfree(c); - sdsfree(val); + if (plain) + zfree(val); + else + sdsfree(val); return NULL; } @@ -330,10 +378,21 @@ int rdbSaveStringObject(rio *rdb, robj *obj) { } } -robj *rdbGenericLoadStringObject(rio *rdb, int encode) { +/* Load a string object from an RDB file according to flags: + * + * RDB_LOAD_NONE (no flags): load an RDB object, unencoded. + * RDB_LOAD_ENC: If the returned type is a Redis object, try to + * encode it in a special way to be more memory + * efficient. When this flag is passed the function + * no longer guarantees that obj->ptr is an SDS string. + * RDB_LOAD_PLAIN: Return a plain string allocated with zmalloc() + * instead of a Redis object. + */ +void *rdbGenericLoadStringObject(rio *rdb, int flags) { + int encode = flags & RDB_LOAD_ENC; + int plain = flags & RDB_LOAD_PLAIN; int isencoded; uint32_t len; - robj *o; len = rdbLoadLen(rdb,&isencoded); if (isencoded) { @@ -341,30 +400,39 @@ robj *rdbGenericLoadStringObject(rio *rdb, int encode) { case REDIS_RDB_ENC_INT8: case REDIS_RDB_ENC_INT16: case REDIS_RDB_ENC_INT32: - return rdbLoadIntegerObject(rdb,len,encode); + return rdbLoadIntegerObject(rdb,len,flags); case REDIS_RDB_ENC_LZF: - return rdbLoadLzfStringObject(rdb); + return rdbLoadLzfStringObject(rdb,flags); default: - redisPanic("Unknown RDB encoding type"); + rdbExitReportCorruptRDB("Unknown RDB encoding type"); } } if (len == REDIS_RDB_LENERR) return NULL; - o = encode ? createStringObject(NULL,len) : - createRawStringObject(NULL,len); - if (len && rioRead(rdb,o->ptr,len) == 0) { - decrRefCount(o); - return NULL; + if (!plain) { + robj *o = encode ? createStringObject(NULL,len) : + createRawStringObject(NULL,len); + if (len && rioRead(rdb,o->ptr,len) == 0) { + decrRefCount(o); + return NULL; + } + return o; + } else { + void *buf = zmalloc(len); + if (len && rioRead(rdb,buf,len) == 0) { + zfree(buf); + return NULL; + } + return buf; } - return o; } robj *rdbLoadStringObject(rio *rdb) { - return rdbGenericLoadStringObject(rdb,0); + return rdbGenericLoadStringObject(rdb,RDB_LOAD_NONE); } robj *rdbLoadEncodedStringObject(rio *rdb) { - return rdbGenericLoadStringObject(rdb,1); + return rdbGenericLoadStringObject(rdb,RDB_LOAD_ENC); } /* Save a double value. Doubles are saved as strings prefixed by an unsigned @@ -433,10 +501,8 @@ int rdbSaveObjectType(rio *rdb, robj *o) { case REDIS_STRING: return rdbSaveType(rdb,REDIS_RDB_TYPE_STRING); case REDIS_LIST: - if (o->encoding == REDIS_ENCODING_ZIPLIST) - return rdbSaveType(rdb,REDIS_RDB_TYPE_LIST_ZIPLIST); - else if (o->encoding == REDIS_ENCODING_LINKEDLIST) - return rdbSaveType(rdb,REDIS_RDB_TYPE_LIST); + if (o->encoding == REDIS_ENCODING_QUICKLIST) + return rdbSaveType(rdb,REDIS_RDB_TYPE_LIST_QUICKLIST); else redisPanic("Unknown list encoding"); case REDIS_SET: @@ -477,7 +543,7 @@ int rdbLoadObjectType(rio *rdb) { /* Save a Redis object. Returns -1 on error, number of bytes written on success. */ int rdbSaveObject(rio *rdb, robj *o) { - int n, nwritten = 0; + int n = 0, nwritten = 0; if (o->type == REDIS_STRING) { /* Save a string value */ @@ -485,25 +551,24 @@ int rdbSaveObject(rio *rdb, robj *o) { nwritten += n; } else if (o->type == REDIS_LIST) { /* Save a list value */ - if (o->encoding == REDIS_ENCODING_ZIPLIST) { - size_t l = ziplistBlobLen((unsigned char*)o->ptr); + if (o->encoding == REDIS_ENCODING_QUICKLIST) { + quicklist *ql = o->ptr; + quicklistNode *node = ql->head; - if ((n = rdbSaveRawString(rdb,o->ptr,l)) == -1) return -1; + if ((n = rdbSaveLen(rdb,ql->len)) == -1) return -1; nwritten += n; - } else if (o->encoding == REDIS_ENCODING_LINKEDLIST) { - list *list = o->ptr; - listIter li; - listNode *ln; - if ((n = rdbSaveLen(rdb,listLength(list))) == -1) return -1; - nwritten += n; - - listRewind(list,&li); - while((ln = listNext(&li))) { - robj *eleobj = listNodeValue(ln); - if ((n = rdbSaveStringObject(rdb,eleobj)) == -1) return -1; - nwritten += n; - } + do { + if (quicklistNodeIsCompressed(node)) { + void *data; + size_t compress_len = quicklistGetLzf(node, &data); + if ((n = rdbSaveLzfBlob(rdb,data,compress_len,node->sz)) == -1) return -1; + nwritten += n; + } else { + if ((n = rdbSaveRawString(rdb,node->zl,node->sz)) == -1) return -1; + nwritten += n; + } + } while ((node = node->next)); } else { redisPanic("Unknown list encoding"); } @@ -627,6 +692,39 @@ int rdbSaveKeyValuePair(rio *rdb, robj *key, robj *val, return 1; } +/* Save an AUX field. */ +int rdbSaveAuxField(rio *rdb, void *key, size_t keylen, void *val, size_t vallen) { + if (rdbSaveType(rdb,REDIS_RDB_OPCODE_AUX) == -1) return -1; + if (rdbSaveRawString(rdb,key,keylen) == -1) return -1; + if (rdbSaveRawString(rdb,val,vallen) == -1) return -1; + return 1; +} + +/* Wrapper for rdbSaveAuxField() used when key/val length can be obtained + * with strlen(). */ +int rdbSaveAuxFieldStrStr(rio *rdb, char *key, char *val) { + return rdbSaveAuxField(rdb,key,strlen(key),val,strlen(val)); +} + +/* Wrapper for strlen(key) + integer type (up to long long range). */ +int rdbSaveAuxFieldStrInt(rio *rdb, char *key, long long val) { + char buf[REDIS_LONGSTR_SIZE]; + int vlen = ll2string(buf,sizeof(buf),val); + return rdbSaveAuxField(rdb,key,strlen(key),buf,vlen); +} + +/* Save a few default AUX fields with information about the RDB generated. */ +int rdbSaveInfoAuxFields(rio *rdb) { + int redis_bits = (sizeof(void*) == 8) ? 64 : 32; + + /* Add a few fields about the state when the RDB was created. */ + if (rdbSaveAuxFieldStrStr(rdb,"redis-ver",REDIS_VERSION) == -1) return -1; + if (rdbSaveAuxFieldStrInt(rdb,"redis-bits",redis_bits) == -1) return -1; + if (rdbSaveAuxFieldStrInt(rdb,"ctime",time(NULL)) == -1) return -1; + if (rdbSaveAuxFieldStrInt(rdb,"used-mem",zmalloc_used_memory()) == -1) return -1; + return 1; +} + /* Produces a dump of the database in RDB format sending it to the specified * Redis I/O channel. On success REDIS_OK is returned, otherwise REDIS_ERR * is returned and part of the output, or all the output, can be @@ -647,6 +745,7 @@ int rdbSaveRio(rio *rdb, int *error) { rdb->update_cksum = rioGenericUpdateChecksum; snprintf(magic,sizeof(magic),"REDIS%04d",REDIS_RDB_VERSION); if (rdbWriteRaw(rdb,magic,9) == -1) goto werr; + if (rdbSaveInfoAuxFields(rdb) == -1) goto werr; for (j = 0; j < server.dbnum; j++) { redisDb *db = server.db+j; @@ -659,6 +758,21 @@ int rdbSaveRio(rio *rdb, int *error) { if (rdbSaveType(rdb,REDIS_RDB_OPCODE_SELECTDB) == -1) goto werr; if (rdbSaveLen(rdb,j) == -1) goto werr; + /* Write the RESIZE DB opcode. We trim the size to UINT32_MAX, which + * is currently the largest type we are able to represent in RDB sizes. + * However this does not limit the actual size of the DB to load since + * these sizes are just hints to resize the hash tables. */ + uint32_t db_size, expires_size; + db_size = (dictSize(db->dict) <= UINT32_MAX) ? + dictSize(db->dict) : + UINT32_MAX; + expires_size = (dictSize(db->dict) <= UINT32_MAX) ? + dictSize(db->expires) : + UINT32_MAX; + if (rdbSaveType(rdb,REDIS_RDB_OPCODE_RESIZEDB) == -1) goto werr; + if (rdbSaveLen(rdb,db_size) == -1) goto werr; + if (rdbSaveLen(rdb,expires_size) == -1) goto werr; + /* Iterate this DB writing every entry */ while((de = dictNext(di)) != NULL) { sds keystr = dictGetKey(de); @@ -720,7 +834,7 @@ int rdbSave(char *filename) { char tmpfile[256]; FILE *fp; rio rdb; - int error; + int error = 0; snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid()); fp = fopen(tmpfile,"w"); @@ -819,7 +933,7 @@ void rdbRemoveTempFile(pid_t childpid) { /* Load a Redis object of the specified type from the specified file. * On success a newly allocated object is returned, otherwise NULL. */ robj *rdbLoadObject(int rdbtype, rio *rdb) { - robj *o, *ele, *dec; + robj *o = NULL, *ele, *dec; size_t len; unsigned int i; @@ -831,33 +945,18 @@ robj *rdbLoadObject(int rdbtype, rio *rdb) { /* Read list value */ if ((len = rdbLoadLen(rdb,NULL)) == REDIS_RDB_LENERR) return NULL; - /* Use a real list when there are too many entries */ - if (len > server.list_max_ziplist_entries) { - o = createListObject(); - } else { - o = createZiplistObject(); - } + o = createQuicklistObject(); + quicklistSetOptions(o->ptr, server.list_max_ziplist_size, + server.list_compress_depth); /* Load every single element of the list */ while(len--) { if ((ele = rdbLoadEncodedStringObject(rdb)) == NULL) return NULL; - - /* If we are using a ziplist and the value is too big, convert - * the object to a real list. */ - if (o->encoding == REDIS_ENCODING_ZIPLIST && - sdsEncodedObject(ele) && - sdslen(ele->ptr) > server.list_max_ziplist_value) - listTypeConvert(o,REDIS_ENCODING_LINKEDLIST); - - if (o->encoding == REDIS_ENCODING_ZIPLIST) { - dec = getDecodedObject(ele); - o->ptr = ziplistPush(o->ptr,dec->ptr,sdslen(dec->ptr),REDIS_TAIL); - decrRefCount(dec); - decrRefCount(ele); - } else { - ele = tryObjectEncoding(ele); - listAddNodeTail(o->ptr,ele); - } + dec = getDecodedObject(ele); + size_t len = sdslen(dec->ptr); + quicklistPushTail(o->ptr, dec->ptr, len); + decrRefCount(dec); + decrRefCount(ele); } } else if (rdbtype == REDIS_RDB_TYPE_SET) { /* Read list/set value */ @@ -989,25 +1088,33 @@ robj *rdbLoadObject(int rdbtype, rio *rdb) { /* Add pair to hash table */ ret = dictAdd((dict*)o->ptr, field, value); - redisAssert(ret == DICT_OK); + if (ret == DICT_ERR) { + rdbExitReportCorruptRDB("Duplicate keys detected"); + } } /* All pairs should be read by now */ redisAssert(len == 0); - + } else if (rdbtype == REDIS_RDB_TYPE_LIST_QUICKLIST) { + if ((len = rdbLoadLen(rdb,NULL)) == REDIS_RDB_LENERR) return NULL; + o = createQuicklistObject(); + quicklistSetOptions(o->ptr, server.list_max_ziplist_size, + server.list_compress_depth); + + while (len--) { + unsigned char *zl = rdbGenericLoadStringObject(rdb,RDB_LOAD_PLAIN); + if (zl == NULL) return NULL; + quicklistAppendZiplist(o->ptr, zl); + } } else if (rdbtype == REDIS_RDB_TYPE_HASH_ZIPMAP || rdbtype == REDIS_RDB_TYPE_LIST_ZIPLIST || rdbtype == REDIS_RDB_TYPE_SET_INTSET || rdbtype == REDIS_RDB_TYPE_ZSET_ZIPLIST || rdbtype == REDIS_RDB_TYPE_HASH_ZIPLIST) { - robj *aux = rdbLoadStringObject(rdb); - - if (aux == NULL) return NULL; - o = createObject(REDIS_STRING,NULL); /* string is just placeholder */ - o->ptr = zmalloc(sdslen(aux->ptr)); - memcpy(o->ptr,aux->ptr,sdslen(aux->ptr)); - decrRefCount(aux); + unsigned char *encoded = rdbGenericLoadStringObject(rdb,RDB_LOAD_PLAIN); + if (encoded == NULL) return NULL; + o = createObject(REDIS_STRING,encoded); /* Obj type fixed below. */ /* Fix the object encoding, and make sure to convert the encoded * data type into the base type if accordingly to the current @@ -1048,8 +1155,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb) { case REDIS_RDB_TYPE_LIST_ZIPLIST: o->type = REDIS_LIST; o->encoding = REDIS_ENCODING_ZIPLIST; - if (ziplistLen(o->ptr) > server.list_max_ziplist_entries) - listTypeConvert(o,REDIS_ENCODING_LINKEDLIST); + listTypeConvert(o,REDIS_ENCODING_QUICKLIST); break; case REDIS_RDB_TYPE_SET_INTSET: o->type = REDIS_SET; @@ -1070,11 +1176,11 @@ robj *rdbLoadObject(int rdbtype, rio *rdb) { hashTypeConvert(o, REDIS_ENCODING_HT); break; default: - redisPanic("Unknown encoding"); + rdbExitReportCorruptRDB("Unknown encoding"); break; } } else { - redisPanic("Unknown object type"); + rdbExitReportCorruptRDB("Unknown object type"); } return o; } @@ -1087,8 +1193,9 @@ void startLoading(FILE *fp) { /* Load the DB */ server.loading = 1; server.loading_start_time = time(NULL); + server.loading_loaded_bytes = 0; if (fstat(fileno(fp), &sb) == -1) { - server.loading_total_bytes = 1; /* just to avoid division by zero */ + server.loading_total_bytes = 0; } else { server.loading_total_bytes = sb.st_size; } @@ -1162,7 +1269,12 @@ int rdbLoad(char *filename) { /* Read type. */ if ((type = rdbLoadType(&rdb)) == -1) goto eoferr; + + /* Handle special types. */ if (type == REDIS_RDB_OPCODE_EXPIRETIME) { + /* EXPIRETIME: load an expire associated with the next key + * to load. Note that after loading an expire we need to + * load the actual type, and continue. */ if ((expiretime = rdbLoadTime(&rdb)) == -1) goto eoferr; /* We read the time so we need to read the object type again. */ if ((type = rdbLoadType(&rdb)) == -1) goto eoferr; @@ -1170,27 +1282,67 @@ int rdbLoad(char *filename) { * into milliseconds. */ expiretime *= 1000; } else if (type == REDIS_RDB_OPCODE_EXPIRETIME_MS) { - /* Milliseconds precision expire times introduced with RDB - * version 3. */ + /* EXPIRETIME_MS: milliseconds precision expire times introduced + * with RDB v3. Like EXPIRETIME but no with more precision. */ if ((expiretime = rdbLoadMillisecondTime(&rdb)) == -1) goto eoferr; /* We read the time so we need to read the object type again. */ if ((type = rdbLoadType(&rdb)) == -1) goto eoferr; - } - - if (type == REDIS_RDB_OPCODE_EOF) + } else if (type == REDIS_RDB_OPCODE_EOF) { + /* EOF: End of file, exit the main loop. */ break; - - /* Handle SELECT DB opcode as a special case */ - if (type == REDIS_RDB_OPCODE_SELECTDB) { + } else if (type == REDIS_RDB_OPCODE_SELECTDB) { + /* SELECTDB: Select the specified database. */ if ((dbid = rdbLoadLen(&rdb,NULL)) == REDIS_RDB_LENERR) goto eoferr; if (dbid >= (unsigned)server.dbnum) { - redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum); + redisLog(REDIS_WARNING, + "FATAL: Data file was created with a Redis " + "server configured to handle more than %d " + "databases. Exiting\n", server.dbnum); exit(1); } db = server.db+dbid; - continue; + continue; /* Read type again. */ + } else if (type == REDIS_RDB_OPCODE_RESIZEDB) { + /* RESIZEDB: Hint about the size of the keys in the currently + * selected data base, in order to avoid useless rehashing. */ + uint32_t db_size, expires_size; + if ((db_size = rdbLoadLen(&rdb,NULL)) == REDIS_RDB_LENERR) + goto eoferr; + if ((expires_size = rdbLoadLen(&rdb,NULL)) == REDIS_RDB_LENERR) + goto eoferr; + dictExpand(db->dict,db_size); + dictExpand(db->expires,expires_size); + continue; /* Read type again. */ + } else if (type == REDIS_RDB_OPCODE_AUX) { + /* AUX: generic string-string fields. Use to add state to RDB + * which is backward compatible. Implementations of RDB loading + * are requierd to skip AUX fields they don't understand. + * + * An AUX field is composed of two strings: key and value. */ + robj *auxkey, *auxval; + if ((auxkey = rdbLoadStringObject(&rdb)) == NULL) goto eoferr; + if ((auxval = rdbLoadStringObject(&rdb)) == NULL) goto eoferr; + + if (((char*)auxkey->ptr)[0] == '%') { + /* All the fields with a name staring with '%' are considered + * information fields and are logged at startup with a log + * level of NOTICE. */ + redisLog(REDIS_NOTICE,"RDB '%s': %s", + (char*)auxkey->ptr, + (char*)auxval->ptr); + } else { + /* We ignore fields we don't understand, as by AUX field + * contract. */ + redisLog(REDIS_DEBUG,"Unrecognized RDB AUX field: '%s'", + (char*)auxkey->ptr); + } + + decrRefCount(auxkey); + decrRefCount(auxval); + continue; /* Read type again. */ } + /* Read key */ if ((key = rdbLoadStringObject(&rdb)) == NULL) goto eoferr; /* Read value */ @@ -1223,7 +1375,7 @@ int rdbLoad(char *filename) { redisLog(REDIS_WARNING,"RDB file was saved with checksum disabled: no check performed."); } else if (cksum != expected) { redisLog(REDIS_WARNING,"Wrong RDB checksum. Aborting now."); - exit(1); + rdbExitReportCorruptRDB("RDB CRC error"); } } @@ -1233,7 +1385,7 @@ int rdbLoad(char *filename) { eoferr: /* unexpected end of file is handled here with a fatal exit */ redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now."); - exit(1); + rdbExitReportCorruptRDB("Unexpected EOF reading RDB file"); return REDIS_ERR; /* Just to avoid warning */ } @@ -1491,7 +1643,9 @@ int rdbSaveToSlavesSockets(void) { { retval = REDIS_ERR; } + zfree(msg); } + zfree(clientids); exitFromChild((retval == REDIS_OK) ? 0 : 1); } else { /* Parent */ @@ -38,7 +38,7 @@ /* The current RDB version. When the format changes in a way that is no longer * backward compatible this number gets incremented. */ -#define REDIS_RDB_VERSION 6 +#define REDIS_RDB_VERSION 7 /* Defines related to the dump file format. To store 32 bits lengths for short * keys requires a lot of space, so we check the most significant 2 bits of @@ -74,6 +74,7 @@ #define REDIS_RDB_TYPE_SET 2 #define REDIS_RDB_TYPE_ZSET 3 #define REDIS_RDB_TYPE_HASH 4 +/* NOTE: WHEN ADDING NEW RDB TYPE, UPDATE rdbIsObjectType() BELOW */ /* Object types for encoded objects. */ #define REDIS_RDB_TYPE_HASH_ZIPMAP 9 @@ -81,11 +82,15 @@ #define REDIS_RDB_TYPE_SET_INTSET 11 #define REDIS_RDB_TYPE_ZSET_ZIPLIST 12 #define REDIS_RDB_TYPE_HASH_ZIPLIST 13 +#define REDIS_RDB_TYPE_LIST_QUICKLIST 14 +/* NOTE: WHEN ADDING NEW RDB TYPE, UPDATE rdbIsObjectType() BELOW */ /* Test if a type is an object type. */ -#define rdbIsObjectType(t) ((t >= 0 && t <= 4) || (t >= 9 && t <= 13)) +#define rdbIsObjectType(t) ((t >= 0 && t <= 4) || (t >= 9 && t <= 14)) /* Special RDB opcodes (saved/loaded with rdbSaveType/rdbLoadType). */ +#define REDIS_RDB_OPCODE_AUX 250 +#define REDIS_RDB_OPCODE_RESIZEDB 251 #define REDIS_RDB_OPCODE_EXPIRETIME_MS 252 #define REDIS_RDB_OPCODE_EXPIRETIME 253 #define REDIS_RDB_OPCODE_SELECTDB 254 diff --git a/src/redis-benchmark.c b/src/redis-benchmark.c index 2e67f1021..7567e0181 100644 --- a/src/redis-benchmark.c +++ b/src/redis-benchmark.c @@ -90,9 +90,10 @@ typedef struct _client { long long start; /* Start time of a request */ long long latency; /* Request latency */ int pending; /* Number of pending requests (replies to consume) */ - int selectlen; /* If non-zero, a SELECT of 'selectlen' bytes is currently - used as a prefix of the pipline of commands. This gets - discarded the first time it's sent. */ + int prefix_pending; /* If non-zero, number of pending prefix commands. Commands + such as auth and select are prefixed to the pipeline of + benchmark commands and discarded after the first send. */ + int prefixlen; /* Size in bytes of the pending prefix commands */ } *client; /* Prototypes */ @@ -212,20 +213,21 @@ static void readHandler(aeEventLoop *el, int fd, void *privdata, int mask) { } freeReplyObject(reply); - - if (c->selectlen) { - size_t j; - - /* This is the OK from SELECT. Just discard the SELECT - * from the buffer. */ + /* This is an OK for prefix commands such as auth and select.*/ + if (c->prefix_pending > 0) { + c->prefix_pending--; c->pending--; - sdsrange(c->obuf,c->selectlen,-1); - /* We also need to fix the pointers to the strings - * we need to randomize. */ - for (j = 0; j < c->randlen; j++) - c->randptr[j] -= c->selectlen; - c->selectlen = 0; - continue; + /* Discard prefix commands on first response.*/ + if (c->prefixlen > 0) { + size_t j; + sdsrange(c->obuf, c->prefixlen, -1); + /* We also need to fix the pointers to the strings + * we need to randomize. */ + for (j = 0; j < c->randlen; j++) + c->randptr[j] -= c->prefixlen; + c->prefixlen = 0; + } + continue; } if (config.requests_finished < config.requests) @@ -299,8 +301,7 @@ static void writeHandler(aeEventLoop *el, int fd, void *privdata, int mask) { * 2) The offsets of the __rand_int__ elements inside the command line, used * for arguments randomization. * - * Even when cloning another client, the SELECT command is automatically prefixed - * if needed. */ + * Even when cloning another client, prefix commands are applied if needed.*/ static client createClient(char *cmd, size_t len, client from) { int j; client c = zmalloc(sizeof(struct _client)); @@ -325,12 +326,16 @@ static client createClient(char *cmd, size_t len, client from) { * Queue N requests accordingly to the pipeline size, or simply clone * the example client buffer. */ c->obuf = sdsempty(); - + /* Prefix the request buffer with AUTH and/or SELECT commands, if applicable. + * These commands are discarded after the first response, so if the client is + * reused the commands will not be used again. */ + c->prefix_pending = 0; if (config.auth) { char *buf = NULL; int len = redisFormatCommand(&buf, "AUTH %s", config.auth); c->obuf = sdscatlen(c->obuf, buf, len); free(buf); + c->prefix_pending++; } /* If a DB number different than zero is selected, prefix our request @@ -340,26 +345,23 @@ static client createClient(char *cmd, size_t len, client from) { if (config.dbnum != 0) { c->obuf = sdscatprintf(c->obuf,"*2\r\n$6\r\nSELECT\r\n$%d\r\n%s\r\n", (int)sdslen(config.dbnumstr),config.dbnumstr); - c->selectlen = sdslen(c->obuf); - } else { - c->selectlen = 0; + c->prefix_pending++; } - + c->prefixlen = sdslen(c->obuf); /* Append the request itself. */ if (from) { c->obuf = sdscatlen(c->obuf, - from->obuf+from->selectlen, - sdslen(from->obuf)-from->selectlen); + from->obuf+from->prefixlen, + sdslen(from->obuf)-from->prefixlen); } else { for (j = 0; j < config.pipeline; j++) c->obuf = sdscatlen(c->obuf,cmd,len); } c->written = 0; - c->pending = config.pipeline; + c->pending = config.pipeline+c->prefix_pending; c->randptr = NULL; c->randlen = 0; - if (c->selectlen) c->pending++; /* Find substrings in the output buffer that need to be randomized. */ if (config.randomkeys) { @@ -371,7 +373,7 @@ static client createClient(char *cmd, size_t len, client from) { for (j = 0; j < (int)c->randlen; j++) { c->randptr[j] = c->obuf + (from->randptr[j]-from->obuf); /* Adjust for the different select prefix length. */ - c->randptr[j] += c->selectlen - from->selectlen; + c->randptr[j] += c->prefixlen - from->prefixlen; } } else { char *p = c->obuf; @@ -390,7 +392,8 @@ static client createClient(char *cmd, size_t len, client from) { } } } - aeCreateFileEvent(config.el,c->context->fd,AE_WRITABLE,writeHandler,c); + if (config.idlemode == 0) + aeCreateFileEvent(config.el,c->context->fd,AE_WRITABLE,writeHandler,c); listAddNodeTail(config.clients,c); config.liveclients++; return c; @@ -555,7 +558,7 @@ usage: " -s <socket> Server socket (overrides host and port)\n" " -a <password> Password for Redis Auth\n" " -c <clients> Number of parallel connections (default 50)\n" -" -n <requests> Total number of requests (default 10000)\n" +" -n <requests> Total number of requests (default 100000)\n" " -d <size> Data size of SET/GET value in bytes (default 2)\n" " -dbnum <db> SELECT the specified db number (default 0)\n" " -k <boolean> 1=keep alive 0=reconnect (default 1)\n" @@ -600,8 +603,12 @@ int showThroughput(struct aeEventLoop *eventLoop, long long id, void *clientData fprintf(stderr,"All clients disconnected... aborting.\n"); exit(1); } - if (config.csv) return 250; + if (config.idlemode == 1) { + printf("clients: %d\r", config.liveclients); + fflush(stdout); + return 250; + } float dt = (float)(mstime()-config.start)/1000.0; float rps = (float)config.requests_finished/dt; printf("%s: %.2f\r", config.title, rps); @@ -635,7 +642,7 @@ int main(int argc, const char **argv) { signal(SIGPIPE, SIG_IGN); config.numclients = 50; - config.requests = 10000; + config.requests = 100000; config.liveclients = 0; config.el = aeCreateEventLoop(1024*10); aeCreateTimeEvent(config.el,1,showThroughput,NULL,NULL); @@ -693,8 +700,8 @@ int main(int argc, const char **argv) { } /* Run default benchmark suite. */ + data = zmalloc(config.datasize+1); do { - data = zmalloc(config.datasize+1); memset(data,'x',config.datasize); data[config.datasize] = '\0'; @@ -731,12 +738,24 @@ int main(int argc, const char **argv) { free(cmd); } + if (test_is_selected("rpush")) { + len = redisFormatCommand(&cmd,"RPUSH mylist %s",data); + benchmark("RPUSH",cmd,len); + free(cmd); + } + if (test_is_selected("lpop")) { len = redisFormatCommand(&cmd,"LPOP mylist"); benchmark("LPOP",cmd,len); free(cmd); } + if (test_is_selected("rpop")) { + len = redisFormatCommand(&cmd,"RPOP mylist"); + benchmark("RPOP",cmd,len); + free(cmd); + } + if (test_is_selected("sadd")) { len = redisFormatCommand(&cmd, "SADD myset element:__rand_int__"); diff --git a/src/redis-check-dump.c b/src/redis-check-rdb.c index 546462001..21f72c222 100644 --- a/src/redis-check-dump.c +++ b/src/redis-check-rdb.c @@ -29,74 +29,19 @@ */ +#include "redis.h" +#include "rdb.h" #include <stdlib.h> #include <stdio.h> #include <unistd.h> #include <fcntl.h> #include <sys/stat.h> #include <sys/mman.h> -#include <string.h> -#include <arpa/inet.h> -#include <stdint.h> -#include <limits.h> #include "lzf.h" #include "crc64.h" -/* Object types */ -#define REDIS_STRING 0 -#define REDIS_LIST 1 -#define REDIS_SET 2 -#define REDIS_ZSET 3 -#define REDIS_HASH 4 -#define REDIS_HASH_ZIPMAP 9 -#define REDIS_LIST_ZIPLIST 10 -#define REDIS_SET_INTSET 11 -#define REDIS_ZSET_ZIPLIST 12 -#define REDIS_HASH_ZIPLIST 13 - -/* Objects encoding. Some kind of objects like Strings and Hashes can be - * internally represented in multiple ways. The 'encoding' field of the object - * is set to one of this fields for this object. */ -#define REDIS_ENCODING_RAW 0 /* Raw representation */ -#define REDIS_ENCODING_INT 1 /* Encoded as integer */ -#define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */ -#define REDIS_ENCODING_HT 3 /* Encoded as a hash table */ - -/* Object types only used for dumping to disk */ -#define REDIS_EXPIRETIME_MS 252 -#define REDIS_EXPIRETIME 253 -#define REDIS_SELECTDB 254 -#define REDIS_EOF 255 - -/* Defines related to the dump file format. To store 32 bits lengths for short - * keys requires a lot of space, so we check the most significant 2 bits of - * the first byte to interpreter the length: - * - * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte - * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte - * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow - * 11|000000 this means: specially encoded object will follow. The six bits - * number specify the kind of object that follows. - * See the REDIS_RDB_ENC_* defines. - * - * Lengths up to 63 are stored using a single byte, most DB keys, and may - * values, will fit inside. */ -#define REDIS_RDB_6BITLEN 0 -#define REDIS_RDB_14BITLEN 1 -#define REDIS_RDB_32BITLEN 2 -#define REDIS_RDB_ENCVAL 3 -#define REDIS_RDB_LENERR UINT_MAX - -/* When a length of a string object stored on disk has the first two bits - * set, the remaining two bits specify a special encoding for the object - * accordingly to the following defines: */ -#define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */ -#define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */ -#define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */ -#define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */ - #define ERROR(...) { \ - printf(__VA_ARGS__); \ + redisLog(REDIS_WARNING, __VA_ARGS__); \ exit(1); \ } @@ -133,28 +78,23 @@ typedef struct { char success; } entry; -/* Global vars that are actually used as constants. The following double - * values are used for double on-disk serialization, and are initialized - * at runtime to avoid strange compiler optimizations. */ -static double R_Zero, R_PosInf, R_NegInf, R_Nan; - #define MAX_TYPES_NUM 256 #define MAX_TYPE_NAME_LEN 16 /* store string types for output */ static char types[MAX_TYPES_NUM][MAX_TYPE_NAME_LEN]; /* Return true if 't' is a valid object type. */ -int checkType(unsigned char t) { +static int rdbCheckType(unsigned char t) { /* In case a new object type is added, update the following * condition as necessary. */ return - (t >= REDIS_HASH_ZIPMAP && t <= REDIS_HASH_ZIPLIST) || - t <= REDIS_HASH || - t >= REDIS_EXPIRETIME_MS; + (t >= REDIS_RDB_TYPE_HASH_ZIPMAP && t <= REDIS_RDB_TYPE_HASH_ZIPLIST) || + t <= REDIS_RDB_TYPE_HASH || + t >= REDIS_RDB_OPCODE_EXPIRETIME_MS; } /* when number of bytes to read is negative, do a peek */ -int readBytes(void *target, long num) { +static int readBytes(void *target, long num) { char peek = (num < 0) ? 1 : 0; num = (num < 0) ? -num : num; @@ -173,28 +113,28 @@ int processHeader(void) { int dump_version; if (!readBytes(buf, 9)) { - ERROR("Cannot read header\n"); + ERROR("Cannot read header"); } /* expect the first 5 bytes to equal REDIS */ if (memcmp(buf,"REDIS",5) != 0) { - ERROR("Wrong signature in header\n"); + ERROR("Wrong signature in header"); } dump_version = (int)strtol(buf + 5, NULL, 10); if (dump_version < 1 || dump_version > 6) { - ERROR("Unknown RDB format version: %d\n", dump_version); + ERROR("Unknown RDB format version: %d", dump_version); } return dump_version; } -int loadType(entry *e) { +static int loadType(entry *e) { uint32_t offset = CURR_OFFSET; /* this byte needs to qualify as type */ unsigned char t; if (readBytes(&t, 1)) { - if (checkType(t)) { + if (rdbCheckType(t)) { e->type = t; return 1; } else { @@ -208,18 +148,18 @@ int loadType(entry *e) { return 0; } -int peekType() { +static int peekType() { unsigned char t; - if (readBytes(&t, -1) && (checkType(t))) + if (readBytes(&t, -1) && (rdbCheckType(t))) return t; return -1; } /* discard time, just consume the bytes */ -int processTime(int type) { +static int processTime(int type) { uint32_t offset = CURR_OFFSET; unsigned char t[8]; - int timelen = (type == REDIS_EXPIRETIME_MS) ? 8 : 4; + int timelen = (type == REDIS_RDB_OPCODE_EXPIRETIME_MS) ? 8 : 4; if (readBytes(t,timelen)) { return 1; @@ -231,7 +171,7 @@ int processTime(int type) { return 0; } -uint32_t loadLength(int *isencoded) { +static uint32_t loadLength(int *isencoded) { unsigned char buf[2]; uint32_t len; int type; @@ -257,7 +197,7 @@ uint32_t loadLength(int *isencoded) { } } -char *loadIntegerObject(int enctype) { +static char *loadIntegerObject(int enctype) { uint32_t offset = CURR_OFFSET; unsigned char enc[4]; long long val; @@ -284,36 +224,36 @@ char *loadIntegerObject(int enctype) { /* convert val into string */ char *buf; - buf = malloc(sizeof(char) * 128); + buf = zmalloc(sizeof(char) * 128); sprintf(buf, "%lld", val); return buf; } -char* loadLzfStringObject() { +static char* loadLzfStringObject() { unsigned int slen, clen; char *c, *s; if ((clen = loadLength(NULL)) == REDIS_RDB_LENERR) return NULL; if ((slen = loadLength(NULL)) == REDIS_RDB_LENERR) return NULL; - c = malloc(clen); + c = zmalloc(clen); if (!readBytes(c, clen)) { - free(c); + zfree(c); return NULL; } - s = malloc(slen+1); + s = zmalloc(slen+1); if (lzf_decompress(c,clen,s,slen) == 0) { - free(c); free(s); + zfree(c); zfree(s); return NULL; } - free(c); + zfree(c); return s; } /* returns NULL when not processable, char* when valid */ -char* loadStringObject() { +static char* loadStringObject() { uint32_t offset = CURR_OFFSET; int isencoded; uint32_t len; @@ -336,48 +276,48 @@ char* loadStringObject() { if (len == REDIS_RDB_LENERR) return NULL; - char *buf = malloc(sizeof(char) * (len+1)); + char *buf = zmalloc(sizeof(char) * (len+1)); if (buf == NULL) return NULL; buf[len] = '\0'; if (!readBytes(buf, len)) { - free(buf); + zfree(buf); return NULL; } return buf; } -int processStringObject(char** store) { +static int processStringObject(char** store) { unsigned long offset = CURR_OFFSET; char *key = loadStringObject(); if (key == NULL) { SHIFT_ERROR(offset, "Error reading string object"); - free(key); + zfree(key); return 0; } if (store != NULL) { *store = key; } else { - free(key); + zfree(key); } return 1; } -double* loadDoubleValue() { +static double* loadDoubleValue() { char buf[256]; unsigned char len; double* val; if (!readBytes(&len,1)) return NULL; - val = malloc(sizeof(double)); + val = zmalloc(sizeof(double)); switch(len) { case 255: *val = R_NegInf; return val; case 254: *val = R_PosInf; return val; case 253: *val = R_Nan; return val; default: if (!readBytes(buf, len)) { - free(val); + zfree(val); return NULL; } buf[len] = '\0'; @@ -386,24 +326,24 @@ double* loadDoubleValue() { } } -int processDoubleValue(double** store) { +static int processDoubleValue(double** store) { unsigned long offset = CURR_OFFSET; double *val = loadDoubleValue(); if (val == NULL) { SHIFT_ERROR(offset, "Error reading double value"); - free(val); + zfree(val); return 0; } if (store != NULL) { *store = val; } else { - free(val); + zfree(val); } return 1; } -int loadPair(entry *e) { +static int loadPair(entry *e) { uint32_t offset = CURR_OFFSET; uint32_t i; @@ -417,10 +357,10 @@ int loadPair(entry *e) { } uint32_t length = 0; - if (e->type == REDIS_LIST || - e->type == REDIS_SET || - e->type == REDIS_ZSET || - e->type == REDIS_HASH) { + if (e->type == REDIS_RDB_TYPE_LIST || + e->type == REDIS_RDB_TYPE_SET || + e->type == REDIS_RDB_TYPE_ZSET || + e->type == REDIS_RDB_TYPE_HASH) { if ((length = loadLength(NULL)) == REDIS_RDB_LENERR) { SHIFT_ERROR(offset, "Error reading %s length", types[e->type]); return 0; @@ -428,19 +368,19 @@ int loadPair(entry *e) { } switch(e->type) { - case REDIS_STRING: - case REDIS_HASH_ZIPMAP: - case REDIS_LIST_ZIPLIST: - case REDIS_SET_INTSET: - case REDIS_ZSET_ZIPLIST: - case REDIS_HASH_ZIPLIST: + case REDIS_RDB_TYPE_STRING: + case REDIS_RDB_TYPE_HASH_ZIPMAP: + case REDIS_RDB_TYPE_LIST_ZIPLIST: + case REDIS_RDB_TYPE_SET_INTSET: + case REDIS_RDB_TYPE_ZSET_ZIPLIST: + case REDIS_RDB_TYPE_HASH_ZIPLIST: if (!processStringObject(NULL)) { SHIFT_ERROR(offset, "Error reading entry value"); return 0; } break; - case REDIS_LIST: - case REDIS_SET: + case REDIS_RDB_TYPE_LIST: + case REDIS_RDB_TYPE_SET: for (i = 0; i < length; i++) { offset = CURR_OFFSET; if (!processStringObject(NULL)) { @@ -449,7 +389,7 @@ int loadPair(entry *e) { } } break; - case REDIS_ZSET: + case REDIS_RDB_TYPE_ZSET: for (i = 0; i < length; i++) { offset = CURR_OFFSET; if (!processStringObject(NULL)) { @@ -463,7 +403,7 @@ int loadPair(entry *e) { } } break; - case REDIS_HASH: + case REDIS_RDB_TYPE_HASH: for (i = 0; i < length; i++) { offset = CURR_OFFSET; if (!processStringObject(NULL)) { @@ -486,7 +426,7 @@ int loadPair(entry *e) { return 1; } -entry loadEntry() { +static entry loadEntry() { entry e = { NULL, -1, 0 }; uint32_t length, offset[4]; @@ -499,7 +439,7 @@ entry loadEntry() { } offset[1] = CURR_OFFSET; - if (e.type == REDIS_SELECTDB) { + if (e.type == REDIS_RDB_OPCODE_SELECTDB) { if ((length = loadLength(NULL)) == REDIS_RDB_LENERR) { SHIFT_ERROR(offset[1], "Error reading database number"); return e; @@ -508,7 +448,7 @@ entry loadEntry() { SHIFT_ERROR(offset[1], "Database number out of range (%d)", length); return e; } - } else if (e.type == REDIS_EOF) { + } else if (e.type == REDIS_RDB_OPCODE_EOF) { if (positions[level].offset < positions[level].size) { SHIFT_ERROR(offset[0], "Unexpected EOF"); } else { @@ -517,8 +457,8 @@ entry loadEntry() { return e; } else { /* optionally consume expire */ - if (e.type == REDIS_EXPIRETIME || - e.type == REDIS_EXPIRETIME_MS) { + if (e.type == REDIS_RDB_OPCODE_EXPIRETIME || + e.type == REDIS_RDB_OPCODE_EXPIRETIME_MS) { if (!processTime(e.type)) return e; if (!loadType(&e)) return e; } @@ -544,31 +484,31 @@ entry loadEntry() { return e; } -void printCentered(int indent, int width, char* body) { +static void printCentered(int indent, int width, char* body) { char head[256], tail[256]; memset(head, '\0', 256); memset(tail, '\0', 256); memset(head, '=', indent); memset(tail, '=', width - 2 - indent - strlen(body)); - printf("%s %s %s\n", head, body, tail); + redisLog(REDIS_WARNING, "%s %s %s", head, body, tail); } -void printValid(uint64_t ops, uint64_t bytes) { +static void printValid(uint64_t ops, uint64_t bytes) { char body[80]; sprintf(body, "Processed %llu valid opcodes (in %llu bytes)", (unsigned long long) ops, (unsigned long long) bytes); printCentered(4, 80, body); } -void printSkipped(uint64_t bytes, uint64_t offset) { +static void printSkipped(uint64_t bytes, uint64_t offset) { char body[80]; sprintf(body, "Skipped %llu bytes (resuming at 0x%08llx)", (unsigned long long) bytes, (unsigned long long) offset); printCentered(4, 80, body); } -void printErrorStack(entry *e) { +static void printErrorStack(entry *e) { unsigned int i; char body[64]; @@ -598,20 +538,20 @@ void printErrorStack(entry *e) { /* display error stack */ for (i = 0; i < errors.level; i++) { - printf("0x%08lx - %s\n", + redisLog(REDIS_WARNING, "0x%08lx - %s", (unsigned long) errors.offset[i], errors.error[i]); } } void process(void) { uint64_t num_errors = 0, num_valid_ops = 0, num_valid_bytes = 0; - entry entry; + entry entry = { NULL, -1, 0 }; int dump_version = processHeader(); /* Exclude the final checksum for RDB >= 5. Will be checked at the end. */ if (dump_version >= 5) { if (positions[0].size < 8) { - printf("RDB version >= 5 but no room for checksum.\n"); + redisLog(REDIS_WARNING, "RDB version >= 5 but no room for checksum."); exit(1); } positions[0].size -= 8; @@ -660,7 +600,7 @@ void process(void) { /* advance position */ positions[0] = positions[1]; } - free(entry.key); + zfree(entry.key); } /* because there is another potential error, @@ -668,7 +608,7 @@ void process(void) { printValid(num_valid_ops, num_valid_bytes); /* expect an eof */ - if (entry.type != REDIS_EOF) { + if (entry.type != REDIS_RDB_OPCODE_EOF) { /* last byte should be EOF, add error */ errors.level = 0; SHIFT_ERROR(positions[0].offset, "Expected EOF, got %s", types[entry.type]); @@ -696,47 +636,40 @@ void process(void) { if (crc != crc2) { SHIFT_ERROR(positions[0].offset, "RDB CRC64 does not match."); } else { - printf("CRC64 checksum is OK\n"); + redisLog(REDIS_WARNING, "CRC64 checksum is OK"); } } /* print summary on errors */ if (num_errors) { - printf("\n"); - printf("Total unprocessable opcodes: %llu\n", + redisLog(REDIS_WARNING, "Total unprocessable opcodes: %llu", (unsigned long long) num_errors); } } -int main(int argc, char **argv) { - /* expect the first argument to be the dump file */ - if (argc <= 1) { - printf("Usage: %s <dump.rdb>\n", argv[0]); - exit(0); - } - +int redis_check_rdb(char *rdbfilename) { int fd; off_t size; struct stat stat; void *data; - fd = open(argv[1], O_RDONLY); + fd = open(rdbfilename, O_RDONLY); if (fd < 1) { - ERROR("Cannot open file: %s\n", argv[1]); + ERROR("Cannot open file: %s", rdbfilename); } if (fstat(fd, &stat) == -1) { - ERROR("Cannot stat: %s\n", argv[1]); + ERROR("Cannot stat: %s", rdbfilename); } else { size = stat.st_size; } if (sizeof(size_t) == sizeof(int32_t) && size >= INT_MAX) { - ERROR("Cannot check dump files >2GB on a 32-bit platform\n"); + ERROR("Cannot check dump files >2GB on a 32-bit platform"); } data = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0); if (data == MAP_FAILED) { - ERROR("Cannot mmap: %s\n", argv[1]); + ERROR("Cannot mmap: %s", rdbfilename); } /* Initialize static vars */ @@ -746,22 +679,16 @@ int main(int argc, char **argv) { errors.level = 0; /* Object types */ - sprintf(types[REDIS_STRING], "STRING"); - sprintf(types[REDIS_LIST], "LIST"); - sprintf(types[REDIS_SET], "SET"); - sprintf(types[REDIS_ZSET], "ZSET"); - sprintf(types[REDIS_HASH], "HASH"); + sprintf(types[REDIS_RDB_TYPE_STRING], "STRING"); + sprintf(types[REDIS_RDB_TYPE_LIST], "LIST"); + sprintf(types[REDIS_RDB_TYPE_SET], "SET"); + sprintf(types[REDIS_RDB_TYPE_ZSET], "ZSET"); + sprintf(types[REDIS_RDB_TYPE_HASH], "HASH"); /* Object types only used for dumping to disk */ - sprintf(types[REDIS_EXPIRETIME], "EXPIRETIME"); - sprintf(types[REDIS_SELECTDB], "SELECTDB"); - sprintf(types[REDIS_EOF], "EOF"); - - /* Double constants initialization */ - R_Zero = 0.0; - R_PosInf = 1.0/R_Zero; - R_NegInf = -1.0/R_Zero; - R_Nan = R_Zero/R_Zero; + sprintf(types[REDIS_RDB_OPCODE_EXPIRETIME], "EXPIRETIME"); + sprintf(types[REDIS_RDB_OPCODE_SELECTDB], "SELECTDB"); + sprintf(types[REDIS_RDB_OPCODE_EOF], "EOF"); process(); @@ -769,3 +696,15 @@ int main(int argc, char **argv) { close(fd); return 0; } + +/* RDB check main: called form redis.c when Redis is executed with the + * redis-check-rdb alias. */ +int redis_check_rdb_main(char **argv, int argc) { + if (argc != 2) { + fprintf(stderr, "Usage: %s <rdb-file-name>\n", argv[0]); + exit(1); + } + redisLog(REDIS_WARNING, "Checking RDB file %s", argv[1]); + exit(redis_check_rdb(argv[1])); + return 0; +} diff --git a/src/redis-cli.c b/src/redis-cli.c index 34070ab80..e243db451 100644 --- a/src/redis-cli.c +++ b/src/redis-cli.c @@ -60,6 +60,8 @@ #define OUTPUT_CSV 2 #define REDIS_CLI_KEEPALIVE_INTERVAL 15 /* seconds */ #define REDIS_CLI_DEFAULT_PIPE_TIMEOUT 30 /* seconds */ +#define REDIS_CLI_HISTFILE_ENV "REDISCLI_HISTFILE" +#define REDIS_CLI_HISTFILE_DEFAULT ".rediscli_history" static redisContext *context; static struct config { @@ -128,9 +130,8 @@ static void cliRefreshPrompt(void) { len = snprintf(config.prompt,sizeof(config.prompt),"redis %s", config.hostsocket); else - len = snprintf(config.prompt,sizeof(config.prompt), - strchr(config.hostip,':') ? "[%s]:%d" : "%s:%d", - config.hostip, config.hostport); + len = anetFormatAddr(config.prompt, sizeof(config.prompt), + config.hostip, config.hostport); /* Add [dbnum] if needed */ if (config.dbnum != 0 && config.last_cmd_type != REDIS_REPLY_ERROR) len += snprintf(config.prompt+len,sizeof(config.prompt)-len,"[%d]", @@ -138,6 +139,30 @@ static void cliRefreshPrompt(void) { snprintf(config.prompt+len,sizeof(config.prompt)-len,"> "); } +static sds getHistoryPath() { + char *path = NULL; + sds historyPath = NULL; + + /* check the env for a histfile override */ + path = getenv(REDIS_CLI_HISTFILE_ENV); + if (path != NULL && *path != '\0') { + if (!strcmp("/dev/null", path)) { + return NULL; + } + + /* if the env is set, return it */ + historyPath = sdscatprintf(sdsempty(), "%s", path); + } else { + char *home = getenv("HOME"); + if (home != NULL && *home != '\0') { + /* otherwise, return the default */ + historyPath = sdscatprintf(sdsempty(), "%s/%s", home, REDIS_CLI_HISTFILE_DEFAULT); + } + } + + return historyPath; +} + /*------------------------------------------------------------------------------ * Help functions *--------------------------------------------------------------------------- */ @@ -302,7 +327,7 @@ static void completionCallback(const char *buf, linenoiseCompletions *lc) { *--------------------------------------------------------------------------- */ /* Send AUTH command to the server */ -static int cliAuth() { +static int cliAuth(void) { redisReply *reply; if (config.auth == NULL) return REDIS_OK; @@ -315,7 +340,7 @@ static int cliAuth() { } /* Send SELECT dbnum to the server */ -static int cliSelect() { +static int cliSelect(void) { redisReply *reply; if (config.dbnum == 0) return REDIS_OK; @@ -493,7 +518,7 @@ static sds cliFormatReplyCSV(redisReply *r) { out = sdscatrepr(out,r->str,r->len); break; case REDIS_REPLY_NIL: - out = sdscat(out,"NIL\n"); + out = sdscat(out,"NIL"); break; case REDIS_REPLY_ARRAY: for (i = 0; i < r->elements; i++) { @@ -604,6 +629,9 @@ static int cliSendCommand(int argc, char **argv, int repeat) { output_raw = 0; if (!strcasecmp(command,"info") || + (argc == 3 && !strcasecmp(command,"debug") && + (!strcasecmp(argv[1],"jemalloc") && + !strcasecmp(argv[2],"info"))) || (argc == 2 && !strcasecmp(command,"cluster") && (!strcasecmp(argv[1],"nodes") || !strcasecmp(argv[1],"info"))) || @@ -830,6 +858,7 @@ static void usage(void) { " not a tty).\n" " --no-raw Force formatted output even when STDOUT is not a tty.\n" " --csv Output in CSV format.\n" +" --stat Print rolling stats about server: mem, clients, ...\n" " --latency Enter a special mode continuously sampling latency.\n" " --latency-history Like --latency but tracking latency changes over time.\n" " Default time interval is 15 sec. Change it using -i.\n" @@ -877,6 +906,33 @@ static char **convertToSds(int count, char** args) { return sds; } +static int issueCommandRepeat(int argc, char **argv, long repeat) { + while (1) { + config.cluster_reissue_command = 0; + if (cliSendCommand(argc,argv,repeat) != REDIS_OK) { + cliConnect(1); + + /* If we still cannot send the command print error. + * We'll try to reconnect the next time. */ + if (cliSendCommand(argc,argv,repeat) != REDIS_OK) { + cliPrintContextError(); + return REDIS_ERR; + } + } + /* Issue the command again if we got redirected in cluster mode */ + if (config.cluster_mode && config.cluster_reissue_command) { + cliConnect(1); + } else { + break; + } + } + return REDIS_OK; +} + +static int issueCommand(int argc, char **argv) { + return issueCommandRepeat(argc, argv, config.repeat); +} + static void repl(void) { sds historyfile = NULL; int history = 0; @@ -890,10 +946,9 @@ static void repl(void) { /* Only use history when stdin is a tty. */ if (isatty(fileno(stdin))) { - history = 1; - - if (getenv("HOME") != NULL) { - historyfile = sdscatprintf(sdsempty(),"%s/.rediscli_history",getenv("HOME")); + historyfile = getHistoryPath(); + if (historyfile != NULL) { + history = 1; linenoiseHistoryLoad(historyfile); } } @@ -933,26 +988,8 @@ static void repl(void) { repeat = 1; } - while (1) { - config.cluster_reissue_command = 0; - if (cliSendCommand(argc-skipargs,argv+skipargs,repeat) - != REDIS_OK) - { - cliConnect(1); - - /* If we still cannot send the command print error. - * We'll try to reconnect the next time. */ - if (cliSendCommand(argc-skipargs,argv+skipargs,repeat) - != REDIS_OK) - cliPrintContextError(); - } - /* Issue the command again if we got redirected in cluster mode */ - if (config.cluster_mode && config.cluster_reissue_command) { - cliConnect(1); - } else { - break; - } - } + issueCommandRepeat(argc-skipargs, argv+skipargs, repeat); + elapsed = mstime()-start_time; if (elapsed >= 500) { printf("(%.2fs)\n",(double)elapsed/1000); @@ -973,10 +1010,9 @@ static int noninteractive(int argc, char **argv) { if (config.stdinarg) { argv = zrealloc(argv, (argc+1)*sizeof(char*)); argv[argc] = readArgFromStdin(); - retval = cliSendCommand(argc+1, argv, config.repeat); + retval = issueCommand(argc+1, argv); } else { - /* stdin is probably a tty, can be tested with S_ISCHR(s.st_mode) */ - retval = cliSendCommand(argc, argv, config.repeat); + retval = issueCommand(argc, argv); } return retval; } @@ -1020,7 +1056,7 @@ static int evalMode(int argc, char **argv) { argv2[2] = sdscatprintf(sdsempty(),"%d",keys); /* Call it */ - return cliSendCommand(argc+3-got_comma, argv2, config.repeat); + return issueCommand(argc+3-got_comma, argv2); } /*------------------------------------------------------------------------------ @@ -1915,8 +1951,6 @@ int main(int argc, char **argv) { argc -= firstarg; argv += firstarg; - signal(SIGPIPE, SIG_IGN); - /* Latency mode */ if (config.latency_mode) { if (cliConnect(0) == REDIS_ERR) exit(1); @@ -1965,6 +1999,9 @@ int main(int argc, char **argv) { /* Start interactive mode when no command is provided */ if (argc == 0 && !config.eval) { + /* Ignore SIGPIPE in interactive mode to force a reconnect */ + signal(SIGPIPE, SIG_IGN); + /* Note that in repl mode we don't abort on connection error. * A new attempt will be performed for every command send. */ cliConnect(0); diff --git a/src/redis-trib.rb b/src/redis-trib.rb index 466a81137..6002e4caa 100755 --- a/src/redis-trib.rb +++ b/src/redis-trib.rb @@ -72,7 +72,7 @@ class ClusterNode @friends end - def slots + def slots @info[:slots] end @@ -154,7 +154,7 @@ class ClusterNode end } if slots @dirty = false - @r.cluster("info").split("\n").each{|e| + @r.cluster("info").split("\n").each{|e| k,v=e.split(":") k = k.to_sym v.chop! @@ -213,7 +213,7 @@ class ClusterNode # # Note: this could be easily written without side effects, # we use 'slots' just to split the computation into steps. - + # First step: we want an increasing array of integers # for instance: [1,2,3,4,5,8,9,20,21,22,23,24,25,30] slots = @info[:slots].keys.sort @@ -273,7 +273,7 @@ class ClusterNode def info @info end - + def is_dirty? @dirty end @@ -540,7 +540,6 @@ class RedisTrib nodes_count = @nodes.length masters_count = @nodes.length / (@replicas+1) masters = [] - slaves = [] # The first step is to split instances by IP. This is useful as # we'll try to allocate master nodes in different physical machines @@ -558,16 +557,31 @@ class RedisTrib # Select master instances puts "Using #{masters_count} masters:" - while masters.length < masters_count - ips.each{|ip,nodes_list| - next if nodes_list.length == 0 - masters << nodes_list.shift - puts masters[-1] - nodes_count -= 1 - break if masters.length == masters_count - } + interleaved = [] + stop = false + while not stop do + # Take one node from each IP until we run out of nodes + # across every IP. + ips.each do |ip,nodes| + if nodes.empty? + # if this IP has no remaining nodes, check for termination + if interleaved.length == nodes_count + # stop when 'interleaved' has accumulated all nodes + stop = true + next + end + else + # else, move one node from this IP to 'interleaved' + interleaved.push nodes.shift + end + end end + masters = interleaved.slice!(0, masters_count) + nodes_count -= masters.length + + masters.each{|m| puts m} + # Alloc slots on masters slots_per_node = ClusterHashSlots.to_f / masters_count first = 0 @@ -594,8 +608,8 @@ class RedisTrib # all nodes will be used. assignment_verbose = false - [:requested,:unused].each{|assign| - masters.each{|m| + [:requested,:unused].each do |assign| + masters.each do |m| assigned_replicas = 0 while assigned_replicas < @replicas break if nodes_count == 0 @@ -609,21 +623,33 @@ class RedisTrib "role too (#{nodes_count} remaining)." end end - ips.each{|ip,nodes_list| - next if nodes_list.length == 0 - # Skip instances with the same IP as the master if we - # have some more IPs available. - next if ip == m.info[:host] && nodes_count > nodes_list.length - slave = nodes_list.shift - slave.set_as_replica(m.info[:name]) - nodes_count -= 1 - assigned_replicas += 1 - puts "Adding replica #{slave} to #{m}" - break - } + + # Return the first node not matching our current master + node = interleaved.find{|n| n.info[:host] != m.info[:host]} + + # If we found a node, use it as a best-first match. + # Otherwise, we didn't find a node on a different IP, so we + # go ahead and use a same-IP replica. + if node + slave = node + interleaved.delete node + else + slave = interleaved.shift + end + slave.set_as_replica(m.info[:name]) + nodes_count -= 1 + assigned_replicas += 1 + puts "Adding replica #{slave} to #{m}" + + # If we are in the "assign extra nodes" loop, + # we want to assign one extra replica to each + # master before repeating masters. + # This break lets us assign extra replicas to masters + # in a round-robin way. + break if assign == :unused end - } - } + end + end end def flush_nodes_config @@ -763,7 +789,7 @@ class RedisTrib # Move slots between source and target nodes using MIGRATE. # - # Options: + # Options: # :verbose -- Print a dot for every moved key. # :fix -- We are moving in the context of a fix. Use REPLACE. # :cold -- Move keys without opening / reconfiguring the nodes. @@ -1139,7 +1165,7 @@ class RedisTrib # right node as needed. cursor = nil while cursor != 0 - cursor,keys = source.scan(cursor,:count,1000) + cursor,keys = source.scan(cursor, :count => 1000) cursor = cursor.to_i keys.each{|k| # Migrate keys using the MIGRATE command. @@ -1206,7 +1232,7 @@ end ################################################################################# # Libraries -# +# # We try to don't depend on external libs since this is a critical part # of Redis Cluster. ################################################################################# diff --git a/src/redis.c b/src/redis.c index 83e0946ef..b2f9ffc68 100644 --- a/src/redis.c +++ b/src/redis.c @@ -46,12 +46,14 @@ #include <sys/time.h> #include <sys/resource.h> #include <sys/uio.h> +#include <sys/un.h> #include <limits.h> #include <float.h> #include <math.h> #include <sys/resource.h> #include <sys/utsname.h> #include <locale.h> +#include <sys/sysctl.h> /* Our shared "common" objects */ @@ -160,7 +162,7 @@ struct redisCommand redisCommandTable[] = { {"smove",smoveCommand,4,"wF",0,NULL,1,2,1,0,0}, {"sismember",sismemberCommand,3,"rF",0,NULL,1,1,1,0,0}, {"scard",scardCommand,2,"rF",0,NULL,1,1,1,0,0}, - {"spop",spopCommand,2,"wRsF",0,NULL,1,1,1,0,0}, + {"spop",spopCommand,-2,"wRsF",0,NULL,1,1,1,0,0}, {"srandmember",srandmemberCommand,-2,"rR",0,NULL,1,1,1,0,0}, {"sinter",sinterCommand,-2,"rS",0,NULL,1,-1,1,0,0}, {"sinterstore",sinterstoreCommand,-3,"wm",0,NULL,1,-1,1,0,0}, @@ -247,7 +249,7 @@ struct redisCommand redisCommandTable[] = { {"pttl",pttlCommand,2,"rF",0,NULL,1,1,1,0,0}, {"persist",persistCommand,2,"wF",0,NULL,1,1,1,0,0}, {"slaveof",slaveofCommand,3,"ast",0,NULL,0,0,0,0,0}, - {"role",roleCommand,1,"last",0,NULL,0,0,0,0,0}, + {"role",roleCommand,1,"lst",0,NULL,0,0,0,0,0}, {"debug",debugCommand,-2,"as",0,NULL,0,0,0,0,0}, {"config",configCommand,-2,"art",0,NULL,0,0,0,0,0}, {"subscribe",subscribeCommand,-2,"rpslt",0,NULL,0,0,0,0,0}, @@ -259,19 +261,19 @@ struct redisCommand redisCommandTable[] = { {"watch",watchCommand,-2,"rsF",0,NULL,1,-1,1,0,0}, {"unwatch",unwatchCommand,1,"rsF",0,NULL,0,0,0,0,0}, {"cluster",clusterCommand,-2,"ar",0,NULL,0,0,0,0,0}, - {"restore",restoreCommand,-4,"awm",0,NULL,1,1,1,0,0}, - {"restore-asking",restoreCommand,-4,"awmk",0,NULL,1,1,1,0,0}, - {"migrate",migrateCommand,-6,"aw",0,NULL,0,0,0,0,0}, + {"restore",restoreCommand,-4,"wm",0,NULL,1,1,1,0,0}, + {"restore-asking",restoreCommand,-4,"wmk",0,NULL,1,1,1,0,0}, + {"migrate",migrateCommand,-6,"w",0,NULL,0,0,0,0,0}, {"asking",askingCommand,1,"r",0,NULL,0,0,0,0,0}, {"readonly",readonlyCommand,1,"rF",0,NULL,0,0,0,0,0}, {"readwrite",readwriteCommand,1,"rF",0,NULL,0,0,0,0,0}, - {"dump",dumpCommand,2,"ar",0,NULL,1,1,1,0,0}, + {"dump",dumpCommand,2,"r",0,NULL,1,1,1,0,0}, {"object",objectCommand,3,"r",0,NULL,2,2,2,0,0}, - {"client",clientCommand,-2,"ars",0,NULL,0,0,0,0,0}, + {"client",clientCommand,-2,"rs",0,NULL,0,0,0,0,0}, {"eval",evalCommand,-3,"s",0,evalGetKeys,0,0,0,0,0}, {"evalsha",evalShaCommand,-3,"s",0,evalGetKeys,0,0,0,0,0}, {"slowlog",slowlogCommand,-2,"r",0,NULL,0,0,0,0,0}, - {"script",scriptCommand,-2,"ras",0,NULL,0,0,0,0,0}, + {"script",scriptCommand,-2,"rs",0,NULL,0,0,0,0,0}, {"time",timeCommand,1,"rRF",0,NULL,0,0,0,0,0}, {"bitop",bitopCommand,-4,"wm",0,NULL,2,-1,1,0,0}, {"bitcount",bitcountCommand,-2,"r",0,NULL,1,1,1,0,0}, @@ -280,7 +282,7 @@ struct redisCommand redisCommandTable[] = { {"command",commandCommand,0,"rlt",0,NULL,0,0,0,0,0}, {"pfselftest",pfselftestCommand,1,"r",0,NULL,0,0,0,0,0}, {"pfadd",pfaddCommand,-2,"wmF",0,NULL,1,1,1,0,0}, - {"pfcount",pfcountCommand,-2,"w",0,NULL,1,1,1,0,0}, + {"pfcount",pfcountCommand,-2,"r",0,NULL,1,1,1,0,0}, {"pfmerge",pfmergeCommand,-2,"wm",0,NULL,1,-1,1,0,0}, {"pfdebug",pfdebugCommand,-3,"w",0,NULL,0,0,0,0,0}, {"latency",latencyCommand,-2,"arslt",0,NULL,0,0,0,0,0} @@ -876,27 +878,30 @@ unsigned int getLRUClock(void) { } /* Add a sample to the operations per second array of samples. */ -void trackOperationsPerSecond(void) { - long long t = mstime() - server.ops_sec_last_sample_time; - long long ops = server.stat_numcommands - server.ops_sec_last_sample_ops; +void trackInstantaneousMetric(int metric, long long current_reading) { + long long t = mstime() - server.inst_metric[metric].last_sample_time; + long long ops = current_reading - + server.inst_metric[metric].last_sample_count; long long ops_sec; ops_sec = t > 0 ? (ops*1000/t) : 0; - server.ops_sec_samples[server.ops_sec_idx] = ops_sec; - server.ops_sec_idx = (server.ops_sec_idx+1) % REDIS_OPS_SEC_SAMPLES; - server.ops_sec_last_sample_time = mstime(); - server.ops_sec_last_sample_ops = server.stat_numcommands; + server.inst_metric[metric].samples[server.inst_metric[metric].idx] = + ops_sec; + server.inst_metric[metric].idx++; + server.inst_metric[metric].idx %= REDIS_METRIC_SAMPLES; + server.inst_metric[metric].last_sample_time = mstime(); + server.inst_metric[metric].last_sample_count = current_reading; } /* Return the mean of all the samples. */ -long long getOperationsPerSecond(void) { +long long getInstantaneousMetric(int metric) { int j; long long sum = 0; - for (j = 0; j < REDIS_OPS_SEC_SAMPLES; j++) - sum += server.ops_sec_samples[j]; - return sum / REDIS_OPS_SEC_SAMPLES; + for (j = 0; j < REDIS_METRIC_SAMPLES; j++) + sum += server.inst_metric[metric].samples[j]; + return sum / REDIS_METRIC_SAMPLES; } /* Check for timeouts. Returns non-zero if the client was terminated */ @@ -1068,7 +1073,13 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) { /* Update the time cache. */ updateCachedTime(); - run_with_period(100) trackOperationsPerSecond(); + run_with_period(100) { + trackInstantaneousMetric(REDIS_METRIC_COMMAND,server.stat_numcommands); + trackInstantaneousMetric(REDIS_METRIC_NET_INPUT, + server.stat_net_input_bytes); + trackInstantaneousMetric(REDIS_METRIC_NET_OUTPUT, + server.stat_net_output_bytes); + } /* We have just REDIS_LRU_BITS bits per object for LRU information. * So we use an (eventually wrapping) LRU clock. @@ -1404,6 +1415,8 @@ void initServerConfig(void) { server.syslog_ident = zstrdup(REDIS_DEFAULT_SYSLOG_IDENT); server.syslog_facility = LOG_LOCAL0; server.daemonize = REDIS_DEFAULT_DAEMONIZE; + server.supervised = 0; + server.supervised_mode = REDIS_SUPERVISED_NONE; server.aof_state = REDIS_AOF_OFF; server.aof_fsync = REDIS_DEFAULT_AOF_FSYNC; server.aof_no_fsync_on_rewrite = REDIS_DEFAULT_AOF_NO_FSYNC_ON_REWRITE; @@ -1421,7 +1434,7 @@ void initServerConfig(void) { server.aof_flush_postponed_start = 0; server.aof_rewrite_incremental_fsync = REDIS_DEFAULT_AOF_REWRITE_INCREMENTAL_FSYNC; server.aof_load_truncated = REDIS_DEFAULT_AOF_LOAD_TRUNCATED; - server.pidfile = zstrdup(REDIS_DEFAULT_PID_FILE); + server.pidfile = NULL; server.rdb_filename = zstrdup(REDIS_DEFAULT_RDB_FILENAME); server.aof_filename = zstrdup(REDIS_DEFAULT_AOF_FILENAME); server.requirepass = NULL; @@ -1437,8 +1450,8 @@ void initServerConfig(void) { server.maxmemory_samples = REDIS_DEFAULT_MAXMEMORY_SAMPLES; server.hash_max_ziplist_entries = REDIS_HASH_MAX_ZIPLIST_ENTRIES; server.hash_max_ziplist_value = REDIS_HASH_MAX_ZIPLIST_VALUE; - server.list_max_ziplist_entries = REDIS_LIST_MAX_ZIPLIST_ENTRIES; - server.list_max_ziplist_value = REDIS_LIST_MAX_ZIPLIST_VALUE; + server.list_max_ziplist_size = REDIS_LIST_MAX_ZIPLIST_SIZE; + server.list_compress_depth = REDIS_LIST_COMPRESS_DEPTH; server.set_max_intset_entries = REDIS_SET_MAX_INTSET_ENTRIES; server.zset_max_ziplist_entries = REDIS_ZSET_MAX_ZIPLIST_ENTRIES; server.zset_max_ziplist_value = REDIS_ZSET_MAX_ZIPLIST_VALUE; @@ -1554,33 +1567,33 @@ void adjustOpenFilesLimit(void) { /* Set the max number of files if the current limit is not enough * for our needs. */ if (oldlimit < maxfiles) { - rlim_t f; + rlim_t bestlimit; int setrlimit_error = 0; /* Try to set the file limit to match 'maxfiles' or at least * to the higher value supported less than maxfiles. */ - f = maxfiles; - while(f > oldlimit) { + bestlimit = maxfiles; + while(bestlimit > oldlimit) { rlim_t decr_step = 16; - limit.rlim_cur = f; - limit.rlim_max = f; + limit.rlim_cur = bestlimit; + limit.rlim_max = bestlimit; if (setrlimit(RLIMIT_NOFILE,&limit) != -1) break; setrlimit_error = errno; - /* We failed to set file limit to 'f'. Try with a + /* We failed to set file limit to 'bestlimit'. Try with a * smaller limit decrementing by a few FDs per iteration. */ - if (f < decr_step) break; - f -= decr_step; + if (bestlimit < decr_step) break; + bestlimit -= decr_step; } /* Assume that the limit we get initially is still valid if * our last try was even lower. */ - if (f < oldlimit) f = oldlimit; + if (bestlimit < oldlimit) bestlimit = oldlimit; - if (f != maxfiles) { + if (bestlimit < maxfiles) { int old_maxclients = server.maxclients; - server.maxclients = f-REDIS_MIN_RESERVED_FDS; + server.maxclients = bestlimit-REDIS_MIN_RESERVED_FDS; if (server.maxclients < 1) { redisLog(REDIS_WARNING,"Your current 'ulimit -n' " "of %llu is not enough for Redis to start. " @@ -1601,7 +1614,7 @@ void adjustOpenFilesLimit(void) { "maxclients has been reduced to %d to compensate for " "low ulimit. " "If you need higher maxclients increase 'ulimit -n'.", - (unsigned long long) oldlimit, server.maxclients); + (unsigned long long) bestlimit, server.maxclients); } else { redisLog(REDIS_NOTICE,"Increased maximum number of open files " "to %llu (it was originally set to %llu).", @@ -1612,6 +1625,23 @@ void adjustOpenFilesLimit(void) { } } +/* Check that server.tcp_backlog can be actually enforced in Linux according + * to the value of /proc/sys/net/core/somaxconn, or warn about it. */ +void checkTcpBacklogSettings(void) { +#ifdef HAVE_PROC_SOMAXCONN + FILE *fp = fopen("/proc/sys/net/core/somaxconn","r"); + char buf[1024]; + if (!fp) return; + if (fgets(buf,sizeof(buf),fp) != NULL) { + int somaxconn = atoi(buf); + if (somaxconn > 0 && somaxconn < server.tcp_backlog) { + redisLog(REDIS_WARNING,"WARNING: The TCP backlog setting of %d cannot be enforced because /proc/sys/net/core/somaxconn is set to the lower value of %d.", server.tcp_backlog, somaxconn); + } + } + fclose(fp); +#endif +} + /* Initialize a set of file descriptors to listen to the specified 'port' * binding the addresses specified in the Redis server configuration. * @@ -1682,6 +1712,8 @@ int listenToPort(int port, int *fds, int *count) { * to reset via CONFIG RESETSTAT. The function is also used in order to * initialize these fields in initServer() at server startup. */ void resetServerStats(void) { + int j; + server.stat_numcommands = 0; server.stat_numconnections = 0; server.stat_expiredkeys = 0; @@ -1694,10 +1726,15 @@ void resetServerStats(void) { server.stat_sync_full = 0; server.stat_sync_partial_ok = 0; server.stat_sync_partial_err = 0; - memset(server.ops_sec_samples,0,sizeof(server.ops_sec_samples)); - server.ops_sec_idx = 0; - server.ops_sec_last_sample_time = mstime(); - server.ops_sec_last_sample_ops = 0; + for (j = 0; j < REDIS_METRIC_COUNT; j++) { + server.inst_metric[j].idx = 0; + server.inst_metric[j].last_sample_time = mstime(); + server.inst_metric[j].last_sample_count = 0; + memset(server.inst_metric[j].samples,0, + sizeof(server.inst_metric[j].samples)); + } + server.stat_net_input_bytes = 0; + server.stat_net_output_bytes = 0; } void initServer(void) { @@ -1724,6 +1761,7 @@ void initServer(void) { server.clients_waiting_acks = listCreate(); server.get_ack_from_slaves = 0; server.clients_paused = 0; + server.system_memory_size = zmalloc_get_memory_size(); createSharedObjects(); adjustOpenFilesLimit(); @@ -1998,7 +2036,7 @@ void call(redisClient *c, int flags) { * not generated from reading an AOF. */ if (listLength(server.monitors) && !server.loading && - !(c->cmd->flags & REDIS_CMD_SKIP_MONITOR)) + !(c->cmd->flags & (REDIS_CMD_SKIP_MONITOR|REDIS_CMD_ADMIN))) { replicationFeedMonitors(c,server.monitors,c->db->id,c->argv,c->argc); } @@ -2337,7 +2375,7 @@ int prepareForShutdown(int flags) { return REDIS_ERR; } } - if (server.daemonize) { + if (server.daemonize || server.pidfile) { redisLog(REDIS_NOTICE,"Removing the pid file."); unlink(server.pidfile); } @@ -2444,7 +2482,6 @@ void timeCommand(redisClient *c) { addReplyBulkLongLong(c,tv.tv_usec); } - /* Helper function for addReplyCommand() to output flags. */ int addReplyCommandFlag(redisClient *c, struct redisCommand *cmd, int f, char *reply) { if (cmd->flags & f) { @@ -2663,7 +2700,10 @@ sds genRedisInfoString(char *section) { if (allsections || defsections || !strcasecmp(section,"memory")) { char hmem[64]; char peak_hmem[64]; + char total_system_hmem[64]; size_t zmalloc_used = zmalloc_used_memory(); + size_t total_system_mem = server.system_memory_size; + char *evict_policy = maxmemoryToString(); /* Peak memory is updated from time to time by serverCron() so it * may happen that the instantaneous value is slightly bigger than @@ -2674,6 +2714,8 @@ sds genRedisInfoString(char *section) { bytesToHuman(hmem,zmalloc_used); bytesToHuman(peak_hmem,server.stat_peak_memory); + bytesToHuman(total_system_hmem,total_system_mem); + if (sections++) info = sdscat(info,"\r\n"); info = sdscatprintf(info, "# Memory\r\n" @@ -2682,17 +2724,23 @@ sds genRedisInfoString(char *section) { "used_memory_rss:%zu\r\n" "used_memory_peak:%zu\r\n" "used_memory_peak_human:%s\r\n" + "total_system_memory:%lu\r\n" + "total_system_memory_human:%s\r\n" "used_memory_lua:%lld\r\n" "mem_fragmentation_ratio:%.2f\r\n" - "mem_allocator:%s\r\n", + "mem_allocator:%s\r\n" + "maxmemory_policy:%s\r\n", zmalloc_used, hmem, server.resident_set_size, server.stat_peak_memory, peak_hmem, + (unsigned long)total_system_mem, + total_system_hmem, ((long long)lua_gc(server.lua,LUA_GCCOUNT,0))*1024LL, zmalloc_get_fragmentation_ratio(server.resident_set_size), - ZMALLOC_LIB + ZMALLOC_LIB, + evict_policy ); } @@ -2757,14 +2805,14 @@ sds genRedisInfoString(char *section) { server.loading_loaded_bytes; perc = ((double)server.loading_loaded_bytes / - server.loading_total_bytes) * 100; + (server.loading_total_bytes+1)) * 100; - elapsed = server.unixtime-server.loading_start_time; + elapsed = time(NULL)-server.loading_start_time; if (elapsed == 0) { eta = 1; /* A fake 1 second figure if we don't have enough info */ } else { - eta = (elapsed*remaining_bytes)/server.loading_loaded_bytes; + eta = (elapsed*remaining_bytes)/(server.loading_loaded_bytes+1); } info = sdscatprintf(info, @@ -2790,6 +2838,10 @@ sds genRedisInfoString(char *section) { "total_connections_received:%lld\r\n" "total_commands_processed:%lld\r\n" "instantaneous_ops_per_sec:%lld\r\n" + "total_net_input_bytes:%lld\r\n" + "total_net_output_bytes:%lld\r\n" + "instantaneous_input_kbps:%.2f\r\n" + "instantaneous_output_kbps:%.2f\r\n" "rejected_connections:%lld\r\n" "sync_full:%lld\r\n" "sync_partial_ok:%lld\r\n" @@ -2804,7 +2856,11 @@ sds genRedisInfoString(char *section) { "migrate_cached_sockets:%ld\r\n", server.stat_numconnections, server.stat_numcommands, - getOperationsPerSecond(), + getInstantaneousMetric(REDIS_METRIC_COMMAND), + server.stat_net_input_bytes, + server.stat_net_output_bytes, + (float)getInstantaneousMetric(REDIS_METRIC_NET_INPUT)/1024, + (float)getInstantaneousMetric(REDIS_METRIC_NET_OUTPUT)/1024, server.stat_rejected_conn, server.stat_sync_full, server.stat_sync_partial_ok, @@ -3003,11 +3059,7 @@ void infoCommand(redisClient *c) { addReply(c,shared.syntaxerr); return; } - sds info = genRedisInfoString(section); - addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n", - (unsigned long)sdslen(info))); - addReplySds(c,info); - addReply(c,shared.crlf); + addReplyBulkSds(c, genRedisInfoString(section)); } void monitorCommand(redisClient *c) { @@ -3342,6 +3394,10 @@ void linuxMemoryWarnings(void) { #endif /* __linux__ */ void createPidFile(void) { + /* If pidfile requested, but no pidfile defined, use + * default pidfile path */ + if (!server.pidfile) server.pidfile = zstrdup(REDIS_DEFAULT_PID_FILE); + /* Try to write the pid file in a best-effort way. */ FILE *fp = fopen(server.pidfile,"w"); if (fp) { @@ -3404,15 +3460,27 @@ void redisAsciiArt(void) { else if (server.sentinel_mode) mode = "sentinel"; else mode = "standalone"; - snprintf(buf,1024*16,ascii_logo, - REDIS_VERSION, - redisGitSHA1(), - strtol(redisGitDirty(),NULL,10) > 0, - (sizeof(long) == 8) ? "64" : "32", - mode, server.port, - (long) getpid() - ); - redisLogRaw(REDIS_NOTICE|REDIS_LOG_RAW,buf); + if (server.syslog_enabled) { + redisLog(REDIS_NOTICE, + "Redis %s (%s/%d) %s bit, %s mode, port %d, pid %ld ready to start.", + REDIS_VERSION, + redisGitSHA1(), + strtol(redisGitDirty(),NULL,10) > 0, + (sizeof(long) == 8) ? "64" : "32", + mode, server.port, + (long) getpid() + ); + } else { + snprintf(buf,1024*16,ascii_logo, + REDIS_VERSION, + redisGitSHA1(), + strtol(redisGitDirty(),NULL,10) > 0, + (sizeof(long) == 8) ? "64" : "32", + mode, server.port, + (long) getpid() + ); + redisLogRaw(REDIS_NOTICE|REDIS_LOG_RAW,buf); + } zfree(buf); } @@ -3521,9 +3589,131 @@ void redisSetProcTitle(char *title) { #endif } +/* + * Check whether systemd or upstart have been used to start redis. + */ + +int redisSupervisedUpstart(void) { + const char *upstart_job = getenv("UPSTART_JOB"); + + if (!upstart_job) { + redisLog(REDIS_WARNING, + "upstart supervision requested, but UPSTART_JOB not found"); + return 0; + } + + redisLog(REDIS_NOTICE, "supervised by upstart, will stop to signal readyness"); + raise(SIGSTOP); + unsetenv("UPSTART_JOB"); + return 1; +} + +int redisSupervisedSystemd(void) { + const char *notify_socket = getenv("NOTIFY_SOCKET"); + int fd = 1; + struct sockaddr_un su; + struct iovec iov; + struct msghdr hdr; + int sendto_flags = 0; + + if (!notify_socket) { + redisLog(REDIS_WARNING, + "systemd supervision requested, but NOTIFY_SOCKET not found"); + return 0; + } + + if ((strchr("@/", notify_socket[0])) == NULL || strlen(notify_socket) < 2) { + return 0; + } + + redisLog(REDIS_NOTICE, "supervised by systemd, will signal readyness"); + if ((fd = socket(AF_UNIX, SOCK_DGRAM, 0)) == -1) { + redisLog(REDIS_WARNING, + "Can't connect to systemd socket %s", notify_socket); + return 0; + } + + memset(&su, 0, sizeof(su)); + su.sun_family = AF_UNIX; + strncpy (su.sun_path, notify_socket, sizeof(su.sun_path) -1); + su.sun_path[sizeof(su.sun_path) - 1] = '\0'; + + if (notify_socket[0] == '@') + su.sun_path[0] = '\0'; + + memset(&iov, 0, sizeof(iov)); + iov.iov_base = "READY=1"; + iov.iov_len = strlen("READY=1"); + + memset(&hdr, 0, sizeof(hdr)); + hdr.msg_name = &su; + hdr.msg_namelen = offsetof(struct sockaddr_un, sun_path) + + strlen(notify_socket); + hdr.msg_iov = &iov; + hdr.msg_iovlen = 1; + + unsetenv("NOTIFY_SOCKET"); +#ifdef HAVE_MSG_NOSIGNAL + sendto_flags |= MSG_NOSIGNAL; +#endif + if (sendmsg(fd, &hdr, sendto_flags) < 0) { + redisLog(REDIS_WARNING, "Can't send notification to systemd"); + close(fd); + return 0; + } + close(fd); + return 1; +} + +int redisIsSupervised(int mode) { + if (mode == REDIS_SUPERVISED_AUTODETECT) { + const char *upstart_job = getenv("UPSTART_JOB"); + const char *notify_socket = getenv("NOTIFY_SOCKET"); + + if (upstart_job) { + redisSupervisedUpstart(); + } else if (notify_socket) { + redisSupervisedSystemd(); + } + } else if (mode == REDIS_SUPERVISED_UPSTART) { + return redisSupervisedUpstart(); + } else if (mode == REDIS_SUPERVISED_SYSTEMD) { + return redisSupervisedSystemd(); + } + + return 0; +} + + int main(int argc, char **argv) { struct timeval tv; +#ifdef REDIS_TEST + if (argc == 3 && !strcasecmp(argv[1], "test")) { + if (!strcasecmp(argv[2], "ziplist")) { + return ziplistTest(argc, argv); + } else if (!strcasecmp(argv[2], "quicklist")) { + quicklistTest(argc, argv); + } else if (!strcasecmp(argv[2], "intset")) { + return intsetTest(argc, argv); + } else if (!strcasecmp(argv[2], "zipmap")) { + return zipmapTest(argc, argv); + } else if (!strcasecmp(argv[2], "sha1test")) { + return sha1Test(argc, argv); + } else if (!strcasecmp(argv[2], "util")) { + return utilTest(argc, argv); + } else if (!strcasecmp(argv[2], "sds")) { + return sdsTest(argc, argv); + } else if (!strcasecmp(argv[2], "endianconv")) { + return endianconvTest(argc, argv); + } else if (!strcasecmp(argv[2], "crc64")) { + return crc64Test(argc, argv); + } + + return -1; /* test not found */ + } +#endif + /* We need to initialize our libraries, and the server configuration. */ #ifdef INIT_SETPROCTITLE_REPLACEMENT spt_init(argc, argv); @@ -3545,6 +3735,12 @@ int main(int argc, char **argv) { initSentinel(); } + /* Check if we need to start in redis-check-rdb mode. We just execute + * the program main. However the program is part of the Redis executable + * so that we can easily execute an RDB check on loading errors. */ + if (strstr(argv[0],"redis-check-rdb") != NULL) + exit(redis_check_rdb_main(argv,argc)); + if (argc >= 2) { int j = 1; /* First option to parse in argv[] */ sds options = sdsempty(); @@ -3576,6 +3772,11 @@ int main(int argc, char **argv) { while(j != argc) { if (argv[j][0] == '-' && argv[j][1] == '-') { /* Option name */ + if (!strcmp(argv[j], "--check-rdb")) { + /* Argument has no options, need to skip for parsing. */ + j++; + continue; + } if (sdslen(options)) options = sdscat(options,"\n"); options = sdscat(options,argv[j]+2); options = sdscat(options," "); @@ -3600,9 +3801,13 @@ int main(int argc, char **argv) { } else { redisLog(REDIS_WARNING, "Warning: no config file specified, using the default config. In order to specify a config file use %s /path/to/%s.conf", argv[0], server.sentinel_mode ? "sentinel" : "redis"); } - if (server.daemonize) daemonize(); + + server.supervised = redisIsSupervised(server.supervised_mode); + int background = server.daemonize && !server.supervised; + if (background) daemonize(); + initServer(); - if (server.daemonize) createPidFile(); + if (background || server.pidfile) createPidFile(); redisSetProcTitle(argv[0]); redisAsciiArt(); @@ -3612,6 +3817,7 @@ int main(int argc, char **argv) { #ifdef __linux__ linuxMemoryWarnings(); #endif + checkTcpBacklogSettings(); loadDataFromDisk(); if (server.cluster_enabled) { if (verifyClusterConfigWithData() == REDIS_ERR) { diff --git a/src/redis.h b/src/redis.h index 855ae5742..2170c5d29 100644 --- a/src/redis.h +++ b/src/redis.h @@ -32,10 +32,7 @@ #include "fmacros.h" #include "config.h" - -#if defined(__sun) #include "solarisfixes.h" -#endif #include <stdio.h> #include <stdlib.h> @@ -65,6 +62,13 @@ typedef long long mstime_t; /* millisecond time type. */ #include "util.h" /* Misc functions useful in many places */ #include "latency.h" /* Latency monitor API */ #include "sparkline.h" /* ASII graphs API */ +#include "quicklist.h" + +/* Following includes allow test functions to be called from Redis main() */ +#include "zipmap.h" +#include "sha1.h" +#include "endianconv.h" +#include "crc64.h" /* Error codes */ #define REDIS_OK 0 @@ -97,7 +101,6 @@ typedef long long mstime_t; /* millisecond time type. */ #define REDIS_REPL_PING_SLAVE_PERIOD 10 #define REDIS_RUN_ID_SIZE 40 #define REDIS_EOF_MARK_SIZE 40 -#define REDIS_OPS_SEC_SAMPLES 16 #define REDIS_DEFAULT_REPL_BACKLOG_SIZE (1024*1024) /* 1mb */ #define REDIS_DEFAULT_REPL_BACKLOG_TIME_LIMIT (60*60) /* 1 hour */ #define REDIS_REPL_BACKLOG_MIN_SIZE (1024*16) /* 16k */ @@ -128,7 +131,7 @@ typedef long long mstime_t; /* millisecond time type. */ #define REDIS_DEFAULT_AOF_REWRITE_INCREMENTAL_FSYNC 1 #define REDIS_DEFAULT_MIN_SLAVES_TO_WRITE 0 #define REDIS_DEFAULT_MIN_SLAVES_MAX_LAG 10 -#define REDIS_IP_STR_LEN INET6_ADDRSTRLEN +#define REDIS_IP_STR_LEN 46 /* INET6_ADDRSTRLEN is 46, but we need to be sure */ #define REDIS_PEER_ID_LEN (REDIS_IP_STR_LEN+32) /* Must be enough for ip:port */ #define REDIS_BINDADDR_MAX 16 #define REDIS_MIN_RESERVED_FDS 32 @@ -140,6 +143,13 @@ typedef long long mstime_t; /* millisecond time type. */ #define ACTIVE_EXPIRE_CYCLE_SLOW 0 #define ACTIVE_EXPIRE_CYCLE_FAST 1 +/* Instantaneous metrics tracking. */ +#define REDIS_METRIC_SAMPLES 16 /* Number of samples per metric. */ +#define REDIS_METRIC_COMMAND 0 /* Number of commands executed. */ +#define REDIS_METRIC_NET_INPUT 1 /* Bytes read to network .*/ +#define REDIS_METRIC_NET_OUTPUT 2 /* Bytes written to network. */ +#define REDIS_METRIC_COUNT 3 + /* Protocol and I/O related defines */ #define REDIS_MAX_QUERYBUF_LEN (1024*1024*1024) /* 1GB max query buffer. */ #define REDIS_IOBUF_LEN (1024*16) /* Generic I/O buffer size */ @@ -192,6 +202,7 @@ typedef long long mstime_t; /* millisecond time type. */ #define REDIS_ENCODING_INTSET 6 /* Encoded as intset */ #define REDIS_ENCODING_SKIPLIST 7 /* Encoded as skiplist */ #define REDIS_ENCODING_EMBSTR 8 /* Embedded sds string encoding */ +#define REDIS_ENCODING_QUICKLIST 9 /* Encoded as linked list of ziplists */ /* Defines related to the dump file format. To store 32 bits lengths for short * keys requires a lot of space, so we check the most significant 2 bits of @@ -302,6 +313,12 @@ typedef long long mstime_t; /* millisecond time type. */ #define REDIS_LOG_RAW (1<<10) /* Modifier to log without timestamp */ #define REDIS_DEFAULT_VERBOSITY REDIS_NOTICE +/* Supervision options */ +#define REDIS_SUPERVISED_NONE 0 +#define REDIS_SUPERVISED_AUTODETECT 1 +#define REDIS_SUPERVISED_SYSTEMD 2 +#define REDIS_SUPERVISED_UPSTART 3 + /* Anti-warning macro... */ #define REDIS_NOTUSED(V) ((void) V) @@ -317,12 +334,14 @@ typedef long long mstime_t; /* millisecond time type. */ /* Zip structure related defaults */ #define REDIS_HASH_MAX_ZIPLIST_ENTRIES 512 #define REDIS_HASH_MAX_ZIPLIST_VALUE 64 -#define REDIS_LIST_MAX_ZIPLIST_ENTRIES 512 -#define REDIS_LIST_MAX_ZIPLIST_VALUE 64 #define REDIS_SET_MAX_INTSET_ENTRIES 512 #define REDIS_ZSET_MAX_ZIPLIST_ENTRIES 128 #define REDIS_ZSET_MAX_ZIPLIST_VALUE 64 +/* List defaults */ +#define REDIS_LIST_MAX_ZIPLIST_SIZE -2 +#define REDIS_LIST_COMPRESS_DEPTH 0 + /* HyperLogLog defines */ #define REDIS_DEFAULT_HLL_SPARSE_MAX_BYTES 3000 @@ -710,12 +729,16 @@ struct redisServer { long long slowlog_log_slower_than; /* SLOWLOG time limit (to get logged) */ unsigned long slowlog_max_len; /* SLOWLOG max number of items logged */ size_t resident_set_size; /* RSS sampled in serverCron(). */ - /* The following two are used to track instantaneous "load" in terms - * of operations per second. */ - long long ops_sec_last_sample_time; /* Timestamp of last sample (in ms) */ - long long ops_sec_last_sample_ops; /* numcommands in last sample */ - long long ops_sec_samples[REDIS_OPS_SEC_SAMPLES]; - int ops_sec_idx; + long long stat_net_input_bytes; /* Bytes read from network. */ + long long stat_net_output_bytes; /* Bytes written to network. */ + /* The following two are used to track instantaneous metrics, like + * number of operations per second, network traffic. */ + struct { + long long last_sample_time; /* Timestamp of last sample in ms */ + long long last_sample_count;/* Count in last sample */ + long long samples[REDIS_METRIC_SAMPLES]; + int idx; + } inst_metric[REDIS_METRIC_COUNT]; /* Configuration */ int verbosity; /* Loglevel in redis.conf */ int maxidletime; /* Client timeout in seconds */ @@ -723,6 +746,8 @@ struct redisServer { int active_expire_enabled; /* Can be disabled for testing purposes. */ size_t client_max_querybuf_len; /* Limit for client query buffer length */ int dbnum; /* Total number of configured DBs */ + int supervised; /* 1 if supervised, 0 otherwise. */ + int supervised_mode; /* See REDIS_SUPERVISED_* */ int daemonize; /* True if running as a daemon */ clientBufferLimitsConfig client_obuf_limits[REDIS_CLIENT_TYPE_COUNT]; /* AOF persistence */ @@ -852,12 +877,14 @@ struct redisServer { /* Zip structure config, see redis.conf for more information */ size_t hash_max_ziplist_entries; size_t hash_max_ziplist_value; - size_t list_max_ziplist_entries; - size_t list_max_ziplist_value; size_t set_max_intset_entries; size_t zset_max_ziplist_entries; size_t zset_max_ziplist_value; size_t hll_sparse_max_bytes; + /* List parameters */ + int list_max_ziplist_size; + int list_compress_depth; + /* time cache */ time_t unixtime; /* Unix time sampled every cron cycle. */ long long mstime; /* Like 'unixtime' but with milliseconds resolution. */ /* Pubsub */ @@ -897,6 +924,8 @@ struct redisServer { int assert_line; int bug_report_start; /* True if bug report header was already logged. */ int watchdog_period; /* Software watchdog period in ms. 0 = off */ + /* System hardware info */ + size_t system_memory_size; /* Total memory in system as reported by OS */ }; typedef struct pubsubPattern { @@ -945,15 +974,13 @@ typedef struct { robj *subject; unsigned char encoding; unsigned char direction; /* Iteration direction */ - unsigned char *zi; - listNode *ln; + quicklistIter *iter; } listTypeIterator; /* Structure for an entry while iterating over a list. */ typedef struct { listTypeIterator *li; - unsigned char *zi; /* Entry in ziplist */ - listNode *ln; /* Entry in linked list */ + quicklistEntry entry; /* Entry in quicklist */ } listTypeEntry; /* Structure to hold set iteration abstraction. */ @@ -1030,6 +1057,7 @@ void addReplyBulkCBuffer(redisClient *c, void *p, size_t len); void addReplyBulkLongLong(redisClient *c, long long ll); void addReply(redisClient *c, robj *obj); void addReplySds(redisClient *c, sds s); +void addReplyBulkSds(redisClient *c, sds s); void addReplyError(redisClient *c, char *err); void addReplyStatus(redisClient *c, char *status); void addReplyDouble(redisClient *c, double d); @@ -1079,7 +1107,7 @@ int listTypeNext(listTypeIterator *li, listTypeEntry *entry); robj *listTypeGet(listTypeEntry *entry); void listTypeInsert(listTypeEntry *entry, robj *value, int where); int listTypeEqual(listTypeEntry *entry, robj *o); -void listTypeDelete(listTypeEntry *entry); +void listTypeDelete(listTypeIterator *iter, listTypeEntry *entry); void listTypeConvert(robj *subject, int enc); void unblockClientWaitingData(redisClient *c); void handleClientsBlockedOnLists(void); @@ -1116,8 +1144,8 @@ robj *tryObjectEncoding(robj *o); robj *getDecodedObject(robj *o); size_t stringObjectLen(robj *o); robj *createStringObjectFromLongLong(long long value); -robj *createStringObjectFromLongDouble(long double value); -robj *createListObject(void); +robj *createStringObjectFromLongDouble(long double value, int humanfriendly); +robj *createQuicklistObject(void); robj *createZiplistObject(void); robj *createSetObject(void); robj *createIntsetObject(void); @@ -1244,6 +1272,7 @@ void closeListeningSockets(int unlink_unix_socket); void updateCachedTime(void); void resetServerStats(void); unsigned int getLRUClock(void); +char *maxmemoryToString(void); /* Set data type */ robj *setTypeCreate(robj *value); @@ -1255,6 +1284,7 @@ void setTypeReleaseIterator(setTypeIterator *si); int setTypeNext(setTypeIterator *si, robj **objele, int64_t *llele); robj *setTypeNextObject(setTypeIterator *si); int setTypeRandomElement(robj *setobj, robj **objele, int64_t *llele); +unsigned long setTypeRandomElements(robj *set, unsigned long count, robj *aux_set); unsigned long setTypeSize(robj *subject); void setTypeConvert(robj *subject, int enc); @@ -1350,6 +1380,10 @@ void sentinelTimer(void); char *sentinelHandleConfiguration(char **argv, int argc); void sentinelIsRunning(void); +/* redis-check-rdb */ +int redis_check_rdb(char *rdbfilename); +int redis_check_rdb_main(char **argv, int argc); + /* Scripting */ void scriptingInit(void); diff --git a/src/replication.c b/src/replication.c index c0e833263..7e36c3e99 100644 --- a/src/replication.c +++ b/src/replication.c @@ -56,7 +56,7 @@ char *replicationGetSlaveName(redisClient *c) { buf[0] = '\0'; if (anetPeerToString(c->fd,ip,sizeof(ip),NULL) != -1) { if (c->slave_listening_port) - snprintf(buf,sizeof(buf),"%s:%d",ip,c->slave_listening_port); + anetFormatAddr(buf,sizeof(buf),ip,c->slave_listening_port); else snprintf(buf,sizeof(buf),"%s:<unknown-slave-port>",ip); } else { @@ -690,6 +690,7 @@ void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) { freeClient(slave); return; } + server.stat_net_output_bytes += nwritten; sdsrange(slave->replpreamble,nwritten,-1); if (sdslen(slave->replpreamble) == 0) { sdsfree(slave->replpreamble); @@ -718,6 +719,7 @@ void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) { return; } slave->repldboff += nwritten; + server.stat_net_output_bytes += nwritten; if (slave->repldboff == slave->repldbsize) { close(slave->repldbfd); slave->repldbfd = -1; @@ -938,6 +940,7 @@ void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask) { replicationAbortSyncTransfer(); return; } + server.stat_net_input_bytes += nread; /* When a mark is used, we want to detect EOF asap in order to avoid * writing the EOF mark into the file... */ diff --git a/src/scripting.c b/src/scripting.c index 39bfe5fa7..c5dd4e718 100644 --- a/src/scripting.c +++ b/src/scripting.c @@ -30,6 +30,7 @@ #include "redis.h" #include "sha1.h" #include "rand.h" +#include "cluster.h" #include <lua.h> #include <lauxlib.h> @@ -213,11 +214,27 @@ int luaRedisGenericCommand(lua_State *lua, int raise_error) { static int argv_size = 0; static robj *cached_objects[LUA_CMD_OBJCACHE_SIZE]; static size_t cached_objects_len[LUA_CMD_OBJCACHE_SIZE]; + static int inuse = 0; /* Recursive calls detection. */ + + /* By using Lua debug hooks it is possible to trigger a recursive call + * to luaRedisGenericCommand(), which normally should never happen. + * To make this function reentrant is futile and makes it slower, but + * we should at least detect such a misuse, and abort. */ + if (inuse) { + char *recursion_warning = + "luaRedisGenericCommand() recursive call detected. " + "Are you doing funny stuff with Lua debug hooks?"; + redisLog(REDIS_WARNING,"%s",recursion_warning); + luaPushError(lua,recursion_warning); + return 1; + } + inuse++; /* Require at least one argument */ if (argc == 0) { luaPushError(lua, "Please specify at least one argument for redis.call()"); + inuse--; return 1; } @@ -272,6 +289,7 @@ int luaRedisGenericCommand(lua_State *lua, int raise_error) { } luaPushError(lua, "Lua redis() command arguments must be strings or integers"); + inuse--; return 1; } @@ -291,6 +309,7 @@ int luaRedisGenericCommand(lua_State *lua, int raise_error) { luaPushError(lua,"Unknown Redis command called from Lua script"); goto cleanup; } + c->cmd = cmd; /* There are commands that are not allowed inside scripts. */ if (cmd->flags & REDIS_CMD_NOSCRIPT) { @@ -337,8 +356,23 @@ int luaRedisGenericCommand(lua_State *lua, int raise_error) { if (cmd->flags & REDIS_CMD_RANDOM) server.lua_random_dirty = 1; if (cmd->flags & REDIS_CMD_WRITE) server.lua_write_dirty = 1; + /* If this is a Redis Cluster node, we need to make sure Lua is not + * trying to access non-local keys. */ + if (server.cluster_enabled) { + /* Duplicate relevant flags in the lua client. */ + c->flags &= ~(REDIS_READONLY|REDIS_ASKING); + c->flags |= server.lua_caller->flags & (REDIS_READONLY|REDIS_ASKING); + if (getNodeByQuery(c,c->cmd,c->argv,c->argc,NULL,NULL) != + server.cluster->myself) + { + luaPushError(lua, + "Lua script attempted to access a non local key in a " + "cluster node"); + goto cleanup; + } + } + /* Run the command */ - c->cmd = cmd; call(c,REDIS_CALL_SLOWLOG | REDIS_CALL_STATS); /* Convert the result of the Redis command into a suitable Lua type. @@ -409,8 +443,10 @@ cleanup: * return the plain error. */ lua_pushstring(lua,"err"); lua_gettable(lua,-2); + inuse--; return lua_error(lua); } + inuse--; return 1; } @@ -295,7 +295,7 @@ sds sdscpy(sds s, const char *t) { * conversion. 's' must point to a string with room for at least * SDS_LLSTR_SIZE bytes. * - * The function returns the lenght of the null-terminated string + * The function returns the length of the null-terminated string * representation stored at 's'. */ #define SDS_LLSTR_SIZE 21 int sdsll2str(char *s, long long value) { @@ -369,7 +369,7 @@ sds sdsfromlonglong(long long value) { return sdsnewlen(buf,len); } -/* Like sdscatpritf() but gets va_list instead of being variadic. */ +/* Like sdscatprintf() but gets va_list instead of being variadic. */ sds sdscatvprintf(sds s, const char *fmt, va_list ap) { va_list cpy; char staticbuf[1024], *buf = staticbuf, *t; @@ -390,7 +390,7 @@ sds sdscatvprintf(sds s, const char *fmt, va_list ap) { buf[buflen-2] = '\0'; va_copy(cpy,ap); vsnprintf(buf, buflen, fmt, cpy); - va_end(ap); + va_end(cpy); if (buf[buflen-2] != '\0') { if (buf != staticbuf) zfree(buf); buflen *= 2; @@ -415,7 +415,7 @@ sds sdscatvprintf(sds s, const char *fmt, va_list ap) { * * Example: * - * s = sdsempty("Sum is: "); + * s = sdsnew("Sum is: "); * s = sdscatprintf(s,"%d+%d = %d",a,b,a+b). * * Often you need to create a string from scratch with the printf-alike @@ -570,7 +570,7 @@ sds sdstrim(sds s, const char *cset) { sp = start = s; ep = end = s+sdslen(s)-1; while(sp <= end && strchr(cset, *sp)) sp++; - while(ep > start && strchr(cset, *ep)) ep--; + while(ep > sp && strchr(cset, *ep)) ep--; len = (sp > ep) ? 0 : ((ep-sp)+1); if (sh->buf != sp) memmove(sh->buf, sp, len); sh->buf[len] = '\0'; @@ -643,8 +643,8 @@ void sdstoupper(sds s) { * * Return value: * - * 1 if s1 > s2. - * -1 if s1 < s2. + * positive if s1 > s2. + * negative if s1 < s2. * 0 if s1 and s2 are exactly the same binary string. * * If two strings share exactly the same prefix, but one of the two has @@ -962,12 +962,15 @@ sds sdsjoin(char **argv, int argc, char *sep) { return join; } -#ifdef SDS_TEST_MAIN +#if defined(REDIS_TEST) || defined(SDS_TEST_MAIN) #include <stdio.h> #include "testhelp.h" #include "limits.h" -int main(void) { +#define UNUSED(x) (void)(x) +int sdsTest(int argc, char *argv[]) { + UNUSED(argc); + UNUSED(argv); { struct sdshdr *sh; sds x = sdsnew("foo"), y; @@ -1014,6 +1017,18 @@ int main(void) { memcmp(x,"--4294967295,18446744073709551615--",35) == 0) sdsfree(x); + x = sdsnew(" x "); + sdstrim(x," x"); + test_cond("sdstrim() works when all chars match", + sdslen(x) == 0) + + sdsfree(x); + x = sdsnew(" x "); + sdstrim(x," "); + test_cond("sdstrim() works when a single char remains", + sdslen(x) == 1 && x[0] == 'x') + + sdsfree(x); x = sdsnew("xxciaoyyy"); sdstrim(x,"xy"); test_cond("sdstrim() correctly trims characters", @@ -1080,7 +1095,7 @@ int main(void) { memcmp(y,"\"\\a\\n\\x00foo\\r\"",15) == 0) { - int oldfree; + unsigned int oldfree; sdsfree(x); x = sdsnew("0"); @@ -1101,3 +1116,9 @@ int main(void) { return 0; } #endif + +#ifdef SDS_TEST_MAIN +int main(void) { + return sdsTest(); +} +#endif @@ -98,4 +98,8 @@ void sdsIncrLen(sds s, int incr); sds sdsRemoveFreeSpace(sds s); size_t sdsAllocSize(sds s); +#ifdef REDIS_TEST +int sdsTest(int argc, char *argv[]); +#endif + #endif diff --git a/src/sentinel.c b/src/sentinel.c index 8e78a2263..c693a5862 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -190,6 +190,7 @@ typedef struct sentinelRedisInstance { * are set to NULL no script is executed. */ char *notification_script; char *client_reconfig_script; + sds info; /* cached INFO output */ } sentinelRedisInstance; /* Main state. */ @@ -576,7 +577,7 @@ void sentinelEvent(int level, char *type, sentinelRedisInstance *ri, if (level == REDIS_WARNING && ri != NULL) { sentinelRedisInstance *master = (ri->flags & SRI_MASTER) ? ri : ri->master; - if (master->notification_script) { + if (master && master->notification_script) { sentinelScheduleScriptExecution(master->notification_script, type,msg,NULL); } @@ -896,7 +897,7 @@ sentinelRedisInstance *createSentinelRedisInstance(char *name, int flags, char * sentinelRedisInstance *ri; sentinelAddr *addr; dict *table = NULL; - char slavename[128], *sdsname; + char slavename[REDIS_PEER_ID_LEN], *sdsname; redisAssert(flags & (SRI_MASTER|SRI_SLAVE|SRI_SENTINEL)); redisAssert((flags & SRI_MASTER) || master != NULL); @@ -907,9 +908,7 @@ sentinelRedisInstance *createSentinelRedisInstance(char *name, int flags, char * /* For slaves and sentinel we use ip:port as name. */ if (flags & (SRI_SLAVE|SRI_SENTINEL)) { - snprintf(slavename,sizeof(slavename), - strchr(hostname,':') ? "[%s]:%d" : "%s:%d", - hostname,port); + anetFormatAddr(slavename, sizeof(slavename), hostname, port); name = slavename; } @@ -983,6 +982,7 @@ sentinelRedisInstance *createSentinelRedisInstance(char *name, int flags, char * ri->promoted_slave = NULL; ri->notification_script = NULL; ri->client_reconfig_script = NULL; + ri->info = NULL; /* Role */ ri->role_reported = ri->flags & (SRI_MASTER|SRI_SLAVE); @@ -1015,6 +1015,7 @@ void releaseSentinelRedisInstance(sentinelRedisInstance *ri) { sdsfree(ri->slave_master_host); sdsfree(ri->leader); sdsfree(ri->auth_pass); + sdsfree(ri->info); releaseSentinelAddr(ri->addr); /* Clear state into the master if needed. */ @@ -1030,11 +1031,11 @@ sentinelRedisInstance *sentinelRedisInstanceLookupSlave( { sds key; sentinelRedisInstance *slave; + char buf[REDIS_PEER_ID_LEN]; redisAssert(ri->flags & SRI_MASTER); - key = sdscatprintf(sdsempty(), - strchr(ip,':') ? "[%s]:%d" : "%s:%d", - ip,port); + anetFormatAddr(buf,sizeof(buf),ip,port); + key = sdsnew(buf); slave = dictFetchValue(ri->slaves,key); sdsfree(key); return slave; @@ -1785,6 +1786,10 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { int numlines, j; int role = 0; + /* cache full INFO output for instance */ + sdsfree(ri->info); + ri->info = sdsnew(info); + /* The following fields must be reset to a given value in the case they * are not found at all in the INFO output. */ ri->master_link_down_time = 0; @@ -2777,6 +2782,67 @@ void sentinelCommand(redisClient *c) { } else if (!strcasecmp(c->argv[1]->ptr,"set")) { if (c->argc < 3 || c->argc % 2 == 0) goto numargserr; sentinelSetCommand(c); + } else if (!strcasecmp(c->argv[1]->ptr,"info-cache")) { + if (c->argc < 2) goto numargserr; + mstime_t now = mstime(); + + /* Create an ad-hoc dictionary type so that we can iterate + * a dictionary composed of just the master groups the user + * requested. */ + dictType copy_keeper = instancesDictType; + copy_keeper.valDestructor = NULL; + dict *masters_local = sentinel.masters; + if (c->argc > 2) { + masters_local = dictCreate(©_keeper, NULL); + + for (int i = 2; i < c->argc; i++) { + sentinelRedisInstance *ri; + ri = sentinelGetMasterByName(c->argv[i]->ptr); + if (!ri) continue; /* ignore non-existing names */ + dictAdd(masters_local, ri->name, ri); + } + } + + /* Reply format: + * 1.) master name + * 2.) 1.) info from master + * 2.) info from replica + * ... + * 3.) other master name + * ... + */ + addReplyMultiBulkLen(c,dictSize(masters_local) * 2); + + dictIterator *di; + dictEntry *de; + di = dictGetIterator(masters_local); + while ((de = dictNext(di)) != NULL) { + sentinelRedisInstance *ri = dictGetVal(de); + addReplyBulkCBuffer(c,ri->name,strlen(ri->name)); + addReplyMultiBulkLen(c,dictSize(ri->slaves) + 1); /* +1 for self */ + addReplyMultiBulkLen(c,2); + addReplyLongLong(c, now - ri->info_refresh); + if (ri->info) + addReplyBulkCBuffer(c,ri->info,sdslen(ri->info)); + else + addReply(c,shared.nullbulk); + + dictIterator *sdi; + dictEntry *sde; + sdi = dictGetIterator(ri->slaves); + while ((sde = dictNext(sdi)) != NULL) { + sentinelRedisInstance *sri = dictGetVal(sde); + addReplyMultiBulkLen(c,2); + addReplyLongLong(c, now - sri->info_refresh); + if (sri->info) + addReplyBulkCBuffer(c,sri->info,sdslen(sri->info)); + else + addReply(c,shared.nullbulk); + } + dictReleaseIterator(sdi); + } + dictReleaseIterator(di); + if (masters_local != sentinel.masters) dictRelease(masters_local); } else { addReplyErrorFormat(c,"Unknown sentinel subcommand '%s'", (char*)c->argv[1]->ptr); @@ -2842,10 +2908,7 @@ void sentinelInfoCommand(redisClient *c) { dictReleaseIterator(di); } - addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n", - (unsigned long)sdslen(info))); - addReplySds(c,info); - addReply(c,shared.crlf); + addReplyBulkSds(c, info); } /* Implements Sentinel verison of the ROLE command. The output is diff --git a/src/sha1.c b/src/sha1.c index 59e6f461d..7f73b40d3 100644 --- a/src/sha1.c +++ b/src/sha1.c @@ -24,9 +24,7 @@ A million repetitions of "a" #include <stdio.h> #include <string.h> #include <sys/types.h> /* for u_int*_t */ -#if defined(__sun) #include "solarisfixes.h" -#endif #include "sha1.h" #include "config.h" @@ -199,16 +197,19 @@ void SHA1Final(unsigned char digest[20], SHA1_CTX* context) } /* ================ end of sha1.c ================ */ -#if 0 +#ifdef REDIS_TEST #define BUFSIZE 4096 -int -main(int argc, char **argv) +#define UNUSED(x) (void)(x) +int sha1Test(int argc, char **argv) { SHA1_CTX ctx; unsigned char hash[20], buf[BUFSIZE]; int i; + UNUSED(argc); + UNUSED(argv); + for(i=0;i<BUFSIZE;i++) buf[i] = i; @@ -223,6 +224,4 @@ main(int argc, char **argv) printf("\n"); return 0; } - #endif - diff --git a/src/sha1.h b/src/sha1.h index 9d6f12965..4c76d19da 100644 --- a/src/sha1.h +++ b/src/sha1.h @@ -1,3 +1,5 @@ +#ifndef SHA1_H +#define SHA1_H /* ================ sha1.h ================ */ /* SHA-1 in C @@ -15,3 +17,8 @@ void SHA1Transform(u_int32_t state[5], const unsigned char buffer[64]); void SHA1Init(SHA1_CTX* context); void SHA1Update(SHA1_CTX* context, const unsigned char* data, u_int32_t len); void SHA1Final(unsigned char digest[20], SHA1_CTX* context); + +#ifdef REDIS_TEST +int sha1Test(int argc, char **argv); +#endif +#endif diff --git a/src/solarisfixes.h b/src/solarisfixes.h index 23025257a..3e53ba67c 100644 --- a/src/solarisfixes.h +++ b/src/solarisfixes.h @@ -28,6 +28,8 @@ * POSSIBILITY OF SUCH DAMAGE. */ +#if defined(__sun) + #if defined(__GNUC__) #include <math.h> #undef isnan @@ -48,3 +50,5 @@ #define u_int uint #define u_int32_t uint32_t #endif /* __GNUC__ */ + +#endif /* __sun */ diff --git a/src/sort.c b/src/sort.c index fedf0cf3a..74b27cb67 100644 --- a/src/sort.c +++ b/src/sort.c @@ -220,7 +220,7 @@ void sortCommand(redisClient *c) { if (sortval) incrRefCount(sortval); else - sortval = createListObject(); + sortval = createQuicklistObject(); /* The SORT command has an SQL-alike syntax, parse it */ while(j < c->argc) { @@ -285,16 +285,15 @@ void sortCommand(redisClient *c) { return; } - /* For the STORE option, or when SORT is called from a Lua script, - * we want to force a specific ordering even when no explicit ordering - * was asked (SORT BY nosort). This guarantees that replication / AOF - * is deterministic. + /* When sorting a set with no sort specified, we must sort the output + * so the result is consistent across scripting and replication. * - * However in the case 'dontsort' is true, but the type to sort is a - * sorted set, we don't need to do anything as ordering is guaranteed - * in this special case. */ - if ((storekey || c->flags & REDIS_LUA_CLIENT) && - (dontsort && sortval->type != REDIS_ZSET)) + * The other types (list, sorted set) will retain their native order + * even if no sort order is requested, so they remain stable across + * scripting and replication. */ + if (dontsort && + sortval->type == REDIS_SET && + (storekey || c->flags & REDIS_LUA_CLIENT)) { /* Force ALPHA sorting */ dontsort = 0; @@ -421,6 +420,7 @@ void sortCommand(redisClient *c) { } else { redisPanic("Unknown type"); } + printf("j: %d; vectorlen: %d\n", j, vectorlen); redisAssertWithInfo(c,sortval,j == vectorlen); /* Now it's time to load the right scores in the sorting vector */ @@ -510,7 +510,7 @@ void sortCommand(redisClient *c) { } } } else { - robj *sobj = createZiplistObject(); + robj *sobj = createQuicklistObject(); /* STORE option specified, set the sorting result as a List object */ for (j = start; j <= end; j++) { diff --git a/src/sparkline.c b/src/sparkline.c index 900f26ab7..8e2764aee 100644 --- a/src/sparkline.c +++ b/src/sparkline.c @@ -49,7 +49,7 @@ static int label_margin_top = 1; * sparklineSequenceAddSample(seq, 10, NULL); * sparklineSequenceAddSample(seq, 20, NULL); * sparklineSequenceAddSample(seq, 30, "last sample label"); - * sds output = sparklineRender(seq, 80, 4); + * sds output = sparklineRender(sdsempty(), seq, 80, 4, SPARKLINE_FILL); * freeSparklineSequence(seq); * ------------------------------------------------------------------------- */ @@ -63,6 +63,7 @@ struct sequence *createSparklineSequence(void) { /* Add a new sample into a sequence. */ void sparklineSequenceAddSample(struct sequence *seq, double value, char *label) { + label = (label == NULL || label[0] == '\0') ? NULL : zstrdup(label); if (seq->length == 0) { seq->min = seq->max = value; } else { diff --git a/src/t_hash.c b/src/t_hash.c index f5ceb36e9..7f33bba0c 100644 --- a/src/t_hash.c +++ b/src/t_hash.c @@ -565,7 +565,7 @@ void hincrbyfloatCommand(redisClient *c) { } value += incr; - new = createStringObjectFromLongDouble(value); + new = createStringObjectFromLongDouble(value,1); hashTypeTryObjectEncoding(o,&c->argv[2],NULL); hashTypeSet(o,c->argv[2],new); addReplyBulk(c,new); diff --git a/src/t_list.c b/src/t_list.c index 7c79185fd..232cb5c52 100644 --- a/src/t_list.c +++ b/src/t_list.c @@ -33,75 +33,37 @@ * List API *----------------------------------------------------------------------------*/ -/* Check the argument length to see if it requires us to convert the ziplist - * to a real list. Only check raw-encoded objects because integer encoded - * objects are never too long. */ -void listTypeTryConversion(robj *subject, robj *value) { - if (subject->encoding != REDIS_ENCODING_ZIPLIST) return; - if (sdsEncodedObject(value) && - sdslen(value->ptr) > server.list_max_ziplist_value) - listTypeConvert(subject,REDIS_ENCODING_LINKEDLIST); -} - /* The function pushes an element to the specified list object 'subject', * at head or tail position as specified by 'where'. * * There is no need for the caller to increment the refcount of 'value' as * the function takes care of it if needed. */ void listTypePush(robj *subject, robj *value, int where) { - /* Check if we need to convert the ziplist */ - listTypeTryConversion(subject,value); - if (subject->encoding == REDIS_ENCODING_ZIPLIST && - ziplistLen(subject->ptr) >= server.list_max_ziplist_entries) - listTypeConvert(subject,REDIS_ENCODING_LINKEDLIST); - - if (subject->encoding == REDIS_ENCODING_ZIPLIST) { - int pos = (where == REDIS_HEAD) ? ZIPLIST_HEAD : ZIPLIST_TAIL; + if (subject->encoding == REDIS_ENCODING_QUICKLIST) { + int pos = (where == REDIS_HEAD) ? QUICKLIST_HEAD : QUICKLIST_TAIL; value = getDecodedObject(value); - subject->ptr = ziplistPush(subject->ptr,value->ptr,sdslen(value->ptr),pos); + size_t len = sdslen(value->ptr); + quicklistPush(subject->ptr, value->ptr, len, pos); decrRefCount(value); - } else if (subject->encoding == REDIS_ENCODING_LINKEDLIST) { - if (where == REDIS_HEAD) { - listAddNodeHead(subject->ptr,value); - } else { - listAddNodeTail(subject->ptr,value); - } - incrRefCount(value); } else { redisPanic("Unknown list encoding"); } } +void *listPopSaver(unsigned char *data, unsigned int sz) { + return createStringObject((char*)data,sz); +} + robj *listTypePop(robj *subject, int where) { + long long vlong; robj *value = NULL; - if (subject->encoding == REDIS_ENCODING_ZIPLIST) { - unsigned char *p; - unsigned char *vstr; - unsigned int vlen; - long long vlong; - int pos = (where == REDIS_HEAD) ? 0 : -1; - p = ziplistIndex(subject->ptr,pos); - if (ziplistGet(p,&vstr,&vlen,&vlong)) { - if (vstr) { - value = createStringObject((char*)vstr,vlen); - } else { + + int ql_where = where == REDIS_HEAD ? QUICKLIST_HEAD : QUICKLIST_TAIL; + if (subject->encoding == REDIS_ENCODING_QUICKLIST) { + if (quicklistPopCustom(subject->ptr, ql_where, (unsigned char **)&value, + NULL, &vlong, listPopSaver)) { + if (!value) value = createStringObjectFromLongLong(vlong); - } - /* We only need to delete an element when it exists */ - subject->ptr = ziplistDelete(subject->ptr,&p); - } - } else if (subject->encoding == REDIS_ENCODING_LINKEDLIST) { - list *list = subject->ptr; - listNode *ln; - if (where == REDIS_HEAD) { - ln = listFirst(list); - } else { - ln = listLast(list); - } - if (ln != NULL) { - value = listNodeValue(ln); - incrRefCount(value); - listDelNode(list,ln); } } else { redisPanic("Unknown list encoding"); @@ -110,25 +72,28 @@ robj *listTypePop(robj *subject, int where) { } unsigned long listTypeLength(robj *subject) { - if (subject->encoding == REDIS_ENCODING_ZIPLIST) { - return ziplistLen(subject->ptr); - } else if (subject->encoding == REDIS_ENCODING_LINKEDLIST) { - return listLength((list*)subject->ptr); + if (subject->encoding == REDIS_ENCODING_QUICKLIST) { + return quicklistCount(subject->ptr); } else { redisPanic("Unknown list encoding"); } } /* Initialize an iterator at the specified index. */ -listTypeIterator *listTypeInitIterator(robj *subject, long index, unsigned char direction) { +listTypeIterator *listTypeInitIterator(robj *subject, long index, + unsigned char direction) { listTypeIterator *li = zmalloc(sizeof(listTypeIterator)); li->subject = subject; li->encoding = subject->encoding; li->direction = direction; - if (li->encoding == REDIS_ENCODING_ZIPLIST) { - li->zi = ziplistIndex(subject->ptr,index); - } else if (li->encoding == REDIS_ENCODING_LINKEDLIST) { - li->ln = listIndex(subject->ptr,index); + li->iter = NULL; + /* REDIS_HEAD means start at TAIL and move *towards* head. + * REDIS_TAIL means start at HEAD and move *towards tail. */ + int iter_direction = + direction == REDIS_HEAD ? AL_START_TAIL : AL_START_HEAD; + if (li->encoding == REDIS_ENCODING_QUICKLIST) { + li->iter = quicklistGetIteratorAtIdx(li->subject->ptr, + iter_direction, index); } else { redisPanic("Unknown list encoding"); } @@ -137,6 +102,7 @@ listTypeIterator *listTypeInitIterator(robj *subject, long index, unsigned char /* Clean up the iterator. */ void listTypeReleaseIterator(listTypeIterator *li) { + zfree(li->iter); zfree(li); } @@ -148,24 +114,8 @@ int listTypeNext(listTypeIterator *li, listTypeEntry *entry) { redisAssert(li->subject->encoding == li->encoding); entry->li = li; - if (li->encoding == REDIS_ENCODING_ZIPLIST) { - entry->zi = li->zi; - if (entry->zi != NULL) { - if (li->direction == REDIS_TAIL) - li->zi = ziplistNext(li->subject->ptr,li->zi); - else - li->zi = ziplistPrev(li->subject->ptr,li->zi); - return 1; - } - } else if (li->encoding == REDIS_ENCODING_LINKEDLIST) { - entry->ln = li->ln; - if (entry->ln != NULL) { - if (li->direction == REDIS_TAIL) - li->ln = li->ln->next; - else - li->ln = li->ln->prev; - return 1; - } + if (li->encoding == REDIS_ENCODING_QUICKLIST) { + return quicklistNext(li->iter, &entry->entry); } else { redisPanic("Unknown list encoding"); } @@ -174,24 +124,14 @@ int listTypeNext(listTypeIterator *li, listTypeEntry *entry) { /* Return entry or NULL at the current position of the iterator. */ robj *listTypeGet(listTypeEntry *entry) { - listTypeIterator *li = entry->li; robj *value = NULL; - if (li->encoding == REDIS_ENCODING_ZIPLIST) { - unsigned char *vstr; - unsigned int vlen; - long long vlong; - redisAssert(entry->zi != NULL); - if (ziplistGet(entry->zi,&vstr,&vlen,&vlong)) { - if (vstr) { - value = createStringObject((char*)vstr,vlen); - } else { - value = createStringObjectFromLongLong(vlong); - } + if (entry->li->encoding == REDIS_ENCODING_QUICKLIST) { + if (entry->entry.value) { + value = createStringObject((char *)entry->entry.value, + entry->entry.sz); + } else { + value = createStringObjectFromLongLong(entry->entry.longval); } - } else if (li->encoding == REDIS_ENCODING_LINKEDLIST) { - redisAssert(entry->ln != NULL); - value = listNodeValue(entry->ln); - incrRefCount(value); } else { redisPanic("Unknown list encoding"); } @@ -199,30 +139,18 @@ robj *listTypeGet(listTypeEntry *entry) { } void listTypeInsert(listTypeEntry *entry, robj *value, int where) { - robj *subject = entry->li->subject; - if (entry->li->encoding == REDIS_ENCODING_ZIPLIST) { + if (entry->li->encoding == REDIS_ENCODING_QUICKLIST) { value = getDecodedObject(value); + sds str = value->ptr; + size_t len = sdslen(str); if (where == REDIS_TAIL) { - unsigned char *next = ziplistNext(subject->ptr,entry->zi); - - /* When we insert after the current element, but the current element - * is the tail of the list, we need to do a push. */ - if (next == NULL) { - subject->ptr = ziplistPush(subject->ptr,value->ptr,sdslen(value->ptr),REDIS_TAIL); - } else { - subject->ptr = ziplistInsert(subject->ptr,next,value->ptr,sdslen(value->ptr)); - } - } else { - subject->ptr = ziplistInsert(subject->ptr,entry->zi,value->ptr,sdslen(value->ptr)); + quicklistInsertAfter((quicklist *)entry->entry.quicklist, + &entry->entry, str, len); + } else if (where == REDIS_HEAD) { + quicklistInsertBefore((quicklist *)entry->entry.quicklist, + &entry->entry, str, len); } decrRefCount(value); - } else if (entry->li->encoding == REDIS_ENCODING_LINKEDLIST) { - if (where == REDIS_TAIL) { - listInsertNode(subject->ptr,entry->ln,value,AL_START_TAIL); - } else { - listInsertNode(subject->ptr,entry->ln,value,AL_START_HEAD); - } - incrRefCount(value); } else { redisPanic("Unknown list encoding"); } @@ -230,59 +158,33 @@ void listTypeInsert(listTypeEntry *entry, robj *value, int where) { /* Compare the given object with the entry at the current position. */ int listTypeEqual(listTypeEntry *entry, robj *o) { - listTypeIterator *li = entry->li; - if (li->encoding == REDIS_ENCODING_ZIPLIST) { + if (entry->li->encoding == REDIS_ENCODING_QUICKLIST) { redisAssertWithInfo(NULL,o,sdsEncodedObject(o)); - return ziplistCompare(entry->zi,o->ptr,sdslen(o->ptr)); - } else if (li->encoding == REDIS_ENCODING_LINKEDLIST) { - return equalStringObjects(o,listNodeValue(entry->ln)); + return quicklistCompare(entry->entry.zi,o->ptr,sdslen(o->ptr)); } else { redisPanic("Unknown list encoding"); } } /* Delete the element pointed to. */ -void listTypeDelete(listTypeEntry *entry) { - listTypeIterator *li = entry->li; - if (li->encoding == REDIS_ENCODING_ZIPLIST) { - unsigned char *p = entry->zi; - li->subject->ptr = ziplistDelete(li->subject->ptr,&p); - - /* Update position of the iterator depending on the direction */ - if (li->direction == REDIS_TAIL) - li->zi = p; - else - li->zi = ziplistPrev(li->subject->ptr,p); - } else if (entry->li->encoding == REDIS_ENCODING_LINKEDLIST) { - listNode *next; - if (li->direction == REDIS_TAIL) - next = entry->ln->next; - else - next = entry->ln->prev; - listDelNode(li->subject->ptr,entry->ln); - li->ln = next; +void listTypeDelete(listTypeIterator *iter, listTypeEntry *entry) { + if (entry->li->encoding == REDIS_ENCODING_QUICKLIST) { + quicklistDelEntry(iter->iter, &entry->entry); } else { redisPanic("Unknown list encoding"); } } +/* Create a quicklist from a single ziplist */ void listTypeConvert(robj *subject, int enc) { - listTypeIterator *li; - listTypeEntry entry; - redisAssertWithInfo(NULL,subject,subject->type == REDIS_LIST); - - if (enc == REDIS_ENCODING_LINKEDLIST) { - list *l = listCreate(); - listSetFreeMethod(l,decrRefCountVoid); - - /* listTypeGet returns a robj with incremented refcount */ - li = listTypeInitIterator(subject,0,REDIS_TAIL); - while (listTypeNext(li,&entry)) listAddNodeTail(l,listTypeGet(&entry)); - listTypeReleaseIterator(li); - - subject->encoding = REDIS_ENCODING_LINKEDLIST; - zfree(subject->ptr); - subject->ptr = l; + redisAssertWithInfo(NULL,subject,subject->type==REDIS_LIST); + redisAssertWithInfo(NULL,subject,subject->encoding==REDIS_ENCODING_ZIPLIST); + + if (enc == REDIS_ENCODING_QUICKLIST) { + size_t zlen = server.list_max_ziplist_size; + int depth = server.list_compress_depth; + subject->ptr = quicklistCreateFromZiplist(zlen, depth, subject->ptr); + subject->encoding = REDIS_ENCODING_QUICKLIST; } else { redisPanic("Unsupported list conversion"); } @@ -304,7 +206,9 @@ void pushGenericCommand(redisClient *c, int where) { for (j = 2; j < c->argc; j++) { c->argv[j] = tryObjectEncoding(c->argv[j]); if (!lobj) { - lobj = createZiplistObject(); + lobj = createQuicklistObject(); + quicklistSetOptions(lobj->ptr, server.list_max_ziplist_size, + server.list_compress_depth); dbAdd(c->db,c->argv[1],lobj); } listTypePush(lobj,c->argv[j],where); @@ -334,17 +238,10 @@ void pushxGenericCommand(redisClient *c, robj *refval, robj *val, int where) { listTypeEntry entry; int inserted = 0; - if ((subject = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL || + if ((subject = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL || checkType(c,subject,REDIS_LIST)) return; if (refval != NULL) { - /* We're not sure if this value can be inserted yet, but we cannot - * convert the list inside the iterator. We don't want to loop over - * the list twice (once to see if the value can be inserted and once - * to do the actual insert), so we assume this value can be inserted - * and convert the ziplist to a regular list if necessary. */ - listTypeTryConversion(subject,val); - /* Seek refval from head to tail */ iter = listTypeInitIterator(subject,0,REDIS_TAIL); while (listTypeNext(iter,&entry)) { @@ -357,10 +254,6 @@ void pushxGenericCommand(redisClient *c, robj *refval, robj *val, int where) { listTypeReleaseIterator(iter); if (inserted) { - /* Check if the length exceeds the ziplist length threshold. */ - if (subject->encoding == REDIS_ENCODING_ZIPLIST && - ziplistLen(subject->ptr) > server.list_max_ziplist_entries) - listTypeConvert(subject,REDIS_ENCODING_LINKEDLIST); signalModifiedKey(c->db,c->argv[1]); notifyKeyspaceEvent(REDIS_NOTIFY_LIST,"linsert", c->argv[1],c->db->id); @@ -418,31 +311,19 @@ void lindexCommand(redisClient *c) { if ((getLongFromObjectOrReply(c, c->argv[2], &index, NULL) != REDIS_OK)) return; - if (o->encoding == REDIS_ENCODING_ZIPLIST) { - unsigned char *p; - unsigned char *vstr; - unsigned int vlen; - long long vlong; - p = ziplistIndex(o->ptr,index); - if (ziplistGet(p,&vstr,&vlen,&vlong)) { - if (vstr) { - value = createStringObject((char*)vstr,vlen); + if (o->encoding == REDIS_ENCODING_QUICKLIST) { + quicklistEntry entry; + if (quicklistIndex(o->ptr, index, &entry)) { + if (entry.value) { + value = createStringObject((char*)entry.value,entry.sz); } else { - value = createStringObjectFromLongLong(vlong); + value = createStringObjectFromLongLong(entry.longval); } addReplyBulk(c,value); decrRefCount(value); } else { addReply(c,shared.nullbulk); } - } else if (o->encoding == REDIS_ENCODING_LINKEDLIST) { - listNode *ln = listIndex(o->ptr,index); - if (ln != NULL) { - value = listNodeValue(ln); - addReplyBulk(c,value); - } else { - addReply(c,shared.nullbulk); - } } else { redisPanic("Unknown list encoding"); } @@ -452,35 +333,18 @@ void lsetCommand(redisClient *c) { robj *o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr); if (o == NULL || checkType(c,o,REDIS_LIST)) return; long index; - robj *value = (c->argv[3] = tryObjectEncoding(c->argv[3])); + robj *value = c->argv[3]; if ((getLongFromObjectOrReply(c, c->argv[2], &index, NULL) != REDIS_OK)) return; - listTypeTryConversion(o,value); - if (o->encoding == REDIS_ENCODING_ZIPLIST) { - unsigned char *p, *zl = o->ptr; - p = ziplistIndex(zl,index); - if (p == NULL) { + if (o->encoding == REDIS_ENCODING_QUICKLIST) { + quicklist *ql = o->ptr; + int replaced = quicklistReplaceAtIndex(ql, index, + value->ptr, sdslen(value->ptr)); + if (!replaced) { addReply(c,shared.outofrangeerr); } else { - o->ptr = ziplistDelete(o->ptr,&p); - value = getDecodedObject(value); - o->ptr = ziplistInsert(o->ptr,p,value->ptr,sdslen(value->ptr)); - decrRefCount(value); - addReply(c,shared.ok); - signalModifiedKey(c->db,c->argv[1]); - notifyKeyspaceEvent(REDIS_NOTIFY_LIST,"lset",c->argv[1],c->db->id); - server.dirty++; - } - } else if (o->encoding == REDIS_ENCODING_LINKEDLIST) { - listNode *ln = listIndex(o->ptr,index); - if (ln == NULL) { - addReply(c,shared.outofrangeerr); - } else { - decrRefCount((robj*)listNodeValue(ln)); - listNodeValue(ln) = value; - incrRefCount(value); addReply(c,shared.ok); signalModifiedKey(c->db,c->argv[1]); notifyKeyspaceEvent(REDIS_NOTIFY_LIST,"lset",c->argv[1],c->db->id); @@ -549,43 +413,28 @@ void lrangeCommand(redisClient *c) { /* Return the result in form of a multi-bulk reply */ addReplyMultiBulkLen(c,rangelen); - if (o->encoding == REDIS_ENCODING_ZIPLIST) { - unsigned char *p = ziplistIndex(o->ptr,start); - unsigned char *vstr; - unsigned int vlen; - long long vlong; + if (o->encoding == REDIS_ENCODING_QUICKLIST) { + listTypeIterator *iter = listTypeInitIterator(o, start, REDIS_TAIL); while(rangelen--) { - ziplistGet(p,&vstr,&vlen,&vlong); - if (vstr) { - addReplyBulkCBuffer(c,vstr,vlen); + listTypeEntry entry; + listTypeNext(iter, &entry); + quicklistEntry *qe = &entry.entry; + if (qe->value) { + addReplyBulkCBuffer(c,qe->value,qe->sz); } else { - addReplyBulkLongLong(c,vlong); + addReplyBulkLongLong(c,qe->longval); } - p = ziplistNext(o->ptr,p); - } - } else if (o->encoding == REDIS_ENCODING_LINKEDLIST) { - listNode *ln; - - /* If we are nearest to the end of the list, reach the element - * starting from tail and going backward, as it is faster. */ - if (start > llen/2) start -= llen; - ln = listIndex(o->ptr,start); - - while(rangelen--) { - addReplyBulk(c,ln->value); - ln = ln->next; } + listTypeReleaseIterator(iter); } else { - redisPanic("List encoding is not LINKEDLIST nor ZIPLIST!"); + redisPanic("List encoding is not QUICKLIST!"); } } void ltrimCommand(redisClient *c) { robj *o; - long start, end, llen, j, ltrim, rtrim; - list *list; - listNode *ln; + long start, end, llen, ltrim, rtrim; if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) || (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return; @@ -612,19 +461,9 @@ void ltrimCommand(redisClient *c) { } /* Remove list elements to perform the trim */ - if (o->encoding == REDIS_ENCODING_ZIPLIST) { - o->ptr = ziplistDeleteRange(o->ptr,0,ltrim); - o->ptr = ziplistDeleteRange(o->ptr,-rtrim,rtrim); - } else if (o->encoding == REDIS_ENCODING_LINKEDLIST) { - list = o->ptr; - for (j = 0; j < ltrim; j++) { - ln = listFirst(list); - listDelNode(list,ln); - } - for (j = 0; j < rtrim; j++) { - ln = listLast(list); - listDelNode(list,ln); - } + if (o->encoding == REDIS_ENCODING_QUICKLIST) { + quicklistDelRange(o->ptr,0,ltrim); + quicklistDelRange(o->ptr,-rtrim,rtrim); } else { redisPanic("Unknown list encoding"); } @@ -641,10 +480,9 @@ void ltrimCommand(redisClient *c) { void lremCommand(redisClient *c) { robj *subject, *obj; - obj = c->argv[3] = tryObjectEncoding(c->argv[3]); + obj = c->argv[3]; long toremove; long removed = 0; - listTypeEntry entry; if ((getLongFromObjectOrReply(c, c->argv[2], &toremove, NULL) != REDIS_OK)) return; @@ -652,10 +490,6 @@ void lremCommand(redisClient *c) { subject = lookupKeyWriteOrReply(c,c->argv[1],shared.czero); if (subject == NULL || checkType(c,subject,REDIS_LIST)) return; - /* Make sure obj is raw when we're dealing with a ziplist */ - if (subject->encoding == REDIS_ENCODING_ZIPLIST) - obj = getDecodedObject(obj); - listTypeIterator *li; if (toremove < 0) { toremove = -toremove; @@ -664,9 +498,10 @@ void lremCommand(redisClient *c) { li = listTypeInitIterator(subject,0,REDIS_TAIL); } + listTypeEntry entry; while (listTypeNext(li,&entry)) { if (listTypeEqual(&entry,obj)) { - listTypeDelete(&entry); + listTypeDelete(li, &entry); server.dirty++; removed++; if (toremove && removed == toremove) break; @@ -674,11 +509,10 @@ void lremCommand(redisClient *c) { } listTypeReleaseIterator(li); - /* Clean up raw encoded object */ - if (subject->encoding == REDIS_ENCODING_ZIPLIST) - decrRefCount(obj); + if (listTypeLength(subject) == 0) { + dbDelete(c->db,c->argv[1]); + } - if (listTypeLength(subject) == 0) dbDelete(c->db,c->argv[1]); addReplyLongLong(c,removed); if (removed) signalModifiedKey(c->db,c->argv[1]); } @@ -702,7 +536,9 @@ void lremCommand(redisClient *c) { void rpoplpushHandlePush(redisClient *c, robj *dstkey, robj *dstobj, robj *value) { /* Create the list if the key does not exist */ if (!dstobj) { - dstobj = createZiplistObject(); + dstobj = createQuicklistObject(); + quicklistSetOptions(dstobj->ptr, server.list_max_ziplist_size, + server.list_compress_depth); dbAdd(c->db,dstkey,dstobj); } signalModifiedKey(c->db,dstkey); @@ -1010,7 +846,9 @@ void handleClientsBlockedOnLists(void) { } } - if (listTypeLength(o) == 0) dbDelete(rl->db,rl->key); + if (listTypeLength(o) == 0) { + dbDelete(rl->db,rl->key); + } /* We don't call signalModifiedKey() as it was already called * when an element was pushed on the list. */ } diff --git a/src/t_set.c b/src/t_set.c index c530d6923..f3f8bbaca 100644 --- a/src/t_set.c +++ b/src/t_set.c @@ -33,7 +33,8 @@ * Set Commands *----------------------------------------------------------------------------*/ -void sunionDiffGenericCommand(redisClient *c, robj **setkeys, int setnum, robj *dstkey, int op); +void sunionDiffGenericCommand(redisClient *c, robj **setkeys, int setnum, + robj *dstkey, int op); /* Factory method to return a set that *can* hold "value". When the object has * an integer-encodable value, an intset will be returned. Otherwise a regular @@ -68,7 +69,8 @@ int setTypeAdd(robj *subject, robj *value) { /* The set *was* an intset and this value is not integer * encodable, so dictAdd should always work. */ - redisAssertWithInfo(NULL,value,dictAdd(subject->ptr,value,NULL) == DICT_OK); + redisAssertWithInfo(NULL,value, + dictAdd(subject->ptr,value,NULL) == DICT_OK); incrRefCount(value); return 1; } @@ -205,6 +207,106 @@ int setTypeRandomElement(robj *setobj, robj **objele, int64_t *llele) { return setobj->encoding; } +/* Return a number of random elements from a non empty set. + * + * This is a version of setTypeRandomElement() that is modified in order to + * return multiple entries, using dictGetRandomKeys() and intsetRandomMembers(). + * + * The elements are stored into 'aux_set' which should be of a set type. + * + * The function returns the number of items stored into 'aux_set', that may + * be less than 'count' if the hash table has less than 'count' elements + * inside. + * + * Note that this function is not suitable when you need a good distribution + * of the returned items, but only when you need to "sample" a given number + * of continuous elements to run some kind of algorithm or to produce + * statistics. However the function is much faster than setTypeRandomElement() + * at producing N elements, and the elements are guaranteed to be non + * repeating. + */ +unsigned long setTypeRandomElements(robj *set, unsigned long count, + robj *aux_set) { + unsigned long set_size; + unsigned long elements_to_return = count; + unsigned long elements_copied = 0; + unsigned long current_element = 0; + + /* Like all setType* functions, we assume good behavior on part of the + * caller, so no extra parameter checks are made. */ + + /* If the number of elements in the the set is less than the count + * requested, just return all of them. */ + set_size = setTypeSize(set); + if (set_size < count) { + elements_to_return = set_size; + } + + /* TODO: It is definitely faster adding items to the set by directly + * handling the Dict or intset inside it, avoiding the constant encoding + * checks inside setTypeAdd(). However, We don't want to touch the set + * internals in non setType* functions. So, we just call setTypeAdd() + * multiple times, but this isn't an optimal solution. + * Another option would be to create a bulk-add function: + * setTypeAddBulk(). */ + if (set->encoding == REDIS_ENCODING_HT) { + /* Allocate result array */ + dictEntry **random_elements = + zmalloc(sizeof(dictEntry*) * elements_to_return); + + /* Get the random elements */ + elements_copied = + dictGetRandomKeys(set->ptr, random_elements, elements_to_return); + redisAssert(elements_copied == elements_to_return); + + /* Put them into the set */ + for (current_element = 0; current_element < elements_copied; + current_element++) { + + /* We get the key and duplicate it, as we know it is a string */ + setTypeAdd(aux_set, + dictGetKey(random_elements[current_element])); + } + + zfree(random_elements); + + } else if (set->encoding == REDIS_ENCODING_INTSET) { + /* Allocate result array */ + int64_t *random_elements = + zmalloc(sizeof(int64_t) * elements_to_return); + robj* element_as_str = NULL; + + elements_copied = + intsetRandomMembers((intset*) set->ptr, + random_elements, + elements_to_return); + + redisAssert(elements_copied == elements_to_return); + + /* Put them into the set */ + for (current_element = 0; current_element < elements_copied; + current_element++) { + + element_as_str = createStringObjectFromLongLong( + random_elements[current_element]); + + /* Put the values in the set */ + setTypeAdd(aux_set, + element_as_str); + + decrRefCount(element_as_str); + } + + zfree(random_elements); + } else { + redisPanic("Unknown set encoding"); + } + + /* We have a set with random elements. Return the actual elements in + the aux_set. */ + return elements_copied; +} + unsigned long setTypeSize(robj *subject) { if (subject->encoding == REDIS_ENCODING_HT) { return dictSize((dict*)subject->ptr); @@ -235,7 +337,8 @@ void setTypeConvert(robj *setobj, int enc) { si = setTypeInitIterator(setobj); while (setTypeNext(si,NULL,&intele) != -1) { element = createStringObjectFromLongLong(intele); - redisAssertWithInfo(NULL,element,dictAdd(d,element,NULL) == DICT_OK); + redisAssertWithInfo(NULL,element, + dictAdd(d,element,NULL) == DICT_OK); } setTypeReleaseIterator(si); @@ -377,15 +480,147 @@ void scardCommand(redisClient *c) { addReplyLongLong(c,setTypeSize(o)); } +/* handle the "SPOP key <count>" variant. The normal version of the + * command is handled by the spopCommand() function itself. */ + +void spopWithCountCommand(redisClient *c) { + long l; + unsigned long count, size; + unsigned long elements_returned; + robj *set, *aux, *aux_set; + int64_t llele; + + /* Get the count argument */ + if (getLongFromObjectOrReply(c,c->argv[2],&l,NULL) != REDIS_OK) return; + if (l >= 0) { + count = (unsigned) l; + } else { + addReply(c,shared.outofrangeerr); + return; + } + + /* Make sure a key with the name inputted exists, and that it's type is + * indeed a set. Otherwise, return nil */ + if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) + == NULL || checkType(c,set,REDIS_SET)) return; + + /* If count is zero, serve an empty multibulk ASAP to avoid special + * cases later. */ + if (count == 0) { + addReply(c,shared.emptymultibulk); + return; + } + + /* Get the size of the set. It is always > 0, as empty sets get + * deleted. */ + size = setTypeSize(set); + + /* Generate an SPOP keyspace notification */ + notifyKeyspaceEvent(REDIS_NOTIFY_SET,"spop",c->argv[1],c->db->id); + + /* CASE 1: + * The number of requested elements is greater than or equal to + * the number of elements inside the set: simply return the whole set. */ + if (count >= size) { + + /* We just return the entire set */ + sunionDiffGenericCommand(c,c->argv+1,1,NULL,REDIS_OP_UNION); + + /* Delete the set as it is now empty */ + dbDelete(c->db,c->argv[1]); + notifyKeyspaceEvent(REDIS_NOTIFY_GENERIC,"del",c->argv[1],c->db->id); + + /* Replicate/AOF this command as an SREM operation */ + aux = createStringObject("DEL",3); + rewriteClientCommandVector(c,2,aux,c->argv[1]); + decrRefCount(aux); + + return; + } + + /* CASE 2: + * The number of requested elements is less than the number + * of elements inside the set. */ + + /* We need an auxiliary set. Optimistically, we create a set using an + * Intset internally. */ + aux = createStringObjectFromLongLong(0); + aux_set = setTypeCreate(aux); + decrRefCount(aux); + + /* Get the count requested of random elements from the set into our + * auxiliary set. */ + elements_returned = setTypeRandomElements(set, count, aux_set); + redisAssert(elements_returned == count); + + { + setTypeIterator *si; + robj *objele; + int element_encoding; + + addReplyMultiBulkLen(c, elements_returned); + + /* Replicate/AOF this command as an SREM operation */ + aux = createStringObject("SREM",4); + + si = setTypeInitIterator(aux_set); + while ((element_encoding = setTypeNext(si, &objele, &llele)) != -1) { + if (element_encoding == REDIS_ENCODING_HT) { + + addReplyBulk(c, objele); + + /* Replicate/AOF this command as an SREM commands */ + rewriteClientCommandVector(c, 3, aux, c->argv[1], objele); + setTypeRemove(set, objele); + } + else if (element_encoding == REDIS_ENCODING_INTSET) { + /* TODO: setTypeRemove() forces us to convert all of the ints + * to string... isn't there a nicer way to do this? */ + objele = createStringObjectFromLongLong(llele); + addReplyBulk(c, objele); + + /* Replicate/AOF this command as an SREM commands */ + rewriteClientCommandVector(c, 3, aux, c->argv[1], objele); + setTypeRemove(set, objele); + + /* We created it, we kill it. */ + decrRefCount(objele); + } + else { + redisPanic("Unknown set encoding"); + } + } + setTypeReleaseIterator(si); + + decrRefCount(aux); + } + + /* Free the auxiliary set - we need it no more. */ + decrRefCount(aux_set); +} + void spopCommand(redisClient *c) { robj *set, *ele, *aux; int64_t llele; int encoding; + if (c->argc == 3) { + spopWithCountCommand(c); + return; + } else if (c->argc > 3) { + addReply(c,shared.syntaxerr); + return; + } + + /* Make sure a key with the name inputted exists, and that it's type is + * indeed a set */ if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL || checkType(c,set,REDIS_SET)) return; + /* Get a random element from the set */ encoding = setTypeRandomElement(set,&ele,&llele); + + /* Remove the element from the set */ if (encoding == REDIS_ENCODING_INTSET) { ele = createStringObjectFromLongLong(llele); set->ptr = intsetRemove(set->ptr,llele,NULL); @@ -393,6 +628,7 @@ void spopCommand(redisClient *c) { incrRefCount(ele); setTypeRemove(set,ele); } + notifyKeyspaceEvent(REDIS_NOTIFY_SET,"spop",c->argv[1],c->db->id); /* Replicate/AOF this command as an SREM operation */ @@ -401,11 +637,16 @@ void spopCommand(redisClient *c) { decrRefCount(ele); decrRefCount(aux); + /* Add the element to the reply */ addReplyBulk(c,ele); + + /* Delete the set if it's empty */ if (setTypeSize(set) == 0) { dbDelete(c->db,c->argv[1]); notifyKeyspaceEvent(REDIS_NOTIFY_GENERIC,"del",c->argv[1],c->db->id); } + + /* Set has been modified */ signalModifiedKey(c->db,c->argv[1]); server.dirty++; } @@ -587,7 +828,8 @@ int qsortCompareSetsByRevCardinality(const void *s1, const void *s2) { return (o2 ? setTypeSize(o2) : 0) - (o1 ? setTypeSize(o1) : 0); } -void sinterGenericCommand(redisClient *c, robj **setkeys, unsigned long setnum, robj *dstkey) { +void sinterGenericCommand(redisClient *c, robj **setkeys, + unsigned long setnum, robj *dstkey) { robj **sets = zmalloc(sizeof(robj*)*setnum); setTypeIterator *si; robj *eleobj, *dstset = NULL; @@ -734,7 +976,8 @@ void sinterstoreCommand(redisClient *c) { #define REDIS_OP_DIFF 1 #define REDIS_OP_INTER 2 -void sunionDiffGenericCommand(redisClient *c, robj **setkeys, int setnum, robj *dstkey, int op) { +void sunionDiffGenericCommand(redisClient *c, robj **setkeys, int setnum, + robj *dstkey, int op) { robj **sets = zmalloc(sizeof(robj*)*setnum); setTypeIterator *si; robj *ele, *dstset = NULL; diff --git a/src/t_string.c b/src/t_string.c index 067aa10e3..34ab11b51 100644 --- a/src/t_string.c +++ b/src/t_string.c @@ -405,7 +405,7 @@ void incrbyfloatCommand(redisClient *c) { addReplyError(c,"increment would produce NaN or Infinity"); return; } - new = createStringObjectFromLongDouble(value); + new = createStringObjectFromLongDouble(value,1); if (o) dbOverwrite(c->db,c->argv[1],new); else diff --git a/src/t_zset.c b/src/t_zset.c index d3c7214bd..64418c9b4 100644 --- a/src/t_zset.c +++ b/src/t_zset.c @@ -1382,7 +1382,7 @@ void zremrangeGenericCommand(redisClient *c, int rangetype) { robj *key = c->argv[1]; robj *zobj; int keyremoved = 0; - unsigned long deleted; + unsigned long deleted = 0; zrangespec range; zlexrangespec lexrange; long start, end, llen; diff --git a/src/util.c b/src/util.c index 80242ff71..543de112b 100644 --- a/src/util.c +++ b/src/util.c @@ -40,6 +40,7 @@ #include <stdint.h> #include "util.h" +#include "sha1.h" /* Glob-style pattern matching. */ int stringmatchlen(const char *pattern, int patternLen, @@ -428,11 +429,44 @@ int d2string(char *buf, size_t len, double value) { * having run_id == A, and you reconnect and it has run_id == B, you can be * sure that it is either a different instance or it was restarted. */ void getRandomHexChars(char *p, unsigned int len) { - FILE *fp = fopen("/dev/urandom","r"); char *charset = "0123456789abcdef"; unsigned int j; - if (fp == NULL || fread(p,len,1,fp) == 0) { + /* Global state. */ + static int seed_initialized = 0; + static unsigned char seed[20]; /* The SHA1 seed, from /dev/urandom. */ + static uint64_t counter = 0; /* The counter we hash with the seed. */ + + if (!seed_initialized) { + /* Initialize a seed and use SHA1 in counter mode, where we hash + * the same seed with a progressive counter. For the goals of this + * function we just need non-colliding strings, there are no + * cryptographic security needs. */ + FILE *fp = fopen("/dev/urandom","r"); + if (fp && fread(seed,sizeof(seed),1,fp) == 1) + seed_initialized = 1; + if (fp) fclose(fp); + } + + if (seed_initialized) { + while(len) { + unsigned char digest[20]; + SHA1_CTX ctx; + unsigned int copylen = len > 20 ? 20 : len; + + SHA1Init(&ctx); + SHA1Update(&ctx, seed, sizeof(seed)); + SHA1Update(&ctx, (unsigned char*)&counter,sizeof(counter)); + SHA1Final(digest, &ctx); + counter++; + + memcpy(p,digest,copylen); + /* Convert to hex digits. */ + for (j = 0; j < copylen; j++) p[j] = charset[p[j] & 0x0F]; + len -= copylen; + p += copylen; + } + } else { /* If we can't read from /dev/urandom, do some reasonable effort * in order to create some entropy, since this function is used to * generate run_id and cluster instance IDs */ @@ -459,14 +493,12 @@ void getRandomHexChars(char *p, unsigned int len) { x += sizeof(pid); } /* Finally xor it with rand() output, that was already seeded with - * time() at startup. */ - for (j = 0; j < len; j++) + * time() at startup, and convert to hex digits. */ + for (j = 0; j < len; j++) { p[j] ^= rand(); + p[j] = charset[p[j] & 0x0F]; + } } - /* Turn it into hex digits taking just 4 bits out of 8 for every byte. */ - for (j = 0; j < len; j++) - p[j] = charset[p[j] & 0x0F]; - if (fp) fclose(fp); } /* Given the filename, return the absolute path as an SDS string, or NULL @@ -529,10 +561,10 @@ int pathIsBaseName(char *path) { return strchr(path,'/') == NULL && strchr(path,'\\') == NULL; } -#ifdef UTIL_TEST_MAIN +#ifdef REDIS_TEST #include <assert.h> -void test_string2ll(void) { +static void test_string2ll(void) { char buf[32]; long long v; @@ -587,7 +619,7 @@ void test_string2ll(void) { assert(string2ll(buf,strlen(buf),&v) == 0); } -void test_string2l(void) { +static void test_string2l(void) { char buf[32]; long v; @@ -636,9 +668,55 @@ void test_string2l(void) { #endif } -int main(int argc, char **argv) { +static void test_ll2string(void) { + char buf[32]; + long long v; + int sz; + + v = 0; + sz = ll2string(buf, sizeof buf, v); + assert(sz == 1); + assert(!strcmp(buf, "0")); + + v = -1; + sz = ll2string(buf, sizeof buf, v); + assert(sz == 2); + assert(!strcmp(buf, "-1")); + + v = 99; + sz = ll2string(buf, sizeof buf, v); + assert(sz == 2); + assert(!strcmp(buf, "99")); + + v = -99; + sz = ll2string(buf, sizeof buf, v); + assert(sz == 3); + assert(!strcmp(buf, "-99")); + + v = -2147483648; + sz = ll2string(buf, sizeof buf, v); + assert(sz == 11); + assert(!strcmp(buf, "-2147483648")); + + v = LLONG_MIN; + sz = ll2string(buf, sizeof buf, v); + assert(sz == 20); + assert(!strcmp(buf, "-9223372036854775808")); + + v = LLONG_MAX; + sz = ll2string(buf, sizeof buf, v); + assert(sz == 19); + assert(!strcmp(buf, "9223372036854775807")); +} + +#define UNUSED(x) (void)(x) +int utilTest(int argc, char **argv) { + UNUSED(argc); + UNUSED(argv); + test_string2ll(); test_string2l(); + test_ll2string(); return 0; } #endif diff --git a/src/util.h b/src/util.h index b3667cd6f..666042c9b 100644 --- a/src/util.h +++ b/src/util.h @@ -42,4 +42,8 @@ int d2string(char *buf, size_t len, double value); sds getAbsolutePath(char *filename); int pathIsBaseName(char *path); +#ifdef REDIS_TEST +int utilTest(int argc, char **argv); +#endif + #endif diff --git a/src/ziplist.c b/src/ziplist.c index 64a22adfc..7428d30e9 100644 --- a/src/ziplist.c +++ b/src/ziplist.c @@ -143,6 +143,7 @@ #define ZIPLIST_TAIL_OFFSET(zl) (*((uint32_t*)((zl)+sizeof(uint32_t)))) #define ZIPLIST_LENGTH(zl) (*((uint16_t*)((zl)+sizeof(uint32_t)*2))) #define ZIPLIST_HEADER_SIZE (sizeof(uint32_t)*2+sizeof(uint16_t)) +#define ZIPLIST_END_SIZE (sizeof(uint8_t)) #define ZIPLIST_ENTRY_HEAD(zl) ((zl)+ZIPLIST_HEADER_SIZE) #define ZIPLIST_ENTRY_TAIL(zl) ((zl)+intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl))) #define ZIPLIST_ENTRY_END(zl) ((zl)+intrev32ifbe(ZIPLIST_BYTES(zl))-1) @@ -162,6 +163,13 @@ typedef struct zlentry { unsigned char *p; } zlentry; +#define ZIPLIST_ENTRY_ZERO(zle) { \ + (zle)->prevrawlensize = (zle)->prevrawlen = 0; \ + (zle)->lensize = (zle)->len = (zle)->headersize = 0; \ + (zle)->encoding = 0; \ + (zle)->p = NULL; \ +} + /* Extract the encoding from the byte pointed by 'ptr' and set it into * 'encoding'. */ #define ZIP_ENTRY_ENCODING(ptr, encoding) do { \ @@ -169,6 +177,8 @@ typedef struct zlentry { if ((encoding) < ZIP_STR_MASK) (encoding) &= ZIP_STR_MASK; \ } while(0) +void ziplistRepr(unsigned char *zl); + /* Return bytes needed to store integer encoded by 'encoding' */ static unsigned int zipIntSize(unsigned char encoding) { switch(encoding) { @@ -404,14 +414,12 @@ static int64_t zipLoadInteger(unsigned char *p, unsigned char encoding) { } /* Return a struct with all information about an entry. */ -static zlentry zipEntry(unsigned char *p) { - zlentry e; - - ZIP_DECODE_PREVLEN(p, e.prevrawlensize, e.prevrawlen); - ZIP_DECODE_LENGTH(p + e.prevrawlensize, e.encoding, e.lensize, e.len); - e.headersize = e.prevrawlensize + e.lensize; - e.p = p; - return e; +static void zipEntry(unsigned char *p, zlentry *e) { + + ZIP_DECODE_PREVLEN(p, e->prevrawlensize, e->prevrawlen); + ZIP_DECODE_LENGTH(p + e->prevrawlensize, e->encoding, e->lensize, e->len); + e->headersize = e->prevrawlensize + e->lensize; + e->p = p; } /* Create a new empty ziplist. */ @@ -460,13 +468,13 @@ static unsigned char *__ziplistCascadeUpdate(unsigned char *zl, unsigned char *p zlentry cur, next; while (p[0] != ZIP_END) { - cur = zipEntry(p); + zipEntry(p, &cur); rawlen = cur.headersize + cur.len; rawlensize = zipPrevEncodeLength(NULL,rawlen); /* Abort if there is no next entry. */ if (p[rawlen] == ZIP_END) break; - next = zipEntry(p+rawlen); + zipEntry(p+rawlen, &next); /* Abort when "prevlen" has not changed. */ if (next.prevrawlen == rawlen) break; @@ -521,7 +529,7 @@ static unsigned char *__ziplistDelete(unsigned char *zl, unsigned char *p, unsig int nextdiff = 0; zlentry first, tail; - first = zipEntry(p); + zipEntry(p, &first); for (i = 0; p[0] != ZIP_END && i < num; i++) { p += zipRawEntryLength(p); deleted++; @@ -545,7 +553,7 @@ static unsigned char *__ziplistDelete(unsigned char *zl, unsigned char *p, unsig /* When the tail contains more than one entry, we need to take * "nextdiff" in account as well. Otherwise, a change in the * size of prevlen doesn't have an effect on the *tail* offset. */ - tail = zipEntry(p); + zipEntry(p, &tail); if (p[tail.headersize+tail.len] != ZIP_END) { ZIPLIST_TAIL_OFFSET(zl) = intrev32ifbe(intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl))+nextdiff); @@ -635,7 +643,7 @@ static unsigned char *__ziplistInsert(unsigned char *zl, unsigned char *p, unsig /* When the tail contains more than one entry, we need to take * "nextdiff" in account as well. Otherwise, a change in the * size of prevlen doesn't have an effect on the *tail* offset. */ - tail = zipEntry(p+reqlen); + zipEntry(p+reqlen, &tail); if (p[reqlen+tail.headersize+tail.len] != ZIP_END) { ZIPLIST_TAIL_OFFSET(zl) = intrev32ifbe(intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl))+nextdiff); @@ -665,6 +673,121 @@ static unsigned char *__ziplistInsert(unsigned char *zl, unsigned char *p, unsig return zl; } +/* Merge ziplists 'first' and 'second' by appending 'second' to 'first'. + * + * NOTE: The larger ziplist is reallocated to contain the new merged ziplist. + * Either 'first' or 'second' can be used for the result. The parameter not + * used will be free'd and set to NULL. + * + * After calling this function, the input parameters are no longer valid since + * they are changed and free'd in-place. + * + * The result ziplist is the contents of 'first' followed by 'second'. + * + * On failure: returns NULL if the merge is impossible. + * On success: returns the merged ziplist (which is expanded version of either + * 'first' or 'second', also frees the other unused input ziplist, and sets the + * input ziplist argument equal to newly reallocated ziplist return value. */ +unsigned char *ziplistMerge(unsigned char **first, unsigned char **second) { + /* If any params are null, we can't merge, so NULL. */ + if (first == NULL || *first == NULL || second == NULL || *second == NULL) + return NULL; + + /* Can't merge same list into itself. */ + if (*first == *second) + return NULL; + + size_t first_bytes = intrev32ifbe(ZIPLIST_BYTES(*first)); + size_t first_len = intrev16ifbe(ZIPLIST_LENGTH(*first)); + + size_t second_bytes = intrev32ifbe(ZIPLIST_BYTES(*second)); + size_t second_len = intrev16ifbe(ZIPLIST_LENGTH(*second)); + + int append; + unsigned char *source, *target; + size_t target_bytes, source_bytes; + /* Pick the largest ziplist so we can resize easily in-place. + * We must also track if we are now appending or prepending to + * the target ziplist. */ + if (first_len >= second_len) { + /* retain first, append second to first. */ + target = *first; + target_bytes = first_bytes; + source = *second; + source_bytes = second_bytes; + append = 1; + } else { + /* else, retain second, prepend first to second. */ + target = *second; + target_bytes = second_bytes; + source = *first; + source_bytes = first_bytes; + append = 0; + } + + /* Calculate final bytes (subtract one pair of metadata) */ + size_t zlbytes = first_bytes + second_bytes - + ZIPLIST_HEADER_SIZE - ZIPLIST_END_SIZE; + size_t zllength = first_len + second_len; + + /* Combined zl length should be limited within UINT16_MAX */ + zllength = zllength < UINT16_MAX ? zllength : UINT16_MAX; + + /* Save offset positions before we start ripping memory apart. */ + size_t first_offset = intrev32ifbe(ZIPLIST_TAIL_OFFSET(*first)); + size_t second_offset = intrev32ifbe(ZIPLIST_TAIL_OFFSET(*second)); + + /* Extend target to new zlbytes then append or prepend source. */ + target = zrealloc(target, zlbytes); + if (append) { + /* append == appending to target */ + /* Copy source after target (copying over original [END]): + * [TARGET - END, SOURCE - HEADER] */ + memcpy(target + target_bytes - ZIPLIST_END_SIZE, + source + ZIPLIST_HEADER_SIZE, + source_bytes - ZIPLIST_HEADER_SIZE); + } else { + /* !append == prepending to target */ + /* Move target *contents* exactly size of (source - [END]), + * then copy source into vacataed space (source - [END]): + * [SOURCE - END, TARGET - HEADER] */ + memmove(target + source_bytes - ZIPLIST_END_SIZE, + target + ZIPLIST_HEADER_SIZE, + target_bytes - ZIPLIST_HEADER_SIZE); + memcpy(target, source, source_bytes - ZIPLIST_END_SIZE); + } + + /* Update header metadata. */ + ZIPLIST_BYTES(target) = intrev32ifbe(zlbytes); + ZIPLIST_LENGTH(target) = intrev16ifbe(zllength); + /* New tail offset is: + * + N bytes of first ziplist + * - 1 byte for [END] of first ziplist + * + M bytes for the offset of the original tail of the second ziplist + * - J bytes for HEADER because second_offset keeps no header. */ + ZIPLIST_TAIL_OFFSET(target) = intrev32ifbe( + (first_bytes - ZIPLIST_END_SIZE) + + (second_offset - ZIPLIST_HEADER_SIZE)); + + /* __ziplistCascadeUpdate just fixes the prev length values until it finds a + * correct prev length value (then it assumes the rest of the list is okay). + * We tell CascadeUpdate to start at the first ziplist's tail element to fix + * the merge seam. */ + target = __ziplistCascadeUpdate(target, target+first_offset); + + /* Now free and NULL out what we didn't realloc */ + if (append) { + zfree(*second); + *second = NULL; + *first = target; + } else { + zfree(*first); + *first = NULL; + *second = target; + } + return target; +} + unsigned char *ziplistPush(unsigned char *zl, unsigned char *s, unsigned int slen, int where) { unsigned char *p; p = (where == ZIPLIST_HEAD) ? ZIPLIST_ENTRY_HEAD(zl) : ZIPLIST_ENTRY_END(zl); @@ -748,7 +871,7 @@ unsigned int ziplistGet(unsigned char *p, unsigned char **sstr, unsigned int *sl if (p == NULL || p[0] == ZIP_END) return 0; if (sstr) *sstr = NULL; - entry = zipEntry(p); + zipEntry(p, &entry); if (ZIP_IS_STR(entry.encoding)) { if (sstr) { *slen = entry.len; @@ -783,7 +906,7 @@ unsigned char *ziplistDelete(unsigned char *zl, unsigned char **p) { } /* Delete a range of entries from the ziplist. */ -unsigned char *ziplistDeleteRange(unsigned char *zl, unsigned int index, unsigned int num) { +unsigned char *ziplistDeleteRange(unsigned char *zl, int index, unsigned int num) { unsigned char *p = ziplistIndex(zl,index); return (p == NULL) ? zl : __ziplistDelete(zl,p,num); } @@ -796,7 +919,7 @@ unsigned int ziplistCompare(unsigned char *p, unsigned char *sstr, unsigned int long long zval, sval; if (p[0] == ZIP_END) return 0; - entry = zipEntry(p); + zipEntry(p, &entry); if (ZIP_IS_STR(entry.encoding)) { /* Raw compare */ if (entry.len == slen) { @@ -913,7 +1036,7 @@ void ziplistRepr(unsigned char *zl) { intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl))); p = ZIPLIST_ENTRY_HEAD(zl); while(*p != ZIP_END) { - entry = zipEntry(p); + zipEntry(p, &entry); printf( "{" "addr 0x%08lx, " @@ -952,14 +1075,14 @@ void ziplistRepr(unsigned char *zl) { printf("{end}\n\n"); } -#ifdef ZIPLIST_TEST_MAIN +#ifdef REDIS_TEST #include <sys/time.h> #include "adlist.h" #include "sds.h" #define debug(f, ...) { if (DEBUG) printf(f, __VA_ARGS__); } -unsigned char *createList() { +static unsigned char *createList() { unsigned char *zl = ziplistNew(); zl = ziplistPush(zl, (unsigned char*)"foo", 3, ZIPLIST_TAIL); zl = ziplistPush(zl, (unsigned char*)"quux", 4, ZIPLIST_TAIL); @@ -968,7 +1091,7 @@ unsigned char *createList() { return zl; } -unsigned char *createIntList() { +static unsigned char *createIntList() { unsigned char *zl = ziplistNew(); char buf[32]; @@ -987,13 +1110,13 @@ unsigned char *createIntList() { return zl; } -long long usec(void) { +static long long usec(void) { struct timeval tv; gettimeofday(&tv,NULL); return (((long long)tv.tv_sec)*1000000)+tv.tv_usec; } -void stress(int pos, int num, int maxsize, int dnum) { +static void stress(int pos, int num, int maxsize, int dnum) { int i,j,k; unsigned char *zl; char posstr[2][5] = { "HEAD", "TAIL" }; @@ -1016,7 +1139,7 @@ void stress(int pos, int num, int maxsize, int dnum) { } } -void pop(unsigned char *zl, int where) { +static unsigned char *pop(unsigned char *zl, int where) { unsigned char *p, *vstr; unsigned int vlen; long long vlong; @@ -1028,20 +1151,22 @@ void pop(unsigned char *zl, int where) { else printf("Pop tail: "); - if (vstr) + if (vstr) { if (vlen && fwrite(vstr,vlen,1,stdout) == 0) perror("fwrite"); - else + } + else { printf("%lld", vlong); + } printf("\n"); - ziplistDeleteRange(zl,-1,1); + return ziplistDelete(zl,&p); } else { printf("ERROR: Could not pop\n"); exit(1); } } -int randstring(char *target, unsigned int min, unsigned int max) { +static int randstring(char *target, unsigned int min, unsigned int max) { int p = 0; int len = min+rand()%(max-min+1); int minval, maxval; @@ -1067,23 +1192,24 @@ int randstring(char *target, unsigned int min, unsigned int max) { return len; } -void verify(unsigned char *zl, zlentry *e) { - int i; +static void verify(unsigned char *zl, zlentry *e) { int len = ziplistLen(zl); zlentry _e; - for (i = 0; i < len; i++) { + ZIPLIST_ENTRY_ZERO(&_e); + + for (int i = 0; i < len; i++) { memset(&e[i], 0, sizeof(zlentry)); - e[i] = zipEntry(ziplistIndex(zl, i)); + zipEntry(ziplistIndex(zl, i), &e[i]); memset(&_e, 0, sizeof(zlentry)); - _e = zipEntry(ziplistIndex(zl, -len+i)); + zipEntry(ziplistIndex(zl, -len+i), &_e); assert(memcmp(&e[i], &_e, sizeof(zlentry)) == 0); } } -int main(int argc, char **argv) { +int ziplistTest(int argc, char **argv) { unsigned char *zl, *p; unsigned char *entry; unsigned int elen; @@ -1096,21 +1222,25 @@ int main(int argc, char **argv) { zl = createIntList(); ziplistRepr(zl); + zfree(zl); + zl = createList(); ziplistRepr(zl); - pop(zl,ZIPLIST_TAIL); + zl = pop(zl,ZIPLIST_TAIL); ziplistRepr(zl); - pop(zl,ZIPLIST_HEAD); + zl = pop(zl,ZIPLIST_HEAD); ziplistRepr(zl); - pop(zl,ZIPLIST_TAIL); + zl = pop(zl,ZIPLIST_TAIL); ziplistRepr(zl); - pop(zl,ZIPLIST_TAIL); + zl = pop(zl,ZIPLIST_TAIL); ziplistRepr(zl); + zfree(zl); + printf("Get element at index 3:\n"); { zl = createList(); @@ -1126,6 +1256,7 @@ int main(int argc, char **argv) { printf("%lld\n", value); } printf("\n"); + zfree(zl); } printf("Get element at index 4 (out of range):\n"); @@ -1139,6 +1270,7 @@ int main(int argc, char **argv) { return 1; } printf("\n"); + zfree(zl); } printf("Get element at index -1 (last element):\n"); @@ -1156,6 +1288,7 @@ int main(int argc, char **argv) { printf("%lld\n", value); } printf("\n"); + zfree(zl); } printf("Get element at index -4 (first element):\n"); @@ -1173,6 +1306,7 @@ int main(int argc, char **argv) { printf("%lld\n", value); } printf("\n"); + zfree(zl); } printf("Get element at index -5 (reverse out of range):\n"); @@ -1186,6 +1320,7 @@ int main(int argc, char **argv) { return 1; } printf("\n"); + zfree(zl); } printf("Iterate list from 0 to end:\n"); @@ -1203,6 +1338,7 @@ int main(int argc, char **argv) { printf("\n"); } printf("\n"); + zfree(zl); } printf("Iterate list from 1 to end:\n"); @@ -1220,6 +1356,7 @@ int main(int argc, char **argv) { printf("\n"); } printf("\n"); + zfree(zl); } printf("Iterate list from 2 to end:\n"); @@ -1237,6 +1374,7 @@ int main(int argc, char **argv) { printf("\n"); } printf("\n"); + zfree(zl); } printf("Iterate starting out of range:\n"); @@ -1249,6 +1387,7 @@ int main(int argc, char **argv) { printf("ERROR\n"); } printf("\n"); + zfree(zl); } printf("Iterate from back to front:\n"); @@ -1266,6 +1405,7 @@ int main(int argc, char **argv) { printf("\n"); } printf("\n"); + zfree(zl); } printf("Iterate from back to front, deleting all items:\n"); @@ -1284,6 +1424,7 @@ int main(int argc, char **argv) { printf("\n"); } printf("\n"); + zfree(zl); } printf("Delete inclusive range 0,0:\n"); @@ -1291,6 +1432,7 @@ int main(int argc, char **argv) { zl = createList(); zl = ziplistDeleteRange(zl, 0, 1); ziplistRepr(zl); + zfree(zl); } printf("Delete inclusive range 0,1:\n"); @@ -1298,6 +1440,7 @@ int main(int argc, char **argv) { zl = createList(); zl = ziplistDeleteRange(zl, 0, 2); ziplistRepr(zl); + zfree(zl); } printf("Delete inclusive range 1,2:\n"); @@ -1305,6 +1448,7 @@ int main(int argc, char **argv) { zl = createList(); zl = ziplistDeleteRange(zl, 1, 2); ziplistRepr(zl); + zfree(zl); } printf("Delete with start index out of range:\n"); @@ -1312,6 +1456,7 @@ int main(int argc, char **argv) { zl = createList(); zl = ziplistDeleteRange(zl, 5, 1); ziplistRepr(zl); + zfree(zl); } printf("Delete with num overflow:\n"); @@ -1319,6 +1464,7 @@ int main(int argc, char **argv) { zl = createList(); zl = ziplistDeleteRange(zl, 1, 5); ziplistRepr(zl); + zfree(zl); } printf("Delete foo while iterating:\n"); @@ -1343,11 +1489,12 @@ int main(int argc, char **argv) { } printf("\n"); ziplistRepr(zl); + zfree(zl); } printf("Regression test for >255 byte strings:\n"); { - char v1[257],v2[257]; + char v1[257] = {0}, v2[257] = {0}; memset(v1,'x',256); memset(v2,'y',256); zl = ziplistNew(); @@ -1362,13 +1509,15 @@ int main(int argc, char **argv) { assert(ziplistGet(p,&entry,&elen,&value)); assert(strncmp(v2,(char*)entry,elen) == 0); printf("SUCCESS\n\n"); + zfree(zl); } printf("Regression test deleting next to last entries:\n"); { - char v[3][257]; - zlentry e[3]; - int i; + char v[3][257] = {{0}}; + zlentry e[3] = {{.prevrawlensize = 0, .prevrawlen = 0, .lensize = 0, + .len = 0, .headersize = 0, .encoding = 0, .p = NULL}}; + size_t i; for (i = 0; i < (sizeof(v)/sizeof(v[0])); i++) { memset(v[i], 'a' + i, sizeof(v[0])); @@ -1399,6 +1548,7 @@ int main(int argc, char **argv) { assert(e[1].prevrawlensize == 5); printf("SUCCESS\n\n"); + zfree(zl); } printf("Create long list and check indices:\n"); @@ -1420,6 +1570,7 @@ int main(int argc, char **argv) { assert(999-i == value); } printf("SUCCESS\n\n"); + zfree(zl); } printf("Compare strings with ziplist entries:\n"); @@ -1445,6 +1596,82 @@ int main(int argc, char **argv) { return 1; } printf("SUCCESS\n\n"); + zfree(zl); + } + + printf("Merge test:\n"); + { + /* create list gives us: [hello, foo, quux, 1024] */ + zl = createList(); + unsigned char *zl2 = createList(); + + unsigned char *zl3 = ziplistNew(); + unsigned char *zl4 = ziplistNew(); + + if (ziplistMerge(&zl4, &zl4)) { + printf("ERROR: Allowed merging of one ziplist into itself.\n"); + return 1; + } + + /* Merge two empty ziplists, get empty result back. */ + zl4 = ziplistMerge(&zl3, &zl4); + ziplistRepr(zl4); + if (ziplistLen(zl4)) { + printf("ERROR: Merging two empty ziplists created entries.\n"); + return 1; + } + zfree(zl4); + + zl2 = ziplistMerge(&zl, &zl2); + /* merge gives us: [hello, foo, quux, 1024, hello, foo, quux, 1024] */ + ziplistRepr(zl2); + + if (ziplistLen(zl2) != 8) { + printf("ERROR: Merged length not 8, but: %u\n", ziplistLen(zl2)); + return 1; + } + + p = ziplistIndex(zl2,0); + if (!ziplistCompare(p,(unsigned char*)"hello",5)) { + printf("ERROR: not \"hello\"\n"); + return 1; + } + if (ziplistCompare(p,(unsigned char*)"hella",5)) { + printf("ERROR: \"hella\"\n"); + return 1; + } + + p = ziplistIndex(zl2,3); + if (!ziplistCompare(p,(unsigned char*)"1024",4)) { + printf("ERROR: not \"1024\"\n"); + return 1; + } + if (ziplistCompare(p,(unsigned char*)"1025",4)) { + printf("ERROR: \"1025\"\n"); + return 1; + } + + p = ziplistIndex(zl2,4); + if (!ziplistCompare(p,(unsigned char*)"hello",5)) { + printf("ERROR: not \"hello\"\n"); + return 1; + } + if (ziplistCompare(p,(unsigned char*)"hella",5)) { + printf("ERROR: \"hella\"\n"); + return 1; + } + + p = ziplistIndex(zl2,7); + if (!ziplistCompare(p,(unsigned char*)"1024",4)) { + printf("ERROR: not \"1024\"\n"); + return 1; + } + if (ziplistCompare(p,(unsigned char*)"1025",4)) { + printf("ERROR: \"1025\"\n"); + return 1; + } + printf("SUCCESS\n\n"); + zfree(zl); } printf("Stress with random payloads of different encoding:\n"); @@ -1464,7 +1691,7 @@ int main(int argc, char **argv) { for (i = 0; i < 20000; i++) { zl = ziplistNew(); ref = listCreate(); - listSetFreeMethod(ref,sdsfree); + listSetFreeMethod(ref,(void (*)(void*))sdsfree); len = rand() % 256; /* Create lists */ @@ -1532,5 +1759,4 @@ int main(int argc, char **argv) { return 0; } - #endif diff --git a/src/ziplist.h b/src/ziplist.h index b29c34167..e92b5e783 100644 --- a/src/ziplist.h +++ b/src/ziplist.h @@ -32,6 +32,7 @@ #define ZIPLIST_TAIL 1 unsigned char *ziplistNew(void); +unsigned char *ziplistMerge(unsigned char **first, unsigned char **second); unsigned char *ziplistPush(unsigned char *zl, unsigned char *s, unsigned int slen, int where); unsigned char *ziplistIndex(unsigned char *zl, int index); unsigned char *ziplistNext(unsigned char *zl, unsigned char *p); @@ -39,8 +40,12 @@ unsigned char *ziplistPrev(unsigned char *zl, unsigned char *p); unsigned int ziplistGet(unsigned char *p, unsigned char **sval, unsigned int *slen, long long *lval); unsigned char *ziplistInsert(unsigned char *zl, unsigned char *p, unsigned char *s, unsigned int slen); unsigned char *ziplistDelete(unsigned char *zl, unsigned char **p); -unsigned char *ziplistDeleteRange(unsigned char *zl, unsigned int index, unsigned int num); +unsigned char *ziplistDeleteRange(unsigned char *zl, int index, unsigned int num); unsigned int ziplistCompare(unsigned char *p, unsigned char *s, unsigned int slen); unsigned char *ziplistFind(unsigned char *p, unsigned char *vstr, unsigned int vlen, unsigned int skip); unsigned int ziplistLen(unsigned char *zl); size_t ziplistBlobLen(unsigned char *zl); + +#ifdef REDIS_TEST +int ziplistTest(int argc, char *argv[]); +#endif diff --git a/src/zipmap.c b/src/zipmap.c index 803fedeec..22bfa1a46 100644 --- a/src/zipmap.c +++ b/src/zipmap.c @@ -51,10 +51,9 @@ * <len> is the length of the following string (key or value). * <len> lengths are encoded in a single value or in a 5 bytes value. * If the first byte value (as an unsigned 8 bit value) is between 0 and - * 252, it's a single-byte length. If it is 253 then a four bytes unsigned + * 253, it's a single-byte length. If it is 254 then a four bytes unsigned * integer follows (in the host byte ordering). A value of 255 is used to - * signal the end of the hash. The special value 254 is used to mark - * empty space that can be used to add new key/value pairs. + * signal the end of the hash. * * <free> is the number of free unused bytes after the string, resulting * from modification of values associated to a key. For instance if "foo" @@ -371,8 +370,8 @@ size_t zipmapBlobLen(unsigned char *zm) { return totlen; } -#ifdef ZIPMAP_TEST_MAIN -void zipmapRepr(unsigned char *p) { +#ifdef REDIS_TEST +static void zipmapRepr(unsigned char *p) { unsigned int l; printf("{status %u}",*p++); @@ -405,9 +404,13 @@ void zipmapRepr(unsigned char *p) { printf("\n"); } -int main(void) { +#define UNUSED(x) (void)(x) +int zipmapTest(int argc, char *argv[]) { unsigned char *zm; + UNUSED(argc); + UNUSED(argv); + zm = zipmapNew(); zm = zipmapSet(zm,(unsigned char*) "name",4, (unsigned char*) "foo",3,NULL); diff --git a/src/zipmap.h b/src/zipmap.h index 9cf1b2484..ac588f05a 100644 --- a/src/zipmap.h +++ b/src/zipmap.h @@ -46,4 +46,8 @@ unsigned int zipmapLen(unsigned char *zm); size_t zipmapBlobLen(unsigned char *zm); void zipmapRepr(unsigned char *p); +#ifdef REDIS_TEST +int zipmapTest(int argc, char *argv[]); +#endif + #endif diff --git a/src/zmalloc.c b/src/zmalloc.c index 6df51a80f..640ee19e2 100644 --- a/src/zmalloc.c +++ b/src/zmalloc.c @@ -364,3 +364,60 @@ size_t zmalloc_get_smap_bytes_by_field(char *field) { size_t zmalloc_get_private_dirty(void) { return zmalloc_get_smap_bytes_by_field("Private_Dirty:"); } + +/* Returns the size of physical memory (RAM) in bytes. + * It looks ugly, but this is the cleanest way to achive cross platform results. + * Cleaned up from: + * + * http://nadeausoftware.com/articles/2012/09/c_c_tip_how_get_physical_memory_size_system + * + * Note that this function: + * 1) Was released under the following CC attribution license: + * http://creativecommons.org/licenses/by/3.0/deed.en_US. + * 2) Was originally implemented by David Robert Nadeau. + * 3) Was modified for Redis by Matt Stancliff. + * 4) This note exists in order to comply with the original license. + */ +size_t zmalloc_get_memory_size(void) { +#if defined(__unix__) || defined(__unix) || defined(unix) || \ + (defined(__APPLE__) && defined(__MACH__)) +#if defined(CTL_HW) && (defined(HW_MEMSIZE) || defined(HW_PHYSMEM64)) + int mib[2]; + mib[0] = CTL_HW; +#if defined(HW_MEMSIZE) + mib[1] = HW_MEMSIZE; /* OSX. --------------------- */ +#elif defined(HW_PHYSMEM64) + mib[1] = HW_PHYSMEM64; /* NetBSD, OpenBSD. --------- */ +#endif + int64_t size = 0; /* 64-bit */ + size_t len = sizeof(size); + if (sysctl( mib, 2, &size, &len, NULL, 0) == 0) + return (size_t)size; + return 0L; /* Failed? */ + +#elif defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE) + /* FreeBSD, Linux, OpenBSD, and Solaris. -------------------- */ + return (size_t)sysconf(_SC_PHYS_PAGES) * (size_t)sysconf(_SC_PAGESIZE); + +#elif defined(CTL_HW) && (defined(HW_PHYSMEM) || defined(HW_REALMEM)) + /* DragonFly BSD, FreeBSD, NetBSD, OpenBSD, and OSX. -------- */ + int mib[2]; + mib[0] = CTL_HW; +#if defined(HW_REALMEM) + mib[1] = HW_REALMEM; /* FreeBSD. ----------------- */ +#elif defined(HW_PYSMEM) + mib[1] = HW_PHYSMEM; /* Others. ------------------ */ +#endif + unsigned int size = 0; /* 32-bit */ + size_t len = sizeof(size); + if (sysctl(mib, 2, &size, &len, NULL, 0) == 0) + return (size_t)size; + return 0L; /* Failed? */ +#endif /* sysctl and sysconf variants */ + +#else + return 0L; /* Unknown OS. */ +#endif +} + + diff --git a/src/zmalloc.h b/src/zmalloc.h index 4de2cffea..a47ea6ccf 100644 --- a/src/zmalloc.h +++ b/src/zmalloc.h @@ -77,6 +77,7 @@ float zmalloc_get_fragmentation_ratio(size_t rss); size_t zmalloc_get_rss(void); size_t zmalloc_get_private_dirty(void); size_t zmalloc_get_smap_bytes_by_field(char *field); +size_t zmalloc_get_memory_size(void); void zlibc_free(void *ptr); #ifndef HAVE_MALLOC_SIZE diff --git a/tests/cluster/run.tcl b/tests/cluster/run.tcl index 69a160c4f..f764cea0a 100644 --- a/tests/cluster/run.tcl +++ b/tests/cluster/run.tcl @@ -21,6 +21,7 @@ proc main {} { if {[catch main e]} { puts $::errorInfo + if {$::pause_on_error} pause_on_error cleanup exit 1 } diff --git a/tests/cluster/tests/04-resharding.tcl b/tests/cluster/tests/04-resharding.tcl index b9e772351..8811762c6 100644 --- a/tests/cluster/tests/04-resharding.tcl +++ b/tests/cluster/tests/04-resharding.tcl @@ -66,9 +66,18 @@ test "Cluster consistency during live resharding" { } # Write random data to random list. - set key "key:[randomInt $numkeys]" + set listid [randomInt $numkeys] + set key "key:$listid" set ele [randomValue] - $cluster rpush $key $ele + # We write both with Lua scripts and with plain commands. + # This way we are able to stress Lua -> Redis command invocation + # as well, that has tests to prevent Lua to write into wrong + # hash slots. + if {$listid % 2} { + $cluster rpush $key $ele + } else { + $cluster eval {redis.call("rpush",KEYS[1],ARGV[1])} 1 $key $ele + } lappend content($key) $ele if {($j % 1000) == 0} { diff --git a/tests/cluster/tests/includes/init-tests.tcl b/tests/cluster/tests/includes/init-tests.tcl index 65fc806e1..466ab8f25 100644 --- a/tests/cluster/tests/includes/init-tests.tcl +++ b/tests/cluster/tests/includes/init-tests.tcl @@ -27,10 +27,17 @@ test "Cluster nodes are reachable" { test "Cluster nodes hard reset" { foreach_redis_id id { + if {$::valgrind} { + set node_timeout 10000 + } else { + set node_timeout 3000 + } catch {R $id flushall} ; # May fail for readonly slaves. + R $id MULTI R $id cluster reset hard R $id cluster set-config-epoch [expr {$id+1}] - R $id config set cluster-node-timeout 3000 + R $id EXEC + R $id config set cluster-node-timeout $node_timeout R $id config set cluster-slave-validity-factor 10 R $id config rewrite } diff --git a/tests/instances.tcl b/tests/instances.tcl index 426508f33..353d9b2d2 100644 --- a/tests/instances.tcl +++ b/tests/instances.tcl @@ -16,6 +16,7 @@ source ../support/server.tcl source ../support/test.tcl set ::verbose 0 +set ::valgrind 0 set ::pause_on_error 0 set ::simulate_error 0 set ::sentinel_instances {} @@ -32,6 +33,25 @@ if {[catch {cd tmp}]} { exit 1 } +# Execute the specified instance of the server specified by 'type', using +# the provided configuration file. Returns the PID of the process. +proc exec_instance {type cfgfile} { + if {$type eq "redis"} { + set prgname redis-server + } elseif {$type eq "sentinel"} { + set prgname redis-sentinel + } else { + error "Unknown instance type." + } + + if {$::valgrind} { + set pid [exec valgrind --track-origins=yes --suppressions=../../../src/valgrind.sup --show-reachable=no --show-possibly-lost=no --leak-check=full ../../../src/${prgname} $cfgfile &] + } else { + set pid [exec ../../../src/${prgname} $cfgfile &] + } + return $pid +} + # Spawn a redis or sentinel instance, depending on 'type'. proc spawn_instance {type base_port count {conf {}}} { for {set j 0} {$j < $count} {incr j} { @@ -58,14 +78,7 @@ proc spawn_instance {type base_port count {conf {}}} { close $cfg # Finally exec it and remember the pid for later cleanup. - if {$type eq "redis"} { - set prgname redis-server - } elseif {$type eq "sentinel"} { - set prgname redis-sentinel - } else { - error "Unknown instance type." - } - set pid [exec ../../../src/${prgname} $cfgfile &] + set pid [exec_instance $type $cfgfile] lappend ::pids $pid # Check availability @@ -98,6 +111,7 @@ proc cleanup {} { proc abort_sentinel_test msg { puts "WARNING: Aborting the test." puts ">>>>>>>> $msg" + if {$::pause_on_error} pause_on_error cleanup exit 1 } @@ -113,6 +127,8 @@ proc parse_options {} { set ::pause_on_error 1 } elseif {$opt eq "--fail"} { set ::simulate_error 1 + } elseif {$opt eq {--valgrind}} { + set ::valgrind 1 } elseif {$opt eq "--help"} { puts "Hello, I'm sentinel.tcl and I run Sentinel unit tests." puts "\nOptions:" @@ -360,15 +376,31 @@ proc get_instance_id_by_port {type port} { # The instance can be restarted with restart-instance. proc kill_instance {type id} { set pid [get_instance_attrib $type $id pid] + set port [get_instance_attrib $type $id port] + if {$pid == -1} { error "You tried to kill $type $id twice." } + exec kill -9 $pid set_instance_attrib $type $id pid -1 set_instance_attrib $type $id link you_tried_to_talk_with_killed_instance # Remove the PID from the list of pids to kill at exit. set ::pids [lsearch -all -inline -not -exact $::pids $pid] + + # Wait for the port it was using to be available again, so that's not + # an issue to start a new server ASAP with the same port. + set retry 10 + while {[incr retry -1]} { + set port_is_free [catch {set s [socket 127.0.01 $port]}] + if {$port_is_free} break + catch {close $s} + after 1000 + } + if {$retry == 0} { + error "Port $port does not return available after killing instance." + } } # Return true of the instance of the specified type/id is killed. @@ -385,12 +417,7 @@ proc restart_instance {type id} { # Execute the instance with its old setup and append the new pid # file for cleanup. - if {$type eq "redis"} { - set prgname redis-server - } else { - set prgname redis-sentinel - } - set pid [exec ../../../src/${prgname} $cfgfile &] + set pid [exec_instance $type $cfgfile] set_instance_attrib $type $id pid $pid lappend ::pids $pid diff --git a/tests/integration/aof.tcl b/tests/integration/aof.tcl index 7ea70943c..01b928bb5 100644 --- a/tests/integration/aof.tcl +++ b/tests/integration/aof.tcl @@ -204,6 +204,30 @@ tags {"aof"} { } } + ## Test that SPOP with <count> (that modifies the client's argc/argv) is correctly free'd + create_aof { + append_to_aof [formatCommand sadd set foo] + append_to_aof [formatCommand sadd set bar] + append_to_aof [formatCommand sadd set gah] + append_to_aof [formatCommand spop set 2] + } + + start_server_aof [list dir $server_path] { + test "AOF+SPOP: Server should have been started" { + assert_equal 1 [is_alive $srv] + } + + test "AOF+SPOP: Set should have 1 member" { + set client [redis [dict get $srv host] [dict get $srv port]] + wait_for_condition 50 100 { + [catch {$client ping} e] == 0 + } else { + fail "Loading DB is taking too much time." + } + assert_equal 1 [$client scard set] + } + } + ## Test that EXPIREAT is loaded correctly create_aof { append_to_aof [formatCommand rpush list foo] diff --git a/tests/integration/replication.tcl b/tests/integration/replication.tcl index d2668d736..71a7ec60a 100644 --- a/tests/integration/replication.tcl +++ b/tests/integration/replication.tcl @@ -118,7 +118,8 @@ foreach dl {no yes} { [lindex $slaves 1] slaveof $master_host $master_port [lindex $slaves 2] slaveof $master_host $master_port - # Wait for all the three slaves to reach the "online" state + # Wait for all the three slaves to reach the "online" + # state from the POV of the master. set retry 500 while {$retry} { set info [r -3 info] @@ -133,6 +134,17 @@ foreach dl {no yes} { error "assertion:Slaves not correctly synchronized" } + # Wait that slaves acknowledge they are online so + # we are sure that DBSIZE and DEBUG DIGEST will not + # fail because of timing issues. + wait_for_condition 500 100 { + [lindex [[lindex $slaves 0] role] 3] eq {connected} && + [lindex [[lindex $slaves 1] role] 3] eq {connected} && + [lindex [[lindex $slaves 2] role] 3] eq {connected} + } else { + fail "Slaves still not connected after some time" + } + # Stop the write load stop_write_load $load_handle0 stop_write_load $load_handle1 @@ -140,16 +152,8 @@ foreach dl {no yes} { stop_write_load $load_handle3 stop_write_load $load_handle4 - # Wait that slaves exit the "loading" state - wait_for_condition 500 100 { - ![string match {*loading:1*} [[lindex $slaves 0] info]] && - ![string match {*loading:1*} [[lindex $slaves 1] info]] && - ![string match {*loading:1*} [[lindex $slaves 2] info]] - } else { - fail "Slaves still loading data after too much time" - } - - # Make sure that slaves and master have same number of keys + # Make sure that slaves and master have same + # number of keys wait_for_condition 500 100 { [$master dbsize] == [[lindex $slaves 0] dbsize] && [$master dbsize] == [[lindex $slaves 1] dbsize] && diff --git a/tests/support/cluster.tcl b/tests/support/cluster.tcl index b007e3b05..d4e7d2e5d 100644 --- a/tests/support/cluster.tcl +++ b/tests/support/cluster.tcl @@ -226,6 +226,8 @@ proc ::redis_cluster::get_keys_from_command {cmd argv} { # Special handling for other commands switch -exact $cmd { mget {return $argv} + eval {return [lrange $argv 2 1+[lindex $argv 1]]} + evalsha {return [lrange $argv 2 1+[lindex $argv 1]]} } # All the remaining commands are not handled. diff --git a/tests/support/server.tcl b/tests/support/server.tcl index 0e2e2982a..317b40a84 100644 --- a/tests/support/server.tcl +++ b/tests/support/server.tcl @@ -70,6 +70,9 @@ proc kill_server config { if {$::valgrind} { check_valgrind_errors [dict get $config stderr] } + + # Remove this pid from the set of active pids in the test server. + send_data_packet $::test_server_fd server-killed $pid } proc is_alive config { @@ -204,11 +207,14 @@ proc start_server {options {code undefined}} { set stderr [format "%s/%s" [dict get $config "dir"] "stderr"] if {$::valgrind} { - exec valgrind --suppressions=src/valgrind.sup --show-reachable=no --show-possibly-lost=no --leak-check=full src/redis-server $config_file > $stdout 2> $stderr & + set pid [exec valgrind --track-origins=yes --suppressions=src/valgrind.sup --show-reachable=no --show-possibly-lost=no --leak-check=full src/redis-server $config_file > $stdout 2> $stderr &] } else { - exec src/redis-server $config_file > $stdout 2> $stderr & + set pid [exec src/redis-server $config_file > $stdout 2> $stderr &] } + # Tell the test server about this new instance. + send_data_packet $::test_server_fd server-spawned $pid + # check that the server actually started # ugly but tries to be as fast as possible... if {$::valgrind} {set retrynum 1000} else {set retrynum 100} @@ -234,9 +240,9 @@ proc start_server {options {code undefined}} { return } - # find out the pid - while {![info exists pid]} { - regexp {PID:\s(\d+)} [exec cat $stdout] _ pid + # Wait for actual startup + while {![info exists _pid]} { + regexp {PID:\s(\d+)} [exec cat $stdout] _ _pid after 100 } diff --git a/tests/support/test.tcl b/tests/support/test.tcl index 7d390cc47..31371c567 100644 --- a/tests/support/test.tcl +++ b/tests/support/test.tcl @@ -19,9 +19,12 @@ proc assert_match {pattern value} { } } -proc assert_equal {expected value} { +proc assert_equal {expected value {detail ""}} { if {$expected ne $value} { - error "assertion:Expected '$value' to be equal to '$expected'" + if {$detail ne ""} { + set detail " (detail: $detail)" + } + error "assertion:Expected '$value' to be equal to '$expected'$detail" } } diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl index 9d975cfb7..212c95b4f 100644 --- a/tests/test_helper.tcl +++ b/tests/test_helper.tcl @@ -65,6 +65,9 @@ set ::file ""; # If set, runs only the tests in this comma separated list set ::curfile ""; # Hold the filename of the current suite set ::accurate 0; # If true runs fuzz tests with more iterations set ::force_failure 0 +set ::timeout 600; # 10 minutes without progresses will quit the test. +set ::last_progress [clock seconds] +set ::active_servers {} ; # Pids of active Redis instances. # Set to 1 when we are running in client mode. The Redis test uses a # server-client model to run tests simultaneously. The server instance @@ -200,11 +203,19 @@ proc test_server_main {} { vwait forever } -# This function gets called 10 times per second, for now does nothing but -# may be used in the future in order to detect test clients taking too much -# time to execute the task. +# This function gets called 10 times per second. proc test_server_cron {} { - # Do some work here. + set elapsed [expr {[clock seconds]-$::last_progress}] + + if {$elapsed > $::timeout} { + set err "\[[colorstr red TIMEOUT]\]: clients state report follows." + puts $err + show_clients_state + kill_clients + force_kill_all_servers + the_end + } + after 100 test_server_cron } @@ -230,6 +241,8 @@ proc read_from_test_client fd { set bytes [gets $fd] set payload [read $fd $bytes] foreach {status data} $payload break + set ::last_progress [clock seconds] + if {$status eq {ready}} { if {!$::quiet} { puts "\[$status\]: $data" @@ -256,12 +269,15 @@ proc read_from_test_client fd { set ::active_clients_task($fd) "(ERR) $data" } elseif {$status eq {exception}} { puts "\[[colorstr red $status]\]: $data" - foreach p $::clients_pids { - catch {exec kill -9 $p} - } + kill_clients + force_kill_all_servers exit 1 } elseif {$status eq {testing}} { set ::active_clients_task($fd) "(IN PROGRESS) $data" + } elseif {$status eq {server-spawned}} { + lappend ::active_servers $data + } elseif {$status eq {server-killed}} { + set ::active_servers [lsearch -all -inline -not -exact $::active_servers $data] } else { if {!$::quiet} { puts "\[$status\]: $data" @@ -269,6 +285,31 @@ proc read_from_test_client fd { } } +proc show_clients_state {} { + # The following loop is only useful for debugging tests that may + # enter an infinite loop. Commented out normally. + foreach x $::active_clients { + if {[info exist ::active_clients_task($x)]} { + puts "$x => $::active_clients_task($x)" + } else { + puts "$x => ???" + } + } +} + +proc kill_clients {} { + foreach p $::clients_pids { + catch {exec kill $p} + } +} + +proc force_kill_all_servers {} { + foreach p $::active_servers { + puts "Killing still running Redis server $p" + catch {exec kill -9 $p} + } +} + # A new client is idle. Remove it from the list of active clients and # if there are still test units to run, launch them. proc signal_idle_client fd { @@ -276,17 +317,7 @@ proc signal_idle_client fd { set ::active_clients \ [lsearch -all -inline -not -exact $::active_clients $fd] - if 0 { - # The following loop is only useful for debugging tests that may - # enter an infinite loop. Commented out normally. - foreach x $::active_clients { - if {[info exist ::active_clients_task($x)]} { - puts "$x => $::active_clients_task($x)" - } else { - puts "$x => ???" - } - } - } + if 0 {show_clients_state} # New unit to process? if {$::next_test != [llength $::all_tests]} { @@ -361,7 +392,8 @@ proc print_help_screen {} { "--quiet Don't show individual tests." "--single <unit> Just execute the specified unit (see next option)." "--list-tests List all the available test units." - "--clients <num> Number of test clients (16)." + "--clients <num> Number of test clients (default 16)." + "--timeout <sec> Test timeout in seconds (default 10 min)." "--force-failure Force the execution of a test that always fails." "--help Print this help screen." } "\n"] @@ -410,6 +442,9 @@ for {set j 0} {$j < [llength $argv]} {incr j} { } elseif {$opt eq {--clients}} { set ::numclients $arg incr j + } elseif {$opt eq {--timeout}} { + set ::timeout $arg + incr j } elseif {$opt eq {--help}} { print_help_screen exit 0 diff --git a/tests/unit/aofrw.tcl b/tests/unit/aofrw.tcl index a2d74168f..4fdbdc6c6 100644 --- a/tests/unit/aofrw.tcl +++ b/tests/unit/aofrw.tcl @@ -77,10 +77,10 @@ start_server {tags {"aofrw"}} { } foreach d {string int} { - foreach e {ziplist linkedlist} { + foreach e {quicklist} { test "AOF rewrite of list with $e encoding, $d data" { r flushall - if {$e eq {ziplist}} {set len 10} else {set len 1000} + set len 1000 for {set j 0} {$j < $len} {incr j} { if {$d eq {string}} { set data [randstring 0 16 alpha] diff --git a/tests/unit/dump.tcl b/tests/unit/dump.tcl index d39204f9f..5af53db8d 100644 --- a/tests/unit/dump.tcl +++ b/tests/unit/dump.tcl @@ -157,7 +157,7 @@ start_server {tags {"dump"}} { test {MIGRATE can correctly transfer large values} { set first [srv 0 client] r del key - for {set j 0} {$j < 5000} {incr j} { + for {set j 0} {$j < 40000} {incr j} { r rpush key 1 2 3 4 5 6 7 8 9 10 r rpush key "item 1" "item 2" "item 3" "item 4" "item 5" \ "item 6" "item 7" "item 8" "item 9" "item 10" @@ -175,7 +175,7 @@ start_server {tags {"dump"}} { assert {[$first exists key] == 0} assert {[$second exists key] == 1} assert {[$second ttl key] == -1} - assert {[$second llen key] == 5000*20} + assert {[$second llen key] == 40000*20} } } diff --git a/tests/unit/scan.tcl b/tests/unit/scan.tcl index 2b1033e39..1d84f128d 100644 --- a/tests/unit/scan.tcl +++ b/tests/unit/scan.tcl @@ -226,4 +226,14 @@ start_server {tags {"scan"}} { set res [r zscan mykey 0 MATCH foo* COUNT 10000] lsort -unique [lindex $res 1] } + + test "ZSCAN scores: regression test for issue #2175" { + r del mykey + for {set j 0} {$j < 500} {incr j} { + r zadd mykey 9.8813129168249309e-323 $j + } + set res [lindex [r zscan mykey 0] 1] + set first_score [lindex $res 1] + assert {$first_score != 0} + } } diff --git a/tests/unit/scripting.tcl b/tests/unit/scripting.tcl index e1cd2174b..921382e34 100644 --- a/tests/unit/scripting.tcl +++ b/tests/unit/scripting.tcl @@ -413,7 +413,7 @@ start_server {tags {"scripting"}} { r sadd myset a b c r mset a 1 b 2 c 3 d 4 assert {[r spop myset] ne {}} - assert {[r spop myset] ne {}} + assert {[r spop myset 1] ne {}} assert {[r spop myset] ne {}} assert {[r mget a b c d] eq {1 2 3 4}} assert {[r spop myset] eq {}} diff --git a/tests/unit/sort.tcl b/tests/unit/sort.tcl index 490158f14..083c4540d 100644 --- a/tests/unit/sort.tcl +++ b/tests/unit/sort.tcl @@ -1,8 +1,7 @@ start_server { tags {"sort"} overrides { - "list-max-ziplist-value" 16 - "list-max-ziplist-entries" 32 + "list-max-ziplist-size" 32 "set-max-intset-entries" 32 } } { @@ -36,9 +35,9 @@ start_server { } foreach {num cmd enc title} { - 16 lpush ziplist "Ziplist" - 1000 lpush linkedlist "Linked list" - 10000 lpush linkedlist "Big Linked list" + 16 lpush quicklist "Old Ziplist" + 1000 lpush quicklist "Old Linked list" + 10000 lpush quicklist "Old Big Linked list" 16 sadd intset "Intset" 1000 sadd hashtable "Hash table" 10000 sadd hashtable "Big Hash table" @@ -85,14 +84,14 @@ start_server { r sort tosort BY weight_* store sort-res assert_equal $result [r lrange sort-res 0 -1] assert_equal 16 [r llen sort-res] - assert_encoding ziplist sort-res + assert_encoding quicklist sort-res } test "SORT BY hash field STORE" { r sort tosort BY wobj_*->weight store sort-res assert_equal $result [r lrange sort-res 0 -1] assert_equal 16 [r llen sort-res] - assert_encoding ziplist sort-res + assert_encoding quicklist sort-res } test "SORT extracts STORE correctly" { @@ -246,6 +245,24 @@ start_server { r sort mylist by num get x:*-> } {100} + test "SORT by nosort retains native order for lists" { + r del testa + r lpush testa 2 1 4 3 5 + r sort testa by nosort + } {5 3 4 1 2} + + test "SORT by nosort plus store retains native order for lists" { + r del testa + r lpush testa 2 1 4 3 5 + r sort testa by nosort store testb + r lrange testb 0 -1 + } {5 3 4 1 2} + + test "SORT by nosort with limit returns based on original list order" { + r sort testa by nosort limit 0 3 store testb + r lrange testb 0 -1 + } {5 3 4} + tags {"slow"} { set num 100 set res [create_random_dataset $num lpush] diff --git a/tests/unit/type/list-2.tcl b/tests/unit/type/list-2.tcl index bf6a055eb..4c7d6d91c 100644 --- a/tests/unit/type/list-2.tcl +++ b/tests/unit/type/list-2.tcl @@ -1,8 +1,7 @@ start_server { tags {"list"} overrides { - "list-max-ziplist-value" 16 - "list-max-ziplist-entries" 256 + "list-max-ziplist-size" 4 } } { source "tests/unit/type/list-common.tcl" @@ -28,14 +27,18 @@ start_server { for {set i 0} {$i < 1000} {incr i} { set min [expr {int(rand()*$startlen)}] set max [expr {$min+int(rand()*$startlen)}] + set before_len [llength $mylist] + set before_len_r [r llen mylist] set mylist [lrange $mylist $min $max] r ltrim mylist $min $max - assert_equal $mylist [r lrange mylist 0 -1] + assert_equal $mylist [r lrange mylist 0 -1] "failed trim" + set starting [r llen mylist] for {set j [r llen mylist]} {$j < $startlen} {incr j} { set str [randomInt 9223372036854775807] r rpush mylist $str lappend mylist $str + assert_equal $mylist [r lrange mylist 0 -1] "failed append match" } } } diff --git a/tests/unit/type/list-3.tcl b/tests/unit/type/list-3.tcl index 94f9a0b79..ece6ea2d5 100644 --- a/tests/unit/type/list-3.tcl +++ b/tests/unit/type/list-3.tcl @@ -1,8 +1,7 @@ start_server { tags {list ziplist} overrides { - "list-max-ziplist-value" 200000 - "list-max-ziplist-entries" 256 + "list-max-ziplist-size" 16 } } { test {Explicit regression for a list bug} { diff --git a/tests/unit/type/list.tcl b/tests/unit/type/list.tcl index c8e26602b..e4d568cf1 100644 --- a/tests/unit/type/list.tcl +++ b/tests/unit/type/list.tcl @@ -1,25 +1,24 @@ start_server { tags {"list"} overrides { - "list-max-ziplist-value" 16 - "list-max-ziplist-entries" 256 + "list-max-ziplist-size" 5 } } { source "tests/unit/type/list-common.tcl" test {LPUSH, RPUSH, LLENGTH, LINDEX, LPOP - ziplist} { # first lpush then rpush - assert_equal 1 [r lpush myziplist1 a] - assert_equal 2 [r rpush myziplist1 b] - assert_equal 3 [r rpush myziplist1 c] + assert_equal 1 [r lpush myziplist1 aa] + assert_equal 2 [r rpush myziplist1 bb] + assert_equal 3 [r rpush myziplist1 cc] assert_equal 3 [r llen myziplist1] - assert_equal a [r lindex myziplist1 0] - assert_equal b [r lindex myziplist1 1] - assert_equal c [r lindex myziplist1 2] + assert_equal aa [r lindex myziplist1 0] + assert_equal bb [r lindex myziplist1 1] + assert_equal cc [r lindex myziplist1 2] assert_equal {} [r lindex myziplist2 3] - assert_equal c [r rpop myziplist1] - assert_equal a [r lpop myziplist1] - assert_encoding ziplist myziplist1 + assert_equal cc [r rpop myziplist1] + assert_equal aa [r lpop myziplist1] + assert_encoding quicklist myziplist1 # first rpush then lpush assert_equal 1 [r rpush myziplist2 a] @@ -32,13 +31,13 @@ start_server { assert_equal {} [r lindex myziplist2 3] assert_equal a [r rpop myziplist2] assert_equal c [r lpop myziplist2] - assert_encoding ziplist myziplist2 + assert_encoding quicklist myziplist2 } test {LPUSH, RPUSH, LLENGTH, LINDEX, LPOP - regular list} { # first lpush then rpush assert_equal 1 [r lpush mylist1 $largevalue(linkedlist)] - assert_encoding linkedlist mylist1 + assert_encoding quicklist mylist1 assert_equal 2 [r rpush mylist1 b] assert_equal 3 [r rpush mylist1 c] assert_equal 3 [r llen mylist1] @@ -51,7 +50,7 @@ start_server { # first rpush then lpush assert_equal 1 [r rpush mylist2 $largevalue(linkedlist)] - assert_encoding linkedlist mylist2 + assert_encoding quicklist mylist2 assert_equal 2 [r lpush mylist2 b] assert_equal 3 [r lpush mylist2 c] assert_equal 3 [r llen mylist2] @@ -74,34 +73,22 @@ start_server { assert_equal {d c b a 0 1 2 3} [r lrange mylist 0 -1] } - test {DEL a list - ziplist} { - assert_equal 1 [r del myziplist2] - assert_equal 0 [r exists myziplist2] - assert_equal 0 [r llen myziplist2] - } - - test {DEL a list - regular list} { + test {DEL a list} { assert_equal 1 [r del mylist2] assert_equal 0 [r exists mylist2] assert_equal 0 [r llen mylist2] } - proc create_ziplist {key entries} { - r del $key - foreach entry $entries { r rpush $key $entry } - assert_encoding ziplist $key - } - - proc create_linkedlist {key entries} { + proc create_list {key entries} { r del $key foreach entry $entries { r rpush $key $entry } - assert_encoding linkedlist $key + assert_encoding quicklist $key } foreach {type large} [array get largevalue] { test "BLPOP, BRPOP: single existing list - $type" { set rd [redis_deferring_client] - create_$type blist "a b $large c d" + create_list blist "a b $large c d" $rd blpop blist 1 assert_equal {blist a} [$rd read] @@ -116,8 +103,8 @@ start_server { test "BLPOP, BRPOP: multiple existing lists - $type" { set rd [redis_deferring_client] - create_$type blist1 "a $large c" - create_$type blist2 "d $large f" + create_list blist1 "a $large c" + create_list blist2 "d $large f" $rd blpop blist1 blist2 1 assert_equal {blist1 a} [$rd read] @@ -137,7 +124,7 @@ start_server { test "BLPOP, BRPOP: second list has an entry - $type" { set rd [redis_deferring_client] r del blist1 - create_$type blist2 "d $large f" + create_list blist2 "d $large f" $rd blpop blist1 blist2 1 assert_equal {blist2 d} [$rd read] @@ -151,7 +138,7 @@ start_server { r del target set rd [redis_deferring_client] - create_$type blist "a b $large c d" + create_list blist "a b $large c d" $rd brpoplpush blist target 1 assert_equal d [$rd read] @@ -517,28 +504,28 @@ start_server { foreach {type large} [array get largevalue] { test "LPUSHX, RPUSHX - $type" { - create_$type xlist "$large c" + create_list xlist "$large c" assert_equal 3 [r rpushx xlist d] assert_equal 4 [r lpushx xlist a] assert_equal "a $large c d" [r lrange xlist 0 -1] } test "LINSERT - $type" { - create_$type xlist "a $large c d" - assert_equal 5 [r linsert xlist before c zz] - assert_equal "a $large zz c d" [r lrange xlist 0 10] - assert_equal 6 [r linsert xlist after c yy] - assert_equal "a $large zz c yy d" [r lrange xlist 0 10] - assert_equal 7 [r linsert xlist after d dd] - assert_equal -1 [r linsert xlist after bad ddd] - assert_equal "a $large zz c yy d dd" [r lrange xlist 0 10] - assert_equal 8 [r linsert xlist before a aa] - assert_equal -1 [r linsert xlist before bad aaa] - assert_equal "aa a $large zz c yy d dd" [r lrange xlist 0 10] + create_list xlist "a $large c d" + assert_equal 5 [r linsert xlist before c zz] "before c" + assert_equal "a $large zz c d" [r lrange xlist 0 10] "lrangeA" + assert_equal 6 [r linsert xlist after c yy] "after c" + assert_equal "a $large zz c yy d" [r lrange xlist 0 10] "lrangeB" + assert_equal 7 [r linsert xlist after d dd] "after d" + assert_equal -1 [r linsert xlist after bad ddd] "after bad" + assert_equal "a $large zz c yy d dd" [r lrange xlist 0 10] "lrangeC" + assert_equal 8 [r linsert xlist before a aa] "before a" + assert_equal -1 [r linsert xlist before bad aaa] "before bad" + assert_equal "aa a $large zz c yy d dd" [r lrange xlist 0 10] "lrangeD" # check inserting integer encoded value - assert_equal 9 [r linsert xlist before aa 42] - assert_equal 42 [r lrange xlist 0 0] + assert_equal 9 [r linsert xlist before aa 42] "before aa" + assert_equal 42 [r lrange xlist 0 0] "lrangeE" } } @@ -547,55 +534,7 @@ start_server { set e } {*ERR*syntax*error*} - test {LPUSHX, RPUSHX convert from ziplist to list} { - set large $largevalue(linkedlist) - - # convert when a large value is pushed - create_ziplist xlist a - assert_equal 2 [r rpushx xlist $large] - assert_encoding linkedlist xlist - create_ziplist xlist a - assert_equal 2 [r lpushx xlist $large] - assert_encoding linkedlist xlist - - # convert when the length threshold is exceeded - create_ziplist xlist [lrepeat 256 a] - assert_equal 257 [r rpushx xlist b] - assert_encoding linkedlist xlist - create_ziplist xlist [lrepeat 256 a] - assert_equal 257 [r lpushx xlist b] - assert_encoding linkedlist xlist - } - - test {LINSERT convert from ziplist to list} { - set large $largevalue(linkedlist) - - # convert when a large value is inserted - create_ziplist xlist a - assert_equal 2 [r linsert xlist before a $large] - assert_encoding linkedlist xlist - create_ziplist xlist a - assert_equal 2 [r linsert xlist after a $large] - assert_encoding linkedlist xlist - - # convert when the length threshold is exceeded - create_ziplist xlist [lrepeat 256 a] - assert_equal 257 [r linsert xlist before a a] - assert_encoding linkedlist xlist - create_ziplist xlist [lrepeat 256 a] - assert_equal 257 [r linsert xlist after a a] - assert_encoding linkedlist xlist - - # don't convert when the value could not be inserted - create_ziplist xlist [lrepeat 256 a] - assert_equal -1 [r linsert xlist before foo a] - assert_encoding ziplist xlist - create_ziplist xlist [lrepeat 256 a] - assert_equal -1 [r linsert xlist after foo a] - assert_encoding ziplist xlist - } - - foreach {type num} {ziplist 250 linkedlist 500} { + foreach {type num} {quicklist 250 quicklist 500} { proc check_numbered_list_consistency {key} { set len [r llen $key] for {set i 0} {$i < $len} {incr i} { @@ -664,16 +603,16 @@ start_server { foreach {type large} [array get largevalue] { test "RPOPLPUSH base case - $type" { r del mylist1 mylist2 - create_$type mylist1 "a $large c d" + create_list mylist1 "a $large c d" assert_equal d [r rpoplpush mylist1 mylist2] assert_equal c [r rpoplpush mylist1 mylist2] assert_equal "a $large" [r lrange mylist1 0 -1] assert_equal "c d" [r lrange mylist2 0 -1] - assert_encoding ziplist mylist2 + assert_encoding quicklist mylist2 } test "RPOPLPUSH with the same list as src and dst - $type" { - create_$type mylist "a $large c" + create_list mylist "a $large c" assert_equal "a $large c" [r lrange mylist 0 -1] assert_equal c [r rpoplpush mylist mylist] assert_equal "c a $large" [r lrange mylist 0 -1] @@ -681,8 +620,8 @@ start_server { foreach {othertype otherlarge} [array get largevalue] { test "RPOPLPUSH with $type source and existing target $othertype" { - create_$type srclist "a b c $large" - create_$othertype dstlist "$otherlarge" + create_list srclist "a b c $large" + create_list dstlist "$otherlarge" assert_equal $large [r rpoplpush srclist dstlist] assert_equal c [r rpoplpush srclist dstlist] assert_equal "a b" [r lrange srclist 0 -1] @@ -691,7 +630,7 @@ start_server { # When we rpoplpush'ed a large value, dstlist should be # converted to the same encoding as srclist. if {$type eq "linkedlist"} { - assert_encoding linkedlist dstlist + assert_encoding quicklist dstlist } } } @@ -713,7 +652,7 @@ start_server { } test {RPOPLPUSH against non list dst key} { - create_ziplist srclist {a b c d} + create_list srclist {a b c d} r set dstlist x assert_error WRONGTYPE* {r rpoplpush srclist dstlist} assert_type string dstlist @@ -727,7 +666,7 @@ start_server { foreach {type large} [array get largevalue] { test "Basic LPOP/RPOP - $type" { - create_$type mylist "$large 1 2" + create_list mylist "$large 1 2" assert_equal $large [r lpop mylist] assert_equal 2 [r rpop mylist] assert_equal 1 [r lpop mylist] @@ -745,7 +684,7 @@ start_server { assert_error WRONGTYPE* {r rpop notalist} } - foreach {type num} {ziplist 250 linkedlist 500} { + foreach {type num} {quicklist 250 quicklist 500} { test "Mass RPOP/LPOP - $type" { r del mylist set sum1 0 @@ -765,24 +704,24 @@ start_server { foreach {type large} [array get largevalue] { test "LRANGE basics - $type" { - create_$type mylist "$large 1 2 3 4 5 6 7 8 9" + create_list mylist "$large 1 2 3 4 5 6 7 8 9" assert_equal {1 2 3 4 5 6 7 8} [r lrange mylist 1 -2] assert_equal {7 8 9} [r lrange mylist -3 -1] assert_equal {4} [r lrange mylist 4 4] } test "LRANGE inverted indexes - $type" { - create_$type mylist "$large 1 2 3 4 5 6 7 8 9" + create_list mylist "$large 1 2 3 4 5 6 7 8 9" assert_equal {} [r lrange mylist 6 2] } test "LRANGE out of range indexes including the full list - $type" { - create_$type mylist "$large 1 2 3" + create_list mylist "$large 1 2 3" assert_equal "$large 1 2 3" [r lrange mylist -1000 1000] } test "LRANGE out of range negative end index - $type" { - create_$type mylist "$large 1 2 3" + create_list mylist "$large 1 2 3" assert_equal $large [r lrange mylist 0 -4] assert_equal {} [r lrange mylist 0 -5] } @@ -796,7 +735,7 @@ start_server { proc trim_list {type min max} { upvar 1 large large r del mylist - create_$type mylist "1 2 3 4 $large" + create_list mylist "1 2 3 4 $large" r ltrim mylist $min $max r lrange mylist 0 -1 } @@ -825,7 +764,7 @@ start_server { foreach {type large} [array get largevalue] { test "LSET - $type" { - create_$type mylist "99 98 $large 96 95" + create_list mylist "99 98 $large 96 95" r lset mylist 1 foo r lset mylist -1 bar assert_equal "99 foo $large 96 bar" [r lrange mylist 0 -1] @@ -847,7 +786,7 @@ start_server { foreach {type e} [array get largevalue] { test "LREM remove all the occurrences - $type" { - create_$type mylist "$e foo bar foobar foobared zap bar test foo" + create_list mylist "$e foo bar foobar foobared zap bar test foo" assert_equal 2 [r lrem mylist 0 bar] assert_equal "$e foo foobar foobared zap test foo" [r lrange mylist 0 -1] } @@ -863,7 +802,7 @@ start_server { } test "LREM starting from tail with negative count - $type" { - create_$type mylist "$e foo bar foobar foobared zap bar test foo foo" + create_list mylist "$e foo bar foobar foobared zap bar test foo foo" assert_equal 1 [r lrem mylist -1 bar] assert_equal "$e foo bar foobar foobared zap test foo foo" [r lrange mylist 0 -1] } @@ -874,7 +813,7 @@ start_server { } test "LREM deleting objects that may be int encoded - $type" { - create_$type myotherlist "$e 1 2 3" + create_list myotherlist "$e 1 2 3" assert_equal 1 [r lrem myotherlist 1 2] assert_equal 3 [r llen myotherlist] } diff --git a/tests/unit/type/set.tcl b/tests/unit/type/set.tcl index 162de0af7..74a8fb318 100644 --- a/tests/unit/type/set.tcl +++ b/tests/unit/type/set.tcl @@ -293,6 +293,13 @@ start_server { assert_equal 0 [r scard myset] } + test "SPOP with <count>=1 - $type" { + create_set myset $contents + assert_encoding $type myset + assert_equal $contents [lsort [list [r spop myset 1] [r spop myset 1] [r spop myset 1]]] + assert_equal 0 [r scard myset] + } + test "SRANDMEMBER - $type" { create_set myset $contents unset -nocomplain myset @@ -304,6 +311,41 @@ start_server { } } + foreach {type contents} { + hashtable {a b c d e f g h i j k l m n o p q r s t u v w x y z} + intset {1 10 11 12 13 14 15 16 17 18 19 2 20 21 22 23 24 25 26 3 4 5 6 7 8 9} + } { + test "SPOP with <count>" { + create_set myset $contents + assert_encoding $type myset + assert_equal $contents [lsort [concat [r spop myset 11] [r spop myset 9] [r spop myset 0] [r spop myset 4] [r spop myset 1] [r spop myset 0] [r spop myset 1] [r spop myset 0]]] + assert_equal 0 [r scard myset] + } + } + + # As seen in intsetRandomMembers + test "SPOP using integers, testing Knuth's and Floyd's algorithm" { + create_set myset {1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20} + assert_encoding intset myset + assert_equal 20 [r scard myset] + r spop myset 1 + assert_equal 19 [r scard myset] + r spop myset 2 + assert_equal 17 [r scard myset] + r spop myset 3 + assert_equal 14 [r scard myset] + r spop myset 10 + assert_equal 4 [r scard myset] + r spop myset 10 + assert_equal 0 [r scard myset] + r spop myset 1 + assert_equal 0 [r scard myset] + } {} + + test "SPOP using integers with Knuth's algorithm" { + r spop nonexisting_key 100 + } {} + test "SRANDMEMBER with <count> against non existing key" { r srandmember nonexisting_key 100 } {} diff --git a/utils/cluster_fail_time.tcl b/utils/cluster_fail_time.tcl new file mode 100644 index 000000000..87399495f --- /dev/null +++ b/utils/cluster_fail_time.tcl @@ -0,0 +1,50 @@ +# This simple script is used in order to estimate the average PFAIL->FAIL +# state switch after a failure. + +set ::sleep_time 10 ; # How much to sleep to trigger PFAIL. +set ::fail_port 30016 ; # Node to put in sleep. +set ::other_port 30001 ; # Node to use to monitor the flag switch. + +proc avg vector { + set sum 0.0 + foreach x $vector { + set sum [expr {$sum+$x}] + } + expr {$sum/[llength $vector]} +} + +set samples {} +while 1 { + exec redis-cli -p $::fail_port debug sleep $::sleep_time > /dev/null & + + # Wait for fail? to appear. + while 1 { + set output [exec redis-cli -p $::other_port cluster nodes] + if {[string match {*fail\?*} $output]} break + after 100 + } + + puts "FAIL?" + set start [clock milliseconds] + + # Wait for fail? to disappear. + while 1 { + set output [exec redis-cli -p $::other_port cluster nodes] + if {![string match {*fail\?*} $output]} break + after 100 + } + + puts "FAIL" + set now [clock milliseconds] + set elapsed [expr {$now-$start}] + puts $elapsed + lappend samples $elapsed + + puts "AVG([llength $samples]): [avg $samples]" + + # Wait for the instance to be available again. + exec redis-cli -p $::fail_port ping + + # Wait for the fail flag to be cleared. + after 2000 +} diff --git a/utils/create-cluster/.gitignore b/utils/create-cluster/.gitignore new file mode 100644 index 000000000..cdd7c19c8 --- /dev/null +++ b/utils/create-cluster/.gitignore @@ -0,0 +1 @@ +config.sh diff --git a/utils/create-cluster/README b/utils/create-cluster/README new file mode 100644 index 000000000..1f43748ee --- /dev/null +++ b/utils/create-cluster/README @@ -0,0 +1,27 @@ +Create-custer is a small script used to easily start a big number of Redis +instances configured to run in cluster mode. Its main goal is to allow manual +testing in a condition which is not easy to replicate with the Redis cluster +unit tests, for example when a lot of instances are needed in order to trigger +a give bug. + +The tool can also be used just to easily create a number of instances in a +Redis Cluster in order to experiment a bit with the system. + +USAGE +--- + +To create a cluster, follow this steps: + +1. Edit create-cluster and change the start / end port, depending on the +number of instances you want to create. +2. Use "./create-cluster start" in order to run the instances. +3. Use "./create-cluster create" in order to execute redis-trib create, so that +an actual Redis cluster will be created. +4. Now you are ready to play with the cluster. AOF files and logs for each instances are created in the current directory. + +In order to stop a cluster: + +1. Use "./craete-cluster stop" to stop all the instances. After you stopped the instances you can use "./create-cluster start" to restart them if you change ideas. +2. Use "./create-cluster clean" to remove all the AOF / log files to restat with a clean environment. + +Use the command "./create-cluster help" to get the full list of features. diff --git a/utils/create-cluster/create-cluster b/utils/create-cluster/create-cluster new file mode 100755 index 000000000..efb3135d4 --- /dev/null +++ b/utils/create-cluster/create-cluster @@ -0,0 +1,95 @@ +#!/bin/bash + +# Settings +PORT=30000 +TIMEOUT=2000 +NODES=6 +REPLICAS=1 + +# You may want to put the above config parameters into config.sh in order to +# override the defaults without modifying this script. + +if [ -a config.sh ] +then + source "config.sh" +fi + +# Computed vars +ENDPORT=$((PORT+NODES)) + +if [ "$1" == "start" ] +then + while [ $((PORT < ENDPORT)) != "0" ]; do + PORT=$((PORT+1)) + echo "Starting $PORT" + ../../src/redis-server --port $PORT --cluster-enabled yes --cluster-config-file nodes-${PORT}.conf --cluster-node-timeout $TIMEOUT --appendonly yes --appendfilename appendonly-${PORT}.aof --dbfilename dump-${PORT}.rdb --logfile ${PORT}.log --daemonize yes + done + exit 0 +fi + +if [ "$1" == "create" ] +then + HOSTS="" + while [ $((PORT < ENDPORT)) != "0" ]; do + PORT=$((PORT+1)) + HOSTS="$HOSTS 127.0.0.1:$PORT" + done + ../../src/redis-trib.rb create --replicas $REPLICAS $HOSTS + exit 0 +fi + +if [ "$1" == "stop" ] +then + while [ $((PORT < ENDPORT)) != "0" ]; do + PORT=$((PORT+1)) + echo "Stopping $PORT" + redis-cli -p $PORT shutdown nosave + done + exit 0 +fi + +if [ "$1" == "watch" ] +then + PORT=$((PORT+1)) + while [ 1 ]; do + clear + date + redis-cli -p $PORT cluster nodes | head -30 + sleep 1 + done + exit 0 +fi + +if [ "$1" == "tail" ] +then + INSTANCE=$2 + PORT=$((PORT+INSTANCE)) + tail -f ${PORT}.log + exit 0 +fi + +if [ "$1" == "call" ] +then + while [ $((PORT < ENDPORT)) != "0" ]; do + PORT=$((PORT+1)) + ../../src/redis-cli -p $PORT $2 $3 $4 $5 $6 $7 $8 $9 + done + exit 0 +fi + +if [ "$1" == "clean" ] +then + rm -rf *.log + rm -rf appendonly*.aof + rm -rf dump*.rdb + rm -rf nodes*.conf + exit 0 +fi + +echo "Usage: $0 [start|create|stop|watch|tail|clean]" +echo "start -- Launch Redis Cluster instances." +echo "create -- Create a cluster using redis-trib create." +echo "stop -- Stop Redis Cluster instances." +echo "watch -- Show CLUSTER NODES output (first 30 lines) of first node." +echo "tail <id> -- Run tail -f of instance at base port + ID." +echo "clean -- Remove all instances data, logs, configs." diff --git a/utils/redis_init_script.tpl b/utils/redis_init_script.tpl index d65086312..2e5b61301 100755 --- a/utils/redis_init_script.tpl +++ b/utils/redis_init_script.tpl @@ -26,11 +26,12 @@ case "$1" in fi ;; status) - if [ ! -f $PIDFILE ] + PID=$(cat $PIDFILE) + if [ ! -x /proc/${PID} ] then echo 'Redis is not running' else - echo "Redis is running ($(<$PIDFILE))" + echo "Redis is running ($PID)" fi ;; restart) diff --git a/utils/whatisdoing.sh b/utils/whatisdoing.sh index 8f441cfc0..e4059caed 100755 --- a/utils/whatisdoing.sh +++ b/utils/whatisdoing.sh @@ -1,9 +1,15 @@ # This script is from http://poormansprofiler.org/ +# +# NOTE: Instead of using this script, you should use the Redis +# Software Watchdog, which provides a similar functionality but in +# a more reliable / easy to use way. +# +# Check http://redis.io/topics/latency for more information. #!/bin/bash nsamples=1 sleeptime=0 -pid=$(pidof redis-server) +pid=$(ps auxww | grep '[r]edis-server' | awk '{print $2}') for x in $(seq 1 $nsamples) do |