summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore1
-rw-r--r--CONTRIBUTING4
-rw-r--r--README.md55
-rw-r--r--deps/Makefile9
-rw-r--r--deps/geohash-int/Makefile23
-rw-r--r--deps/linenoise/linenoise.c7
-rw-r--r--redis.conf146
-rw-r--r--sentinel.conf17
-rw-r--r--src/Makefile50
-rw-r--r--src/Makefile.dep186
-rw-r--r--src/ae_select.c1
-rw-r--r--src/anet.c2
-rw-r--r--src/aof.c179
-rw-r--r--src/atomicvar.h2
-rw-r--r--src/bitops.c123
-rw-r--r--src/blocked.c7
-rw-r--r--src/childinfo.c85
-rw-r--r--src/cluster.c14
-rw-r--r--src/config.c106
-rw-r--r--src/db.c302
-rw-r--r--src/debug.c142
-rw-r--r--src/debugmacro.h41
-rw-r--r--src/dict.c250
-rw-r--r--src/dict.h21
-rw-r--r--src/evict.c524
-rw-r--r--src/expire.c502
-rw-r--r--src/geo.c50
-rw-r--r--src/geohash.c (renamed from deps/geohash-int/geohash.c)10
-rw-r--r--src/geohash.h (renamed from deps/geohash-int/geohash.h)2
-rw-r--r--src/geohash_helper.c (renamed from deps/geohash-int/geohash_helper.c)102
-rw-r--r--src/geohash_helper.h (renamed from deps/geohash-int/geohash_helper.h)1
-rw-r--r--src/help.h21
-rw-r--r--src/hyperloglog.c41
-rw-r--r--src/intset.c2
-rw-r--r--src/intset.h2
-rw-r--r--src/latency.c2
-rw-r--r--src/lazyfree.c5
-rw-r--r--src/module.c1300
-rw-r--r--src/modules/API.md460
-rw-r--r--src/modules/BLOCK.md265
-rw-r--r--src/modules/INTRO.md83
-rw-r--r--src/modules/Makefile21
-rw-r--r--src/modules/TYPES.md379
-rw-r--r--src/modules/helloblock.c122
-rw-r--r--src/modules/hellotype.c272
-rw-r--r--src/modules/helloworld.c52
-rw-r--r--src/modules/testmodule.c237
-rw-r--r--src/networking.c82
-rw-r--r--src/object.c510
-rw-r--r--src/quicklist.c9
-rw-r--r--src/quicklist.h2
-rw-r--r--src/rdb.c446
-rw-r--r--src/rdb.h52
-rw-r--r--src/redis-benchmark.c20
-rw-r--r--src/redis-check-rdb.c920
-rw-r--r--src/redis-cli.c36
-rwxr-xr-xsrc/redis-trib.rb8
-rw-r--r--src/redismodule.h107
-rw-r--r--src/replication.c587
-rw-r--r--src/rio.h3
-rw-r--r--src/sds.c11
-rw-r--r--src/sentinel.c40
-rw-r--r--src/server.c694
-rw-r--r--src/server.h379
-rw-r--r--src/t_hash.c4
-rw-r--r--src/t_list.c93
-rw-r--r--src/t_set.c17
-rw-r--r--src/t_string.c6
-rw-r--r--src/t_zset.c37
-rw-r--r--src/ziplist.c62
-rw-r--r--src/ziplist.h1
-rw-r--r--src/zmalloc.c25
-rw-r--r--src/zmalloc.h4
-rw-r--r--tests/assets/default.conf1
-rw-r--r--tests/integration/psync2.tcl182
-rw-r--r--tests/integration/rdb.tcl2
-rw-r--r--tests/integration/replication-3.tcl12
-rw-r--r--tests/sentinel/tests/05-manual.tcl3
-rw-r--r--tests/sentinel/tests/07-down-conditions.tcl68
-rw-r--r--tests/support/server.tcl2
-rw-r--r--tests/test_helper.tcl4
-rw-r--r--tests/unit/aofrw.tcl101
-rw-r--r--tests/unit/bitfield.tcl14
-rw-r--r--tests/unit/bitops.tcl10
-rw-r--r--tests/unit/expire.tcl6
-rw-r--r--tests/unit/geo.tcl98
-rw-r--r--tests/unit/introspection-2.tcl23
-rw-r--r--tests/unit/other.tcl2
-rw-r--r--tests/unit/scripting.tcl2
-rw-r--r--tests/unit/type/list-3.tcl44
-rw-r--r--tests/unit/type/list.tcl4
-rw-r--r--tests/unit/wait.tcl42
-rw-r--r--utils/corrupt_rdb.c44
-rw-r--r--utils/hyperloglog/hll-gnuplot-graph.rb2
-rwxr-xr-xutils/install_server.sh95
-rw-r--r--utils/lru/README10
-rw-r--r--utils/lru/lfu-simulation.c158
-rw-r--r--utils/lru/test-lru.rb301
-rwxr-xr-xutils/releasetools/changelog.tcl6
99 files changed, 8797 insertions, 2822 deletions
diff --git a/.gitignore b/.gitignore
index 3d346fbcf..a188cfc82 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,3 +27,4 @@ deps/lua/src/liblua.a
.make-*
.prerequisites
*.dSYM
+Makefile.dep
diff --git a/CONTRIBUTING b/CONTRIBUTING
index b33aacb3e..f57de3fd9 100644
--- a/CONTRIBUTING
+++ b/CONTRIBUTING
@@ -12,7 +12,7 @@ each source file that you contribute.
PLEASE DO NOT POST GENERAL QUESTIONS that are not about bugs or suspected
bugs in the Github issues system. We'll be very happy to help you and provide
- all the support Reddit sub:
+ all the support at the Reddit sub:
http://reddit.com/r/redis
@@ -24,7 +24,7 @@ each source file that you contribute.
1. If it is a major feature or a semantical change, please post it as a new submission in r/redis on Reddit at http://reddit.com/r/redis. Try to be passionate about why the feature is needed, make users upvote your proposal to gain traction and so forth. Read feedbacks about the community. But in this first step **please don't write code yet**.
-2. If in step 1 you get an acknowledge from the project leaders, use the
+2. If in step 1 you get an acknowledgment from the project leaders, use the
following procedure to submit a patch:
a. Fork Redis on github ( http://help.github.com/fork-a-repo/ )
diff --git a/README.md b/README.md
index c6d46e6e2..70a15790f 100644
--- a/README.md
+++ b/README.md
@@ -39,7 +39,7 @@ You can run a 32 bit Redis binary using:
% make 32bit
-After building Redis is a good idea to test it, using:
+After building Redis, it is a good idea to test it using:
% make test
@@ -47,8 +47,8 @@ Fixing build problems with dependencies or cached build options
---------
Redis has some dependencies which are included into the `deps` directory.
-`make` does not rebuild dependencies automatically, even if something in the
-source code of dependencies is changed.
+`make` does not automatically rebuild dependencies even if something in
+the source code of dependencies changes.
When you update the source code with `git pull` or when code inside the
dependencies tree is modified in any other way, make sure to use the following
@@ -109,14 +109,14 @@ To run Redis with the default configuration just type:
% cd src
% ./redis-server
-
+
If you want to provide your redis.conf, you have to run it using an additional
parameter (the path of the configuration file):
% cd src
% ./redis-server /path/to/redis.conf
-It is possible to alter the Redis configuration passing parameters directly
+It is possible to alter the Redis configuration by passing parameters directly
as options using the command line. Examples:
% ./redis-server --port 9999 --slaveof 127.0.0.1 6379
@@ -174,7 +174,7 @@ You'll be able to stop and start Redis using the script named
`/etc/init.d/redis_<portnumber>`, for instance `/etc/init.d/redis_6379`.
Code contributions
----
+-----------------
Note: by contributing code to the Redis project in any form, including sending
a pull request via Github, a code fragment or patch via private email or
@@ -196,8 +196,8 @@ or you just untarred the Redis distribution tar ball. In both the cases
you are basically one step away from the source code, so here we explain
the Redis source code layout, what is in each file as a general idea, the
most important functions and structures inside the Redis server and so forth.
-We keep all the discussion at an high level without digging into the details
-since this document would be huge otherwise, and our code base changes
+We keep all the discussion at a high level without digging into the details
+since this document would be huge otherwise and our code base changes
continuously, but a general idea should be a good starting point to
understand more. Moreover most of the code is heavily commented and easy
to follow.
@@ -206,17 +206,17 @@ Source code layout
---
The Redis root directory just contains this README, the Makefile which
-actually calls the real Makefile inside the `src` directory, an example
-configuration for Redis and Sentinel. Finally you can find a few shell
+calls the real Makefile inside the `src` directory and an example
+configuration for Redis and Sentinel. You can find a few shell
scripts that are used in order to execute the Redis, Redis Cluster and
Redis Sentinel unit tests, which are implemented inside the `tests`
directory.
-Inside the root directory the are the following important directories:
+Inside the root are the following important directories:
* `src`: contains the Redis implementation, written in C.
* `tests`: contains the unit tests, implemented in Tcl.
-* `deps`: contains libraries Redis uses. Everything needed to compile Redis is inside this directory, your system needs to provide just the `libc`, a POSIX compatible interface, and a C compiler. Notably `deps` contains a copy of `jemalloc`, which is the default allocator of Redis under Linux. Note that under `deps` there are also things which started with the Redis project, but for which the main repository is not `anitrez/redis`. an exception to this rule is `deps/geohash-int` which is the low level geocoding library used by Redis: it originated from a different project, but at this point it diverged so much that it is developed as a separated entity directly inside the Redis repository.
+* `deps`: contains libraries Redis uses. Everything needed to compile Redis is inside this directory; your system just needs to provide `libc`, a POSIX compatible interface and a C compiler. Notably `deps` contains a copy of `jemalloc`, which is the default allocator of Redis under Linux. Note that under `deps` there are also things which started with the Redis project, but for which the main repository is not `anitrez/redis`. An exception to this rule is `deps/geohash-int` which is the low level geocoding library used by Redis: it originated from a different project, but at this point it diverged so much that it is developed as a separated entity directly inside the Redis repository.
There are a few more directories but they are not very important for our goals
here. We'll focus mostly on `src`, where the Redis implementation is contained,
@@ -225,34 +225,34 @@ exposed is the logical one to follow in order to disclose different layers
of complexity incrementally.
Note: lately Redis was refactored quite a bit. Function names and file
-names changed, so you may find that this documentation reflects the
+names have been changed, so you may find that this documentation reflects the
`unstable` branch more closely. For instance in Redis 3.0 the `server.c`
-and `server.h` files were renamed `redis.c` and `redis.h`. However the overall
+and `server.h` files were named to `redis.c` and `redis.h`. However the overall
structure is the same. Keep in mind that all the new developments and pull
requests should be performed against the `unstable` branch.
server.h
---
-The simplest way to understand how a program works, is to understand the
+The simplest way to understand how a program works is to understand the
data structures it uses. So we'll start from the main header file of
Redis, which is `server.h`.
All the server configuration and in general all the shared state is
defined in a global structure called `server`, of type `struct redisServer`.
-A few important fields in this structure:
+A few important fields in this structure are:
* `server.db` is an array of Redis databases, where data is stored.
* `server.commands` is the command table.
* `server.clients` is a linked list of clients connected to the server.
* `server.master` is a special client, the master, if the instance is a slave.
-There are tons of other fields, most fields are commented directly inside
+There are tons of other fields. Most fields are commented directly inside
the structure definition.
Another important Redis data structure is the one defining a client.
In the past it was called `redisClient`, now just `client`. The structure
-has many fields, here we'll show just the main ones:
+has many fields, here we'll just show the main ones:
struct client {
int fd;
@@ -270,7 +270,7 @@ The client structure defines a *connected client*:
* The `fd` field is the client socket file descriptor.
* `argc` and `argv` are populated with the command the client is executing, so that functions implementing a given Redis command can read the arguments.
-* `querybuf` accumulates the requests from the client, which are parsed by the Redis server according to the Redis protocol, and executed calling the implementations of the commands the client is executing.
+* `querybuf` accumulates the requests from the client, which are parsed by the Redis server according to the Redis protocol and executed by calling the implementations of the commands the client is executing.
* `reply` and `buf` are dynamic and static buffers that accumulate the replies the server sends to the client. These buffers are incrementally written to the socket as soon as the file descriptor is writable.
As you can see in the client structure above, arguments in a command
@@ -288,16 +288,16 @@ structure, which defines a *Redis object*:
Basically this structure can represent all the basic Redis data types like
strings, lists, sets, sorted sets and so forth. The interesting thing is that
it has a `type` field, so that it is possible to know what type a given
-object is, and a `refcount`, so that the same object can be referenced
+object has, and a `refcount`, so that the same object can be referenced
in multiple places without allocating it multiple times. Finally the `ptr`
-field points to the actual representation of the object, that may vary
+field points to the actual representation of the object, which might vary
even for the same type, depending on the `encoding` used.
Redis objects are used extensively in the Redis internals, however in order
to avoid the overhead of indirect accesses, recently in many places
we just use plain dynamic strings not wrapped inside a Redis object.
-sever.c
+server.c
---
This is the entry point of the Redis server, where the `main()` function
@@ -306,7 +306,7 @@ the Redis server.
* `initServerConfig()` setups the default values of the `server` structure.
* `initServer()` allocates the data structures needed to operate, setup the listening socket, and so forth.
-* `aeMain()` enters the event loop listening for new connections.
+* `aeMain()` starts the event loop which listens for new connections.
There are two special functions called periodically by the event loop:
@@ -328,7 +328,7 @@ This file defines all the I/O functions with clients, masters and slaves
* `createClient()` allocates and initializes a new client.
* the `addReply*()` family of functions are used by commands implementations in order to append data to the client structure, that will be transmitted to the client as a reply for a given command executed.
-* `writeToClient()` transmits the data pending in the output buffers to the client, and is called by the *writable event handler* `sendReplyToClient()`.
+* `writeToClient()` transmits the data pending in the output buffers to the client and is called by the *writable event handler* `sendReplyToClient()`.
* `readQueryFromClient()` is the *readable event handler* and accumulates data from read from the client into the query buffer.
* `processInputBuffer()` is the entry point in order to parse the client query buffer according to the Redis protocol. Once commands are ready to be processed, it calls `processCommand()` which is defined inside `server.c` in order to actually execute the command.
* `freeClient()` deallocates, disconnects and removes a client.
@@ -439,9 +439,8 @@ There are tons of commands implementations inside th Redis source code
that can serve as examples of actual commands implementations. To write
a few toy commands can be a good exercise to familiarize with the code base.
-There are also many other files not described here, but it is useless to
-cover everything, we want just to help you with the first steps,
-eventually you'll find your way inside the Redis code base :-)
+There are also many other files not described here, but it is useless to
+cover everything. We want to just help you with the first steps.
+Eventually you'll find your way inside the Redis code base :-)
Enjoy!
-
diff --git a/deps/Makefile b/deps/Makefile
index 10ae6e790..e148a331c 100644
--- a/deps/Makefile
+++ b/deps/Makefile
@@ -36,7 +36,6 @@ distclean:
-(cd hiredis && $(MAKE) clean) > /dev/null || true
-(cd linenoise && $(MAKE) clean) > /dev/null || true
-(cd lua && $(MAKE) clean) > /dev/null || true
- -(cd geohash-int && $(MAKE) clean) > /dev/null || true
-(cd jemalloc && [ -f Makefile ] && $(MAKE) distclean) > /dev/null || true
-(rm -f .make-*)
@@ -78,13 +77,7 @@ JEMALLOC_LDFLAGS= $(LDFLAGS)
jemalloc: .make-prerequisites
@printf '%b %b\n' $(MAKECOLOR)MAKE$(ENDCOLOR) $(BINCOLOR)$@$(ENDCOLOR)
- cd jemalloc && ./configure --with-jemalloc-prefix=je_ --enable-cc-silence CFLAGS="$(JEMALLOC_CFLAGS)" LDFLAGS="$(JEMALLOC_LDFLAGS)"
+ cd jemalloc && ./configure --with-lg-quantum=3 --with-jemalloc-prefix=je_ --enable-cc-silence CFLAGS="$(JEMALLOC_CFLAGS)" LDFLAGS="$(JEMALLOC_LDFLAGS)"
cd jemalloc && $(MAKE) CFLAGS="$(JEMALLOC_CFLAGS)" LDFLAGS="$(JEMALLOC_LDFLAGS)" lib/libjemalloc.a
.PHONY: jemalloc
-
-geohash-int: .make-prerequisites
- @printf '%b %b\n' $(MAKECOLOR)MAKE$(ENDCOLOR) $(BINCOLOR)$@$(ENDCOLOR)
- cd geohash-int && $(MAKE)
-
-.PHONY: geohash-int
diff --git a/deps/geohash-int/Makefile b/deps/geohash-int/Makefile
deleted file mode 100644
index b7c259577..000000000
--- a/deps/geohash-int/Makefile
+++ /dev/null
@@ -1,23 +0,0 @@
-STD=
-WARN= -Wall
-OPT= -O2
-
-R_CFLAGS= $(STD) $(WARN) $(OPT) $(DEBUG) $(CFLAGS)
-R_LDFLAGS= $(LDFLAGS)
-DEBUG= -g
-
-R_CC=$(CC) $(R_CFLAGS)
-R_LD=$(CC) $(R_LDFLAGS)
-
-all: geohash.o geohash_helper.o
-
-.PHONY: all
-
-geohash.o: geohash.h geohash.c
-geohash_helper.o: geohash.h geohash_helper.h geohash_helper.c
-
-.c.o:
- $(R_CC) -c $<
-
-clean:
- rm -f *.o
diff --git a/deps/linenoise/linenoise.c b/deps/linenoise/linenoise.c
index a807d9b8a..fce14a7c5 100644
--- a/deps/linenoise/linenoise.c
+++ b/deps/linenoise/linenoise.c
@@ -111,6 +111,7 @@
#include <string.h>
#include <stdlib.h>
#include <ctype.h>
+#include <sys/stat.h>
#include <sys/types.h>
#include <sys/ioctl.h>
#include <unistd.h>
@@ -1160,10 +1161,14 @@ int linenoiseHistorySetMaxLen(int len) {
/* Save the history in the specified file. On success 0 is returned
* otherwise -1 is returned. */
int linenoiseHistorySave(const char *filename) {
- FILE *fp = fopen(filename,"w");
+ mode_t old_umask = umask(S_IXUSR|S_IRWXG|S_IRWXO);
+ FILE *fp;
int j;
+ fp = fopen(filename,"w");
+ umask(old_umask);
if (fp == NULL) return -1;
+ chmod(filename,S_IRUSR|S_IWUSR);
for (j = 0; j < history_len; j++)
fprintf(fp,"%s\n",history[j]);
fclose(fp);
diff --git a/redis.conf b/redis.conf
index a39a643ab..14b5ca979 100644
--- a/redis.conf
+++ b/redis.conf
@@ -125,8 +125,9 @@ timeout 0
# Note that to close the connection the double of the time is needed.
# On other kernels the period depends on the kernel configuration.
#
-# A reasonable value for this option is 60 seconds.
-tcp-keepalive 0
+# A reasonable value for this option is 300 seconds, which is the new
+# Redis default starting with Redis 3.2.1.
+tcp-keepalive 300
################################# GENERAL #####################################
@@ -154,7 +155,7 @@ supervised no
#
# Creating a pid file is best effort: if Redis is not able to create it
# nothing bad happens, the server will start and run normally.
-pidfile /var/run/redis.pid
+pidfile /var/run/redis_6379.pid
# Specify the server verbosity level.
# This can be one of:
@@ -184,6 +185,14 @@ logfile ""
# dbid is a number between 0 and 'databases'-1
databases 16
+# By default Redis shows an ASCII art logo only when started to log to the
+# standard output and if the standard output is a TTY. Basically this means
+# that normally a logo is displayed only in interactive sessions.
+#
+# However it is possible to force the pre-4.0 behavior and always show a
+# ASCII art logo in startup logs by setting the following option to yes.
+always-show-logo yes
+
################################ SNAPSHOTTING ################################
#
# Save the DB on disk:
@@ -401,6 +410,10 @@ repl-disable-tcp-nodelay no
# need to elapse, starting from the time the last slave disconnected, for
# the backlog buffer to be freed.
#
+# Note that slaves never free the backlog for timeout, since they may be
+# promoted to masters later, and should be able to correctly "partially
+# resynchronize" with the slaves: hence they should always accumulate backlog.
+#
# A value of 0 means to never release the backlog.
#
# repl-backlog-ttl 3600
@@ -442,6 +455,35 @@ slave-priority 100
# By default min-slaves-to-write is set to 0 (feature disabled) and
# min-slaves-max-lag is set to 10.
+# A Redis master is able to list the address and port of the attached
+# slaves in different ways. For example the "INFO replication" section
+# offers this information, which is used, among other tools, by
+# Redis Sentinel in order to discover slave instances.
+# Another place where this info is available is in the output of the
+# "ROLE" command of a master.
+#
+# The listed IP and address normally reported by a slave is obtained
+# in the following way:
+#
+# IP: The address is auto detected by checking the peer address
+# of the socket used by the slave to connect with the master.
+#
+# Port: The port is communicated by the slave during the replication
+# handshake, and is normally the port that the slave is using to
+# list for connections.
+#
+# However when port forwarding or Network Address Translation (NAT) is
+# used, the slave may be actually reachable via different IP and port
+# pairs. The following two options can be used by a slave in order to
+# report to its master a specific set of IP and port, so that both INFO
+# and ROLE will report those values.
+#
+# There is no need to use both the options if you need to override just
+# the port or the IP address.
+#
+# slave-announce-ip 5.5.5.5
+# slave-announce-port 1234
+
################################## SECURITY ###################################
# Require clients to issue AUTH <PASSWORD> before processing any other
@@ -491,7 +533,7 @@ slave-priority 100
############################## MEMORY MANAGEMENT ################################
-# Don't use more memory than the specified amount of bytes.
+# Set a memory usage limit to the specified amount of bytes.
# When the memory limit is reached Redis will try to remove keys
# according to the eviction policy selected (see maxmemory-policy).
#
@@ -500,8 +542,8 @@ slave-priority 100
# that would use more memory, like SET, LPUSH, and so on, and will continue
# to reply to read-only commands like GET.
#
-# This option is usually useful when using Redis as an LRU cache, or to set
-# a hard memory limit for an instance (using the 'noeviction' policy).
+# This option is usually useful when using Redis as an LRU or LFU cache, or to
+# set a hard memory limit for an instance (using the 'noeviction' policy).
#
# WARNING: If you have slaves attached to an instance with maxmemory on,
# the size of the output buffers needed to feed the slaves are subtracted
@@ -519,12 +561,20 @@ slave-priority 100
# MAXMEMORY POLICY: how Redis will select what to remove when maxmemory
# is reached. You can select among five behaviors:
#
-# volatile-lru -> remove the key with an expire set using an LRU algorithm
-# allkeys-lru -> remove any key according to the LRU algorithm
-# volatile-random -> remove a random key with an expire set
-# allkeys-random -> remove a random key, any key
-# volatile-ttl -> remove the key with the nearest expire time (minor TTL)
-# noeviction -> don't expire at all, just return an error on write operations
+# volatile-lru -> Evict using approximated LRU among the keys with an expire set.
+# allkeys-lru -> Evict any key using approximated LRU.
+# volatile-lfu -> Evict using approximated LFU among the keys with an expire set.
+# allkeys-lfu -> Evict any key using approximated LFU.
+# volatile-random -> Remove a random key among the ones with an expire set.
+# allkeys-random -> Remove a random key, any key.
+# volatile-ttl -> Remove the key with the nearest expire time (minor TTL)
+# noeviction -> Don't evict anything, just return an error on write operations.
+#
+# LRU means Least Recently Used
+# LFU means Least Frequently Used
+#
+# Both LRU, LFU and volatile-ttl are implemented using approximated
+# randomized algorithms.
#
# Note: with any of the above policies, Redis will return an error on write
# operations, when there are no suitable keys for eviction.
@@ -539,14 +589,14 @@ slave-priority 100
#
# maxmemory-policy noeviction
-# LRU and minimal TTL algorithms are not precise algorithms but approximated
+# LRU, LFU and minimal TTL algorithms are not precise algorithms but approximated
# algorithms (in order to save memory), so you can tune it for speed or
# accuracy. For default Redis will check five keys and pick the one that was
# used less recently, you can change the sample size using the following
# configuration directive.
#
# The default of 5 produces good enough results. 10 Approximates very closely
-# true LRU but costs a bit more CPU. 3 is very fast but not very accurate.
+# true LRU but costs more CPU. 3 is faster but not very accurate.
#
# maxmemory-samples 5
@@ -717,6 +767,20 @@ auto-aof-rewrite-min-size 64mb
# will be found.
aof-load-truncated yes
+# When rewriting the AOF file, Redis is able to use an RDB preamble in the
+# AOF file for faster rewrites and recoveries. When this option is turned
+# on the rewritten AOF file is composed of two different stanzas:
+#
+# [RDB file][AOF tail]
+#
+# When loading Redis recognizes that the AOF file starts with the "REDIS"
+# string and loads the prefixed RDB file, and continues loading the AOF
+# tail.
+#
+# This is currently turned off by default in order to avoid the surprise
+# of a format change, but will at some point be used as the default.
+aof-use-rdb-preamble no
+
################################ LUA SCRIPTING ###############################
# Max execution time of a Lua script in milliseconds.
@@ -766,7 +830,7 @@ lua-time-limit 5000
# A slave of a failing master will avoid to start a failover if its data
# looks too old.
#
-# There is no simple way for a slave to actually have a exact measure of
+# There is no simple way for a slave to actually have an exact measure of
# its "data age", so the following two checks are performed:
#
# 1) If there are multiple slaves able to failover, they exchange messages
@@ -1112,3 +1176,55 @@ hz 10
# in order to commit the file to the disk more incrementally and avoid
# big latency spikes.
aof-rewrite-incremental-fsync yes
+
+# Redis LFU eviction (see maxmemory setting) can be tuned. However it is a good
+# idea to start with the default settings and only change them after investigating
+# how to improve the performances and how the keys LFU change over time, which
+# is possible to inspect via the OBJECT FREQ command.
+#
+# There are two tunable parameters in the Redis LFU implementation: the
+# counter logarithm factor and the counter decay time. It is important to
+# understand what the two parameters mean before changing them.
+#
+# The LFU counter is just 8 bits per key, it's maximum value is 255, so Redis
+# uses a probabilistic increment with logarithmic behavior. Given the value
+# of the old counter, when a key is accessed, the counter is incremented in
+# this way:
+#
+# 1. A random number R between 0 and 1 is extracted.
+# 2. A probability P is calculated as 1/(old_value*lfu_log_factor+1).
+# 3. The counter is incremented only if R < P.
+#
+# The default lfu-log-factor is 10. This is a table of how the frequency
+# counter changes with a different number of accesses with different
+# logarithmic factors:
+#
+# +--------+------------+------------+------------+------------+------------+
+# | factor | 100 hits | 1000 hits | 100K hits | 1M hits | 10M hits |
+# +--------+------------+------------+------------+------------+------------+
+# | 0 | 104 | 255 | 255 | 255 | 255 |
+# +--------+------------+------------+------------+------------+------------+
+# | 1 | 18 | 49 | 255 | 255 | 255 |
+# +--------+------------+------------+------------+------------+------------+
+# | 10 | 10 | 18 | 142 | 255 | 255 |
+# +--------+------------+------------+------------+------------+------------+
+# | 100 | 8 | 11 | 49 | 143 | 255 |
+# +--------+------------+------------+------------+------------+------------+
+#
+# NOTE: The above table was obtained by running the following commands:
+#
+# redis-benchmark -n 1000000 incr foo
+# redis-cli object freq foo
+#
+# NOTE 2: The counter initial value is 5 in order to give new objects a chance
+# to accumulate hits.
+#
+# The counter decay time is the time, in minutes, that must elapse in order
+# for the key counter to be divided by two (or decremented if it has a value
+# less <= 10).
+#
+# The default value for the lfu-decay-time is 1. A Special value of 0 means to
+# decay the counter every time it happens to be scanned.
+#
+# lfu-log-factor 10
+# lfu-decay-time 1
diff --git a/sentinel.conf b/sentinel.conf
index 39d1044e2..0e1b266ed 100644
--- a/sentinel.conf
+++ b/sentinel.conf
@@ -1,5 +1,21 @@
# Example sentinel.conf
+# *** IMPORTANT ***
+#
+# By default Sentinel will not be reachable from interfaces different than
+# localhost, either use the 'bind' directive to bind to a list of network
+# interfaces, or disable protected mode with "protected-mode no" by
+# adding it to this configuration file.
+#
+# Before doing that MAKE SURE the instance is protected from the outside
+# world via firewalling or other means.
+#
+# For example you may use one of the following:
+#
+# bind 127.0.0.1 192.168.1.1
+#
+# protected-mode no
+
# port <sentinel-port>
# The port that this sentinel instance will run on
port 26379
@@ -178,4 +194,3 @@ sentinel failover-timeout mymaster 180000
#
# sentinel client-reconfig-script mymaster /var/redis/reconfig.sh
-
diff --git a/src/Makefile b/src/Makefile
index c390d3f2e..2bf3c9347 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -15,11 +15,12 @@
release_hdr := $(shell sh -c './mkreleasehdr.sh')
uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not')
OPTIMIZATION?=-O2
-DEPENDENCY_TARGETS=hiredis linenoise lua geohash-int
+DEPENDENCY_TARGETS=hiredis linenoise lua
+NODEPS:=clean distclean
# Default settings
STD=-std=c99 -pedantic -DREDIS_STATIC=''
-WARN=-Wall -W
+WARN=-Wall -W -Wno-missing-field-initializers
OPT=$(OPTIMIZATION)
PREFIX?=/usr/local
@@ -53,7 +54,7 @@ endif
# Override default settings if possible
-include .make-settings
-FINAL_CFLAGS=$(STD) $(WARN) $(OPT) $(DEBUG) $(CFLAGS) $(REDIS_CFLAGS) -I../deps/geohash-int
+FINAL_CFLAGS=$(STD) $(WARN) $(OPT) $(DEBUG) $(CFLAGS) $(REDIS_CFLAGS)
FINAL_LDFLAGS=$(LDFLAGS) $(REDIS_LDFLAGS) $(DEBUG)
FINAL_LIBS=-lm
DEBUG=-g -ggdb
@@ -65,17 +66,27 @@ ifeq ($(uname_S),SunOS)
FINAL_LIBS+= -ldl -lnsl -lsocket -lresolv -lpthread -lrt
else
ifeq ($(uname_S),Darwin)
- # Darwin (nothing to do)
+ # Darwin
+ FINAL_LIBS+= -ldl
else
ifeq ($(uname_S),AIX)
# AIX
FINAL_LDFLAGS+= -Wl,-bexpall
- FINAL_LIBS+= -pthread -lcrypt -lbsd
-
+ FINAL_LIBS+=-ldl -pthread -lcrypt -lbsd
+else
+ifeq ($(uname_S),OpenBSD)
+ # OpenBSD
+ FINAL_LIBS+= -lpthread
+else
+ifeq ($(uname_S),FreeBSD)
+ # FreeBSD
+ FINAL_LIBS+= -lpthread
else
# All the other OSes (notably Linux)
FINAL_LDFLAGS+= -rdynamic
- FINAL_LIBS+= -pthread
+ FINAL_LIBS+=-ldl -pthread
+endif
+endif
endif
endif
endif
@@ -95,7 +106,7 @@ endif
ifeq ($(MALLOC),jemalloc)
DEPENDENCY_TARGETS+= jemalloc
FINAL_CFLAGS+= -DUSE_JEMALLOC -I../deps/jemalloc/include
- FINAL_LIBS+= ../deps/jemalloc/lib/libjemalloc.a -ldl
+ FINAL_LIBS+= ../deps/jemalloc/lib/libjemalloc.a
endif
REDIS_CC=$(QUIET_CC)$(CC) $(FINAL_CFLAGS)
@@ -117,8 +128,7 @@ endif
REDIS_SERVER_NAME=redis-server
REDIS_SENTINEL_NAME=redis-sentinel
-REDIS_SERVER_OBJ=adlist.o quicklist.o ae.o anet.o dict.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o crc16.o endianconv.o slowlog.o scripting.o bio.o rio.o rand.o memtest.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o geo.o lazyfree.o module.o
-REDIS_GEOHASH_OBJ=../deps/geohash-int/geohash.o ../deps/geohash-int/geohash_helper.o
+REDIS_SERVER_OBJ=adlist.o quicklist.o ae.o anet.o dict.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o crc16.o endianconv.o slowlog.o scripting.o bio.o rio.o rand.o memtest.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o
REDIS_CLI_NAME=redis-cli
REDIS_CLI_OBJ=anet.o adlist.o redis-cli.o zmalloc.o release.o anet.o ae.o crc64.o
REDIS_BENCHMARK_NAME=redis-benchmark
@@ -132,15 +142,14 @@ all: $(REDIS_SERVER_NAME) $(REDIS_SENTINEL_NAME) $(REDIS_CLI_NAME) $(REDIS_BENCH
@echo "Hint: It's a good idea to run 'make test' ;)"
@echo ""
-.PHONY: all
-
-# Deps (use make dep to generate this)
-include Makefile.dep
+Makefile.dep:
+ -$(REDIS_CC) -MM *.c > Makefile.dep 2> /dev/null || true
-dep:
- $(REDIS_CC) -MM *.c > Makefile.dep
+ifeq (0, $(words $(findstring $(MAKECMDGOALS), $(NODEPS))))
+-include Makefile.dep
+endif
-.PHONY: dep
+.PHONY: all
persist-settings: distclean
echo STD=$(STD) >> .make-settings
@@ -172,7 +181,7 @@ endif
# redis-server
$(REDIS_SERVER_NAME): $(REDIS_SERVER_OBJ)
- $(REDIS_LD) -o $@ $^ ../deps/hiredis/libhiredis.a ../deps/lua/src/liblua.a $(REDIS_GEOHASH_OBJ) $(FINAL_LIBS)
+ $(REDIS_LD) -o $@ $^ ../deps/hiredis/libhiredis.a ../deps/lua/src/liblua.a $(FINAL_LIBS)
# redis-sentinel
$(REDIS_SENTINEL_NAME): $(REDIS_SERVER_NAME)
@@ -194,6 +203,9 @@ $(REDIS_BENCHMARK_NAME): $(REDIS_BENCHMARK_OBJ)
$(REDIS_CHECK_AOF_NAME): $(REDIS_CHECK_AOF_OBJ)
$(REDIS_LD) -o $@ $^ $(FINAL_LIBS)
+dict-benchmark: dict.c zmalloc.c sds.c
+ $(REDIS_CC) $(FINAL_CFLAGS) dict.c zmalloc.c sds.c -D DICT_BENCHMARK_MAIN -o dict-benchmark
+
# Because the jemalloc.h header is generated as a part of the jemalloc build,
# building it should complete before building any other object. Instead of
# depending on a single artifact, build all dependencies first.
@@ -201,7 +213,7 @@ $(REDIS_CHECK_AOF_NAME): $(REDIS_CHECK_AOF_OBJ)
$(REDIS_CC) -c $<
clean:
- rm -rf $(REDIS_SERVER_NAME) $(REDIS_SENTINEL_NAME) $(REDIS_CLI_NAME) $(REDIS_BENCHMARK_NAME) $(REDIS_CHECK_RDB_NAME) $(REDIS_CHECK_AOF_NAME) *.o *.gcda *.gcno *.gcov redis.info lcov-html
+ rm -rf $(REDIS_SERVER_NAME) $(REDIS_SENTINEL_NAME) $(REDIS_CLI_NAME) $(REDIS_BENCHMARK_NAME) $(REDIS_CHECK_RDB_NAME) $(REDIS_CHECK_AOF_NAME) *.o *.gcda *.gcno *.gcov redis.info lcov-html Makefile.dep dict-benchmark
.PHONY: clean
diff --git a/src/Makefile.dep b/src/Makefile.dep
deleted file mode 100644
index 90784a2e7..000000000
--- a/src/Makefile.dep
+++ /dev/null
@@ -1,186 +0,0 @@
-adlist.o: adlist.c adlist.h zmalloc.h
-ae.o: ae.c ae.h zmalloc.h config.h ae_kqueue.c ae_epoll.c ae_select.c ae_evport.c
-ae_epoll.o: ae_epoll.c
-ae_evport.o: ae_evport.c
-ae_kqueue.o: ae_kqueue.c
-ae_select.o: ae_select.c
-anet.o: anet.c fmacros.h anet.h
-aof.o: aof.c server.h fmacros.h config.h solarisfixes.h \
- ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h sds.h dict.h \
- adlist.h zmalloc.h anet.h ziplist.h intset.h version.h util.h latency.h \
- sparkline.h quicklist.h zipmap.h sha1.h endianconv.h crc64.h rdb.h rio.h \
- bio.h
-bio.o: bio.c server.h fmacros.h config.h solarisfixes.h \
- ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h sds.h dict.h \
- adlist.h zmalloc.h anet.h ziplist.h intset.h version.h util.h latency.h \
- sparkline.h quicklist.h zipmap.h sha1.h endianconv.h crc64.h rdb.h rio.h \
- bio.h
-bitops.o: bitops.c server.h fmacros.h config.h solarisfixes.h \
- ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h sds.h dict.h \
- adlist.h zmalloc.h anet.h ziplist.h intset.h version.h util.h latency.h \
- sparkline.h quicklist.h zipmap.h sha1.h endianconv.h crc64.h rdb.h rio.h
-blocked.o: blocked.c server.h fmacros.h config.h solarisfixes.h \
- ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h sds.h dict.h \
- adlist.h zmalloc.h anet.h ziplist.h intset.h version.h util.h latency.h \
- sparkline.h quicklist.h zipmap.h sha1.h endianconv.h crc64.h rdb.h rio.h
-cluster.o: cluster.c server.h fmacros.h config.h solarisfixes.h \
- ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h sds.h dict.h \
- adlist.h zmalloc.h anet.h ziplist.h intset.h version.h util.h latency.h \
- sparkline.h quicklist.h zipmap.h sha1.h endianconv.h crc64.h rdb.h rio.h \
- cluster.h
-config.o: config.c server.h fmacros.h config.h solarisfixes.h \
- ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h sds.h dict.h \
- adlist.h zmalloc.h anet.h ziplist.h intset.h version.h util.h latency.h \
- sparkline.h quicklist.h zipmap.h sha1.h endianconv.h crc64.h rdb.h rio.h \
- cluster.h
-crc16.o: crc16.c server.h fmacros.h config.h solarisfixes.h \
- ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h sds.h dict.h \
- adlist.h zmalloc.h anet.h ziplist.h intset.h version.h util.h latency.h \
- sparkline.h quicklist.h zipmap.h sha1.h endianconv.h crc64.h rdb.h rio.h
-crc64.o: crc64.c
-db.o: db.c server.h fmacros.h config.h solarisfixes.h \
- ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h sds.h dict.h \
- adlist.h zmalloc.h anet.h ziplist.h intset.h version.h util.h latency.h \
- sparkline.h quicklist.h zipmap.h sha1.h endianconv.h crc64.h rdb.h rio.h \
- cluster.h atomicvar.h
-debug.o: debug.c server.h fmacros.h config.h solarisfixes.h \
- ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h sds.h dict.h \
- adlist.h zmalloc.h anet.h ziplist.h intset.h version.h util.h latency.h \
- sparkline.h quicklist.h zipmap.h sha1.h endianconv.h crc64.h rdb.h rio.h \
- bio.h
-dict.o: dict.c fmacros.h dict.h zmalloc.h redisassert.h
-endianconv.o: endianconv.c
-geo.o: geo.c geo.h server.h fmacros.h config.h solarisfixes.h \
- ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h sds.h dict.h \
- adlist.h zmalloc.h anet.h ziplist.h intset.h version.h util.h latency.h \
- sparkline.h quicklist.h zipmap.h sha1.h endianconv.h crc64.h rdb.h rio.h \
- ../deps/geohash-int/geohash_helper.h ../deps/geohash-int/geohash.h
-hyperloglog.o: hyperloglog.c server.h fmacros.h config.h solarisfixes.h \
- ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h sds.h dict.h \
- adlist.h zmalloc.h anet.h ziplist.h intset.h version.h util.h latency.h \
- sparkline.h quicklist.h zipmap.h sha1.h endianconv.h crc64.h rdb.h rio.h
-intset.o: intset.c intset.h zmalloc.h endianconv.h config.h
-latency.o: latency.c server.h fmacros.h config.h solarisfixes.h \
- ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h sds.h dict.h \
- adlist.h zmalloc.h anet.h ziplist.h intset.h version.h util.h latency.h \
- sparkline.h quicklist.h zipmap.h sha1.h endianconv.h crc64.h rdb.h rio.h
-lazyfree.o: lazyfree.c server.h fmacros.h config.h solarisfixes.h \
- ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h sds.h dict.h \
- adlist.h zmalloc.h anet.h ziplist.h intset.h version.h util.h latency.h \
- sparkline.h quicklist.h zipmap.h sha1.h endianconv.h crc64.h rdb.h rio.h \
- bio.h atomicvar.h cluster.h
-lzf_c.o: lzf_c.c lzfP.h
-lzf_d.o: lzf_d.c lzfP.h
-memtest.o: memtest.c config.h
-multi.o: multi.c server.h fmacros.h config.h solarisfixes.h \
- ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h sds.h dict.h \
- adlist.h zmalloc.h anet.h ziplist.h intset.h version.h util.h latency.h \
- sparkline.h quicklist.h zipmap.h sha1.h endianconv.h crc64.h rdb.h rio.h
-networking.o: networking.c server.h fmacros.h config.h solarisfixes.h \
- ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h sds.h dict.h \
- adlist.h zmalloc.h anet.h ziplist.h intset.h version.h util.h latency.h \
- sparkline.h quicklist.h zipmap.h sha1.h endianconv.h crc64.h rdb.h rio.h
-notify.o: notify.c server.h fmacros.h config.h solarisfixes.h \
- ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h sds.h dict.h \
- adlist.h zmalloc.h anet.h ziplist.h intset.h version.h util.h latency.h \
- sparkline.h quicklist.h zipmap.h sha1.h endianconv.h crc64.h rdb.h rio.h
-object.o: object.c server.h fmacros.h config.h solarisfixes.h \
- ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h sds.h dict.h \
- adlist.h zmalloc.h anet.h ziplist.h intset.h version.h util.h latency.h \
- sparkline.h quicklist.h zipmap.h sha1.h endianconv.h crc64.h rdb.h rio.h
-pqsort.o: pqsort.c
-pubsub.o: pubsub.c server.h fmacros.h config.h solarisfixes.h \
- ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h sds.h dict.h \
- adlist.h zmalloc.h anet.h ziplist.h intset.h version.h util.h latency.h \
- sparkline.h quicklist.h zipmap.h sha1.h endianconv.h crc64.h rdb.h rio.h
-quicklist.o: quicklist.c quicklist.h zmalloc.h ziplist.h util.h sds.h \
- lzf.h
-rand.o: rand.c
-rdb.o: rdb.c server.h fmacros.h config.h solarisfixes.h \
- ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h sds.h dict.h \
- adlist.h zmalloc.h anet.h ziplist.h intset.h version.h util.h latency.h \
- sparkline.h quicklist.h zipmap.h sha1.h endianconv.h crc64.h rdb.h rio.h \
- lzf.h
-redis-benchmark.o: redis-benchmark.c fmacros.h ../deps/hiredis/sds.h ae.h \
- ../deps/hiredis/hiredis.h adlist.h zmalloc.h
-redis-check-aof.o: redis-check-aof.c fmacros.h config.h
-redis-check-rdb.o: redis-check-rdb.c server.h fmacros.h config.h \
- solarisfixes.h ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h \
- sds.h dict.h adlist.h zmalloc.h anet.h ziplist.h intset.h version.h \
- util.h latency.h sparkline.h quicklist.h zipmap.h sha1.h endianconv.h \
- crc64.h rdb.h rio.h lzf.h
-redis-cli.o: redis-cli.c fmacros.h version.h ../deps/hiredis/hiredis.h \
- ../deps/hiredis/sds.h zmalloc.h ../deps/linenoise/linenoise.h help.h \
- anet.h ae.h
-release.o: release.c release.h version.h crc64.h
-replication.o: replication.c server.h fmacros.h config.h solarisfixes.h \
- ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h sds.h dict.h \
- adlist.h zmalloc.h anet.h ziplist.h intset.h version.h util.h latency.h \
- sparkline.h quicklist.h zipmap.h sha1.h endianconv.h crc64.h rdb.h rio.h
-rio.o: rio.c fmacros.h rio.h sds.h util.h crc64.h config.h server.h \
- solarisfixes.h ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h \
- dict.h adlist.h zmalloc.h anet.h ziplist.h intset.h version.h latency.h \
- sparkline.h quicklist.h zipmap.h sha1.h endianconv.h rdb.h
-scripting.o: scripting.c server.h fmacros.h config.h solarisfixes.h \
- ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h sds.h dict.h \
- adlist.h zmalloc.h anet.h ziplist.h intset.h version.h util.h latency.h \
- sparkline.h quicklist.h zipmap.h sha1.h endianconv.h crc64.h rdb.h rio.h \
- rand.h cluster.h ../deps/lua/src/lauxlib.h ../deps/lua/src/lua.h \
- ../deps/lua/src/lualib.h
-sds.o: sds.c sds.h sdsalloc.h zmalloc.h
-sentinel.o: sentinel.c server.h fmacros.h config.h solarisfixes.h \
- ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h sds.h dict.h \
- adlist.h zmalloc.h anet.h ziplist.h intset.h version.h util.h latency.h \
- sparkline.h quicklist.h zipmap.h sha1.h endianconv.h crc64.h rdb.h rio.h \
- ../deps/hiredis/hiredis.h ../deps/hiredis/async.h \
- ../deps/hiredis/hiredis.h
-server.o: server.c server.h fmacros.h config.h solarisfixes.h \
- ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h sds.h dict.h \
- adlist.h zmalloc.h anet.h ziplist.h intset.h version.h util.h latency.h \
- sparkline.h quicklist.h zipmap.h sha1.h endianconv.h crc64.h rdb.h rio.h \
- cluster.h slowlog.h bio.h asciilogo.h
-setproctitle.o: setproctitle.c
-sha1.o: sha1.c solarisfixes.h sha1.h config.h
-slowlog.o: slowlog.c server.h fmacros.h config.h solarisfixes.h \
- ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h sds.h dict.h \
- adlist.h zmalloc.h anet.h ziplist.h intset.h version.h util.h latency.h \
- sparkline.h quicklist.h zipmap.h sha1.h endianconv.h crc64.h rdb.h rio.h \
- slowlog.h
-sort.o: sort.c server.h fmacros.h config.h solarisfixes.h \
- ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h sds.h dict.h \
- adlist.h zmalloc.h anet.h ziplist.h intset.h version.h util.h latency.h \
- sparkline.h quicklist.h zipmap.h sha1.h endianconv.h crc64.h rdb.h rio.h \
- pqsort.h
-sparkline.o: sparkline.c server.h fmacros.h config.h solarisfixes.h \
- ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h sds.h dict.h \
- adlist.h zmalloc.h anet.h ziplist.h intset.h version.h util.h latency.h \
- sparkline.h quicklist.h zipmap.h sha1.h endianconv.h crc64.h rdb.h rio.h
-syncio.o: syncio.c server.h fmacros.h config.h solarisfixes.h \
- ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h sds.h dict.h \
- adlist.h zmalloc.h anet.h ziplist.h intset.h version.h util.h latency.h \
- sparkline.h quicklist.h zipmap.h sha1.h endianconv.h crc64.h rdb.h rio.h
-t_hash.o: t_hash.c server.h fmacros.h config.h solarisfixes.h \
- ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h sds.h dict.h \
- adlist.h zmalloc.h anet.h ziplist.h intset.h version.h util.h latency.h \
- sparkline.h quicklist.h zipmap.h sha1.h endianconv.h crc64.h rdb.h rio.h
-t_list.o: t_list.c server.h fmacros.h config.h solarisfixes.h \
- ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h sds.h dict.h \
- adlist.h zmalloc.h anet.h ziplist.h intset.h version.h util.h latency.h \
- sparkline.h quicklist.h zipmap.h sha1.h endianconv.h crc64.h rdb.h rio.h
-t_set.o: t_set.c server.h fmacros.h config.h solarisfixes.h \
- ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h sds.h dict.h \
- adlist.h zmalloc.h anet.h ziplist.h intset.h version.h util.h latency.h \
- sparkline.h quicklist.h zipmap.h sha1.h endianconv.h crc64.h rdb.h rio.h
-t_string.o: t_string.c server.h fmacros.h config.h solarisfixes.h \
- ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h sds.h dict.h \
- adlist.h zmalloc.h anet.h ziplist.h intset.h version.h util.h latency.h \
- sparkline.h quicklist.h zipmap.h sha1.h endianconv.h crc64.h rdb.h rio.h
-t_zset.o: t_zset.c server.h fmacros.h config.h solarisfixes.h \
- ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h sds.h dict.h \
- adlist.h zmalloc.h anet.h ziplist.h intset.h version.h util.h latency.h \
- sparkline.h quicklist.h zipmap.h sha1.h endianconv.h crc64.h rdb.h rio.h
-util.o: util.c fmacros.h util.h sds.h sha1.h
-ziplist.o: ziplist.c zmalloc.h util.h sds.h ziplist.h endianconv.h \
- config.h redisassert.h
-zipmap.o: zipmap.c zmalloc.h endianconv.h config.h
-zmalloc.o: zmalloc.c config.h zmalloc.h atomicvar.h
diff --git a/src/ae_select.c b/src/ae_select.c
index e2b7a9e8a..c039a8ea3 100644
--- a/src/ae_select.c
+++ b/src/ae_select.c
@@ -29,6 +29,7 @@
*/
+#include <sys/select.h>
#include <string.h>
typedef struct aeApiState {
diff --git a/src/anet.c b/src/anet.c
index 1728f3eb9..ef1711d06 100644
--- a/src/anet.c
+++ b/src/anet.c
@@ -486,7 +486,7 @@ static int _anetTcpServer(char *err, int port, char *bindaddr, int af, int backl
goto end;
}
if (p == NULL) {
- anetSetError(err, "unable to bind socket");
+ anetSetError(err, "unable to bind socket, errno: %d", errno);
goto error;
}
diff --git a/src/aof.c b/src/aof.c
index 9df1e9b9e..07d8561da 100644
--- a/src/aof.c
+++ b/src/aof.c
@@ -251,7 +251,10 @@ int startAppendOnly(void) {
strerror(errno));
return C_ERR;
}
- if (rewriteAppendOnlyFileBackground() == C_ERR) {
+ if (server.rdb_child_pid != -1) {
+ server.aof_rewrite_scheduled = 1;
+ serverLog(LL_WARNING,"AOF was enabled but there is already a child process saving an RDB file on disk. An AOF background was scheduled to start when possible.");
+ } else if (rewriteAppendOnlyFileBackground() == C_ERR) {
close(server.aof_fd);
serverLog(LL_WARNING,"Redis needs to enable the AOF but can't trigger a background AOF rewrite operation. Check the above logs for more info about the error.");
return C_ERR;
@@ -613,19 +616,23 @@ int loadAppendOnlyFile(char *filename) {
struct redis_stat sb;
int old_aof_state = server.aof_state;
long loops = 0;
- off_t valid_up_to = 0; /* Offset of the latest well-formed command loaded. */
+ off_t valid_up_to = 0; /* Offset of latest well-formed command loaded. */
+
+ if (fp == NULL) {
+ serverLog(LL_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
+ exit(1);
+ }
+ /* Handle a zero-length AOF file as a special case. An emtpy AOF file
+ * is a valid AOF because an empty server with AOF enabled will create
+ * a zero length file at startup, that will remain like that if no write
+ * operation is received. */
if (fp && redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0) {
server.aof_current_size = 0;
fclose(fp);
return C_ERR;
}
- if (fp == NULL) {
- serverLog(LL_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
- exit(1);
- }
-
/* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
* to the same file we're about to read. */
server.aof_state = AOF_OFF;
@@ -633,6 +640,28 @@ int loadAppendOnlyFile(char *filename) {
fakeClient = createFakeClient();
startLoading(fp);
+ /* Check if this AOF file has an RDB preamble. In that case we need to
+ * load the RDB file and later continue loading the AOF tail. */
+ char sig[5]; /* "REDIS" */
+ if (fread(sig,1,5,fp) != 5 || memcmp(sig,"REDIS",5) != 0) {
+ /* No RDB preamble, seek back at 0 offset. */
+ if (fseek(fp,0,SEEK_SET) == -1) goto readerr;
+ } else {
+ /* RDB preamble. Pass loading the RDB functions. */
+ rio rdb;
+
+ serverLog(LL_NOTICE,"Reading RDB preamble from AOF file...");
+ if (fseek(fp,0,SEEK_SET) == -1) goto readerr;
+ rioInitWithFile(&rdb,fp);
+ if (rdbLoadRio(&rdb,NULL) != C_OK) {
+ serverLog(LL_WARNING,"Error reading the RDB preamble of the AOF file, AOF loading aborted");
+ goto readerr;
+ } else {
+ serverLog(LL_NOTICE,"Reading the remaining AOF tail...");
+ }
+ }
+
+ /* Read the actual AOF file, in REPL format, command by command. */
while(1) {
int argc, j;
unsigned long len;
@@ -693,6 +722,7 @@ int loadAppendOnlyFile(char *filename) {
}
/* Run the command in the context of a fake client */
+ fakeClient->cmd = cmd;
cmd->proc(fakeClient);
/* The fake client should not have a reply */
@@ -703,6 +733,7 @@ int loadAppendOnlyFile(char *filename) {
/* Clean up. Command code may have changed argv/argc so we use the
* argv/argc of the client instead of the local variables. */
freeFakeClientArgv(fakeClient);
+ fakeClient->cmd = NULL;
if (server.aof_load_truncated) valid_up_to = ftello(fp);
}
@@ -983,6 +1014,22 @@ int rewriteHashObject(rio *r, robj *key, robj *o) {
return 1;
}
+/* Call the module type callback in order to rewrite a data type
+ * that is exported by a module and is not handled by Redis itself.
+ * The function returns 0 on error, 1 on success. */
+int rewriteModuleObject(rio *r, robj *key, robj *o) {
+ RedisModuleIO io;
+ moduleValue *mv = o->ptr;
+ moduleType *mt = mv->type;
+ moduleInitIOContext(io,mt,r);
+ mt->aof_rewrite(&io,key,mv->value);
+ if (io.ctx) {
+ moduleFreeContext(io.ctx);
+ zfree(io.ctx);
+ }
+ return io.error ? 0 : 1;
+}
+
/* This function is called by the child rewriting the AOF file to read
* the difference accumulated from the parent into a buffer, that is
* concatenated at the end of the rewrite. */
@@ -998,51 +1045,23 @@ ssize_t aofReadDiffFromParent(void) {
return total;
}
-/* Write a sequence of commands able to fully rebuild the dataset into
- * "filename". Used both by REWRITEAOF and BGREWRITEAOF.
- *
- * In order to minimize the number of commands needed in the rewritten
- * log Redis uses variadic commands when possible, such as RPUSH, SADD
- * and ZADD. However at max AOF_REWRITE_ITEMS_PER_CMD items per time
- * are inserted using a single command. */
-int rewriteAppendOnlyFile(char *filename) {
+int rewriteAppendOnlyFileRio(rio *aof) {
dictIterator *di = NULL;
dictEntry *de;
- rio aof;
- FILE *fp;
- char tmpfile[256];
- int j;
- long long now = mstime();
- char byte;
size_t processed = 0;
+ long long now = mstime();
+ int j;
- /* Note that we have to use a different temp name here compared to the
- * one used by rewriteAppendOnlyFileBackground() function. */
- snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
- fp = fopen(tmpfile,"w");
- if (!fp) {
- serverLog(LL_WARNING, "Opening the temp file for AOF rewrite in rewriteAppendOnlyFile(): %s", strerror(errno));
- return C_ERR;
- }
-
- server.aof_child_diff = sdsempty();
- rioInitWithFile(&aof,fp);
- if (server.aof_rewrite_incremental_fsync)
- rioSetAutoSync(&aof,AOF_AUTOSYNC_BYTES);
for (j = 0; j < server.dbnum; j++) {
char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
redisDb *db = server.db+j;
dict *d = db->dict;
if (dictSize(d) == 0) continue;
di = dictGetSafeIterator(d);
- if (!di) {
- fclose(fp);
- return C_ERR;
- }
/* SELECT the new DB */
- if (rioWrite(&aof,selectcmd,sizeof(selectcmd)-1) == 0) goto werr;
- if (rioWriteBulkLongLong(&aof,j) == 0) goto werr;
+ if (rioWrite(aof,selectcmd,sizeof(selectcmd)-1) == 0) goto werr;
+ if (rioWriteBulkLongLong(aof,j) == 0) goto werr;
/* Iterate this DB writing every entry */
while((de = dictNext(di)) != NULL) {
@@ -1063,37 +1082,83 @@ int rewriteAppendOnlyFile(char *filename) {
if (o->type == OBJ_STRING) {
/* Emit a SET command */
char cmd[]="*3\r\n$3\r\nSET\r\n";
- if (rioWrite(&aof,cmd,sizeof(cmd)-1) == 0) goto werr;
+ if (rioWrite(aof,cmd,sizeof(cmd)-1) == 0) goto werr;
/* Key and value */
- if (rioWriteBulkObject(&aof,&key) == 0) goto werr;
- if (rioWriteBulkObject(&aof,o) == 0) goto werr;
+ if (rioWriteBulkObject(aof,&key) == 0) goto werr;
+ if (rioWriteBulkObject(aof,o) == 0) goto werr;
} else if (o->type == OBJ_LIST) {
- if (rewriteListObject(&aof,&key,o) == 0) goto werr;
+ if (rewriteListObject(aof,&key,o) == 0) goto werr;
} else if (o->type == OBJ_SET) {
- if (rewriteSetObject(&aof,&key,o) == 0) goto werr;
+ if (rewriteSetObject(aof,&key,o) == 0) goto werr;
} else if (o->type == OBJ_ZSET) {
- if (rewriteSortedSetObject(&aof,&key,o) == 0) goto werr;
+ if (rewriteSortedSetObject(aof,&key,o) == 0) goto werr;
} else if (o->type == OBJ_HASH) {
- if (rewriteHashObject(&aof,&key,o) == 0) goto werr;
+ if (rewriteHashObject(aof,&key,o) == 0) goto werr;
+ } else if (o->type == OBJ_MODULE) {
+ if (rewriteModuleObject(aof,&key,o) == 0) goto werr;
} else {
serverPanic("Unknown object type");
}
/* Save the expire time */
if (expiretime != -1) {
char cmd[]="*3\r\n$9\r\nPEXPIREAT\r\n";
- if (rioWrite(&aof,cmd,sizeof(cmd)-1) == 0) goto werr;
- if (rioWriteBulkObject(&aof,&key) == 0) goto werr;
- if (rioWriteBulkLongLong(&aof,expiretime) == 0) goto werr;
+ if (rioWrite(aof,cmd,sizeof(cmd)-1) == 0) goto werr;
+ if (rioWriteBulkObject(aof,&key) == 0) goto werr;
+ if (rioWriteBulkLongLong(aof,expiretime) == 0) goto werr;
}
/* Read some diff from the parent process from time to time. */
- if (aof.processed_bytes > processed+1024*10) {
- processed = aof.processed_bytes;
+ if (aof->processed_bytes > processed+AOF_READ_DIFF_INTERVAL_BYTES) {
+ processed = aof->processed_bytes;
aofReadDiffFromParent();
}
}
dictReleaseIterator(di);
di = NULL;
}
+ return C_OK;
+
+werr:
+ if (di) dictReleaseIterator(di);
+ return C_ERR;
+}
+
+/* Write a sequence of commands able to fully rebuild the dataset into
+ * "filename". Used both by REWRITEAOF and BGREWRITEAOF.
+ *
+ * In order to minimize the number of commands needed in the rewritten
+ * log Redis uses variadic commands when possible, such as RPUSH, SADD
+ * and ZADD. However at max AOF_REWRITE_ITEMS_PER_CMD items per time
+ * are inserted using a single command. */
+int rewriteAppendOnlyFile(char *filename) {
+ rio aof;
+ FILE *fp;
+ char tmpfile[256];
+ char byte;
+
+ /* Note that we have to use a different temp name here compared to the
+ * one used by rewriteAppendOnlyFileBackground() function. */
+ snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
+ fp = fopen(tmpfile,"w");
+ if (!fp) {
+ serverLog(LL_WARNING, "Opening the temp file for AOF rewrite in rewriteAppendOnlyFile(): %s", strerror(errno));
+ return C_ERR;
+ }
+
+ server.aof_child_diff = sdsempty();
+ rioInitWithFile(&aof,fp);
+
+ if (server.aof_rewrite_incremental_fsync)
+ rioSetAutoSync(&aof,AOF_AUTOSYNC_BYTES);
+
+ if (server.aof_use_rdb_preamble) {
+ int error;
+ if (rdbSaveRio(&aof,&error,RDB_SAVE_AOF_PREAMBLE,NULL) == C_ERR) {
+ errno = error;
+ goto werr;
+ }
+ } else {
+ if (rewriteAppendOnlyFileRio(&aof) == C_ERR) goto werr;
+ }
/* Do an initial slow fsync here while the parent is still sending
* data, in order to make the next final fsync faster. */
@@ -1159,7 +1224,6 @@ werr:
serverLog(LL_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
fclose(fp);
unlink(tmpfile);
- if (di) dictReleaseIterator(di);
return C_ERR;
}
@@ -1257,8 +1321,9 @@ int rewriteAppendOnlyFileBackground(void) {
pid_t childpid;
long long start;
- if (server.aof_child_pid != -1) return C_ERR;
+ if (server.aof_child_pid != -1 || server.rdb_child_pid != -1) return C_ERR;
if (aofCreatePipes() != C_OK) return C_ERR;
+ openChildInfoPipe();
start = ustime();
if ((childpid = fork()) == 0) {
char tmpfile[256];
@@ -1268,13 +1333,16 @@ int rewriteAppendOnlyFileBackground(void) {
redisSetProcTitle("redis-aof-rewrite");
snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
if (rewriteAppendOnlyFile(tmpfile) == C_OK) {
- size_t private_dirty = zmalloc_get_private_dirty();
+ size_t private_dirty = zmalloc_get_private_dirty(-1);
if (private_dirty) {
serverLog(LL_NOTICE,
"AOF rewrite: %zu MB of memory used by copy-on-write",
private_dirty/(1024*1024));
}
+
+ server.child_info_data.cow_size = private_dirty;
+ sendChildInfo(CHILD_INFO_TYPE_AOF);
exitFromChild(0);
} else {
exitFromChild(1);
@@ -1285,6 +1353,7 @@ int rewriteAppendOnlyFileBackground(void) {
server.stat_fork_rate = (double) zmalloc_used_memory() * 1000000 / server.stat_fork_time / (1024*1024*1024); /* GB per second. */
latencyAddSampleIfNeeded("fork",server.stat_fork_time/1000);
if (childpid == -1) {
+ closeChildInfoPipe();
serverLog(LL_WARNING,
"Can't rewrite append only file in background: fork: %s",
strerror(errno));
diff --git a/src/atomicvar.h b/src/atomicvar.h
index 3489972d2..4aa8fa173 100644
--- a/src/atomicvar.h
+++ b/src/atomicvar.h
@@ -51,7 +51,7 @@
#ifndef __ATOMIC_VAR_H
#define __ATOMIC_VAR_H
-#if defined(__ATOMIC_RELAXED)
+#if defined(__ATOMIC_RELAXED) && (!defined(__clang__) || !defined(__APPLE__) || __apple_build_version__ > 4210057)
/* Implementation using __atomic macros. */
#define atomicIncr(var,count,mutex) __atomic_add_fetch(&var,(count),__ATOMIC_RELAXED)
diff --git a/src/bitops.c b/src/bitops.c
index ed7e384a0..46eee22c3 100644
--- a/src/bitops.c
+++ b/src/bitops.c
@@ -215,12 +215,7 @@ void setUnsignedBitfield(unsigned char *p, uint64_t offset, uint64_t bits, uint6
}
void setSignedBitfield(unsigned char *p, uint64_t offset, uint64_t bits, int64_t value) {
- uint64_t uv;
-
- if (value >= 0)
- uv = value;
- else
- uv = UINT64_MAX + value + 1;
+ uint64_t uv = value; /* Casting will add UINT64_MAX + 1 if v is negative. */
setUnsignedBitfield(p,offset,bits,uv);
}
@@ -239,9 +234,21 @@ uint64_t getUnsignedBitfield(unsigned char *p, uint64_t offset, uint64_t bits) {
}
int64_t getSignedBitfield(unsigned char *p, uint64_t offset, uint64_t bits) {
- int64_t value = getUnsignedBitfield(p,offset,bits);
+ int64_t value;
+ union {uint64_t u; int64_t i;} conv;
+
+ /* Converting from unsigned to signed is undefined when the value does
+ * not fit, however here we assume two's complement and the original value
+ * was obtained from signed -> unsigned conversion, so we'll find the
+ * most significant bit set if the original value was negative.
+ *
+ * Note that two's complement is mandatory for exact-width types
+ * according to the C99 standard. */
+ conv.u = getUnsignedBitfield(p,offset,bits);
+ value = conv.i;
+
/* If the top significant bit is 1, propagate it to all the
- * higher bits for two complement representation of signed
+ * higher bits for two's complement representation of signed
* integers. */
if (value & ((uint64_t)1 << (bits-1)))
value |= ((uint64_t)-1) << bits;
@@ -299,7 +306,7 @@ int checkUnsignedBitfieldOverflow(uint64_t value, int64_t incr, uint64_t bits, i
handle_wrap:
{
- uint64_t mask = ((int64_t)-1) << bits;
+ uint64_t mask = ((uint64_t)-1) << bits;
uint64_t res = value+incr;
res &= ~mask;
@@ -342,7 +349,7 @@ int checkSignedBitfieldOverflow(int64_t value, int64_t incr, uint64_t bits, int
handle_wrap:
{
- uint64_t mask = ((int64_t)-1) << bits;
+ uint64_t mask = ((uint64_t)-1) << bits;
uint64_t msb = (uint64_t)1 << (bits-1);
uint64_t a = value, b = incr, c;
c = a+b; /* Perform addition as unsigned so that's defined. */
@@ -476,6 +483,37 @@ robj *lookupStringForBitCommand(client *c, size_t maxbit) {
return o;
}
+/* Return a pointer to the string object content, and stores its length
+ * in 'len'. The user is required to pass (likely stack allocated) buffer
+ * 'llbuf' of at least LONG_STR_SIZE bytes. Such a buffer is used in the case
+ * the object is integer encoded in order to provide the representation
+ * without usign heap allocation.
+ *
+ * The function returns the pointer to the object array of bytes representing
+ * the string it contains, that may be a pointer to 'llbuf' or to the
+ * internal object representation. As a side effect 'len' is filled with
+ * the length of such buffer.
+ *
+ * If the source object is NULL the function is guaranteed to return NULL
+ * and set 'len' to 0. */
+unsigned char *getObjectReadOnlyString(robj *o, long *len, char *llbuf) {
+ serverAssert(o->type == OBJ_STRING);
+ unsigned char *p = NULL;
+
+ /* Set the 'p' pointer to the string, that can be just a stack allocated
+ * array if our string was integer encoded. */
+ if (o && o->encoding == OBJ_ENCODING_INT) {
+ p = (unsigned char*) llbuf;
+ if (len) *len = ll2string(llbuf,LONG_STR_SIZE,(long)o->ptr);
+ } else if (o) {
+ p = (unsigned char*) o->ptr;
+ if (len) *len = sdslen(o->ptr);
+ } else {
+ if (len) *len = 0;
+ }
+ return p;
+}
+
/* SETBIT key offset bitvalue */
void setbitCommand(client *c) {
robj *o;
@@ -721,21 +759,12 @@ void bitcountCommand(client *c) {
robj *o;
long start, end, strlen;
unsigned char *p;
- char llbuf[32];
+ char llbuf[LONG_STR_SIZE];
/* Lookup, check for type, and return 0 for non existing keys. */
if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
checkType(c,o,OBJ_STRING)) return;
-
- /* Set the 'p' pointer to the string, that can be just a stack allocated
- * array if our string was integer encoded. */
- if (o->encoding == OBJ_ENCODING_INT) {
- p = (unsigned char*) llbuf;
- strlen = ll2string(llbuf,sizeof(llbuf),(long)o->ptr);
- } else {
- p = (unsigned char*) o->ptr;
- strlen = sdslen(o->ptr);
- }
+ p = getObjectReadOnlyString(o,&strlen,llbuf);
/* Parse start/end range if any. */
if (c->argc == 4) {
@@ -744,6 +773,10 @@ void bitcountCommand(client *c) {
if (getLongFromObjectOrReply(c,c->argv[3],&end,NULL) != C_OK)
return;
/* Convert negative indexes */
+ if (start < 0 && end < 0 && start > end) {
+ addReply(c,shared.czero);
+ return;
+ }
if (start < 0) start = strlen+start;
if (end < 0) end = strlen+end;
if (start < 0) start = 0;
@@ -775,7 +808,7 @@ void bitposCommand(client *c) {
robj *o;
long bit, start, end, strlen;
unsigned char *p;
- char llbuf[32];
+ char llbuf[LONG_STR_SIZE];
int end_given = 0;
/* Parse the bit argument to understand what we are looking for, set
@@ -795,16 +828,7 @@ void bitposCommand(client *c) {
return;
}
if (checkType(c,o,OBJ_STRING)) return;
-
- /* Set the 'p' pointer to the string, that can be just a stack allocated
- * array if our string was integer encoded. */
- if (o->encoding == OBJ_ENCODING_INT) {
- p = (unsigned char*) llbuf;
- strlen = ll2string(llbuf,sizeof(llbuf),(long)o->ptr);
- } else {
- p = (unsigned char*) o->ptr;
- strlen = sdslen(o->ptr);
- }
+ p = getObjectReadOnlyString(o,&strlen,llbuf);
/* Parse start/end range if any. */
if (c->argc == 4 || c->argc == 5) {
@@ -882,6 +906,8 @@ void bitfieldCommand(client *c) {
int j, numops = 0, changes = 0;
struct bitfieldOp *ops = NULL; /* Array of ops to execute at end. */
int owtype = BFOVERFLOW_WRAP; /* Overflow type. */
+ int readonly = 1;
+ size_t higest_write_offset = 0;
for (j = 2; j < c->argc; j++) {
int remargs = c->argc-j-1; /* Remaining args other than current. */
@@ -929,8 +955,11 @@ void bitfieldCommand(client *c) {
return;
}
- /* INCRBY and SET require another argument. */
if (opcode != BITFIELDOP_GET) {
+ readonly = 0;
+ if (higest_write_offset < bitoffset + bits - 1)
+ higest_write_offset = bitoffset + bits - 1;
+ /* INCRBY and SET require another argument. */
if (getLongLongFromObjectOrReply(c,c->argv[j+3],&i64,NULL) != C_OK){
zfree(ops);
return;
@@ -950,6 +979,18 @@ void bitfieldCommand(client *c) {
j += 3 - (opcode == BITFIELDOP_GET);
}
+ if (readonly) {
+ /* Lookup for read is ok if key doesn't exit, but errors
+ * if it's not a string. */
+ o = lookupKeyRead(c->db,c->argv[1]);
+ if (o != NULL && checkType(c,o,OBJ_STRING)) return;
+ } else {
+ /* Lookup by making room up to the farest bit reached by
+ * this operation. */
+ if ((o = lookupStringForBitCommand(c,
+ higest_write_offset)) == NULL) return;
+ }
+
addReplyMultiBulkLen(c,numops);
/* Actually process the operations. */
@@ -964,11 +1005,6 @@ void bitfieldCommand(client *c) {
* for simplicity. SET return value is the previous value so
* we need fetch & store as well. */
- /* Lookup by making room up to the farest bit reached by
- * this operation. */
- if ((o = lookupStringForBitCommand(c,
- thisop->offset + (thisop->bits-1))) == NULL) return;
-
/* We need two different but very similar code paths for signed
* and unsigned operations, since the set of functions to get/set
* the integers and the used variables types are different. */
@@ -1035,20 +1071,23 @@ void bitfieldCommand(client *c) {
changes++;
} else {
/* GET */
- o = lookupKeyRead(c->db,c->argv[1]);
- size_t olen = (o == NULL) ? 0 : sdslen(o->ptr);
unsigned char buf[9];
+ long strlen = 0;
+ unsigned char *src = NULL;
+ char llbuf[LONG_STR_SIZE];
+
+ if (o != NULL)
+ src = getObjectReadOnlyString(o,&strlen,llbuf);
/* For GET we use a trick: before executing the operation
* copy up to 9 bytes to a local buffer, so that we can easily
* execute up to 64 bit operations that are at actual string
* object boundaries. */
memset(buf,0,9);
- unsigned char *src = o ? o->ptr : NULL;
int i;
size_t byte = thisop->offset >> 3;
for (i = 0; i < 9; i++) {
- if (src == NULL || i+byte >= olen) break;
+ if (src == NULL || i+byte >= (size_t)strlen) break;
buf[i] = src[i+byte];
}
diff --git a/src/blocked.c b/src/blocked.c
index d22872548..54b26b713 100644
--- a/src/blocked.c
+++ b/src/blocked.c
@@ -136,6 +136,8 @@ void unblockClient(client *c) {
unblockClientWaitingData(c);
} else if (c->btype == BLOCKED_WAIT) {
unblockClientWaitingReplicas(c);
+ } else if (c->btype == BLOCKED_MODULE) {
+ unblockClientFromModule(c);
} else {
serverPanic("Unknown btype in unblockClient().");
}
@@ -153,12 +155,15 @@ void unblockClient(client *c) {
}
/* This function gets called when a blocked client timed out in order to
- * send it a reply of some kind. */
+ * send it a reply of some kind. After this function is called,
+ * unblockClient() will be called with the same client as argument. */
void replyToBlockedClientTimedOut(client *c) {
if (c->btype == BLOCKED_LIST) {
addReply(c,shared.nullmultibulk);
} else if (c->btype == BLOCKED_WAIT) {
addReplyLongLong(c,replicationCountAcksByOffset(c->bpop.reploffset));
+ } else if (c->btype == BLOCKED_MODULE) {
+ moduleBlockedClientTimedOut(c);
} else {
serverPanic("Unknown btype in replyToBlockedClientTimedOut().");
}
diff --git a/src/childinfo.c b/src/childinfo.c
new file mode 100644
index 000000000..719025e8c
--- /dev/null
+++ b/src/childinfo.c
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2016, Salvatore Sanfilippo <antirez at gmail dot com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Redis nor the names of its contributors may be used
+ * to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "server.h"
+#include <unistd.h>
+
+/* Open a child-parent channel used in order to move information about the
+ * RDB / AOF saving process from the child to the parent (for instance
+ * the amount of copy on write memory used) */
+void openChildInfoPipe(void) {
+ if (pipe(server.child_info_pipe) == -1) {
+ /* On error our two file descriptors should be still set to -1,
+ * but we call anyway cloesChildInfoPipe() since can't hurt. */
+ closeChildInfoPipe();
+ } else if (anetNonBlock(NULL,server.child_info_pipe[0]) != ANET_OK) {
+ closeChildInfoPipe();
+ } else {
+ memset(&server.child_info_data,0,sizeof(server.child_info_data));
+ }
+}
+
+/* Close the pipes opened with openChildInfoPipe(). */
+void closeChildInfoPipe(void) {
+ if (server.child_info_pipe[0] != -1 ||
+ server.child_info_pipe[1] != -1)
+ {
+ close(server.child_info_pipe[0]);
+ close(server.child_info_pipe[1]);
+ server.child_info_pipe[0] = -1;
+ server.child_info_pipe[1] = -1;
+ }
+}
+
+/* Send COW data to parent. The child should call this function after populating
+ * the corresponding fields it want to sent (according to the process type). */
+void sendChildInfo(int ptype) {
+ if (server.child_info_pipe[1] == -1) return;
+ server.child_info_data.magic = CHILD_INFO_MAGIC;
+ server.child_info_data.process_type = ptype;
+ ssize_t wlen = sizeof(server.child_info_data);
+ if (write(server.child_info_pipe[1],&server.child_info_data,wlen) != wlen) {
+ /* Nothing to do on error, this will be detected by the other side. */
+ }
+}
+
+/* Receive COW data from parent. */
+void receiveChildInfo(void) {
+ if (server.child_info_pipe[0] == -1) return;
+ ssize_t wlen = sizeof(server.child_info_data);
+ if (read(server.child_info_pipe[0],&server.child_info_data,wlen) == wlen &&
+ server.child_info_data.magic == CHILD_INFO_MAGIC)
+ {
+ if (server.child_info_data.process_type == CHILD_INFO_TYPE_RDB) {
+ server.stat_rdb_cow_bytes = server.child_info_data.cow_size;
+ } else if (server.child_info_data.process_type == CHILD_INFO_TYPE_AOF) {
+ server.stat_aof_cow_bytes = server.child_info_data.cow_size;
+ }
+ }
+}
diff --git a/src/cluster.c b/src/cluster.c
index 1f19db3e4..6e3fc8b00 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -129,7 +129,7 @@ int clusterLoadConfig(char *filename) {
/* Skip blank lines, they can be created either by users manually
* editing nodes.conf or by the config writing process if stopped
* before the truncate() call. */
- if (line[0] == '\n') continue;
+ if (line[0] == '\n' || line[0] == '\0') continue;
/* Split the line into arguments for processing. */
argv = sdssplitargs(line,&argc);
@@ -2774,7 +2774,7 @@ void clusterHandleSlaveFailover(void) {
* and wait for replies), and the failover retry time (the time to wait
* before trying to get voted again).
*
- * Timeout is MIN(NODE_TIMEOUT*2,2000) milliseconds.
+ * Timeout is MAX(NODE_TIMEOUT*2,2000) milliseconds.
* Retry is two times the Timeout.
*/
auth_timeout = server.cluster_node_timeout*2;
@@ -4535,7 +4535,7 @@ int verifyDumpPayload(unsigned char *p, size_t len) {
/* Verify RDB version */
rdbver = (footer[1] << 8) | footer[0];
- if (rdbver != RDB_VERSION) return C_ERR;
+ if (rdbver > RDB_VERSION) return C_ERR;
/* Verify CRC64 */
crc = crc64(0,p,len-8);
@@ -4617,7 +4617,7 @@ void restoreCommand(client *c) {
/* Create the key and set the TTL if any */
dbAdd(c->db,c->argv[1],obj);
- if (ttl) setExpire(c->db,c->argv[1],mstime()+ttl);
+ if (ttl) setExpire(c,c->db,c->argv[1],mstime()+ttl);
signalModifiedKey(c->db,c->argv[1]);
addReply(c,shared.ok);
server.dirty++;
@@ -4750,7 +4750,6 @@ void migrateCommand(client *c) {
int copy, replace, j;
long timeout;
long dbid;
- long long ttl, expireat;
robj **ov = NULL; /* Objects to migrate. */
robj **kv = NULL; /* Key names. */
robj **newargv = NULL; /* Used to rewrite the command as DEL ... keys ... */
@@ -4765,7 +4764,6 @@ void migrateCommand(client *c) {
/* Initialization */
copy = 0;
replace = 0;
- ttl = 0;
/* Parse additional options */
for (j = 6; j < c->argc; j++) {
@@ -4841,7 +4839,9 @@ try_again:
/* Create RESTORE payload and generate the protocol to call the command. */
for (j = 0; j < num_keys; j++) {
- expireat = getExpire(c->db,kv[j]);
+ long long ttl = 0;
+ long long expireat = getExpire(c->db,kv[j]);
+
if (expireat != -1) {
ttl = expireat-mstime();
if (ttl < 1) ttl = 1;
diff --git a/src/config.c b/src/config.c
index c72f0aeb2..54af5bfe0 100644
--- a/src/config.c
+++ b/src/config.c
@@ -45,9 +45,11 @@ typedef struct configEnum {
configEnum maxmemory_policy_enum[] = {
{"volatile-lru", MAXMEMORY_VOLATILE_LRU},
+ {"volatile-lfu", MAXMEMORY_VOLATILE_LFU},
{"volatile-random",MAXMEMORY_VOLATILE_RANDOM},
{"volatile-ttl",MAXMEMORY_VOLATILE_TTL},
{"allkeys-lru",MAXMEMORY_ALLKEYS_LRU},
+ {"allkeys-lfu",MAXMEMORY_ALLKEYS_LFU},
{"allkeys-random",MAXMEMORY_ALLKEYS_RANDOM},
{"noeviction",MAXMEMORY_NO_EVICTION},
{NULL, 0}
@@ -153,6 +155,20 @@ void resetServerSaveParams(void) {
server.saveparamslen = 0;
}
+void queueLoadModule(sds path, sds *argv, int argc) {
+ int i;
+ struct moduleLoadQueueEntry *loadmod;
+
+ loadmod = zmalloc(sizeof(struct moduleLoadQueueEntry));
+ loadmod->argv = zmalloc(sizeof(robj*)*argc);
+ loadmod->path = sdsnew(path);
+ loadmod->argc = argc;
+ for (i = 0; i < argc; i++) {
+ loadmod->argv[i] = createRawStringObject(argv[i],sdslen(argv[i]));
+ }
+ listAddNodeTail(server.loadmodule_queue,loadmod);
+}
+
void loadServerConfigFromString(char *config) {
char *err = NULL;
int linenum = 0, totlines, i;
@@ -267,6 +283,10 @@ void loadServerConfigFromString(char *config) {
}
fclose(logfp);
}
+ } else if (!strcasecmp(argv[0],"always-show-logo") && argc == 2) {
+ if ((server.always_show_logo = yesnotoi(argv[1])) == -1) {
+ err = "argument must be 'yes' or 'no'"; goto loaderr;
+ }
} else if (!strcasecmp(argv[0],"syslog-enabled") && argc == 2) {
if ((server.syslog_enabled = yesnotoi(argv[1])) == -1) {
err = "argument must be 'yes' or 'no'"; goto loaderr;
@@ -308,6 +328,18 @@ void loadServerConfigFromString(char *config) {
err = "maxmemory-samples must be 1 or greater";
goto loaderr;
}
+ } else if (!strcasecmp(argv[0],"lfu-log-factor") && argc == 2) {
+ server.lfu_log_factor = atoi(argv[1]);
+ if (server.maxmemory_samples < 0) {
+ err = "lfu-log-factor must be 0 or greater";
+ goto loaderr;
+ }
+ } else if (!strcasecmp(argv[0],"lfu-decay-time") && argc == 2) {
+ server.lfu_decay_time = atoi(argv[1]);
+ if (server.maxmemory_samples < 1) {
+ err = "lfu-decay-time must be 0 or greater";
+ goto loaderr;
+ }
} else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
slaveof_linenum = linenum;
server.masterhost = sdsnew(argv[1]);
@@ -447,6 +479,10 @@ void loadServerConfigFromString(char *config) {
if ((server.aof_load_truncated = yesnotoi(argv[1])) == -1) {
err = "argument must be 'yes' or 'no'"; goto loaderr;
}
+ } else if (!strcasecmp(argv[0],"aof-use-rdb-preamble") && argc == 2) {
+ if ((server.aof_use_rdb_preamble = yesnotoi(argv[1])) == -1) {
+ err = "argument must be 'yes' or 'no'"; goto loaderr;
+ }
} else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
if (strlen(argv[1]) > CONFIG_AUTHPASS_MAX_LEN) {
err = "Password is longer than CONFIG_AUTHPASS_MAX_LEN";
@@ -584,8 +620,9 @@ void loadServerConfigFromString(char *config) {
unsigned long long hard, soft;
int soft_seconds;
- if (class == -1) {
- err = "Unrecognized client limit class";
+ if (class == -1 || class == CLIENT_TYPE_MASTER) {
+ err = "Unrecognized client limit class: the user specified "
+ "an invalid one, or 'master' which has no buffer limits.";
goto loaderr;
}
hard = memtoll(argv[2],NULL);
@@ -605,6 +642,16 @@ void loadServerConfigFromString(char *config) {
}
} else if (!strcasecmp(argv[0],"slave-priority") && argc == 2) {
server.slave_priority = atoi(argv[1]);
+ } else if (!strcasecmp(argv[0],"slave-announce-ip") && argc == 2) {
+ zfree(server.slave_announce_ip);
+ server.slave_announce_ip = zstrdup(argv[1]);
+ } else if (!strcasecmp(argv[0],"slave-announce-port") && argc == 2) {
+ server.slave_announce_port = atoi(argv[1]);
+ if (server.slave_announce_port < 0 ||
+ server.slave_announce_port > 65535)
+ {
+ err = "Invalid port"; goto loaderr;
+ }
} else if (!strcasecmp(argv[0],"min-slaves-to-write") && argc == 2) {
server.repl_min_slaves_to_write = atoi(argv[1]);
if (server.repl_min_slaves_to_write < 0) {
@@ -632,8 +679,8 @@ void loadServerConfigFromString(char *config) {
"Allowed values: 'upstart', 'systemd', 'auto', or 'no'";
goto loaderr;
}
- } else if (!strcasecmp(argv[0],"loadmodule") && argc == 2) {
- listAddNodeTail(server.loadmodule_queue,sdsnew(argv[1]));
+ } else if (!strcasecmp(argv[0],"loadmodule") && argc >= 2) {
+ queueLoadModule(argv[1],&argv[2],argc-2);
} else if (!strcasecmp(argv[0],"sentinel")) {
/* argc == 1 is handled by main() as we need to enter the sentinel
* mode ASAP. */
@@ -719,7 +766,7 @@ void loadServerConfig(char *filename, char *options) {
#define config_set_numerical_field(_name,_var,min,max) \
} else if (!strcasecmp(c->argv[2]->ptr,_name)) { \
- if (getLongLongFromObject(o,&ll) == C_ERR || ll < 0) goto badfmt; \
+ if (getLongLongFromObject(o,&ll) == C_ERR) goto badfmt; \
if (min != LLONG_MIN && ll < min) goto badfmt; \
if (max != LLONG_MAX && ll > max) goto badfmt; \
_var = ll;
@@ -864,7 +911,8 @@ void configSetCommand(client *c) {
long val;
if ((j % 4) == 0) {
- if (getClientTypeByName(v[j]) == -1) {
+ int class = getClientTypeByName(v[j]);
+ if (class == -1 || class == CLIENT_TYPE_MASTER) {
sdsfreesplitres(v,vlen);
goto badfmt;
}
@@ -897,6 +945,9 @@ void configSetCommand(client *c) {
if (flags == -1) goto badfmt;
server.notify_keyspace_events = flags;
+ } config_set_special_field("slave-announce-ip") {
+ zfree(server.slave_announce_ip);
+ server.slave_announce_ip = ((char*)o->ptr)[0] ? zstrdup(o->ptr) : NULL;
/* Boolean fields.
* config_set_bool_field(name,var). */
@@ -913,6 +964,8 @@ void configSetCommand(client *c) {
} config_set_bool_field(
"aof-load-truncated",server.aof_load_truncated) {
} config_set_bool_field(
+ "aof-use-rdb-preamble",server.aof_use_rdb_preamble) {
+ } config_set_bool_field(
"slave-serve-stale-data",server.repl_serve_stale_data) {
} config_set_bool_field(
"slave-read-only",server.repl_slave_ro) {
@@ -940,6 +993,10 @@ void configSetCommand(client *c) {
} config_set_numerical_field(
"maxmemory-samples",server.maxmemory_samples,1,LLONG_MAX) {
} config_set_numerical_field(
+ "lfu-log-factor",server.lfu_log_factor,0,LLONG_MAX) {
+ } config_set_numerical_field(
+ "lfu-decay-time",server.lfu_decay_time,0,LLONG_MAX) {
+ } config_set_numerical_field(
"timeout",server.maxidletime,0,LONG_MAX) {
} config_set_numerical_field(
"auto-aof-rewrite-percentage",server.aof_rewrite_perc,0,LLONG_MAX){
@@ -950,9 +1007,9 @@ void configSetCommand(client *c) {
} config_set_numerical_field(
"hash-max-ziplist-value",server.hash_max_ziplist_value,0,LLONG_MAX) {
} config_set_numerical_field(
- "list-max-ziplist-size",server.list_max_ziplist_size,0,LLONG_MAX) {
+ "list-max-ziplist-size",server.list_max_ziplist_size,INT_MIN,INT_MAX) {
} config_set_numerical_field(
- "list-compress-depth",server.list_compress_depth,0,LLONG_MAX) {
+ "list-compress-depth",server.list_compress_depth,0,INT_MAX) {
} config_set_numerical_field(
"set-max-intset-entries",server.set_max_intset_entries,0,LLONG_MAX) {
} config_set_numerical_field(
@@ -982,6 +1039,8 @@ void configSetCommand(client *c) {
} config_set_numerical_field(
"slave-priority",server.slave_priority,0,LLONG_MAX) {
} config_set_numerical_field(
+ "slave-announce-port",server.slave_announce_port,0,65535) {
+ } config_set_numerical_field(
"min-slaves-to-write",server.repl_min_slaves_to_write,0,LLONG_MAX) {
refreshGoodSlavesCount();
} config_set_numerical_field(
@@ -1053,7 +1112,7 @@ badfmt: /* Bad format errors */
*----------------------------------------------------------------------------*/
#define config_get_string_field(_name,_var) do { \
- if (stringmatch(pattern,_name,0)) { \
+ if (stringmatch(pattern,_name,1)) { \
addReplyBulkCString(c,_name); \
addReplyBulkCString(c,_var ? _var : ""); \
matches++; \
@@ -1061,7 +1120,7 @@ badfmt: /* Bad format errors */
} while(0);
#define config_get_bool_field(_name,_var) do { \
- if (stringmatch(pattern,_name,0)) { \
+ if (stringmatch(pattern,_name,1)) { \
addReplyBulkCString(c,_name); \
addReplyBulkCString(c,_var ? "yes" : "no"); \
matches++; \
@@ -1069,7 +1128,7 @@ badfmt: /* Bad format errors */
} while(0);
#define config_get_numerical_field(_name,_var) do { \
- if (stringmatch(pattern,_name,0)) { \
+ if (stringmatch(pattern,_name,1)) { \
ll2string(buf,sizeof(buf),_var); \
addReplyBulkCString(c,_name); \
addReplyBulkCString(c,buf); \
@@ -1078,7 +1137,7 @@ badfmt: /* Bad format errors */
} while(0);
#define config_get_enum_field(_name,_var,_enumvar) do { \
- if (stringmatch(pattern,_name,0)) { \
+ if (stringmatch(pattern,_name,1)) { \
addReplyBulkCString(c,_name); \
addReplyBulkCString(c,configEnumGetNameOrUnknown(_enumvar,_var)); \
matches++; \
@@ -1101,6 +1160,7 @@ void configGetCommand(client *c) {
config_get_string_field("unixsocket",server.unixsocket);
config_get_string_field("logfile",server.logfile);
config_get_string_field("pidfile",server.pidfile);
+ config_get_string_field("slave-announce-ip",server.slave_announce_ip);
/* Numerical values */
config_get_numerical_field("maxmemory",server.maxmemory);
@@ -1145,6 +1205,7 @@ void configGetCommand(client *c) {
config_get_numerical_field("maxclients",server.maxclients);
config_get_numerical_field("watchdog-period",server.watchdog_period);
config_get_numerical_field("slave-priority",server.slave_priority);
+ config_get_numerical_field("slave-announce-port",server.slave_announce_port);
config_get_numerical_field("min-slaves-to-write",server.repl_min_slaves_to_write);
config_get_numerical_field("min-slaves-max-lag",server.repl_min_slaves_max_lag);
config_get_numerical_field("hz",server.hz);
@@ -1178,6 +1239,8 @@ void configGetCommand(client *c) {
server.aof_rewrite_incremental_fsync);
config_get_bool_field("aof-load-truncated",
server.aof_load_truncated);
+ config_get_bool_field("aof-use-rdb-preamble",
+ server.aof_use_rdb_preamble);
config_get_bool_field("lazyfree-lazy-eviction",
server.lazyfree_lazy_eviction);
config_get_bool_field("lazyfree-lazy-expire",
@@ -1201,12 +1264,12 @@ void configGetCommand(client *c) {
/* Everything we can't handle with macros follows. */
- if (stringmatch(pattern,"appendonly",0)) {
+ if (stringmatch(pattern,"appendonly",1)) {
addReplyBulkCString(c,"appendonly");
addReplyBulkCString(c,server.aof_state == AOF_OFF ? "no" : "yes");
matches++;
}
- if (stringmatch(pattern,"dir",0)) {
+ if (stringmatch(pattern,"dir",1)) {
char buf[1024];
if (getcwd(buf,sizeof(buf)) == NULL)
@@ -1216,7 +1279,7 @@ void configGetCommand(client *c) {
addReplyBulkCString(c,buf);
matches++;
}
- if (stringmatch(pattern,"save",0)) {
+ if (stringmatch(pattern,"save",1)) {
sds buf = sdsempty();
int j;
@@ -1232,7 +1295,7 @@ void configGetCommand(client *c) {
sdsfree(buf);
matches++;
}
- if (stringmatch(pattern,"client-output-buffer-limit",0)) {
+ if (stringmatch(pattern,"client-output-buffer-limit",1)) {
sds buf = sdsempty();
int j;
@@ -1250,14 +1313,14 @@ void configGetCommand(client *c) {
sdsfree(buf);
matches++;
}
- if (stringmatch(pattern,"unixsocketperm",0)) {
+ if (stringmatch(pattern,"unixsocketperm",1)) {
char buf[32];
snprintf(buf,sizeof(buf),"%o",server.unixsocketperm);
addReplyBulkCString(c,"unixsocketperm");
addReplyBulkCString(c,buf);
matches++;
}
- if (stringmatch(pattern,"slaveof",0)) {
+ if (stringmatch(pattern,"slaveof",1)) {
char buf[256];
addReplyBulkCString(c,"slaveof");
@@ -1269,7 +1332,7 @@ void configGetCommand(client *c) {
addReplyBulkCString(c,buf);
matches++;
}
- if (stringmatch(pattern,"notify-keyspace-events",0)) {
+ if (stringmatch(pattern,"notify-keyspace-events",1)) {
robj *flagsobj = createObject(OBJ_STRING,
keyspaceEventsFlagsToString(server.notify_keyspace_events));
@@ -1278,7 +1341,7 @@ void configGetCommand(client *c) {
decrRefCount(flagsobj);
matches++;
}
- if (stringmatch(pattern,"bind",0)) {
+ if (stringmatch(pattern,"bind",1)) {
sds aux = sdsjoin(server.bindaddr,server.bindaddr_count," ");
addReplyBulkCString(c,"bind");
@@ -1833,6 +1896,7 @@ int rewriteConfig(char *path) {
rewriteConfigOctalOption(state,"unixsocketperm",server.unixsocketperm,CONFIG_DEFAULT_UNIX_SOCKET_PERM);
rewriteConfigNumericalOption(state,"timeout",server.maxidletime,CONFIG_DEFAULT_CLIENT_TIMEOUT);
rewriteConfigNumericalOption(state,"tcp-keepalive",server.tcpkeepalive,CONFIG_DEFAULT_TCP_KEEPALIVE);
+ rewriteConfigNumericalOption(state,"slave-announce-port",server.slave_announce_port,CONFIG_DEFAULT_SLAVE_ANNOUNCE_PORT);
rewriteConfigEnumOption(state,"loglevel",server.verbosity,loglevel_enum,CONFIG_DEFAULT_VERBOSITY);
rewriteConfigStringOption(state,"logfile",server.logfile,CONFIG_DEFAULT_LOGFILE);
rewriteConfigYesNoOption(state,"syslog-enabled",server.syslog_enabled,CONFIG_DEFAULT_SYSLOG_ENABLED);
@@ -1846,6 +1910,7 @@ int rewriteConfig(char *path) {
rewriteConfigStringOption(state,"dbfilename",server.rdb_filename,CONFIG_DEFAULT_RDB_FILENAME);
rewriteConfigDirOption(state);
rewriteConfigSlaveofOption(state);
+ rewriteConfigStringOption(state,"slave-announce-ip",server.slave_announce_ip,CONFIG_DEFAULT_SLAVE_ANNOUNCE_IP);
rewriteConfigStringOption(state,"masterauth",server.masterauth,NULL);
rewriteConfigStringOption(state,"cluster-announce-ip",server.cluster_announce_ip,NULL);
rewriteConfigYesNoOption(state,"slave-serve-stale-data",server.repl_serve_stale_data,CONFIG_DEFAULT_SLAVE_SERVE_STALE_DATA);
@@ -1896,6 +1961,7 @@ int rewriteConfig(char *path) {
rewriteConfigNumericalOption(state,"hz",server.hz,CONFIG_DEFAULT_HZ);
rewriteConfigYesNoOption(state,"aof-rewrite-incremental-fsync",server.aof_rewrite_incremental_fsync,CONFIG_DEFAULT_AOF_REWRITE_INCREMENTAL_FSYNC);
rewriteConfigYesNoOption(state,"aof-load-truncated",server.aof_load_truncated,CONFIG_DEFAULT_AOF_LOAD_TRUNCATED);
+ rewriteConfigYesNoOption(state,"aof-use-rdb-preamble",server.aof_use_rdb_preamble,CONFIG_DEFAULT_AOF_USE_RDB_PREAMBLE);
rewriteConfigEnumOption(state,"supervised",server.supervised_mode,supervised_mode_enum,SUPERVISED_NONE);
rewriteConfigYesNoOption(state,"lazyfree-lazy-eviction",server.lazyfree_lazy_eviction,CONFIG_DEFAULT_LAZYFREE_LAZY_EVICTION);
rewriteConfigYesNoOption(state,"lazyfree-lazy-expire",server.lazyfree_lazy_expire,CONFIG_DEFAULT_LAZYFREE_LAZY_EXPIRE);
diff --git a/src/db.c b/src/db.c
index 6f70a5383..90a75fcfe 100644
--- a/src/db.c
+++ b/src/db.c
@@ -38,7 +38,10 @@
* C-level DB API
*----------------------------------------------------------------------------*/
-robj *lookupKey(redisDb *db, robj *key) {
+/* Low level key lookup API, not actually called directly from commands
+ * implementations that should instead rely on lookupKeyRead(),
+ * lookupKeyWrite() and lookupKeyReadWithFlags(). */
+robj *lookupKey(redisDb *db, robj *key, int flags) {
dictEntry *de = dictFind(db->dict,key->ptr);
if (de) {
robj *val = dictGetVal(de);
@@ -46,15 +49,46 @@ robj *lookupKey(redisDb *db, robj *key) {
/* Update the access time for the ageing algorithm.
* Don't do it if we have a saving child, as this will trigger
* a copy on write madness. */
- if (server.rdb_child_pid == -1 && server.aof_child_pid == -1)
- val->lru = LRU_CLOCK();
+ if (server.rdb_child_pid == -1 &&
+ server.aof_child_pid == -1 &&
+ !(flags & LOOKUP_NOTOUCH))
+ {
+ if (server.maxmemory_policy & MAXMEMORY_FLAG_LFU) {
+ unsigned long ldt = val->lru >> 8;
+ unsigned long counter = LFULogIncr(val->lru & 255);
+ val->lru = (ldt << 8) | counter;
+ } else {
+ val->lru = LRU_CLOCK();
+ }
+ }
return val;
} else {
return NULL;
}
}
-robj *lookupKeyRead(redisDb *db, robj *key) {
+/* Lookup a key for read operations, or return NULL if the key is not found
+ * in the specified DB.
+ *
+ * As a side effect of calling this function:
+ * 1. A key gets expired if it reached it's TTL.
+ * 2. The key last access time is updated.
+ * 3. The global keys hits/misses stats are updated (reported in INFO).
+ *
+ * This API should not be used when we write to the key after obtaining
+ * the object linked to the key, but only for read only operations.
+ *
+ * Flags change the behavior of this command:
+ *
+ * LOOKUP_NONE (or zero): no special flags are passed.
+ * LOOKUP_NOTOUCH: don't alter the last access time of the key.
+ *
+ * Note: this function also returns NULL is the key is logically expired
+ * but still existing, in case this is a slave, since this API is called only
+ * for read operations. Even if the key expiry is master-driven, we can
+ * correctly report a key is expired on slaves even if the master is lagging
+ * expiring our key via DELs in the replication link. */
+robj *lookupKeyReadWithFlags(redisDb *db, robj *key, int flags) {
robj *val;
if (expireIfNeeded(db,key) == 1) {
@@ -83,7 +117,7 @@ robj *lookupKeyRead(redisDb *db, robj *key) {
return NULL;
}
}
- val = lookupKey(db,key);
+ val = lookupKey(db,key,flags);
if (val == NULL)
server.stat_keyspace_misses++;
else
@@ -91,9 +125,20 @@ robj *lookupKeyRead(redisDb *db, robj *key) {
return val;
}
+/* Like lookupKeyReadWithFlags(), but does not use any flag, which is the
+ * common case. */
+robj *lookupKeyRead(redisDb *db, robj *key) {
+ return lookupKeyReadWithFlags(db,key,LOOKUP_NONE);
+}
+
+/* Lookup a key for write operations, and as a side effect, if needed, expires
+ * the key if its TTL is reached.
+ *
+ * Returns the linked value object if the key exists or NULL if the key
+ * does not exist in the specified DB. */
robj *lookupKeyWrite(redisDb *db, robj *key) {
expireIfNeeded(db,key);
- return lookupKey(db,key);
+ return lookupKey(db,key,LOOKUP_NONE);
}
robj *lookupKeyReadOrReply(client *c, robj *key, robj *reply) {
@@ -130,7 +175,14 @@ void dbOverwrite(redisDb *db, robj *key, robj *val) {
dictEntry *de = dictFind(db->dict,key->ptr);
serverAssertWithInfo(NULL,key,de != NULL);
- dictReplace(db->dict, key->ptr, val);
+ if (server.maxmemory_policy & MAXMEMORY_FLAG_LFU) {
+ robj *old = dictGetVal(de);
+ int saved_lru = old->lru;
+ dictReplace(db->dict, key->ptr, val);
+ val->lru = saved_lru;
+ } else {
+ dictReplace(db->dict, key->ptr, val);
+ }
}
/* High level Set operation. This function can be used in order to set
@@ -138,7 +190,9 @@ void dbOverwrite(redisDb *db, robj *key, robj *val) {
*
* 1) The ref count of the value object is incremented.
* 2) clients WATCHing for the destination key notified.
- * 3) The expire time of the key is reset (the key is made persistent). */
+ * 3) The expire time of the key is reset (the key is made persistent).
+ *
+ * All the new keys in the database should be craeted via this interface. */
void setKey(redisDb *db, robj *key, robj *val) {
if (lookupKeyWrite(db,key) == NULL) {
dbAdd(db,key,val);
@@ -278,6 +332,7 @@ long long emptyDb(int dbnum, int flags, void(callback)(void*)) {
slotToKeyFlush();
}
}
+ if (dbnum == -1) flushSlaveKeysWithExpireList();
return removed;
}
@@ -361,7 +416,7 @@ void flushallCommand(client *c) {
/* Normally rdbSave() will reset dirty, but we don't want this here
* as otherwise FLUSHALL will not be replicated nor put into the AOF. */
int saved_dirty = server.dirty;
- rdbSave(server.rdb_filename);
+ rdbSave(server.rdb_filename,NULL);
server.dirty = saved_dirty;
}
server.dirty++;
@@ -419,7 +474,7 @@ void selectCommand(client *c) {
return;
}
if (selectDb(c,id) == C_ERR) {
- addReplyError(c,"invalid DB index");
+ addReplyError(c,"DB index is out of range");
} else {
addReply(c,shared.ok);
}
@@ -721,7 +776,7 @@ void typeCommand(client *c) {
robj *o;
char *type;
- o = lookupKeyRead(c->db,c->argv[1]);
+ o = lookupKeyReadWithFlags(c->db,c->argv[1],LOOKUP_NOTOUCH);
if (o == NULL) {
type = "none";
} else {
@@ -731,6 +786,10 @@ void typeCommand(client *c) {
case OBJ_SET: type = "set"; break;
case OBJ_ZSET: type = "zset"; break;
case OBJ_HASH: type = "hash"; break;
+ case OBJ_MODULE: {
+ moduleValue *mv = o->ptr;
+ type = mv->type->name;
+ }; break;
default: type = "unknown"; break;
}
}
@@ -795,7 +854,7 @@ void renameGenericCommand(client *c, int nx) {
dbDelete(c->db,c->argv[2]);
}
dbAdd(c->db,c->argv[2],o);
- if (expire != -1) setExpire(c->db,c->argv[2],expire);
+ if (expire != -1) setExpire(c,c->db,c->argv[2],expire);
dbDelete(c->db,c->argv[1]);
signalModifiedKey(c->db,c->argv[1]);
signalModifiedKey(c->db,c->argv[2]);
@@ -861,7 +920,7 @@ void moveCommand(client *c) {
return;
}
dbAdd(dst,c->argv[1],o);
- if (expire != -1) setExpire(dst,c->argv[1],expire);
+ if (expire != -1) setExpire(c,dst,c->argv[1],expire);
incrRefCount(o);
/* OK! key moved, free the entry in the source DB */
@@ -870,6 +929,91 @@ void moveCommand(client *c) {
addReply(c,shared.cone);
}
+/* Helper function for dbSwapDatabases(): scans the list of keys that have
+ * one or more blocked clients for B[LR]POP or other list blocking commands
+ * and signal the keys are ready if they are lists. See the comment where
+ * the function is used for more info. */
+void scanDatabaseForReadyLists(redisDb *db) {
+ dictEntry *de;
+ dictIterator *di = dictGetSafeIterator(db->blocking_keys);
+ while((de = dictNext(di)) != NULL) {
+ robj *key = dictGetKey(de);
+ robj *value = lookupKey(db,key,LOOKUP_NOTOUCH);
+ if (value && value->type == OBJ_LIST)
+ signalListAsReady(db, key);
+ }
+ dictReleaseIterator(di);
+}
+
+/* Swap two databases at runtime so that all clients will magically see
+ * the new database even if already connected. Note that the client
+ * structure c->db points to a given DB, so we need to be smarter and
+ * swap the underlying referenced structures, otherwise we would need
+ * to fix all the references to the Redis DB structure.
+ *
+ * Returns C_ERR if at least one of the DB ids are out of range, otherwise
+ * C_OK is returned. */
+int dbSwapDatabases(int id1, int id2) {
+ if (id1 < 0 || id1 >= server.dbnum ||
+ id2 < 0 || id2 >= server.dbnum) return C_ERR;
+ if (id1 == id2) return C_OK;
+ redisDb aux = server.db[id1];
+ redisDb *db1 = &server.db[id1], *db2 = &server.db[id2];
+
+ /* Swap hash tables. Note that we don't swap blocking_keys,
+ * ready_keys and watched_keys, since we want clients to
+ * remain in the same DB they were. */
+ db1->dict = db2->dict;
+ db1->expires = db2->expires;
+ db1->avg_ttl = db2->avg_ttl;
+
+ db2->dict = aux.dict;
+ db2->expires = aux.expires;
+ db2->avg_ttl = aux.avg_ttl;
+
+ /* Now we need to handle clients blocked on lists: as an effect
+ * of swapping the two DBs, a client that was waiting for list
+ * X in a given DB, may now actually be unblocked if X happens
+ * to exist in the new version of the DB, after the swap.
+ *
+ * However normally we only do this check for efficiency reasons
+ * in dbAdd() when a list is created. So here we need to rescan
+ * the list of clients blocked on lists and signal lists as ready
+ * if needed. */
+ scanDatabaseForReadyLists(db1);
+ scanDatabaseForReadyLists(db2);
+ return C_OK;
+}
+
+/* SWAPDB db1 db2 */
+void swapdbCommand(client *c) {
+ long id1, id2;
+
+ /* Not allowed in cluster mode: we have just DB 0 there. */
+ if (server.cluster_enabled) {
+ addReplyError(c,"SWAPDB is not allowed in cluster mode");
+ return;
+ }
+
+ /* Get the two DBs indexes. */
+ if (getLongFromObjectOrReply(c, c->argv[1], &id1,
+ "invalid first DB index") != C_OK)
+ return;
+
+ if (getLongFromObjectOrReply(c, c->argv[2], &id2,
+ "invalid second DB index") != C_OK)
+ return;
+
+ /* Swap... */
+ if (dbSwapDatabases(id1,id2) == C_ERR) {
+ addReplyError(c,"DB index is out of range");
+ return;
+ } else {
+ server.dirty++;
+ addReply(c,shared.ok);
+ }
+}
+
/*-----------------------------------------------------------------------------
* Expires API
*----------------------------------------------------------------------------*/
@@ -881,14 +1025,22 @@ int removeExpire(redisDb *db, robj *key) {
return dictDelete(db->expires,key->ptr) == DICT_OK;
}
-void setExpire(redisDb *db, robj *key, long long when) {
+/* Set an expire to the specified key. If the expire is set in the context
+ * of an user calling a command 'c' is the client, otherwise 'c' is set
+ * to NULL. The 'when' parameter is the absolute unix time in milliseconds
+ * after which the key will no longer be considered valid. */
+void setExpire(client *c, redisDb *db, robj *key, long long when) {
dictEntry *kde, *de;
/* Reuse the sds from the main dict in the expire dict */
kde = dictFind(db->dict,key->ptr);
serverAssertWithInfo(NULL,key,kde != NULL);
- de = dictReplaceRaw(db->expires,dictGetKey(kde));
+ de = dictAddOrFind(db->expires,dictGetKey(kde));
dictSetSignedIntegerVal(de,when);
+
+ int writable_slave = server.masterhost && server.repl_slave_ro == 0;
+ if (c && writable_slave && !(c->flags & CLIENT_MASTER))
+ rememberSlaveKeyWithExpire(db,key);
}
/* Return the expire time of the specified key, or -1 if no expire
@@ -967,126 +1119,6 @@ int expireIfNeeded(redisDb *db, robj *key) {
dbSyncDelete(db,key);
}
-/*-----------------------------------------------------------------------------
- * Expires Commands
- *----------------------------------------------------------------------------*/
-
-/* This is the generic command implementation for EXPIRE, PEXPIRE, EXPIREAT
- * and PEXPIREAT. Because the commad second argument may be relative or absolute
- * the "basetime" argument is used to signal what the base time is (either 0
- * for *AT variants of the command, or the current time for relative expires).
- *
- * unit is either UNIT_SECONDS or UNIT_MILLISECONDS, and is only used for
- * the argv[2] parameter. The basetime is always specified in milliseconds. */
-void expireGenericCommand(client *c, long long basetime, int unit) {
- robj *key = c->argv[1], *param = c->argv[2];
- long long when; /* unix time in milliseconds when the key will expire. */
-
- if (getLongLongFromObjectOrReply(c, param, &when, NULL) != C_OK)
- return;
-
- if (unit == UNIT_SECONDS) when *= 1000;
- when += basetime;
-
- /* No key, return zero. */
- if (lookupKeyWrite(c->db,key) == NULL) {
- addReply(c,shared.czero);
- return;
- }
-
- /* EXPIRE with negative TTL, or EXPIREAT with a timestamp into the past
- * should never be executed as a DEL when load the AOF or in the context
- * of a slave instance.
- *
- * Instead we take the other branch of the IF statement setting an expire
- * (possibly in the past) and wait for an explicit DEL from the master. */
- if (when <= mstime() && !server.loading && !server.masterhost) {
- robj *aux;
-
- int deleted = server.lazyfree_lazy_expire ? dbAsyncDelete(c->db,key) :
- dbSyncDelete(c->db,key);
- serverAssertWithInfo(c,key,deleted);
- server.dirty++;
-
- /* Replicate/AOF this as an explicit DEL or UNLINK. */
- aux = server.lazyfree_lazy_expire ? shared.unlink : shared.del;
- rewriteClientCommandVector(c,2,aux,key);
- signalModifiedKey(c->db,key);
- notifyKeyspaceEvent(NOTIFY_GENERIC,"del",key,c->db->id);
- addReply(c, shared.cone);
- return;
- } else {
- setExpire(c->db,key,when);
- addReply(c,shared.cone);
- signalModifiedKey(c->db,key);
- notifyKeyspaceEvent(NOTIFY_GENERIC,"expire",key,c->db->id);
- server.dirty++;
- return;
- }
-}
-
-void expireCommand(client *c) {
- expireGenericCommand(c,mstime(),UNIT_SECONDS);
-}
-
-void expireatCommand(client *c) {
- expireGenericCommand(c,0,UNIT_SECONDS);
-}
-
-void pexpireCommand(client *c) {
- expireGenericCommand(c,mstime(),UNIT_MILLISECONDS);
-}
-
-void pexpireatCommand(client *c) {
- expireGenericCommand(c,0,UNIT_MILLISECONDS);
-}
-
-void ttlGenericCommand(client *c, int output_ms) {
- long long expire, ttl = -1;
-
- /* If the key does not exist at all, return -2 */
- if (lookupKeyRead(c->db,c->argv[1]) == NULL) {
- addReplyLongLong(c,-2);
- return;
- }
- /* The key exists. Return -1 if it has no expire, or the actual
- * TTL value otherwise. */
- expire = getExpire(c->db,c->argv[1]);
- if (expire != -1) {
- ttl = expire-mstime();
- if (ttl < 0) ttl = 0;
- }
- if (ttl == -1) {
- addReplyLongLong(c,-1);
- } else {
- addReplyLongLong(c,output_ms ? ttl : ((ttl+500)/1000));
- }
-}
-
-void ttlCommand(client *c) {
- ttlGenericCommand(c, 0);
-}
-
-void pttlCommand(client *c) {
- ttlGenericCommand(c, 1);
-}
-
-void persistCommand(client *c) {
- dictEntry *de;
-
- de = dictFind(c->db->dict,c->argv[1]->ptr);
- if (de == NULL) {
- addReply(c,shared.czero);
- } else {
- if (removeExpire(c->db,c->argv[1])) {
- addReply(c,shared.cone);
- server.dirty++;
- } else {
- addReply(c,shared.czero);
- }
- }
-}
-
/* -----------------------------------------------------------------------------
* API to get key arguments from commands
* ---------------------------------------------------------------------------*/
diff --git a/src/debug.c b/src/debug.c
index 1e179caff..b8ad4e511 100644
--- a/src/debug.c
+++ b/src/debug.c
@@ -33,12 +33,14 @@
#include <arpa/inet.h>
#include <signal.h>
+#include <dlfcn.h>
#ifdef HAVE_BACKTRACE
#include <execinfo.h>
#include <ucontext.h>
#include <fcntl.h>
#include "bio.h"
+#include <unistd.h>
#endif /* HAVE_BACKTRACE */
#ifdef __CYGWIN__
@@ -250,14 +252,6 @@ void computeDatasetDigest(unsigned char *final) {
}
}
-#if defined(USE_JEMALLOC)
-void inputCatSds(void *result, const char *str) {
- /* result is actually a (sds *), so re-cast it here */
- sds *info = (sds *)result;
- *info = sdscat(*info, str);
-}
-#endif
-
void debugCommand(client *c) {
if (c->argc == 1) {
addReplyError(c,"You must specify a subcommand for DEBUG. Try DEBUG HELP for info.");
@@ -286,6 +280,8 @@ void debugCommand(client *c) {
blen++; addReplyStatus(c,
"sdslen <key> -- Show low level SDS string info representing key and value.");
blen++; addReplyStatus(c,
+ "ziplist <key> -- Show low level info about the ziplist encoding.");
+ blen++; addReplyStatus(c,
"populate <count> [prefix] -- Create <count> string keys named key:<num>. If a prefix is specified is used instead of the 'key' prefix.");
blen++; addReplyStatus(c,
"digest -- Outputs an hex signature representing the current DB content.");
@@ -301,10 +297,6 @@ void debugCommand(client *c) {
"structsize -- Return the size of different Redis core C structures.");
blen++; addReplyStatus(c,
"htstats <dbid> -- Return hash table statistics of the specified Redis database.");
- blen++; addReplyStatus(c,
- "jemalloc info -- Show internal jemalloc statistics.");
- blen++; addReplyStatus(c,
- "jemalloc purge -- Force jemalloc to release unused memory.");
setDeferredMultiBulkLength(c,blenp,blen);
} else if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
*((char*)-1) = 'x';
@@ -330,12 +322,12 @@ void debugCommand(client *c) {
if (c->argc >= 3) c->argv[2] = tryObjectEncoding(c->argv[2]);
serverAssertWithInfo(c,c->argv[0],1 == 2);
} else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
- if (rdbSave(server.rdb_filename) != C_OK) {
+ if (rdbSave(server.rdb_filename,NULL) != C_OK) {
addReply(c,shared.err);
return;
}
emptyDb(-1,EMPTYDB_NO_FLAGS,NULL);
- if (rdbLoad(server.rdb_filename) != C_OK) {
+ if (rdbLoad(server.rdb_filename,NULL) != C_OK) {
addReplyError(c,"Error trying to load the RDB dump");
return;
}
@@ -419,12 +411,26 @@ void debugCommand(client *c) {
addReplyError(c,"Not an sds encoded string.");
} else {
addReplyStatusFormat(c,
- "key_sds_len:%lld, key_sds_avail:%lld, "
- "val_sds_len:%lld, val_sds_avail:%lld",
+ "key_sds_len:%lld, key_sds_avail:%lld, key_zmalloc: %lld, "
+ "val_sds_len:%lld, val_sds_avail:%lld, val_zmalloc: %lld",
(long long) sdslen(key),
(long long) sdsavail(key),
+ (long long) sdsZmallocSize(key),
(long long) sdslen(val->ptr),
- (long long) sdsavail(val->ptr));
+ (long long) sdsavail(val->ptr),
+ (long long) getStringObjectSdsUsedMemory(val));
+ }
+ } else if (!strcasecmp(c->argv[1]->ptr,"ziplist") && c->argc == 3) {
+ robj *o;
+
+ if ((o = objectCommandLookupOrReply(c,c->argv[2],shared.nokeyerr))
+ == NULL) return;
+
+ if (o->encoding != OBJ_ENCODING_ZIPLIST) {
+ addReplyError(c,"Not an sds encoded string.");
+ } else {
+ ziplistRepr(o->ptr);
+ addReplyStatus(c,"Ziplist structure printed on stdout");
}
} else if (!strcasecmp(c->argv[1]->ptr,"populate") &&
(c->argc == 3 || c->argc == 4)) {
@@ -518,30 +524,6 @@ void debugCommand(client *c) {
stats = sdscat(stats,buf);
addReplyBulkSds(c,stats);
- } else if (!strcasecmp(c->argv[1]->ptr,"jemalloc") && c->argc == 3) {
-#if defined(USE_JEMALLOC)
- if (!strcasecmp(c->argv[2]->ptr, "info")) {
- sds info = sdsempty();
- je_malloc_stats_print(inputCatSds, &info, NULL);
- addReplyBulkSds(c, info);
- } else if (!strcasecmp(c->argv[2]->ptr, "purge")) {
- char tmp[32];
- unsigned narenas = 0;
- size_t sz = sizeof(unsigned);
- if (!je_mallctl("arenas.narenas", &narenas, &sz, NULL, 0)) {
- sprintf(tmp, "arena.%d.purge", narenas);
- if (!je_mallctl(tmp, NULL, 0, NULL, 0)) {
- addReply(c, shared.ok);
- return;
- }
- }
- addReplyError(c, "Error purging dirty pages");
- } else {
- addReplyErrorFormat(c, "Valid jemalloc debug fields: info, purge");
- }
-#else
- addReplyErrorFormat(c, "jemalloc support not available");
-#endif
} else {
addReplyErrorFormat(c, "Unknown DEBUG subcommand or wrong number of arguments for '%s'",
(char*)c->argv[1]->ptr);
@@ -550,7 +532,7 @@ void debugCommand(client *c) {
/* =========================== Crash handling ============================== */
-void _serverAssert(char *estr, char *file, int line) {
+void _serverAssert(const char *estr, const char *file, int line) {
bugReportStart();
serverLog(LL_WARNING,"=== ASSERTION FAILED ===");
serverLog(LL_WARNING,"==> %s:%d '%s' is not true",file,line,estr);
@@ -563,7 +545,7 @@ void _serverAssert(char *estr, char *file, int line) {
*((char*)-1) = 'x';
}
-void _serverAssertPrintClientInfo(client *c) {
+void _serverAssertPrintClientInfo(const client *c) {
int j;
bugReportStart();
@@ -587,7 +569,7 @@ void _serverAssertPrintClientInfo(client *c) {
}
}
-void serverLogObjectDebugInfo(robj *o) {
+void serverLogObjectDebugInfo(const robj *o) {
serverLog(LL_WARNING,"Object type: %d", o->type);
serverLog(LL_WARNING,"Object encoding: %d", o->encoding);
serverLog(LL_WARNING,"Object refcount: %d", o->refcount);
@@ -607,23 +589,23 @@ void serverLogObjectDebugInfo(robj *o) {
} else if (o->type == OBJ_ZSET) {
serverLog(LL_WARNING,"Sorted set size: %d", (int) zsetLength(o));
if (o->encoding == OBJ_ENCODING_SKIPLIST)
- serverLog(LL_WARNING,"Skiplist level: %d", (int) ((zset*)o->ptr)->zsl->level);
+ serverLog(LL_WARNING,"Skiplist level: %d", (int) ((const zset*)o->ptr)->zsl->level);
}
}
-void _serverAssertPrintObject(robj *o) {
+void _serverAssertPrintObject(const robj *o) {
bugReportStart();
serverLog(LL_WARNING,"=== ASSERTION FAILED OBJECT CONTEXT ===");
serverLogObjectDebugInfo(o);
}
-void _serverAssertWithInfo(client *c, robj *o, char *estr, char *file, int line) {
+void _serverAssertWithInfo(const client *c, const robj *o, const char *estr, const char *file, int line) {
if (c) _serverAssertPrintClientInfo(c);
if (o) _serverAssertPrintObject(o);
_serverAssert(estr,file,line);
}
-void _serverPanic(char *msg, char *file, int line) {
+void _serverPanic(const char *msg, const char *file, int line) {
bugReportStart();
serverLog(LL_WARNING,"------------------------------------------------");
serverLog(LL_WARNING,"!!! Software Failure. Press left mouse button to continue");
@@ -669,6 +651,8 @@ static void *getMcontextEip(ucontext_t *uc) {
return (void*) uc->uc_mcontext.gregs[16]; /* Linux 64 */
#elif defined(__ia64__) /* Linux IA64 */
return (void*) uc->uc_mcontext.sc_ip;
+ #elif defined(__arm__) /* Linux ARM */
+ return (void*) uc->uc_mcontext.arm_pc;
#endif
#else
return NULL;
@@ -970,6 +954,32 @@ int memtest_test_linux_anonymous_maps(void) {
}
#endif
+/* Scans the (assumed) x86 code starting at addr, for a max of `len`
+ * bytes, searching for E8 (callq) opcodes, and dumping the symbols
+ * and the call offset if they appear to be valid. */
+void dumpX86Calls(void *addr, size_t len) {
+ size_t j;
+ unsigned char *p = addr;
+ Dl_info info;
+ /* Hash table to best-effort avoid printing the same symbol
+ * multiple times. */
+ unsigned long ht[256] = {0};
+
+ if (len < 5) return;
+ for (j = 0; j < len-4; j++) {
+ if (p[j] != 0xE8) continue; /* Not an E8 CALL opcode. */
+ unsigned long target = (unsigned long)addr+j+5;
+ target += *((int32_t*)(p+j+1));
+ if (dladdr((void*)target, &info) != 0 && info.dli_sname != NULL) {
+ if (ht[target&0xff] != target) {
+ printf("Function at 0x%lx is %s\n",target,info.dli_sname);
+ ht[target&0xff] = target;
+ }
+ j += 4; /* Skip the 32 bit immediate. */
+ }
+ }
+}
+
void sigsegvHandler(int sig, siginfo_t *info, void *secret) {
ucontext_t *uc = (ucontext_t*) secret;
void *eip = getMcontextEip(uc);
@@ -1020,19 +1030,49 @@ void sigsegvHandler(int sig, siginfo_t *info, void *secret) {
bioKillThreads();
if (memtest_test_linux_anonymous_maps()) {
serverLogRaw(LL_WARNING|LL_RAW,
- "!!! MEMORY ERROR DETECTED! Check your memory ASAP !!!");
+ "!!! MEMORY ERROR DETECTED! Check your memory ASAP !!!\n");
} else {
serverLogRaw(LL_WARNING|LL_RAW,
- "Fast memory test PASSED, however your memory can still be broken. Please run a memory test for several hours if possible.");
+ "Fast memory test PASSED, however your memory can still be broken. Please run a memory test for several hours if possible.\n");
}
#endif
+ if (eip != NULL) {
+ Dl_info info;
+ if (dladdr(eip, &info) != 0) {
+ serverLog(LL_WARNING|LL_RAW,
+ "\n------ DUMPING CODE AROUND EIP ------\n"
+ "Symbol: %s (base: %p)\n"
+ "Module: %s (base %p)\n"
+ "$ xxd -r -p /tmp/dump.hex /tmp/dump.bin\n"
+ "$ objdump --adjust-vma=%p -D -b binary -m i386:x86-64 /tmp/dump.bin\n"
+ "------\n",
+ info.dli_sname, info.dli_saddr, info.dli_fname, info.dli_fbase,
+ info.dli_saddr);
+ size_t len = (long)eip - (long)info.dli_saddr;
+ unsigned long sz = sysconf(_SC_PAGESIZE);
+ if (len < 1<<13) { /* we don't have functions over 8k (verified) */
+ /* Find the address of the next page, which is our "safety"
+ * limit when dumping. Then try to dump just 128 bytes more
+ * than EIP if there is room, or stop sooner. */
+ unsigned long next = ((unsigned long)eip + sz) & ~(sz-1);
+ unsigned long end = (unsigned long)eip + 128;
+ if (end > next) end = next;
+ len = end - (unsigned long)info.dli_saddr;
+ serverLogHexDump(LL_WARNING, "dump of function",
+ info.dli_saddr ,len);
+ dumpX86Calls(info.dli_saddr,len);
+ }
+ }
+ }
+
serverLogRaw(LL_WARNING|LL_RAW,
"\n=== REDIS BUG REPORT END. Make sure to include from START to END. ===\n\n"
" Please report the crash by opening an issue on github:\n\n"
" http://github.com/antirez/redis/issues\n\n"
" Suspect RAM error? Use redis-server --test-memory to verify it.\n\n"
);
+
/* free(messages); Don't call free() with possibly corrupted memory. */
if (server.daemonize && server.supervised == 0) unlink(server.pidfile);
@@ -1053,7 +1093,7 @@ void serverLogHexDump(int level, char *descr, void *value, size_t len) {
unsigned char *v = value;
char charset[] = "0123456789abcdef";
- serverLog(level,"%s (hexdump):", descr);
+ serverLog(level,"%s (hexdump of %zu bytes):", descr, len);
b = buf;
while(len) {
b[0] = charset[(*v)>>4];
diff --git a/src/debugmacro.h b/src/debugmacro.h
new file mode 100644
index 000000000..ded2d2667
--- /dev/null
+++ b/src/debugmacro.h
@@ -0,0 +1,41 @@
+/* This file contains debugging macros to be used when investigating issues.
+ *
+ * -----------------------------------------------------------------------------
+ *
+ * Copyright (c) 2016, Salvatore Sanfilippo <antirez at gmail dot com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Redis nor the names of its contributors may be used
+ * to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#define D(...) \
+ do { \
+ FILE *fp = fopen("/tmp/log.txt","a"); \
+ fprintf(fp,"%s:%s:%d:\t", __FILE__, __func__, __LINE__); \
+ fprintf(fp,__VA_ARGS__); \
+ fprintf(fp,"\n"); \
+ fclose(fp); \
+ } while (0);
diff --git a/src/dict.c b/src/dict.c
index 887a0f65e..b9b2390f1 100644
--- a/src/dict.c
+++ b/src/dict.c
@@ -45,7 +45,11 @@
#include "dict.h"
#include "zmalloc.h"
+#ifndef DICT_BENCHMARK_MAIN
#include "redisassert.h"
+#else
+#include <assert.h>
+#endif
/* Using dictEnableResize() / dictDisableResize() we make possible to
* enable/disable resizing of the hash table as needed. This is very important
@@ -62,23 +66,11 @@ static unsigned int dict_force_resize_ratio = 5;
static int _dictExpandIfNeeded(dict *ht);
static unsigned long _dictNextPower(unsigned long size);
-static int _dictKeyIndex(dict *ht, const void *key);
+static int _dictKeyIndex(dict *ht, const void *key, unsigned int hash, dictEntry **existing);
static int _dictInit(dict *ht, dictType *type, void *privDataPtr);
/* -------------------------- hash functions -------------------------------- */
-/* Thomas Wang's 32 bit Mix Function */
-unsigned int dictIntHashFunction(unsigned int key)
-{
- key += ~(key << 15);
- key ^= (key >> 10);
- key += (key << 3);
- key ^= (key >> 6);
- key += ~(key << 11);
- key ^= (key >> 16);
- return key;
-}
-
static uint32_t dict_hash_function_seed = 5381;
void dictSetHashFunctionSeed(uint32_t seed) {
@@ -321,29 +313,32 @@ static void _dictRehashStep(dict *d) {
/* Add an element to the target hash table */
int dictAdd(dict *d, void *key, void *val)
{
- dictEntry *entry = dictAddRaw(d,key);
+ dictEntry *entry = dictAddRaw(d,key,NULL);
if (!entry) return DICT_ERR;
dictSetVal(d, entry, val);
return DICT_OK;
}
-/* Low level add. This function adds the entry but instead of setting
- * a value returns the dictEntry structure to the user, that will make
- * sure to fill the value field as he wishes.
+/* Low level add or find:
+ * This function adds the entry but instead of setting a value returns the
+ * dictEntry structure to the user, that will make sure to fill the value
+ * field as he wishes.
*
* This function is also directly exposed to the user API to be called
* mainly in order to store non-pointers inside the hash value, example:
*
- * entry = dictAddRaw(dict,mykey);
+ * entry = dictAddRaw(dict,mykey,NULL);
* if (entry != NULL) dictSetSignedIntegerVal(entry,1000);
*
* Return values:
*
- * If key already exists NULL is returned.
+ * If key already exists NULL is returned, and "*existing" is populated
+ * with the existing entry if existing is not NULL.
+ *
* If key was added, the hash entry is returned to be manipulated by the caller.
*/
-dictEntry *dictAddRaw(dict *d, void *key)
+dictEntry *dictAddRaw(dict *d, void *key, dictEntry **existing)
{
int index;
dictEntry *entry;
@@ -353,7 +348,7 @@ dictEntry *dictAddRaw(dict *d, void *key)
/* Get the index of the new element, or -1 if
* the element already exists. */
- if ((index = _dictKeyIndex(d, key)) == -1)
+ if ((index = _dictKeyIndex(d, key, dictHashKey(d,key), existing)) == -1)
return NULL;
/* Allocate the memory and store the new entry.
@@ -371,51 +366,57 @@ dictEntry *dictAddRaw(dict *d, void *key)
return entry;
}
-/* Add an element, discarding the old if the key already exists.
+/* Add or Overwrite:
+ * Add an element, discarding the old value if the key already exists.
* Return 1 if the key was added from scratch, 0 if there was already an
* element with such key and dictReplace() just performed a value update
* operation. */
int dictReplace(dict *d, void *key, void *val)
{
- dictEntry *entry, auxentry;
+ dictEntry *entry, *existing, auxentry;
/* Try to add the element. If the key
* does not exists dictAdd will suceed. */
- if (dictAdd(d, key, val) == DICT_OK)
+ entry = dictAddRaw(d,key,&existing);
+ if (entry) {
+ dictSetVal(d, entry, val);
return 1;
- /* It already exists, get the entry */
- entry = dictFind(d, key);
+ }
+
/* Set the new value and free the old one. Note that it is important
* to do that in this order, as the value may just be exactly the same
* as the previous one. In this context, think to reference counting,
* you want to increment (set), and then decrement (free), and not the
* reverse. */
- auxentry = *entry;
- dictSetVal(d, entry, val);
+ auxentry = *existing;
+ dictSetVal(d, existing, val);
dictFreeVal(d, &auxentry);
return 0;
}
-/* dictReplaceRaw() is simply a version of dictAddRaw() that always
+/* Add or Find:
+ * dictAddOrFind() is simply a version of dictAddRaw() that always
* returns the hash entry of the specified key, even if the key already
* exists and can't be added (in that case the entry of the already
* existing key is returned.)
*
* See dictAddRaw() for more information. */
-dictEntry *dictReplaceRaw(dict *d, void *key) {
- dictEntry *entry = dictFind(d,key);
-
- return entry ? entry : dictAddRaw(d,key);
+dictEntry *dictAddOrFind(dict *d, void *key) {
+ dictEntry *entry, *existing;
+ entry = dictAddRaw(d,key,&existing);
+ return entry ? entry : existing;
}
-/* Search and remove an element */
-static int dictGenericDelete(dict *d, const void *key, int nofree)
-{
+/* Search and remove an element. This is an helper function for
+ * dictDelete() and dictUnlink(), please check the top comment
+ * of those functions. */
+static dictEntry *dictGenericDelete(dict *d, const void *key, int nofree) {
unsigned int h, idx;
dictEntry *he, *prevHe;
int table;
- if (d->ht[0].size == 0) return DICT_ERR; /* d->ht[0].table is NULL */
+ if (d->ht[0].used == 0 && d->ht[1].used == 0) return NULL;
+
if (dictIsRehashing(d)) _dictRehashStep(d);
h = dictHashKey(d, key);
@@ -433,27 +434,59 @@ static int dictGenericDelete(dict *d, const void *key, int nofree)
if (!nofree) {
dictFreeKey(d, he);
dictFreeVal(d, he);
+ zfree(he);
}
- zfree(he);
d->ht[table].used--;
- return DICT_OK;
+ return he;
}
prevHe = he;
he = he->next;
}
if (!dictIsRehashing(d)) break;
}
- return DICT_ERR; /* not found */
+ return NULL; /* not found */
}
+/* Remove an element, returning DICT_OK on success or DICT_ERR if the
+ * element was not found. */
int dictDelete(dict *ht, const void *key) {
- return dictGenericDelete(ht,key,0);
+ return dictGenericDelete(ht,key,0) ? DICT_OK : DICT_ERR;
}
-int dictDeleteNoFree(dict *ht, const void *key) {
+/* Remove an element from the table, but without actually releasing
+ * the key, value and dictionary entry. The dictionary entry is returned
+ * if the element was found (and unlinked from the table), and the user
+ * should later call `dictFreeUnlinkedEntry()` with it in order to release it.
+ * Otherwise if the key is not found, NULL is returned.
+ *
+ * This function is useful when we want to remove something from the hash
+ * table but want to use its value before actually deleting the entry.
+ * Without this function the pattern would require two lookups:
+ *
+ * entry = dictFind(...);
+ * // Do something with entry
+ * dictDelete(dictionary,entry);
+ *
+ * Thanks to this function it is possible to avoid this, and use
+ * instead:
+ *
+ * entry = dictUnlink(dictionary,entry);
+ * // Do something with entry
+ * dictFreeUnlinkedEntry(entry); // <- This does not need to lookup again.
+ */
+dictEntry *dictUnlink(dict *ht, const void *key) {
return dictGenericDelete(ht,key,1);
}
+/* You need to call this function to really free the entry after a call
+ * to dictUnlink(). It's safe to call this function with 'he' = NULL. */
+void dictFreeUnlinkedEntry(dict *d, dictEntry *he) {
+ if (he == NULL) return;
+ dictFreeKey(d, he);
+ dictFreeVal(d, he);
+ zfree(he);
+}
+
/* Destroy an entire dictionary */
int _dictClear(dict *d, dictht *ht, void(callback)(void *)) {
unsigned long i;
@@ -962,27 +995,29 @@ static unsigned long _dictNextPower(unsigned long size)
/* Returns the index of a free slot that can be populated with
* a hash entry for the given 'key'.
- * If the key already exists, -1 is returned.
+ * If the key already exists, -1 is returned
+ * and the optional output parameter may be filled.
*
* Note that if we are in the process of rehashing the hash table, the
* index is always returned in the context of the second (new) hash table. */
-static int _dictKeyIndex(dict *d, const void *key)
+static int _dictKeyIndex(dict *d, const void *key, unsigned int hash, dictEntry **existing)
{
- unsigned int h, idx, table;
+ unsigned int idx, table;
dictEntry *he;
+ if (existing) *existing = NULL;
/* Expand the hash table if needed */
if (_dictExpandIfNeeded(d) == DICT_ERR)
return -1;
- /* Compute the key hash value */
- h = dictHashKey(d, key);
for (table = 0; table <= 1; table++) {
- idx = h & d->ht[table].sizemask;
+ idx = hash & d->ht[table].sizemask;
/* Search if this slot does not already contain the given key */
he = d->ht[table].table[idx];
while(he) {
- if (key==he->key || dictCompareKeys(d, key, he->key))
+ if (key==he->key || dictCompareKeys(d, key, he->key)) {
+ if (existing) *existing = he;
return -1;
+ }
he = he->next;
}
if (!dictIsRehashing(d)) break;
@@ -1083,3 +1118,120 @@ void dictGetStats(char *buf, size_t bufsize, dict *d) {
/* Make sure there is a NULL term at the end. */
if (orig_bufsize) orig_buf[orig_bufsize-1] = '\0';
}
+
+/* ------------------------------- Benchmark ---------------------------------*/
+
+#ifdef DICT_BENCHMARK_MAIN
+
+#include "sds.h"
+
+unsigned int hashCallback(const void *key) {
+ return dictGenHashFunction((unsigned char*)key, sdslen((char*)key));
+}
+
+int compareCallback(void *privdata, const void *key1, const void *key2) {
+ int l1,l2;
+ DICT_NOTUSED(privdata);
+
+ l1 = sdslen((sds)key1);
+ l2 = sdslen((sds)key2);
+ if (l1 != l2) return 0;
+ return memcmp(key1, key2, l1) == 0;
+}
+
+void freeCallback(void *privdata, void *val) {
+ DICT_NOTUSED(privdata);
+
+ sdsfree(val);
+}
+
+dictType BenchmarkDictType = {
+ hashCallback,
+ NULL,
+ NULL,
+ compareCallback,
+ freeCallback,
+ NULL
+};
+
+#define start_benchmark() start = timeInMilliseconds()
+#define end_benchmark(msg) do { \
+ elapsed = timeInMilliseconds()-start; \
+ printf(msg ": %ld items in %lld ms\n", count, elapsed); \
+} while(0);
+
+/* dict-benchmark [count] */
+int main(int argc, char **argv) {
+ long j;
+ long long start, elapsed;
+ dict *dict = dictCreate(&BenchmarkDictType,NULL);
+ long count = 0;
+
+ if (argc == 2) {
+ count = strtol(argv[1],NULL,10);
+ } else {
+ count = 5000000;
+ }
+
+ start_benchmark();
+ for (j = 0; j < count; j++) {
+ int retval = dictAdd(dict,sdsfromlonglong(j),(void*)j);
+ assert(retval == DICT_OK);
+ }
+ end_benchmark("Inserting");
+ assert((long)dictSize(dict) == count);
+
+ /* Wait for rehashing. */
+ while (dictIsRehashing(dict)) {
+ dictRehashMilliseconds(dict,100);
+ }
+
+ start_benchmark();
+ for (j = 0; j < count; j++) {
+ sds key = sdsfromlonglong(j);
+ dictEntry *de = dictFind(dict,key);
+ assert(de != NULL);
+ sdsfree(key);
+ }
+ end_benchmark("Linear access of existing elements");
+
+ start_benchmark();
+ for (j = 0; j < count; j++) {
+ sds key = sdsfromlonglong(j);
+ dictEntry *de = dictFind(dict,key);
+ assert(de != NULL);
+ sdsfree(key);
+ }
+ end_benchmark("Linear access of existing elements (2nd round)");
+
+ start_benchmark();
+ for (j = 0; j < count; j++) {
+ sds key = sdsfromlonglong(rand() % count);
+ dictEntry *de = dictFind(dict,key);
+ assert(de != NULL);
+ sdsfree(key);
+ }
+ end_benchmark("Random access of existing elements");
+
+ start_benchmark();
+ for (j = 0; j < count; j++) {
+ sds key = sdsfromlonglong(rand() % count);
+ key[0] = 'X';
+ dictEntry *de = dictFind(dict,key);
+ assert(de == NULL);
+ sdsfree(key);
+ }
+ end_benchmark("Accessing missing");
+
+ start_benchmark();
+ for (j = 0; j < count; j++) {
+ sds key = sdsfromlonglong(j);
+ int retval = dictDelete(dict,key);
+ assert(retval == DICT_OK);
+ key[0] += 17; /* Change first number to letter. */
+ retval = dictAdd(dict,key,(void*)j);
+ assert(retval == DICT_OK);
+ }
+ end_benchmark("Removing and adding");
+}
+#endif
diff --git a/src/dict.h b/src/dict.h
index 967a238b6..04b247a25 100644
--- a/src/dict.h
+++ b/src/dict.h
@@ -106,19 +106,19 @@ typedef void (dictScanFunction)(void *privdata, const dictEntry *de);
#define dictSetVal(d, entry, _val_) do { \
if ((d)->type->valDup) \
- entry->v.val = (d)->type->valDup((d)->privdata, _val_); \
+ (entry)->v.val = (d)->type->valDup((d)->privdata, _val_); \
else \
- entry->v.val = (_val_); \
+ (entry)->v.val = (_val_); \
} while(0)
#define dictSetSignedIntegerVal(entry, _val_) \
- do { entry->v.s64 = _val_; } while(0)
+ do { (entry)->v.s64 = _val_; } while(0)
#define dictSetUnsignedIntegerVal(entry, _val_) \
- do { entry->v.u64 = _val_; } while(0)
+ do { (entry)->v.u64 = _val_; } while(0)
#define dictSetDoubleVal(entry, _val_) \
- do { entry->v.d = _val_; } while(0)
+ do { (entry)->v.d = _val_; } while(0)
#define dictFreeKey(d, entry) \
if ((d)->type->keyDestructor) \
@@ -126,9 +126,9 @@ typedef void (dictScanFunction)(void *privdata, const dictEntry *de);
#define dictSetKey(d, entry, _key_) do { \
if ((d)->type->keyDup) \
- entry->key = (d)->type->keyDup((d)->privdata, _key_); \
+ (entry)->key = (d)->type->keyDup((d)->privdata, _key_); \
else \
- entry->key = (_key_); \
+ (entry)->key = (_key_); \
} while(0)
#define dictCompareKeys(d, key1, key2) \
@@ -150,11 +150,12 @@ typedef void (dictScanFunction)(void *privdata, const dictEntry *de);
dict *dictCreate(dictType *type, void *privDataPtr);
int dictExpand(dict *d, unsigned long size);
int dictAdd(dict *d, void *key, void *val);
-dictEntry *dictAddRaw(dict *d, void *key);
+dictEntry *dictAddRaw(dict *d, void *key, dictEntry **existing);
+dictEntry *dictAddOrFind(dict *d, void *key);
int dictReplace(dict *d, void *key, void *val);
-dictEntry *dictReplaceRaw(dict *d, void *key);
int dictDelete(dict *d, const void *key);
-int dictDeleteNoFree(dict *d, const void *key);
+dictEntry *dictUnlink(dict *ht, const void *key);
+void dictFreeUnlinkedEntry(dict *d, dictEntry *he);
void dictRelease(dict *d);
dictEntry * dictFind(dict *d, const void *key);
void *dictFetchValue(dict *d, const void *key);
diff --git a/src/evict.c b/src/evict.c
new file mode 100644
index 000000000..802997ce8
--- /dev/null
+++ b/src/evict.c
@@ -0,0 +1,524 @@
+/* Maxmemory directive handling (LRU eviction and other policies).
+ *
+ * ----------------------------------------------------------------------------
+ *
+ * Copyright (c) 2009-2016, Salvatore Sanfilippo <antirez at gmail dot com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Redis nor the names of its contributors may be used
+ * to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "server.h"
+#include "bio.h"
+
+/* ----------------------------------------------------------------------------
+ * Data structures
+ * --------------------------------------------------------------------------*/
+
+/* To improve the quality of the LRU approximation we take a set of keys
+ * that are good candidate for eviction across freeMemoryIfNeeded() calls.
+ *
+ * Entries inside the eviciton pool are taken ordered by idle time, putting
+ * greater idle times to the right (ascending order).
+ *
+ * When an LFU policy is used instead, a reverse frequency indication is used
+ * instead of the idle time, so that we still evict by larger value (larger
+ * inverse frequency means to evict keys with the least frequent accesses).
+ *
+ * Empty entries have the key pointer set to NULL. */
+#define EVPOOL_SIZE 16
+#define EVPOOL_CACHED_SDS_SIZE 255
+struct evictionPoolEntry {
+ unsigned long long idle; /* Object idle time (inverse frequency for LFU) */
+ sds key; /* Key name. */
+ sds cached; /* Cached SDS object for key name. */
+ int dbid; /* Key DB number. */
+};
+
+static struct evictionPoolEntry *EvictionPoolLRU;
+
+unsigned long LFUDecrAndReturn(robj *o);
+
+/* ----------------------------------------------------------------------------
+ * Implementation of eviction, aging and LRU
+ * --------------------------------------------------------------------------*/
+
+/* Return the LRU clock, based on the clock resolution. This is a time
+ * in a reduced-bits format that can be used to set and check the
+ * object->lru field of redisObject structures. */
+unsigned int getLRUClock(void) {
+ return (mstime()/LRU_CLOCK_RESOLUTION) & LRU_CLOCK_MAX;
+}
+
+/* Given an object returns the min number of milliseconds the object was never
+ * requested, using an approximated LRU algorithm. */
+unsigned long long estimateObjectIdleTime(robj *o) {
+ unsigned long long lruclock = LRU_CLOCK();
+ if (lruclock >= o->lru) {
+ return (lruclock - o->lru) * LRU_CLOCK_RESOLUTION;
+ } else {
+ return (lruclock + (LRU_CLOCK_MAX - o->lru)) *
+ LRU_CLOCK_RESOLUTION;
+ }
+}
+
+/* freeMemoryIfNeeded() gets called when 'maxmemory' is set on the config
+ * file to limit the max memory used by the server, before processing a
+ * command.
+ *
+ * The goal of the function is to free enough memory to keep Redis under the
+ * configured memory limit.
+ *
+ * The function starts calculating how many bytes should be freed to keep
+ * Redis under the limit, and enters a loop selecting the best keys to
+ * evict accordingly to the configured policy.
+ *
+ * If all the bytes needed to return back under the limit were freed the
+ * function returns C_OK, otherwise C_ERR is returned, and the caller
+ * should block the execution of commands that will result in more memory
+ * used by the server.
+ *
+ * ------------------------------------------------------------------------
+ *
+ * LRU approximation algorithm
+ *
+ * Redis uses an approximation of the LRU algorithm that runs in constant
+ * memory. Every time there is a key to expire, we sample N keys (with
+ * N very small, usually in around 5) to populate a pool of best keys to
+ * evict of M keys (the pool size is defined by EVPOOL_SIZE).
+ *
+ * The N keys sampled are added in the pool of good keys to expire (the one
+ * with an old access time) if they are better than one of the current keys
+ * in the pool.
+ *
+ * After the pool is populated, the best key we have in the pool is expired.
+ * However note that we don't remove keys from the pool when they are deleted
+ * so the pool may contain keys that no longer exist.
+ *
+ * When we try to evict a key, and all the entries in the pool don't exist
+ * we populate it again. This time we'll be sure that the pool has at least
+ * one key that can be evicted, if there is at least one key that can be
+ * evicted in the whole database. */
+
+/* Create a new eviction pool. */
+void evictionPoolAlloc(void) {
+ struct evictionPoolEntry *ep;
+ int j;
+
+ ep = zmalloc(sizeof(*ep)*EVPOOL_SIZE);
+ for (j = 0; j < EVPOOL_SIZE; j++) {
+ ep[j].idle = 0;
+ ep[j].key = NULL;
+ ep[j].cached = sdsnewlen(NULL,EVPOOL_CACHED_SDS_SIZE);
+ ep[j].dbid = 0;
+ }
+ EvictionPoolLRU = ep;
+}
+
+/* This is an helper function for freeMemoryIfNeeded(), it is used in order
+ * to populate the evictionPool with a few entries every time we want to
+ * expire a key. Keys with idle time smaller than one of the current
+ * keys are added. Keys are always added if there are free entries.
+ *
+ * We insert keys on place in ascending order, so keys with the smaller
+ * idle time are on the left, and keys with the higher idle time on the
+ * right. */
+
+void evictionPoolPopulate(int dbid, dict *sampledict, dict *keydict, struct evictionPoolEntry *pool) {
+ int j, k, count;
+ dictEntry *samples[server.maxmemory_samples];
+
+ count = dictGetSomeKeys(sampledict,samples,server.maxmemory_samples);
+ for (j = 0; j < count; j++) {
+ unsigned long long idle;
+ sds key;
+ robj *o;
+ dictEntry *de;
+
+ de = samples[j];
+ key = dictGetKey(de);
+
+ /* If the dictionary we are sampling from is not the main
+ * dictionary (but the expires one) we need to lookup the key
+ * again in the key dictionary to obtain the value object. */
+ if (server.maxmemory_policy != MAXMEMORY_VOLATILE_TTL) {
+ if (sampledict != keydict) de = dictFind(keydict, key);
+ o = dictGetVal(de);
+ }
+
+ /* Calculate the idle time according to the policy. This is called
+ * idle just because the code initially handled LRU, but is in fact
+ * just a score where an higher score means better candidate. */
+ if (server.maxmemory_policy & MAXMEMORY_FLAG_LRU) {
+ idle = estimateObjectIdleTime(o);
+ } else if (server.maxmemory_policy & MAXMEMORY_FLAG_LFU) {
+ /* When we use an LRU policy, we sort the keys by idle time
+ * so that we expire keys starting from greater idle time.
+ * However when the policy is an LFU one, we have a frequency
+ * estimation, and we want to evict keys with lower frequency
+ * first. So inside the pool we put objects using the inverted
+ * frequency subtracting the actual frequency to the maximum
+ * frequency of 255. */
+ idle = 255-LFUDecrAndReturn(o);
+ } else if (server.maxmemory_policy == MAXMEMORY_VOLATILE_TTL) {
+ /* In this case the sooner the expire the better. */
+ idle = ULLONG_MAX - (long)dictGetVal(de);
+ } else {
+ serverPanic("Unknown eviction policy in evictionPoolPopulate()");
+ }
+
+ /* Insert the element inside the pool.
+ * First, find the first empty bucket or the first populated
+ * bucket that has an idle time smaller than our idle time. */
+ k = 0;
+ while (k < EVPOOL_SIZE &&
+ pool[k].key &&
+ pool[k].idle < idle) k++;
+ if (k == 0 && pool[EVPOOL_SIZE-1].key != NULL) {
+ /* Can't insert if the element is < the worst element we have
+ * and there are no empty buckets. */
+ continue;
+ } else if (k < EVPOOL_SIZE && pool[k].key == NULL) {
+ /* Inserting into empty position. No setup needed before insert. */
+ } else {
+ /* Inserting in the middle. Now k points to the first element
+ * greater than the element to insert. */
+ if (pool[EVPOOL_SIZE-1].key == NULL) {
+ /* Free space on the right? Insert at k shifting
+ * all the elements from k to end to the right. */
+
+ /* Save SDS before overwriting. */
+ sds cached = pool[EVPOOL_SIZE-1].cached;
+ memmove(pool+k+1,pool+k,
+ sizeof(pool[0])*(EVPOOL_SIZE-k-1));
+ pool[k].cached = cached;
+ } else {
+ /* No free space on right? Insert at k-1 */
+ k--;
+ /* Shift all elements on the left of k (included) to the
+ * left, so we discard the element with smaller idle time. */
+ sds cached = pool[0].cached; /* Save SDS before overwriting. */
+ if (pool[0].key != pool[0].cached) sdsfree(pool[0].key);
+ memmove(pool,pool+1,sizeof(pool[0])*k);
+ pool[k].cached = cached;
+ }
+ }
+
+ /* Try to reuse the cached SDS string allocated in the pool entry,
+ * because allocating and deallocating this object is costly
+ * (according to the profiler, not my fantasy. Remember:
+ * premature optimizbla bla bla bla. */
+ int klen = sdslen(key);
+ if (klen > EVPOOL_CACHED_SDS_SIZE) {
+ pool[k].key = sdsdup(key);
+ } else {
+ memcpy(pool[k].cached,key,klen+1);
+ sdssetlen(pool[k].cached,klen);
+ pool[k].key = pool[k].cached;
+ }
+ pool[k].idle = idle;
+ pool[k].dbid = dbid;
+ }
+}
+
+/* ----------------------------------------------------------------------------
+ * LFU (Least Frequently Used) implementation.
+
+ * We have 24 total bits of space in each object in order to implement
+ * an LFU (Least Frequently Used) eviction policy, since we re-use the
+ * LRU field for this purpose.
+ *
+ * We split the 24 bits into two fields:
+ *
+ * 16 bits 8 bits
+ * +----------------+--------+
+ * + Last decr time | LOG_C |
+ * +----------------+--------+
+ *
+ * LOG_C is a logarithmic counter that provides an indication of the access
+ * frequency. However this field must also be decremented otherwise what used
+ * to be a frequently accessed key in the past, will remain ranked like that
+ * forever, while we want the algorithm to adapt to access pattern changes.
+ *
+ * So the remaining 16 bits are used in order to store the "decrement time",
+ * a reduced-precision Unix time (we take 16 bits of the time converted
+ * in minutes since we don't care about wrapping around) where the LOG_C
+ * counter is halved if it has an high value, or just decremented if it
+ * has a low value.
+ *
+ * New keys don't start at zero, in order to have the ability to collect
+ * some accesses before being trashed away, so they start at COUNTER_INIT_VAL.
+ * The logarithmic increment performed on LOG_C takes care of COUNTER_INIT_VAL
+ * when incrementing the key, so that keys starting at COUNTER_INIT_VAL
+ * (or having a smaller value) have a very high chance of being incremented
+ * on access.
+ *
+ * During decrement, the value of the logarithmic counter is halved if
+ * its current value is greater than two times the COUNTER_INIT_VAL, otherwise
+ * it is just decremented by one.
+ * --------------------------------------------------------------------------*/
+
+/* Return the current time in minutes, just taking the least significant
+ * 16 bits. The returned time is suitable to be stored as LDT (last decrement
+ * time) for the LFU implementation. */
+unsigned long LFUGetTimeInMinutes(void) {
+ return (server.unixtime/60) & 65535;
+}
+
+/* Given an object last decrement time, compute the minimum number of minutes
+ * that elapsed since the last decrement. Handle overflow (ldt greater than
+ * the current 16 bits minutes time) considering the time as wrapping
+ * exactly once. */
+unsigned long LFUTimeElapsed(unsigned long ldt) {
+ unsigned long now = LFUGetTimeInMinutes();
+ if (now >= ldt) return now-ldt;
+ return 65535-ldt+now;
+}
+
+/* Logarithmically increment a counter. The greater is the current counter value
+ * the less likely is that it gets really implemented. Saturate it at 255. */
+uint8_t LFULogIncr(uint8_t counter) {
+ if (counter == 255) return 255;
+ double r = (double)rand()/RAND_MAX;
+ double baseval = counter - LFU_INIT_VAL;
+ if (baseval < 0) baseval = 0;
+ double p = 1.0/(baseval*server.lfu_log_factor+1);
+ if (r < p) counter++;
+ return counter;
+}
+
+/* If the object decrement time is reached, decrement the LFU counter and
+ * update the decrement time field. Return the object frequency counter.
+ *
+ * This function is used in order to scan the dataset for the best object
+ * to fit: as we check for the candidate, we incrementally decrement the
+ * counter of the scanned objects if needed. */
+#define LFU_DECR_INTERVAL 1
+unsigned long LFUDecrAndReturn(robj *o) {
+ unsigned long ldt = o->lru >> 8;
+ unsigned long counter = o->lru & 255;
+ if (LFUTimeElapsed(ldt) >= server.lfu_decay_time && counter) {
+ if (counter > LFU_INIT_VAL*2) {
+ counter /= 2;
+ if (counter < LFU_INIT_VAL*2) counter = LFU_INIT_VAL*2;
+ } else {
+ counter--;
+ }
+ o->lru = (LFUGetTimeInMinutes()<<8) | counter;
+ }
+ return counter;
+}
+
+/* ----------------------------------------------------------------------------
+ * The external API for eviction: freeMemroyIfNeeded() is called by the
+ * server when there is data to add in order to make space if needed.
+ * --------------------------------------------------------------------------*/
+
+int freeMemoryIfNeeded(void) {
+ size_t mem_reported, mem_used, mem_tofree, mem_freed;
+ int slaves = listLength(server.slaves);
+ mstime_t latency, eviction_latency;
+ long long delta;
+
+ /* Check if we are over the memory usage limit. If we are not, no need
+ * to subtract the slaves output buffers. We can just return ASAP. */
+ mem_reported = zmalloc_used_memory();
+ if (mem_reported <= server.maxmemory) return C_OK;
+
+ /* Remove the size of slaves output buffers and AOF buffer from the
+ * count of used memory. */
+ mem_used = mem_reported;
+ if (slaves) {
+ listIter li;
+ listNode *ln;
+
+ listRewind(server.slaves,&li);
+ while((ln = listNext(&li))) {
+ client *slave = listNodeValue(ln);
+ unsigned long obuf_bytes = getClientOutputBufferMemoryUsage(slave);
+ if (obuf_bytes > mem_used)
+ mem_used = 0;
+ else
+ mem_used -= obuf_bytes;
+ }
+ }
+ if (server.aof_state != AOF_OFF) {
+ mem_used -= sdslen(server.aof_buf);
+ mem_used -= aofRewriteBufferSize();
+ }
+
+ /* Check if we are still over the memory limit. */
+ if (mem_used <= server.maxmemory) return C_OK;
+
+ /* Compute how much memory we need to free. */
+ mem_tofree = mem_used - server.maxmemory;
+ mem_freed = 0;
+
+ if (server.maxmemory_policy == MAXMEMORY_NO_EVICTION)
+ goto cant_free; /* We need to free memory, but policy forbids. */
+
+ latencyStartMonitor(latency);
+ while (mem_freed < mem_tofree) {
+ int j, k, i, keys_freed = 0;
+ static int next_db = 0;
+ sds bestkey = NULL;
+ int bestdbid;
+ redisDb *db;
+ dict *dict;
+ dictEntry *de;
+
+ if (server.maxmemory_policy & (MAXMEMORY_FLAG_LRU|MAXMEMORY_FLAG_LFU) ||
+ server.maxmemory_policy == MAXMEMORY_VOLATILE_TTL)
+ {
+ struct evictionPoolEntry *pool = EvictionPoolLRU;
+
+ while(bestkey == NULL) {
+ unsigned long total_keys = 0, keys;
+
+ /* We don't want to make local-db choices when expiring keys,
+ * so to start populate the eviction pool sampling keys from
+ * every DB. */
+ for (i = 0; i < server.dbnum; i++) {
+ db = server.db+i;
+ dict = (server.maxmemory_policy & MAXMEMORY_FLAG_ALLKEYS) ?
+ db->dict : db->expires;
+ if ((keys = dictSize(dict)) != 0) {
+ evictionPoolPopulate(i, dict, db->dict, pool);
+ total_keys += keys;
+ }
+ }
+ if (!total_keys) break; /* No keys to evict. */
+
+ /* Go backward from best to worst element to evict. */
+ for (k = EVPOOL_SIZE-1; k >= 0; k--) {
+ if (pool[k].key == NULL) continue;
+ bestdbid = pool[k].dbid;
+
+ if (server.maxmemory_policy & MAXMEMORY_FLAG_ALLKEYS) {
+ de = dictFind(server.db[pool[k].dbid].dict,
+ pool[k].key);
+ } else {
+ de = dictFind(server.db[pool[k].dbid].expires,
+ pool[k].key);
+ }
+
+ /* Remove the entry from the pool. */
+ if (pool[k].key != pool[k].cached)
+ sdsfree(pool[k].key);
+ pool[k].key = NULL;
+ pool[k].idle = 0;
+
+ /* If the key exists, is our pick. Otherwise it is
+ * a ghost and we need to try the next element. */
+ if (de) {
+ bestkey = dictGetKey(de);
+ break;
+ } else {
+ /* Ghost... Iterate again. */
+ }
+ }
+ }
+ }
+
+ /* volatile-random and allkeys-random policy */
+ else if (server.maxmemory_policy == MAXMEMORY_ALLKEYS_RANDOM ||
+ server.maxmemory_policy == MAXMEMORY_VOLATILE_RANDOM)
+ {
+ /* When evicting a random key, we try to evict a key for
+ * each DB, so we use the static 'next_db' variable to
+ * incrementally visit all DBs. */
+ for (i = 0; i < server.dbnum; i++) {
+ j = (++next_db) % server.dbnum;
+ db = server.db+j;
+ dict = (server.maxmemory_policy == MAXMEMORY_ALLKEYS_RANDOM) ?
+ db->dict : db->expires;
+ if (dictSize(dict) != 0) {
+ de = dictGetRandomKey(dict);
+ bestkey = dictGetKey(de);
+ bestdbid = j;
+ break;
+ }
+ }
+ }
+
+ /* Finally remove the selected key. */
+ if (bestkey) {
+ db = server.db+bestdbid;
+ robj *keyobj = createStringObject(bestkey,sdslen(bestkey));
+ propagateExpire(db,keyobj,server.lazyfree_lazy_eviction);
+ /* We compute the amount of memory freed by db*Delete() alone.
+ * It is possible that actually the memory needed to propagate
+ * the DEL in AOF and replication link is greater than the one
+ * we are freeing removing the key, but we can't account for
+ * that otherwise we would never exit the loop.
+ *
+ * AOF and Output buffer memory will be freed eventually so
+ * we only care about memory used by the key space. */
+ delta = (long long) zmalloc_used_memory();
+ latencyStartMonitor(eviction_latency);
+ if (server.lazyfree_lazy_eviction)
+ dbAsyncDelete(db,keyobj);
+ else
+ dbSyncDelete(db,keyobj);
+ latencyEndMonitor(eviction_latency);
+ latencyAddSampleIfNeeded("eviction-del",eviction_latency);
+ latencyRemoveNestedEvent(latency,eviction_latency);
+ delta -= (long long) zmalloc_used_memory();
+ mem_freed += delta;
+ server.stat_evictedkeys++;
+ notifyKeyspaceEvent(NOTIFY_EVICTED, "evicted",
+ keyobj, db->id);
+ decrRefCount(keyobj);
+ keys_freed++;
+
+ /* When the memory to free starts to be big enough, we may
+ * start spending so much time here that is impossible to
+ * deliver data to the slaves fast enough, so we force the
+ * transmission here inside the loop. */
+ if (slaves) flushSlavesOutputBuffers();
+ }
+
+ if (!keys_freed) {
+ latencyEndMonitor(latency);
+ latencyAddSampleIfNeeded("eviction-cycle",latency);
+ goto cant_free; /* nothing to free... */
+ }
+ }
+ latencyEndMonitor(latency);
+ latencyAddSampleIfNeeded("eviction-cycle",latency);
+ return C_OK;
+
+cant_free:
+ /* We are here if we are not able to reclaim memory. There is only one
+ * last thing we can try: check if the lazyfree thread has jobs in queue
+ * and wait... */
+ while(bioPendingJobsOfType(BIO_LAZY_FREE)) {
+ if (((mem_reported - zmalloc_used_memory()) + mem_freed) >= mem_tofree)
+ break;
+ usleep(1000);
+ }
+ return C_ERR;
+}
+
diff --git a/src/expire.c b/src/expire.c
new file mode 100644
index 000000000..637139f63
--- /dev/null
+++ b/src/expire.c
@@ -0,0 +1,502 @@
+/* Implementation of EXPIRE (keys with fixed time to live).
+ *
+ * ----------------------------------------------------------------------------
+ *
+ * Copyright (c) 2009-2016, Salvatore Sanfilippo <antirez at gmail dot com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Redis nor the names of its contributors may be used
+ * to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "server.h"
+
+/*-----------------------------------------------------------------------------
+ * Incremental collection of expired keys.
+ *
+ * When keys are accessed they are expired on-access. However we need a
+ * mechanism in order to ensure keys are eventually removed when expired even
+ * if no access is performed on them.
+ *----------------------------------------------------------------------------*/
+
+/* Helper function for the activeExpireCycle() function.
+ * This function will try to expire the key that is stored in the hash table
+ * entry 'de' of the 'expires' hash table of a Redis database.
+ *
+ * If the key is found to be expired, it is removed from the database and
+ * 1 is returned. Otherwise no operation is performed and 0 is returned.
+ *
+ * When a key is expired, server.stat_expiredkeys is incremented.
+ *
+ * The parameter 'now' is the current time in milliseconds as is passed
+ * to the function to avoid too many gettimeofday() syscalls. */
+int activeExpireCycleTryExpire(redisDb *db, dictEntry *de, long long now) {
+ long long t = dictGetSignedIntegerVal(de);
+ if (now > t) {
+ sds key = dictGetKey(de);
+ robj *keyobj = createStringObject(key,sdslen(key));
+
+ propagateExpire(db,keyobj,server.lazyfree_lazy_expire);
+ if (server.lazyfree_lazy_expire)
+ dbAsyncDelete(db,keyobj);
+ else
+ dbSyncDelete(db,keyobj);
+ notifyKeyspaceEvent(NOTIFY_EXPIRED,
+ "expired",keyobj,db->id);
+ decrRefCount(keyobj);
+ server.stat_expiredkeys++;
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+/* Try to expire a few timed out keys. The algorithm used is adaptive and
+ * will use few CPU cycles if there are few expiring keys, otherwise
+ * it will get more aggressive to avoid that too much memory is used by
+ * keys that can be removed from the keyspace.
+ *
+ * No more than CRON_DBS_PER_CALL databases are tested at every
+ * iteration.
+ *
+ * This kind of call is used when Redis detects that timelimit_exit is
+ * true, so there is more work to do, and we do it more incrementally from
+ * the beforeSleep() function of the event loop.
+ *
+ * Expire cycle type:
+ *
+ * If type is ACTIVE_EXPIRE_CYCLE_FAST the function will try to run a
+ * "fast" expire cycle that takes no longer than EXPIRE_FAST_CYCLE_DURATION
+ * microseconds, and is not repeated again before the same amount of time.
+ *
+ * If type is ACTIVE_EXPIRE_CYCLE_SLOW, that normal expire cycle is
+ * executed, where the time limit is a percentage of the REDIS_HZ period
+ * as specified by the REDIS_EXPIRELOOKUPS_TIME_PERC define. */
+
+void activeExpireCycle(int type) {
+ /* This function has some global state in order to continue the work
+ * incrementally across calls. */
+ static unsigned int current_db = 0; /* Last DB tested. */
+ static int timelimit_exit = 0; /* Time limit hit in previous call? */
+ static long long last_fast_cycle = 0; /* When last fast cycle ran. */
+
+ int j, iteration = 0;
+ int dbs_per_call = CRON_DBS_PER_CALL;
+ long long start = ustime(), timelimit;
+
+ if (type == ACTIVE_EXPIRE_CYCLE_FAST) {
+ /* Don't start a fast cycle if the previous cycle did not exited
+ * for time limt. Also don't repeat a fast cycle for the same period
+ * as the fast cycle total duration itself. */
+ if (!timelimit_exit) return;
+ if (start < last_fast_cycle + ACTIVE_EXPIRE_CYCLE_FAST_DURATION*2) return;
+ last_fast_cycle = start;
+ }
+
+ /* We usually should test CRON_DBS_PER_CALL per iteration, with
+ * two exceptions:
+ *
+ * 1) Don't test more DBs than we have.
+ * 2) If last time we hit the time limit, we want to scan all DBs
+ * in this iteration, as there is work to do in some DB and we don't want
+ * expired keys to use memory for too much time. */
+ if (dbs_per_call > server.dbnum || timelimit_exit)
+ dbs_per_call = server.dbnum;
+
+ /* We can use at max ACTIVE_EXPIRE_CYCLE_SLOW_TIME_PERC percentage of CPU time
+ * per iteration. Since this function gets called with a frequency of
+ * server.hz times per second, the following is the max amount of
+ * microseconds we can spend in this function. */
+ timelimit = 1000000*ACTIVE_EXPIRE_CYCLE_SLOW_TIME_PERC/server.hz/100;
+ timelimit_exit = 0;
+ if (timelimit <= 0) timelimit = 1;
+
+ if (type == ACTIVE_EXPIRE_CYCLE_FAST)
+ timelimit = ACTIVE_EXPIRE_CYCLE_FAST_DURATION; /* in microseconds. */
+
+ for (j = 0; j < dbs_per_call; j++) {
+ int expired;
+ redisDb *db = server.db+(current_db % server.dbnum);
+
+ /* Increment the DB now so we are sure if we run out of time
+ * in the current DB we'll restart from the next. This allows to
+ * distribute the time evenly across DBs. */
+ current_db++;
+
+ /* Continue to expire if at the end of the cycle more than 25%
+ * of the keys were expired. */
+ do {
+ unsigned long num, slots;
+ long long now, ttl_sum;
+ int ttl_samples;
+
+ /* If there is nothing to expire try next DB ASAP. */
+ if ((num = dictSize(db->expires)) == 0) {
+ db->avg_ttl = 0;
+ break;
+ }
+ slots = dictSlots(db->expires);
+ now = mstime();
+
+ /* When there are less than 1% filled slots getting random
+ * keys is expensive, so stop here waiting for better times...
+ * The dictionary will be resized asap. */
+ if (num && slots > DICT_HT_INITIAL_SIZE &&
+ (num*100/slots < 1)) break;
+
+ /* The main collection cycle. Sample random keys among keys
+ * with an expire set, checking for expired ones. */
+ expired = 0;
+ ttl_sum = 0;
+ ttl_samples = 0;
+
+ if (num > ACTIVE_EXPIRE_CYCLE_LOOKUPS_PER_LOOP)
+ num = ACTIVE_EXPIRE_CYCLE_LOOKUPS_PER_LOOP;
+
+ while (num--) {
+ dictEntry *de;
+ long long ttl;
+
+ if ((de = dictGetRandomKey(db->expires)) == NULL) break;
+ ttl = dictGetSignedIntegerVal(de)-now;
+ if (activeExpireCycleTryExpire(db,de,now)) expired++;
+ if (ttl > 0) {
+ /* We want the average TTL of keys yet not expired. */
+ ttl_sum += ttl;
+ ttl_samples++;
+ }
+ }
+
+ /* Update the average TTL stats for this database. */
+ if (ttl_samples) {
+ long long avg_ttl = ttl_sum/ttl_samples;
+
+ /* Do a simple running average with a few samples.
+ * We just use the current estimate with a weight of 2%
+ * and the previous estimate with a weight of 98%. */
+ if (db->avg_ttl == 0) db->avg_ttl = avg_ttl;
+ db->avg_ttl = (db->avg_ttl/50)*49 + (avg_ttl/50);
+ }
+
+ /* We can't block forever here even if there are many keys to
+ * expire. So after a given amount of milliseconds return to the
+ * caller waiting for the other active expire cycle. */
+ iteration++;
+ if ((iteration & 0xf) == 0) { /* check once every 16 iterations. */
+ long long elapsed = ustime()-start;
+
+ latencyAddSampleIfNeeded("expire-cycle",elapsed/1000);
+ if (elapsed > timelimit) timelimit_exit = 1;
+ }
+ if (timelimit_exit) return;
+ /* We don't repeat the cycle if there are less than 25% of keys
+ * found expired in the current DB. */
+ } while (expired > ACTIVE_EXPIRE_CYCLE_LOOKUPS_PER_LOOP/4);
+ }
+}
+
+/*-----------------------------------------------------------------------------
+ * Expires of keys created in writable slaves
+ *
+ * Normally slaves do not process expires: they wait the masters to synthesize
+ * DEL operations in order to retain consistency. However writable slaves are
+ * an exception: if a key is created in the slave and an expire is assigned
+ * to it, we need a way to expire such a key, since the master does not know
+ * anything about such a key.
+ *
+ * In order to do so, we track keys created in the slave side with an expire
+ * set, and call the expireSlaveKeys() function from time to time in order to
+ * reclaim the keys if they already expired.
+ *
+ * Note that the use case we are trying to cover here, is a popular one where
+ * slaves are put in writable mode in order to compute slow operations in
+ * the slave side that are mostly useful to actually read data in a more
+ * processed way. Think at sets intersections in a tmp key, with an expire so
+ * that it is also used as a cache to avoid intersecting every time.
+ *
+ * This implementation is currently not perfect but a lot better than leaking
+ * the keys as implemented in 3.2.
+ *----------------------------------------------------------------------------*/
+
+/* The dictionary where we remember key names and database ID of keys we may
+ * want to expire from the slave. Since this function is not often used we
+ * don't even care to initialize the database at startup. We'll do it once
+ * the feature is used the first time, that is, when rememberSlaveKeyWithExpire()
+ * is called.
+ *
+ * The dictionary has an SDS string representing the key as the hash table
+ * key, while the value is a 64 bit unsigned integer with the bits corresponding
+ * to the DB where the keys may exist set to 1. Currently the keys created
+ * with a DB id > 63 are not expired, but a trivial fix is to set the bitmap
+ * to the max 64 bit unsigned value when we know there is a key with a DB
+ * ID greater than 63, and check all the configured DBs in such a case. */
+dict *slaveKeysWithExpire = NULL;
+
+/* Check the set of keys created by the master with an expire set in order to
+ * check if they should be evicted. */
+void expireSlaveKeys(void) {
+ if (slaveKeysWithExpire == NULL ||
+ dictSize(slaveKeysWithExpire) == 0) return;
+
+ int cycles = 0, noexpire = 0;
+ mstime_t start = mstime();
+ while(1) {
+ dictEntry *de = dictGetRandomKey(slaveKeysWithExpire);
+ sds keyname = dictGetKey(de);
+ uint64_t dbids = dictGetUnsignedIntegerVal(de);
+ uint64_t new_dbids = 0;
+
+ /* Check the key against every database corresponding to the
+ * bits set in the value bitmap. */
+ int dbid = 0;
+ while(dbids && dbid < server.dbnum) {
+ if ((dbids & 1) != 0) {
+ redisDb *db = server.db+dbid;
+ dictEntry *expire = dictFind(db->expires,keyname);
+ int expired = 0;
+
+ if (expire &&
+ activeExpireCycleTryExpire(server.db+dbid,expire,start))
+ {
+ expired = 1;
+ }
+
+ /* If the key was not expired in this DB, we need to set the
+ * corresponding bit in the new bitmap we set as value.
+ * At the end of the loop if the bitmap is zero, it means we
+ * no longer need to keep track of this key. */
+ if (expire && !expired) {
+ noexpire++;
+ new_dbids |= (uint64_t)1 << dbid;
+ }
+ }
+ dbid++;
+ dbids >>= 1;
+ }
+
+ /* Set the new bitmap as value of the key, in the dictionary
+ * of keys with an expire set directly in the writable slave. Otherwise
+ * if the bitmap is zero, we no longer need to keep track of it. */
+ if (new_dbids)
+ dictSetUnsignedIntegerVal(de,new_dbids);
+ else
+ dictDelete(slaveKeysWithExpire,keyname);
+
+ /* Stop conditions: found 3 keys we cna't expire in a row or
+ * time limit was reached. */
+ cycles++;
+ if (noexpire > 3) break;
+ if ((cycles % 64) == 0 && mstime()-start > 1) break;
+ if (dictSize(slaveKeysWithExpire) == 0) break;
+ }
+}
+
+/* Track keys that received an EXPIRE or similar command in the context
+ * of a writable slave. */
+void rememberSlaveKeyWithExpire(redisDb *db, robj *key) {
+ if (slaveKeysWithExpire == NULL) {
+ static dictType dt = {
+ dictSdsHash, /* hash function */
+ NULL, /* key dup */
+ NULL, /* val dup */
+ dictSdsKeyCompare, /* key compare */
+ dictSdsDestructor, /* key destructor */
+ NULL /* val destructor */
+ };
+ slaveKeysWithExpire = dictCreate(&dt,NULL);
+ }
+ if (db->id > 63) return;
+
+ dictEntry *de = dictAddOrFind(slaveKeysWithExpire,key->ptr);
+ /* If the entry was just created, set it to a copy of the SDS string
+ * representing the key: we don't want to need to take those keys
+ * in sync with the main DB. The keys will be removed by expireSlaveKeys()
+ * as it scans to find keys to remove. */
+ if (de->key == key->ptr) {
+ de->key = sdsdup(key->ptr);
+ dictSetUnsignedIntegerVal(de,0);
+ }
+
+ uint64_t dbids = dictGetUnsignedIntegerVal(de);
+ dbids |= (uint64_t)1 << db->id;
+ dictSetUnsignedIntegerVal(de,dbids);
+}
+
+/* Return the number of keys we are tracking. */
+size_t getSlaveKeyWithExpireCount(void) {
+ if (slaveKeysWithExpire == NULL) return 0;
+ return dictSize(slaveKeysWithExpire);
+}
+
+/* Remove the keys in the hash table. We need to do that when data is
+ * flushed from the server. We may receive new keys from the master with
+ * the same name/db and it is no longer a good idea to expire them.
+ *
+ * Note: technically we should handle the case of a single DB being flushed
+ * but it is not worth it since anyway race conditions using the same set
+ * of key names in a wriatable slave and in its master will lead to
+ * inconsistencies. This is just a best-effort thing we do. */
+void flushSlaveKeysWithExpireList(void) {
+ if (slaveKeysWithExpire) {
+ dictRelease(slaveKeysWithExpire);
+ slaveKeysWithExpire = NULL;
+ }
+}
+
+/*-----------------------------------------------------------------------------
+ * Expires Commands
+ *----------------------------------------------------------------------------*/
+
+/* This is the generic command implementation for EXPIRE, PEXPIRE, EXPIREAT
+ * and PEXPIREAT. Because the commad second argument may be relative or absolute
+ * the "basetime" argument is used to signal what the base time is (either 0
+ * for *AT variants of the command, or the current time for relative expires).
+ *
+ * unit is either UNIT_SECONDS or UNIT_MILLISECONDS, and is only used for
+ * the argv[2] parameter. The basetime is always specified in milliseconds. */
+void expireGenericCommand(client *c, long long basetime, int unit) {
+ robj *key = c->argv[1], *param = c->argv[2];
+ long long when; /* unix time in milliseconds when the key will expire. */
+
+ if (getLongLongFromObjectOrReply(c, param, &when, NULL) != C_OK)
+ return;
+
+ if (unit == UNIT_SECONDS) when *= 1000;
+ when += basetime;
+
+ /* No key, return zero. */
+ if (lookupKeyWrite(c->db,key) == NULL) {
+ addReply(c,shared.czero);
+ return;
+ }
+
+ /* EXPIRE with negative TTL, or EXPIREAT with a timestamp into the past
+ * should never be executed as a DEL when load the AOF or in the context
+ * of a slave instance.
+ *
+ * Instead we take the other branch of the IF statement setting an expire
+ * (possibly in the past) and wait for an explicit DEL from the master. */
+ if (when <= mstime() && !server.loading && !server.masterhost) {
+ robj *aux;
+
+ int deleted = server.lazyfree_lazy_expire ? dbAsyncDelete(c->db,key) :
+ dbSyncDelete(c->db,key);
+ serverAssertWithInfo(c,key,deleted);
+ server.dirty++;
+
+ /* Replicate/AOF this as an explicit DEL or UNLINK. */
+ aux = server.lazyfree_lazy_expire ? shared.unlink : shared.del;
+ rewriteClientCommandVector(c,2,aux,key);
+ signalModifiedKey(c->db,key);
+ notifyKeyspaceEvent(NOTIFY_GENERIC,"del",key,c->db->id);
+ addReply(c, shared.cone);
+ return;
+ } else {
+ setExpire(c,c->db,key,when);
+ addReply(c,shared.cone);
+ signalModifiedKey(c->db,key);
+ notifyKeyspaceEvent(NOTIFY_GENERIC,"expire",key,c->db->id);
+ server.dirty++;
+ return;
+ }
+}
+
+/* EXPIRE key seconds */
+void expireCommand(client *c) {
+ expireGenericCommand(c,mstime(),UNIT_SECONDS);
+}
+
+/* EXPIREAT key time */
+void expireatCommand(client *c) {
+ expireGenericCommand(c,0,UNIT_SECONDS);
+}
+
+/* PEXPIRE key milliseconds */
+void pexpireCommand(client *c) {
+ expireGenericCommand(c,mstime(),UNIT_MILLISECONDS);
+}
+
+/* PEXPIREAT key ms_time */
+void pexpireatCommand(client *c) {
+ expireGenericCommand(c,0,UNIT_MILLISECONDS);
+}
+
+/* Implements TTL and PTTL */
+void ttlGenericCommand(client *c, int output_ms) {
+ long long expire, ttl = -1;
+
+ /* If the key does not exist at all, return -2 */
+ if (lookupKeyReadWithFlags(c->db,c->argv[1],LOOKUP_NOTOUCH) == NULL) {
+ addReplyLongLong(c,-2);
+ return;
+ }
+ /* The key exists. Return -1 if it has no expire, or the actual
+ * TTL value otherwise. */
+ expire = getExpire(c->db,c->argv[1]);
+ if (expire != -1) {
+ ttl = expire-mstime();
+ if (ttl < 0) ttl = 0;
+ }
+ if (ttl == -1) {
+ addReplyLongLong(c,-1);
+ } else {
+ addReplyLongLong(c,output_ms ? ttl : ((ttl+500)/1000));
+ }
+}
+
+/* TTL key */
+void ttlCommand(client *c) {
+ ttlGenericCommand(c, 0);
+}
+
+/* PTTL key */
+void pttlCommand(client *c) {
+ ttlGenericCommand(c, 1);
+}
+
+/* PERSIST key */
+void persistCommand(client *c) {
+ dictEntry *de;
+
+ de = dictFind(c->db->dict,c->argv[1]->ptr);
+ if (de == NULL) {
+ addReply(c,shared.czero);
+ } else {
+ if (removeExpire(c->db,c->argv[1])) {
+ addReply(c,shared.cone);
+ server.dirty++;
+ } else {
+ addReply(c,shared.czero);
+ }
+ }
+}
+
+/* TOUCH key1 [key2 key3 ... keyN] */
+void touchCommand(client *c) {
+ int touched = 0;
+ for (int j = 1; j < c->argc; j++)
+ if (lookupKeyRead(c->db,c->argv[j]) != NULL) touched++;
+ addReplyLongLong(c,touched);
+}
+
diff --git a/src/geo.c b/src/geo.c
index 2d351d8e0..8423931af 100644
--- a/src/geo.c
+++ b/src/geo.c
@@ -1,6 +1,6 @@
/*
* Copyright (c) 2014, Matt Stancliff <matt@genges.com>.
- * Copyright (c) 2015, Salvatore Sanfilippo <antirez@gmail.com>.
+ * Copyright (c) 2015-2016, Salvatore Sanfilippo <antirez@gmail.com>.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -30,6 +30,7 @@
#include "geo.h"
#include "geohash_helper.h"
+#include "debugmacro.h"
/* Things exported from t_zset.c only for geo.c, since it is the only other
* part of Redis that requires close zset introspection. */
@@ -156,9 +157,13 @@ double extractDistanceOrReply(client *c, robj **argv,
return -1;
}
+ if (distance < 0) {
+ addReplyError(c,"radius cannot be negative");
+ return -1;
+ }
+
double to_meters = extractUnitOrReply(c,argv[1]);
if (to_meters < 0) {
- addReplyError(c,"radius cannot be negative");
return -1;
}
@@ -321,6 +326,7 @@ int membersOfGeoHashBox(robj *zobj, GeoHashBits hash, geoArray *ga, double lon,
int membersOfAllNeighbors(robj *zobj, GeoHashRadius n, double lon, double lat, double radius, geoArray *ga) {
GeoHashBits neighbors[9];
unsigned int i, count = 0, last_processed = 0;
+ int debugmsg = 0;
neighbors[0] = n.hash;
neighbors[1] = n.neighbors.north;
@@ -335,8 +341,26 @@ int membersOfAllNeighbors(robj *zobj, GeoHashRadius n, double lon, double lat, d
/* For each neighbor (*and* our own hashbox), get all the matching
* members and add them to the potential result list. */
for (i = 0; i < sizeof(neighbors) / sizeof(*neighbors); i++) {
- if (HASHISZERO(neighbors[i]))
+ if (HASHISZERO(neighbors[i])) {
+ if (debugmsg) D("neighbors[%d] is zero",i);
continue;
+ }
+
+ /* Debugging info. */
+ if (debugmsg) {
+ GeoHashRange long_range, lat_range;
+ geohashGetCoordRange(&long_range,&lat_range);
+ GeoHashArea myarea = {{0}};
+ geohashDecode(long_range, lat_range, neighbors[i], &myarea);
+
+ /* Dump center square. */
+ D("neighbors[%d]:\n",i);
+ D("area.longitude.min: %f\n", myarea.longitude.min);
+ D("area.longitude.max: %f\n", myarea.longitude.max);
+ D("area.latitude.min: %f\n", myarea.latitude.min);
+ D("area.latitude.max: %f\n", myarea.latitude.max);
+ D("\n");
+ }
/* When a huge Radius (in the 5000 km range or more) is used,
* adjacent neighbors can be the same, leading to duplicated
@@ -345,7 +369,11 @@ int membersOfAllNeighbors(robj *zobj, GeoHashRadius n, double lon, double lat, d
if (last_processed &&
neighbors[i].bits == neighbors[last_processed].bits &&
neighbors[i].step == neighbors[last_processed].step)
+ {
+ if (debugmsg)
+ D("Skipping processing of %d, same as previous\n",i);
continue;
+ }
count += membersOfGeoHashBox(zobj, neighbors[i], ga, lon, lat, radius);
last_processed = i;
}
@@ -656,16 +684,15 @@ void geohashCommand(client *c) {
int j;
/* Look up the requested zset */
- robj *zobj = NULL;
- if ((zobj = lookupKeyReadOrReply(c, c->argv[1], shared.emptymultibulk))
- == NULL || checkType(c, zobj, OBJ_ZSET)) return;
+ robj *zobj = lookupKeyRead(c->db, c->argv[1]);
+ if (zobj && checkType(c, zobj, OBJ_ZSET)) return;
/* Geohash elements one after the other, using a null bulk reply for
* missing elements. */
addReplyMultiBulkLen(c,c->argc-2);
for (j = 2; j < c->argc; j++) {
double score;
- if (zsetScore(zobj, c->argv[j]->ptr, &score) == C_ERR) {
+ if (!zobj || zsetScore(zobj, c->argv[j]->ptr, &score) == C_ERR) {
addReply(c,shared.nullbulk);
} else {
/* The internal format we use for geocoding is a bit different
@@ -710,16 +737,15 @@ void geoposCommand(client *c) {
int j;
/* Look up the requested zset */
- robj *zobj = NULL;
- if ((zobj = lookupKeyReadOrReply(c, c->argv[1], shared.emptymultibulk))
- == NULL || checkType(c, zobj, OBJ_ZSET)) return;
+ robj *zobj = lookupKeyRead(c->db, c->argv[1]);
+ if (zobj && checkType(c, zobj, OBJ_ZSET)) return;
/* Report elements one after the other, using a null bulk reply for
* missing elements. */
addReplyMultiBulkLen(c,c->argc-2);
for (j = 2; j < c->argc; j++) {
double score;
- if (zsetScore(zobj, c->argv[j]->ptr, &score) == C_ERR) {
+ if (!zobj || zsetScore(zobj, c->argv[j]->ptr, &score) == C_ERR) {
addReply(c,shared.nullmultibulk);
} else {
/* Decode... */
@@ -754,7 +780,7 @@ void geodistCommand(client *c) {
/* Look up the requested zset */
robj *zobj = NULL;
- if ((zobj = lookupKeyReadOrReply(c, c->argv[1], shared.emptybulk))
+ if ((zobj = lookupKeyReadOrReply(c, c->argv[1], shared.nullbulk))
== NULL || checkType(c, zobj, OBJ_ZSET)) return;
/* Get the scores. We need both otherwise NULL is returned. */
diff --git a/deps/geohash-int/geohash.c b/src/geohash.c
index d3bc7de25..1ae7a7e05 100644
--- a/deps/geohash-int/geohash.c
+++ b/src/geohash.c
@@ -1,7 +1,7 @@
/*
* Copyright (c) 2013-2014, yinqiwen <yinqiwen@gmail.com>
* Copyright (c) 2014, Matt Stancliff <matt@genges.com>.
- * Copyright (c) 2015, Salvatore Sanfilippo <antirez@gmail.com>.
+ * Copyright (c) 2015-2016, Salvatore Sanfilippo <antirez@gmail.com>.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -118,7 +118,7 @@ void geohashGetCoordRange(GeoHashRange *long_range, GeoHashRange *lat_range) {
lat_range->min = GEO_LAT_MIN;
}
-int geohashEncode(GeoHashRange *long_range, GeoHashRange *lat_range,
+int geohashEncode(const GeoHashRange *long_range, const GeoHashRange *lat_range,
double longitude, double latitude, uint8_t step,
GeoHashBits *hash) {
/* Check basic arguments sanity. */
@@ -151,7 +151,7 @@ int geohashEncode(GeoHashRange *long_range, GeoHashRange *lat_range,
}
int geohashEncodeType(double longitude, double latitude, uint8_t step, GeoHashBits *hash) {
- GeoHashRange r[2] = { { 0 } };
+ GeoHashRange r[2] = {{0}};
geohashGetCoordRange(&r[0], &r[1]);
return geohashEncode(&r[0], &r[1], longitude, latitude, step, hash);
}
@@ -194,7 +194,7 @@ int geohashDecode(const GeoHashRange long_range, const GeoHashRange lat_range,
}
int geohashDecodeType(const GeoHashBits hash, GeoHashArea *area) {
- GeoHashRange r[2] = { { 0 } };
+ GeoHashRange r[2] = {{0}};
geohashGetCoordRange(&r[0], &r[1]);
return geohashDecode(r[0], r[1], hash, area);
}
@@ -211,7 +211,7 @@ int geohashDecodeAreaToLongLat(const GeoHashArea *area, double *xy) {
}
int geohashDecodeToLongLatType(const GeoHashBits hash, double *xy) {
- GeoHashArea area = { { 0 } };
+ GeoHashArea area = {{0}};
if (!xy || !geohashDecodeType(hash, &area))
return 0;
return geohashDecodeAreaToLongLat(&area, xy);
diff --git a/deps/geohash-int/geohash.h b/src/geohash.h
index c2f57bed0..ed2ef9336 100644
--- a/deps/geohash-int/geohash.h
+++ b/src/geohash.h
@@ -95,7 +95,7 @@ typedef struct {
* -1:failed
*/
void geohashGetCoordRange(GeoHashRange *long_range, GeoHashRange *lat_range);
-int geohashEncode(GeoHashRange *long_range, GeoHashRange *lat_range,
+int geohashEncode(const GeoHashRange *long_range, const GeoHashRange *lat_range,
double longitude, double latitude, uint8_t step,
GeoHashBits *hash);
int geohashEncodeType(double longitude, double latitude,
diff --git a/deps/geohash-int/geohash_helper.c b/src/geohash_helper.c
index 4c3762faf..77d8ab392 100644
--- a/deps/geohash-int/geohash_helper.c
+++ b/src/geohash_helper.c
@@ -1,7 +1,7 @@
/*
* Copyright (c) 2013-2014, yinqiwen <yinqiwen@gmail.com>
* Copyright (c) 2014, Matt Stancliff <matt@genges.com>.
- * Copyright (c) 2015, Salvatore Sanfilippo <antirez@gmail.com>.
+ * Copyright (c) 2015-2016, Salvatore Sanfilippo <antirez@gmail.com>.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -34,7 +34,10 @@
* https://github.com/yinqiwen/ardb/blob/d42503/src/geo/geohash_helper.cpp
*/
+#include "fmacros.h"
#include "geohash_helper.h"
+#include "debugmacro.h"
+#include <math.h>
#define D_R (M_PI / 180.0)
#define R_MAJOR 6378137.0
@@ -54,8 +57,8 @@ const double MERCATOR_MIN = -20037726.37;
static inline double deg_rad(double ang) { return ang * D_R; }
static inline double rad_deg(double ang) { return ang / D_R; }
-/* You must *ONLY* estimate steps when you are encoding.
- * If you are decoding, always decode to GEO_STEP_MAX (26). */
+/* This function is used in order to estimate the step (bits precision)
+ * of the 9 search area boxes during radius queries. */
uint8_t geohashEstimateStepsByRadius(double range_meters, double lat) {
if (range_meters == 0) return 26;
int step = 1;
@@ -63,55 +66,45 @@ uint8_t geohashEstimateStepsByRadius(double range_meters, double lat) {
range_meters *= 2;
step++;
}
- step -= 2; /* Make sure range is included in the worst case. */
+ step -= 2; /* Make sure range is included in most of the base cases. */
+
/* Wider range torwards the poles... Note: it is possible to do better
* than this approximation by computing the distance between meridians
* at this latitude, but this does the trick for now. */
- if (lat > 67 || lat < -67) step--;
- if (lat > 80 || lat < -80) step--;
+ if (lat > 66 || lat < -66) {
+ step--;
+ if (lat > 80 || lat < -80) step--;
+ }
/* Frame to valid range. */
if (step < 1) step = 1;
- if (step > 26) step = 25;
+ if (step > 26) step = 26;
return step;
}
-int geohashBitsComparator(const GeoHashBits *a, const GeoHashBits *b) {
- /* If step not equal, compare on step. Else, compare on bits. */
- return a->step != b->step ? a->step - b->step : a->bits - b->bits;
-}
-
+/* Return the bounding box of the search area centered at latitude,longitude
+ * having a radius of radius_meter. bounds[0] - bounds[2] is the minimum
+ * and maxium longitude, while bounds[1] - bounds[3] is the minimum and
+ * maximum latitude. */
int geohashBoundingBox(double longitude, double latitude, double radius_meters,
double *bounds) {
if (!bounds) return 0;
- double lonr, latr;
- lonr = deg_rad(longitude);
- latr = deg_rad(latitude);
-
- double distance = radius_meters / EARTH_RADIUS_IN_METERS;
- double min_latitude = latr - distance;
- double max_latitude = latr + distance;
-
- /* Note: we're being lazy and not accounting for coordinates near poles */
- double min_longitude, max_longitude;
- double difference_longitude = asin(sin(distance) / cos(latr));
- min_longitude = lonr - difference_longitude;
- max_longitude = lonr + difference_longitude;
-
- bounds[0] = rad_deg(min_longitude);
- bounds[1] = rad_deg(min_latitude);
- bounds[2] = rad_deg(max_longitude);
- bounds[3] = rad_deg(max_latitude);
+ bounds[0] = longitude - rad_deg(radius_meters/EARTH_RADIUS_IN_METERS/cos(deg_rad(latitude)));
+ bounds[2] = longitude + rad_deg(radius_meters/EARTH_RADIUS_IN_METERS/cos(deg_rad(latitude)));
+ bounds[1] = latitude - rad_deg(radius_meters/EARTH_RADIUS_IN_METERS);
+ bounds[3] = latitude + rad_deg(radius_meters/EARTH_RADIUS_IN_METERS);
return 1;
}
+/* Return a set of areas (center + 8) that are able to cover a range query
+ * for the specified position and radius. */
GeoHashRadius geohashGetAreasByRadius(double longitude, double latitude, double radius_meters) {
GeoHashRange long_range, lat_range;
- GeoHashRadius radius = { { 0 } };
- GeoHashBits hash = { 0 };
- GeoHashNeighbors neighbors = { { 0 } };
- GeoHashArea area = { { 0 } };
+ GeoHashRadius radius;
+ GeoHashBits hash;
+ GeoHashNeighbors neighbors;
+ GeoHashArea area;
double min_lon, max_lon, min_lat, max_lat;
double bounds[4];
int steps;
@@ -124,12 +117,43 @@ GeoHashRadius geohashGetAreasByRadius(double longitude, double latitude, double
steps = geohashEstimateStepsByRadius(radius_meters,latitude);
- geohashGetCoordRange(&long_range, &lat_range);
- geohashEncode(&long_range, &lat_range, longitude, latitude, steps, &hash);
- geohashNeighbors(&hash, &neighbors);
- geohashGetCoordRange(&long_range, &lat_range);
- geohashDecode(long_range, lat_range, hash, &area);
+ geohashGetCoordRange(&long_range,&lat_range);
+ geohashEncode(&long_range,&lat_range,longitude,latitude,steps,&hash);
+ geohashNeighbors(&hash,&neighbors);
+ geohashDecode(long_range,lat_range,hash,&area);
+
+ /* Check if the step is enough at the limits of the covered area.
+ * Sometimes when the search area is near an edge of the
+ * area, the estimated step is not small enough, since one of the
+ * north / south / west / east square is too near to the search area
+ * to cover everything. */
+ int decrease_step = 0;
+ {
+ GeoHashArea north, south, east, west;
+
+ geohashDecode(long_range, lat_range, neighbors.north, &north);
+ geohashDecode(long_range, lat_range, neighbors.south, &south);
+ geohashDecode(long_range, lat_range, neighbors.east, &east);
+ geohashDecode(long_range, lat_range, neighbors.west, &west);
+
+ if (geohashGetDistance(longitude,latitude,longitude,north.latitude.max)
+ < radius_meters) decrease_step = 1;
+ if (geohashGetDistance(longitude,latitude,longitude,south.latitude.min)
+ < radius_meters) decrease_step = 1;
+ if (geohashGetDistance(longitude,latitude,east.longitude.max,latitude)
+ < radius_meters) decrease_step = 1;
+ if (geohashGetDistance(longitude,latitude,west.longitude.min,latitude)
+ < radius_meters) decrease_step = 1;
+ }
+
+ if (steps > 1 && decrease_step) {
+ steps--;
+ geohashEncode(&long_range,&lat_range,longitude,latitude,steps,&hash);
+ geohashNeighbors(&hash,&neighbors);
+ geohashDecode(long_range,lat_range,hash,&area);
+ }
+ /* Exclude the search areas that are useless. */
if (area.latitude.min < min_lat) {
GZERO(neighbors.south);
GZERO(neighbors.south_west);
diff --git a/deps/geohash-int/geohash_helper.h b/src/geohash_helper.h
index bff111dbe..eb0dda38a 100644
--- a/deps/geohash-int/geohash_helper.h
+++ b/src/geohash_helper.h
@@ -32,7 +32,6 @@
#ifndef GEOHASH_HELPER_HPP_
#define GEOHASH_HELPER_HPP_
-#include <math.h>
#include "geohash.h"
#define GZERO(s) s.bits = s.step = 0;
diff --git a/src/help.h b/src/help.h
index 673b71155..5f927c303 100644
--- a/src/help.h
+++ b/src/help.h
@@ -52,6 +52,11 @@ struct commandHelp {
"Count set bits in a string",
1,
"2.6.0" },
+ { "BITFIELD",
+ "key [GET type offset] [SET type offset value] [INCRBY type offset increment] [OVERFLOW WRAP|SAT|FAIL]",
+ "Perform arbitrary bitfield integer operations on strings",
+ 1,
+ "3.2.0" },
{ "BITOP",
"operation destkey key [key ...]",
"Perform bitwise operations between strings",
@@ -326,32 +331,32 @@ struct commandHelp {
"key longitude latitude member [longitude latitude member ...]",
"Add one or more geospatial items in the geospatial index represented using a sorted set",
13,
- "" },
+ "3.2.0" },
{ "GEODIST",
"key member1 member2 [unit]",
"Returns the distance between two members of a geospatial index",
13,
- "" },
+ "3.2.0" },
{ "GEOHASH",
"key member [member ...]",
"Returns members of a geospatial index as standard geohash strings",
13,
- "" },
+ "3.2.0" },
{ "GEOPOS",
"key member [member ...]",
"Returns longitude and latitude of members of a geospatial index",
13,
- "" },
+ "3.2.0" },
{ "GEORADIUS",
- "key longitude latitude radius m|km|ft|mi [WITHCOORD] [WITHDIST] [WITHHASH] [COUNT count] [ASC|DESC]",
+ "key longitude latitude radius m|km|ft|mi [WITHCOORD] [WITHDIST] [WITHHASH] [COUNT count] [ASC|DESC] [STORE key] [STOREDIST key]",
"Query a sorted set representing a geospatial index to fetch members matching a given maximum distance from a point",
13,
- "" },
+ "3.2.0" },
{ "GEORADIUSBYMEMBER",
- "key member radius m|km|ft|mi [WITHCOORD] [WITHDIST] [WITHHASH] [COUNT count] [ASC|DESC]",
+ "key member radius m|km|ft|mi [WITHCOORD] [WITHDIST] [WITHHASH] [COUNT count] [ASC|DESC] [STORE key] [STOREDIST key]",
"Query a sorted set representing a geospatial index to fetch members matching a given maximum distance from a member",
13,
- "" },
+ "3.2.0" },
{ "GET",
"key",
"Get the value of a key",
diff --git a/src/hyperloglog.c b/src/hyperloglog.c
index 8ccc16be2..0800bf59d 100644
--- a/src/hyperloglog.c
+++ b/src/hyperloglog.c
@@ -994,32 +994,21 @@ uint64_t hllCount(struct hllhdr *hdr, int *invalid) {
serverPanic("Unknown HyperLogLog encoding in hllCount()");
}
- /* Muliply the inverse of E for alpha_m * m^2 to have the raw estimate. */
- E = (1/E)*alpha*m*m;
-
- /* Use the LINEARCOUNTING algorithm for small cardinalities.
- * For larger values but up to 72000 HyperLogLog raw approximation is
- * used since linear counting error starts to increase. However HyperLogLog
- * shows a strong bias in the range 2.5*16384 - 72000, so we try to
- * compensate for it. */
- if (E < m*2.5 && ez != 0) {
- E = m*log(m/ez); /* LINEARCOUNTING() */
- } else if (m == 16384 && E < 72000) {
- /* We did polynomial regression of the bias for this range, this
- * way we can compute the bias for a given cardinality and correct
- * according to it. Only apply the correction for P=14 that's what
- * we use and the value the correction was verified with. */
- double bias = 5.9119*1.0e-18*(E*E*E*E)
- -1.4253*1.0e-12*(E*E*E)+
- 1.2940*1.0e-7*(E*E)
- -5.2921*1.0e-3*E+
- 83.3216;
- E -= E*(bias/100);
- }
- /* We don't apply the correction for E > 1/30 of 2^32 since we use
- * a 64 bit function and 6 bit counters. To apply the correction for
- * 1/30 of 2^64 is not needed since it would require a huge set
- * to approach such a value. */
+ /* Apply loglog-beta to the raw estimate. See:
+ * "LogLog-Beta and More: A New Algorithm for Cardinality Estimation
+ * Based on LogLog Counting" Jason Qin, Denys Kim, Yumei Tung
+ * arXiv:1612.02284 */
+ double zl = log(ez + 1);
+ double beta = -0.370393911*ez +
+ 0.070471823*zl +
+ 0.17393686*pow(zl,2) +
+ 0.16339839*pow(zl,3) +
+ -0.09237745*pow(zl,4) +
+ 0.03738027*pow(zl,5) +
+ -0.005384159*pow(zl,6) +
+ 0.00042419*pow(zl,7);
+
+ E = llroundl(alpha*m*(m-ez)*(1/(E+beta)));
return (uint64_t) E;
}
diff --git a/src/intset.c b/src/intset.c
index ac003519b..198c90aa1 100644
--- a/src/intset.c
+++ b/src/intset.c
@@ -272,7 +272,7 @@ uint8_t intsetGet(intset *is, uint32_t pos, int64_t *value) {
}
/* Return intset length */
-uint32_t intsetLen(intset *is) {
+uint32_t intsetLen(const intset *is) {
return intrev32ifbe(is->length);
}
diff --git a/src/intset.h b/src/intset.h
index 30a854f89..8119e6636 100644
--- a/src/intset.h
+++ b/src/intset.h
@@ -44,7 +44,7 @@ intset *intsetRemove(intset *is, int64_t value, int *success);
uint8_t intsetFind(intset *is, int64_t value);
int64_t intsetRandom(intset *is);
uint8_t intsetGet(intset *is, uint32_t pos, int64_t *value);
-uint32_t intsetLen(intset *is);
+uint32_t intsetLen(const intset *is);
size_t intsetBlobLen(intset *is);
#ifdef REDIS_TEST
diff --git a/src/latency.c b/src/latency.c
index 6f8b2a59f..53e0ec7be 100644
--- a/src/latency.c
+++ b/src/latency.c
@@ -79,7 +79,7 @@ int THPIsEnabled(void) {
* value of the function is non-zero, the process is being targeted by
* THP support, and is likely to have memory usage / latency issues. */
int THPGetAnonHugePagesSize(void) {
- return zmalloc_get_smap_bytes_by_field("AnonHugePages:");
+ return zmalloc_get_smap_bytes_by_field("AnonHugePages:",-1);
}
/* ---------------------------- Latency API --------------------------------- */
diff --git a/src/lazyfree.c b/src/lazyfree.c
index dba3f00e2..c05252159 100644
--- a/src/lazyfree.c
+++ b/src/lazyfree.c
@@ -57,7 +57,7 @@ int dbAsyncDelete(redisDb *db, robj *key) {
/* If the value is composed of a few allocations, to free in a lazy way
* is actually just slower... So under a certain limit we just free
* the object synchronously. */
- dictEntry *de = dictFind(db->dict,key->ptr);
+ dictEntry *de = dictUnlink(db->dict,key->ptr);
if (de) {
robj *val = dictGetVal(de);
size_t free_effort = lazyfreeGetFreeEffort(val);
@@ -73,7 +73,8 @@ int dbAsyncDelete(redisDb *db, robj *key) {
/* Release the key-val pair, or just the key if we set the val
* field to NULL in order to lazy free it later. */
- if (dictDelete(db->dict,key->ptr) == DICT_OK) {
+ if (de) {
+ dictFreeUnlinkedEntry(db->dict,de);
if (server.cluster_enabled) slotToKeyDel(key);
return 1;
} else {
diff --git a/src/module.c b/src/module.c
index e738678ce..a5b3d52ae 100644
--- a/src/module.c
+++ b/src/module.c
@@ -1,3 +1,32 @@
+/*
+ * Copyright (c) 2016, Salvatore Sanfilippo <antirez at gmail dot com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Redis nor the names of its contributors may be used
+ * to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
#include "server.h"
#include "cluster.h"
#include <dlfcn.h>
@@ -17,6 +46,7 @@ struct RedisModule {
char *name; /* Module name. */
int ver; /* Module version. We use just progressive integers. */
int apiver; /* Module API version as requested during initialization.*/
+ list *types; /* Module data types. */
};
typedef struct RedisModule RedisModule;
@@ -75,6 +105,7 @@ struct RedisModuleCtx {
int flags; /* REDISMODULE_CTX_... flags. */
void **postponed_arrays; /* To set with RM_ReplySetArrayLength(). */
int postponed_arrays_count; /* Number of entries in postponed_arrays. */
+ void *blocked_privdata; /* Privdata set when unblocking a clinet. */
/* Used if there is the REDISMODULE_CTX_KEYS_POS_REQUEST flag set. */
int *keys_pos;
@@ -84,10 +115,12 @@ struct RedisModuleCtx {
};
typedef struct RedisModuleCtx RedisModuleCtx;
-#define REDISMODULE_CTX_INIT {(void*)(unsigned long)&RM_GetApi, NULL, NULL, NULL, 0, 0, 0, NULL, 0, NULL, 0, NULL}
+#define REDISMODULE_CTX_INIT {(void*)(unsigned long)&RM_GetApi, NULL, NULL, NULL, 0, 0, 0, NULL, 0, NULL, NULL, 0, NULL}
#define REDISMODULE_CTX_MULTI_EMITTED (1<<0)
#define REDISMODULE_CTX_AUTO_MEMORY (1<<1)
#define REDISMODULE_CTX_KEYS_POS_REQUEST (1<<2)
+#define REDISMODULE_CTX_BLOCKED_REPLY (1<<3)
+#define REDISMODULE_CTX_BLOCKED_TIMEOUT (1<<4)
/* This represents a Redis key opened with RM_OpenKey(). */
struct RedisModuleKey {
@@ -153,6 +186,23 @@ typedef struct RedisModuleCallReply {
} val;
} RedisModuleCallReply;
+/* Structure representing a blocked client. We get a pointer to such
+ * an object when blocking from modules. */
+typedef struct RedisModuleBlockedClient {
+ client *client; /* Pointer to the blocked client. or NULL if the client
+ was destroyed during the life of this object. */
+ RedisModule *module; /* Module blocking the client. */
+ RedisModuleCmdFunc reply_callback; /* Reply callback on normal completion.*/
+ RedisModuleCmdFunc timeout_callback; /* Reply callback on timeout. */
+ void (*free_privdata)(void *); /* privdata cleanup callback. */
+ void *privdata; /* Module private data that may be used by the reply
+ or timeout callback. It is set via the
+ RedisModule_UnblockClient() API. */
+} RedisModuleBlockedClient;
+
+static pthread_mutex_t moduleUnblockedClientsMutex = PTHREAD_MUTEX_INITIALIZER;
+static list *moduleUnblockedClients;
+
/* --------------------------------------------------------------------------
* Prototypes
* -------------------------------------------------------------------------- */
@@ -162,7 +212,45 @@ void RM_CloseKey(RedisModuleKey *key);
void autoMemoryCollect(RedisModuleCtx *ctx);
robj **moduleCreateArgvFromUserFormat(const char *cmdname, const char *fmt, int *argcp, int *flags, va_list ap);
void moduleReplicateMultiIfNeeded(RedisModuleCtx *ctx);
-void RM_ZsetRangeStop(RedisModuleKey *key);
+void RM_ZsetRangeStop(RedisModuleKey *kp);
+static void zsetKeyReset(RedisModuleKey *key);
+
+/* --------------------------------------------------------------------------
+ * Heap allocation raw functions
+ * -------------------------------------------------------------------------- */
+
+/* Use like malloc(). Memory allocated with this function is reported in
+ * Redis INFO memory, used for keys eviction according to maxmemory settings
+ * and in general is taken into account as memory allocated by Redis.
+ * You should avoid using malloc(). */
+void *RM_Alloc(size_t bytes) {
+ return zmalloc(bytes);
+}
+
+/* Use like calloc(). Memory allocated with this function is reported in
+ * Redis INFO memory, used for keys eviction according to maxmemory settings
+ * and in general is taken into account as memory allocated by Redis.
+ * You should avoid using calloc() directly. */
+void *RM_Calloc(size_t nmemb, size_t size) {
+ return zcalloc(nmemb*size);
+}
+
+/* Use like realloc() for memory obtained with RedisModule_Alloc(). */
+void* RM_Realloc(void *ptr, size_t bytes) {
+ return zrealloc(ptr,bytes);
+}
+
+/* Use like free() for memory obtained by RedisModule_Alloc() and
+ * RedisModule_Realloc(). However you should never try to free with
+ * RedisModule_Free() memory allocated with malloc() inside your module. */
+void RM_Free(void *ptr) {
+ zfree(ptr);
+}
+
+/* Like strdup() but returns memory allocated with RedisModule_Alloc(). */
+char *RM_Strdup(const char *str) {
+ return zstrdup(str);
+}
/* --------------------------------------------------------------------------
* Pool allocator
@@ -335,26 +423,36 @@ void moduleFreeContext(RedisModuleCtx *ctx) {
}
}
-/* This Redis command binds the normal Redis command invocation with commands
- * exported by modules. */
-void RedisModuleCommandDispatcher(client *c) {
- RedisModuleCommandProxy *cp = (void*)(unsigned long)c->cmd->getkeys_proc;
- RedisModuleCtx ctx = REDISMODULE_CTX_INIT;
+/* Helper function for when a command callback is called, in order to handle
+ * details needed to correctly replicate commands. */
+void moduleHandlePropagationAfterCommandCallback(RedisModuleCtx *ctx) {
+ client *c = ctx->client;
- ctx.module = cp->module;
- ctx.client = c;
- cp->func(&ctx,(void**)c->argv,c->argc);
+ /* We don't want any automatic propagation here since in modules we handle
+ * replication / AOF propagation in explicit ways. */
preventCommandPropagation(c);
/* Handle the replication of the final EXEC, since whatever a command
* emits is always wrappered around MULTI/EXEC. */
- if (ctx.flags & REDISMODULE_CTX_MULTI_EMITTED) {
+ if (ctx->flags & REDISMODULE_CTX_MULTI_EMITTED) {
robj *propargv[1];
propargv[0] = createStringObject("EXEC",4);
alsoPropagate(server.execCommand,c->db->id,propargv,1,
PROPAGATE_AOF|PROPAGATE_REPL);
decrRefCount(propargv[0]);
}
+}
+
+/* This Redis command binds the normal Redis command invocation with commands
+ * exported by modules. */
+void RedisModuleCommandDispatcher(client *c) {
+ RedisModuleCommandProxy *cp = (void*)(unsigned long)c->cmd->getkeys_proc;
+ RedisModuleCtx ctx = REDISMODULE_CTX_INIT;
+
+ ctx.module = cp->module;
+ ctx.client = c;
+ cp->func(&ctx,(void**)c->argv,c->argc);
+ moduleHandlePropagationAfterCommandCallback(&ctx);
moduleFreeContext(&ctx);
}
@@ -546,9 +644,15 @@ void RM_SetModuleAttribs(RedisModuleCtx *ctx, const char *name, int ver, int api
module->name = sdsnew((char*)name);
module->ver = ver;
module->apiver = apiver;
+ module->types = listCreate();
ctx->module = module;
}
+/* Return the current UNIX time in milliseconds. */
+long long RM_Milliseconds(void) {
+ return mstime();
+}
+
/* --------------------------------------------------------------------------
* Automatic memory management for modules
* -------------------------------------------------------------------------- */
@@ -575,21 +679,38 @@ void autoMemoryAdd(RedisModuleCtx *ctx, int type, void *ptr) {
}
/* Mark an object as freed in the auto release queue, so that users can still
- * free things manually if they want. */
-void autoMemoryFreed(RedisModuleCtx *ctx, int type, void *ptr) {
- if (!(ctx->flags & REDISMODULE_CTX_AUTO_MEMORY)) return;
+ * free things manually if they want.
+ *
+ * The function returns 1 if the object was actually found in the auto memory
+ * pool, otherwise 0 is returned. */
+int autoMemoryFreed(RedisModuleCtx *ctx, int type, void *ptr) {
+ if (!(ctx->flags & REDISMODULE_CTX_AUTO_MEMORY)) return 0;
+
+ int count = (ctx->amqueue_used+1)/2;
+ for (int j = 0; j < count; j++) {
+ for (int side = 0; side < 2; side++) {
+ /* For side = 0 check right side of the array, for
+ * side = 1 check the left side instead (zig-zag scanning). */
+ int i = (side == 0) ? (ctx->amqueue_used - 1 - j) : j;
+ if (ctx->amqueue[i].type == type &&
+ ctx->amqueue[i].ptr == ptr)
+ {
+ ctx->amqueue[i].type = REDISMODULE_AM_FREED;
- int j;
- for (j = 0; j < ctx->amqueue_used; j++) {
- if (ctx->amqueue[j].type == type &&
- ctx->amqueue[j].ptr == ptr)
- {
- ctx->amqueue[j].type = REDISMODULE_AM_FREED;
- /* Optimization: if this is the last element, we can
- * reuse it. */
- if (j == ctx->amqueue_used-1) ctx->amqueue_used--;
+ /* Switch the freed element and the last element, to avoid growing
+ * the queue unnecessarily if we allocate/free in a loop */
+ if (i != ctx->amqueue_used-1) {
+ ctx->amqueue[i] = ctx->amqueue[ctx->amqueue_used-1];
+ }
+
+ /* Reduce the size of the queue because we either moved the top
+ * element elsewhere or freed it */
+ ctx->amqueue_used--;
+ return 1;
+ }
}
}
+ return 0;
}
/* Release all the objects in queue. */
@@ -624,13 +745,33 @@ void autoMemoryCollect(RedisModuleCtx *ctx) {
*
* The string is created by copying the `len` bytes starting
* at `ptr`. No reference is retained to the passed buffer. */
-RedisModuleString *RM_CreateString(RedisModuleCtx *ctx, const char *ptr, size_t len)
-{
+RedisModuleString *RM_CreateString(RedisModuleCtx *ctx, const char *ptr, size_t len) {
RedisModuleString *o = createStringObject(ptr,len);
autoMemoryAdd(ctx,REDISMODULE_AM_STRING,o);
return o;
}
+
+/* Create a new module string object from a printf format and arguments.
+ * The returned string must be freed with RedisModule_FreeString(), unless
+ * automatic memory is enabled.
+ *
+ * The string is created using the sds formatter function sdscatvprintf(). */
+RedisModuleString *RM_CreateStringPrintf(RedisModuleCtx *ctx, const char *fmt, ...) {
+ sds s = sdsempty();
+
+ va_list ap;
+ va_start(ap, fmt);
+ s = sdscatvprintf(s, fmt, ap);
+ va_end(ap);
+
+ RedisModuleString *o = createObject(OBJ_STRING, s);
+ autoMemoryAdd(ctx,REDISMODULE_AM_STRING,o);
+
+ return o;
+}
+
+
/* Like RedisModule_CreatString(), but creates a string starting from a long long
* integer instead of taking a buffer and its length.
*
@@ -642,6 +783,17 @@ RedisModuleString *RM_CreateStringFromLongLong(RedisModuleCtx *ctx, long long ll
return RM_CreateString(ctx,buf,len);
}
+/* Like RedisModule_CreatString(), but creates a string starting from another
+ * RedisModuleString.
+ *
+ * The returned string must be released with RedisModule_FreeString() or by
+ * enabling automatic memory management. */
+RedisModuleString *RM_CreateStringFromString(RedisModuleCtx *ctx, const RedisModuleString *str) {
+ RedisModuleString *o = dupStringObject(str);
+ autoMemoryAdd(ctx,REDISMODULE_AM_STRING,o);
+ return o;
+}
+
/* Free a module string object obtained with one of the Redis modules API calls
* that return new string objects.
*
@@ -653,19 +805,65 @@ void RM_FreeString(RedisModuleCtx *ctx, RedisModuleString *str) {
autoMemoryFreed(ctx,REDISMODULE_AM_STRING,str);
}
+/* Every call to this function, will make the string 'str' requiring
+ * an additional call to RedisModule_FreeString() in order to really
+ * free the string. Note that the automatic freeing of the string obtained
+ * enabling modules automatic memory management counts for one
+ * RedisModule_FreeString() call (it is just executed automatically).
+ *
+ * Normally you want to call this function when, at the same time
+ * the following conditions are true:
+ *
+ * 1) You have automatic memory management enabled.
+ * 2) You want to create string objects.
+ * 3) Those string objects you create need to live *after* the callback
+ * function(for example a command implementation) creating them returns.
+ *
+ * Usually you want this in order to store the created string object
+ * into your own data structure, for example when implementing a new data
+ * type.
+ *
+ * Note that when memory management is turned off, you don't need
+ * any call to RetainString() since creating a string will always result
+ * into a string that lives after the callback function returns, if
+ * no FreeString() call is performed. */
+void RM_RetainString(RedisModuleCtx *ctx, RedisModuleString *str) {
+ if (!autoMemoryFreed(ctx,REDISMODULE_AM_STRING,str)) {
+ /* Increment the string reference counting only if we can't
+ * just remove the object from the list of objects that should
+ * be reclaimed. Why we do that, instead of just incrementing
+ * the refcount in any case, and let the automatic FreeString()
+ * call at the end to bring the refcount back at the desired
+ * value? Because this way we ensure that the object refcount
+ * value is 1 (instead of going to 2 to be dropped later to 1)
+ * after the call to this function. This is needed for functions
+ * like RedisModule_StringAppendBuffer() to work. */
+ incrRefCount(str);
+ }
+}
+
/* Given a string module object, this function returns the string pointer
* and length of the string. The returned pointer and length should only
* be used for read only accesses and never modified. */
-const char *RM_StringPtrLen(RedisModuleString *str, size_t *len) {
+const char *RM_StringPtrLen(const RedisModuleString *str, size_t *len) {
+ if (str == NULL) {
+ const char *errmsg = "(NULL string reply referenced in module)";
+ if (len) *len = strlen(errmsg);
+ return errmsg;
+ }
if (len) *len = sdslen(str->ptr);
return str->ptr;
}
+/* --------------------------------------------------------------------------
+ * Higher level string operations
+ * ------------------------------------------------------------------------- */
+
/* Convert the string into a long long integer, storing it at `*ll`.
* Returns REDISMODULE_OK on success. If the string can't be parsed
* as a valid, strict long long (no spaces before/after), REDISMODULE_ERR
* is returned. */
-int RM_StringToLongLong(RedisModuleString *str, long long *ll) {
+int RM_StringToLongLong(const RedisModuleString *str, long long *ll) {
return string2ll(str->ptr,sdslen(str->ptr),ll) ? REDISMODULE_OK :
REDISMODULE_ERR;
}
@@ -673,11 +871,52 @@ int RM_StringToLongLong(RedisModuleString *str, long long *ll) {
/* Convert the string into a double, storing it at `*d`.
* Returns REDISMODULE_OK on success or REDISMODULE_ERR if the string is
* not a valid string representation of a double value. */
-int RM_StringToDouble(RedisModuleString *str, double *d) {
+int RM_StringToDouble(const RedisModuleString *str, double *d) {
int retval = getDoubleFromObject(str,d);
return (retval == C_OK) ? REDISMODULE_OK : REDISMODULE_ERR;
}
+/* Compare two string objects, returning -1, 0 or 1 respectively if
+ * a < b, a == b, a > b. Strings are compared byte by byte as two
+ * binary blobs without any encoding care / collation attempt. */
+int RM_StringCompare(RedisModuleString *a, RedisModuleString *b) {
+ return compareStringObjects(a,b);
+}
+
+/* Return the (possibly modified in encoding) input 'str' object if
+ * the string is unshared, otherwise NULL is returned. */
+RedisModuleString *moduleAssertUnsharedString(RedisModuleString *str) {
+ if (str->refcount != 1) {
+ serverLog(LL_WARNING,
+ "Module attempted to use an in-place string modify operation "
+ "with a string referenced multiple times. Please check the code "
+ "for API usage correctness.");
+ return NULL;
+ }
+ if (str->encoding == OBJ_ENCODING_EMBSTR) {
+ /* Note: here we "leak" the additional allocation that was
+ * used in order to store the embedded string in the object. */
+ str->ptr = sdsnewlen(str->ptr,sdslen(str->ptr));
+ str->encoding = OBJ_ENCODING_RAW;
+ } else if (str->encoding == OBJ_ENCODING_INT) {
+ /* Convert the string from integer to raw encoding. */
+ str->ptr = sdsfromlonglong((long)str->ptr);
+ str->encoding = OBJ_ENCODING_RAW;
+ }
+ return str;
+}
+
+/* Append the specified buffere to the string 'str'. The string must be a
+ * string created by the user that is referenced only a single time, otherwise
+ * REDISMODULE_ERR is returend and the operation is not performed. */
+int RM_StringAppendBuffer(RedisModuleCtx *ctx, RedisModuleString *str, const char *buf, size_t len) {
+ UNUSED(ctx);
+ str = moduleAssertUnsharedString(str);
+ if (str == NULL) return REDISMODULE_ERR;
+ str->ptr = sdscatlen(str->ptr,buf,len);
+ return REDISMODULE_OK;
+}
+
/* --------------------------------------------------------------------------
* Reply APIs
*
@@ -1016,7 +1255,7 @@ void *RM_OpenKey(RedisModuleCtx *ctx, robj *keyname, int mode) {
kp->value = value;
kp->iter = NULL;
kp->mode = mode;
- RM_ZsetRangeStop(kp);
+ zsetKeyReset(kp);
autoMemoryAdd(ctx,REDISMODULE_AM_KEY,kp);
return (void*)kp;
}
@@ -1044,6 +1283,7 @@ int RM_KeyType(RedisModuleKey *key) {
case OBJ_SET: return REDISMODULE_KEYTYPE_SET;
case OBJ_ZSET: return REDISMODULE_KEYTYPE_ZSET;
case OBJ_HASH: return REDISMODULE_KEYTYPE_HASH;
+ case OBJ_MODULE: return REDISMODULE_KEYTYPE_MODULE;
default: return 0;
}
}
@@ -1102,7 +1342,7 @@ int RM_SetExpire(RedisModuleKey *key, mstime_t expire) {
return REDISMODULE_ERR;
if (expire != REDISMODULE_NO_EXPIRE) {
expire += mstime();
- setExpire(key->db,key->key,expire);
+ setExpire(key->ctx->client,key->db,key->key,expire);
} else {
removeExpire(key->db,key->key);
}
@@ -1401,6 +1641,12 @@ int RM_ZsetScore(RedisModuleKey *key, RedisModuleString *ele, double *score) {
* Key API for Sorted Set iterator
* -------------------------------------------------------------------------- */
+void zsetKeyReset(RedisModuleKey *key) {
+ key->ztype = REDISMODULE_ZSET_RANGE_NONE;
+ key->zcurrent = NULL;
+ key->zer = 1;
+}
+
/* Stop a sorted set iteration. */
void RM_ZsetRangeStop(RedisModuleKey *key) {
/* Free resources if needed. */
@@ -1409,9 +1655,7 @@ void RM_ZsetRangeStop(RedisModuleKey *key) {
/* Setup sensible values so that misused iteration API calls when an
* iterator is not active will result into something more sensible
* than crashing. */
- key->ztype = REDISMODULE_ZSET_RANGE_NONE;
- key->zcurrent = NULL;
- key->zer = 1;
+ zsetKeyReset(key);
}
/* Return the "End of range" flag value to signal the end of the iteration. */
@@ -1614,12 +1858,12 @@ int RM_ZsetRangeNext(RedisModuleKey *key) {
} else {
/* Are we still within the range? */
if (key->ztype == REDISMODULE_ZSET_RANGE_SCORE &&
- !zslValueLteMax(ln->score,&key->zrs))
+ !zslValueLteMax(next->score,&key->zrs))
{
key->zer = 1;
return 0;
} else if (key->ztype == REDISMODULE_ZSET_RANGE_LEX) {
- if (!zslLexValueLteMax(ln->ele,&key->zlrs)) {
+ if (!zslLexValueLteMax(next->ele,&key->zlrs)) {
key->zer = 1;
return 0;
}
@@ -1677,7 +1921,7 @@ int RM_ZsetRangePrev(RedisModuleKey *key) {
} else {
/* Are we still within the range? */
if (key->ztype == REDISMODULE_ZSET_RANGE_SCORE &&
- !zslValueGteMin(ln->score,&key->zrs))
+ !zslValueGteMin(prev->score,&key->zrs))
{
key->zer = 1;
return 0;
@@ -1788,17 +2032,20 @@ int RM_HashSet(RedisModuleKey *key, int flags, ...) {
continue;
}
+ int low_flags = HASH_SET_COPY;
/* If CFIELDS is active, we can pass the ownership of the
* SDS object to the low level function that sets the field
* to avoid a useless copy. */
- int low_flags = HASH_SET_COPY;
if (flags & REDISMODULE_HASH_CFIELDS)
low_flags |= HASH_SET_TAKE_FIELD;
updated += hashTypeSet(key->value, field->ptr, value->ptr, low_flags);
- field->ptr = NULL; /* Ownership is now of hashTypeSet() */
- /* Cleanup */
- if (flags & REDISMODULE_HASH_CFIELDS) decrRefCount(field);
+ /* If CFIELDS is active, SDS string ownership is now of hashTypeSet(),
+ * however we still have to release the 'field' object shell. */
+ if (flags & REDISMODULE_HASH_CFIELDS) {
+ field->ptr = NULL; /* Prevent the SDS string from being freed. */
+ decrRefCount(field);
+ }
}
va_end(ap);
moduleDelKeyIfEmpty(key);
@@ -2039,12 +2286,15 @@ void RM_FreeCallReply_Rec(RedisModuleCallReply *reply, int freenested){
* to have the first level function to return on nested replies, but only
* if called by the module API. */
void RM_FreeCallReply(RedisModuleCallReply *reply) {
+
+ RedisModuleCtx *ctx = reply->ctx;
RM_FreeCallReply_Rec(reply,0);
- autoMemoryFreed(reply->ctx,REDISMODULE_AM_REPLY,reply);
+ autoMemoryFreed(ctx,REDISMODULE_AM_REPLY,reply);
}
/* Return the reply type. */
int RM_CallReplyType(RedisModuleCallReply *reply) {
+ if (!reply) return REDISMODULE_REPLY_UNKNOWN;
return reply->type;
}
@@ -2278,6 +2528,736 @@ const char *RM_CallReplyProto(RedisModuleCallReply *reply, size_t *len) {
}
/* --------------------------------------------------------------------------
+ * Modules data types
+ *
+ * When String DMA or using existing data structures is not enough, it is
+ * possible to create new data types from scratch and export them to
+ * Redis. The module must provide a set of callbacks for handling the
+ * new values exported (for example in order to provide RDB saving/loading,
+ * AOF rewrite, and so forth). In this section we define this API.
+ * -------------------------------------------------------------------------- */
+
+/* Turn a 9 chars name in the specified charset and a 10 bit encver into
+ * a single 64 bit unsigned integer that represents this exact module name
+ * and version. This final number is called a "type ID" and is used when
+ * writing module exported values to RDB files, in order to re-associate the
+ * value to the right module to load them during RDB loading.
+ *
+ * If the string is not of the right length or the charset is wrong, or
+ * if encver is outside the unsigned 10 bit integer range, 0 is returned,
+ * otherwise the function returns the right type ID.
+ *
+ * The resulting 64 bit integer is composed as follows:
+ *
+ * (high order bits) 6|6|6|6|6|6|6|6|6|10 (low order bits)
+ *
+ * The first 6 bits value is the first character, name[0], while the last
+ * 6 bits value, immediately before the 10 bits integer, is name[8].
+ * The last 10 bits are the encoding version.
+ *
+ * Note that a name and encver combo of "AAAAAAAAA" and 0, will produce
+ * zero as return value, that is the same we use to signal errors, thus
+ * this combination is invalid, and also useless since type names should
+ * try to be vary to avoid collisions. */
+
+const char *ModuleTypeNameCharSet =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+ "abcdefghijklmnopqrstuvwxyz"
+ "0123456789-_";
+
+uint64_t moduleTypeEncodeId(const char *name, int encver) {
+ /* We use 64 symbols so that we can map each character into 6 bits
+ * of the final output. */
+ const char *cset = ModuleTypeNameCharSet;
+ if (strlen(name) != 9) return 0;
+ if (encver < 0 || encver > 1023) return 0;
+
+ uint64_t id = 0;
+ for (int j = 0; j < 9; j++) {
+ char *p = strchr(cset,name[j]);
+ if (!p) return 0;
+ unsigned long pos = p-cset;
+ id = (id << 6) | pos;
+ }
+ id = (id << 10) | encver;
+ return id;
+}
+
+/* Search, in the list of exported data types of all the modules registered,
+ * a type with the same name as the one given. Returns the moduleType
+ * structure pointer if such a module is found, or NULL otherwise. */
+moduleType *moduleTypeLookupModuleByName(const char *name) {
+ dictIterator *di = dictGetIterator(modules);
+ dictEntry *de;
+
+ while ((de = dictNext(di)) != NULL) {
+ struct RedisModule *module = dictGetVal(de);
+ listIter li;
+ listNode *ln;
+
+ listRewind(module->types,&li);
+ while((ln = listNext(&li))) {
+ moduleType *mt = ln->value;
+ if (memcmp(name,mt->name,sizeof(mt->name)) == 0) {
+ dictReleaseIterator(di);
+ return mt;
+ }
+ }
+ }
+ dictReleaseIterator(di);
+ return NULL;
+}
+
+/* Lookup a module by ID, with caching. This function is used during RDB
+ * loading. Modules exporting data types should never be able to unload, so
+ * our cache does not need to expire. */
+#define MODULE_LOOKUP_CACHE_SIZE 3
+
+moduleType *moduleTypeLookupModuleByID(uint64_t id) {
+ static struct {
+ uint64_t id;
+ moduleType *mt;
+ } cache[MODULE_LOOKUP_CACHE_SIZE];
+
+ /* Search in cache to start. */
+ int j;
+ for (j = 0; j < MODULE_LOOKUP_CACHE_SIZE; j++)
+ if (cache[j].id == id) return cache[j].mt;
+
+ /* Slow module by module lookup. */
+ moduleType *mt = NULL;
+ dictIterator *di = dictGetIterator(modules);
+ dictEntry *de;
+
+ while ((de = dictNext(di)) != NULL) {
+ struct RedisModule *module = dictGetVal(de);
+ listIter li;
+ listNode *ln;
+
+ listRewind(module->types,&li);
+ while((ln = listNext(&li))) {
+ mt = ln->value;
+ /* Compare only the 54 bit module identifier and not the
+ * encoding version. */
+ if (mt->id >> 10 == id >> 10) break;
+ }
+ }
+ dictReleaseIterator(di);
+
+ /* Add to cache if possible. */
+ if (mt && j < MODULE_LOOKUP_CACHE_SIZE) {
+ cache[j].id = id;
+ cache[j].mt = mt;
+ }
+ return mt;
+}
+
+/* Turn an (unresolved) module ID into a type name, to show the user an
+ * error when RDB files contain module data we can't load. */
+void moduleTypeNameByID(char *name, uint64_t moduleid) {
+ const char *cset = ModuleTypeNameCharSet;
+
+ name[0] = '\0';
+ char *p = name+8;
+ moduleid >>= 10;
+ for (int j = 0; j < 9; j++) {
+ *p-- = cset[moduleid & 63];
+ moduleid >>= 6;
+ }
+}
+
+/* Register a new data type exported by the module. The parameters are the
+ * following. Please for in depth documentation check the modules API
+ * documentation, especially the TYPES.md file.
+ *
+ * * **name**: A 9 characters data type name that MUST be unique in the Redis
+ * Modules ecosystem. Be creative... and there will be no collisions. Use
+ * the charset A-Z a-z 9-0, plus the two "-_" characters. A good
+ * idea is to use, for example `<typename>-<vendor>`. For example
+ * "tree-AntZ" may mean "Tree data structure by @antirez". To use both
+ * lower case and upper case letters helps in order to prevent collisions.
+ * * **encver**: Encoding version, which is, the version of the serialization
+ * that a module used in order to persist data. As long as the "name"
+ * matches, the RDB loading will be dispatched to the type callbacks
+ * whatever 'encver' is used, however the module can understand if
+ * the encoding it must load are of an older version of the module.
+ * For example the module "tree-AntZ" initially used encver=0. Later
+ * after an upgrade, it started to serialize data in a different format
+ * and to register the type with encver=1. However this module may
+ * still load old data produced by an older version if the rdb_load
+ * callback is able to check the encver value and act accordingly.
+ * The encver must be a positive value between 0 and 1023.
+ * * **typemethods_ptr** is a pointer to a RedisModuleTypeMethods structure
+ * that should be populated with the methods callbacks and structure
+ * version, like in the following example:
+ *
+ * RedisModuleTypeMethods tm = {
+ * .version = REDISMODULE_TYPE_METHOD_VERSION,
+ * .rdb_load = myType_RDBLoadCallBack,
+ * .rdb_save = myType_RDBSaveCallBack,
+ * .aof_rewrite = myType_AOFRewriteCallBack,
+ * .free = myType_FreeCallBack,
+ *
+ * // Optional fields
+ * .digest = myType_DigestCallBack,
+ * .mem_usage = myType_MemUsageCallBack,
+ * }
+ *
+ * * **rdb_load**: A callback function pointer that loads data from RDB files.
+ * * **rdb_save**: A callback function pointer that saves data to RDB files.
+ * * **aof_rewrite**: A callback function pointer that rewrites data as commands.
+ * * **digest**: A callback function pointer that is used for `DEBUG DIGEST`.
+ * * **free**: A callback function pointer that can free a type value.
+ *
+ * The **digest* and **mem_usage** methods should currently be omitted since
+ * they are not yet implemented inside the Redis modules core.
+ *
+ * Note: the module name "AAAAAAAAA" is reserved and produces an error, it
+ * happens to be pretty lame as well.
+ *
+ * If there is already a module registering a type with the same name,
+ * and if the module name or encver is invalid, NULL is returned.
+ * Otherwise the new type is registered into Redis, and a reference of
+ * type RedisModuleType is returned: the caller of the function should store
+ * this reference into a gobal variable to make future use of it in the
+ * modules type API, since a single module may register multiple types.
+ * Example code fragment:
+ *
+ * static RedisModuleType *BalancedTreeType;
+ *
+ * int RedisModule_OnLoad(RedisModuleCtx *ctx) {
+ * // some code here ...
+ * BalancedTreeType = RM_CreateDataType(...);
+ * }
+ */
+moduleType *RM_CreateDataType(RedisModuleCtx *ctx, const char *name, int encver, void *typemethods_ptr) {
+ uint64_t id = moduleTypeEncodeId(name,encver);
+ if (id == 0) return NULL;
+ if (moduleTypeLookupModuleByName(name) != NULL) return NULL;
+
+ long typemethods_version = ((long*)typemethods_ptr)[0];
+ if (typemethods_version == 0) return NULL;
+
+ struct typemethods {
+ uint64_t version;
+ moduleTypeLoadFunc rdb_load;
+ moduleTypeSaveFunc rdb_save;
+ moduleTypeRewriteFunc aof_rewrite;
+ moduleTypeDigestFunc digest;
+ moduleTypeMemUsageFunc mem_usage;
+ moduleTypeFreeFunc free;
+ } *tms = (struct typemethods*) typemethods_ptr;
+
+ moduleType *mt = zcalloc(sizeof(*mt));
+ mt->id = id;
+ mt->module = ctx->module;
+ mt->rdb_load = tms->rdb_load;
+ mt->rdb_save = tms->rdb_save;
+ mt->aof_rewrite = tms->aof_rewrite;
+ mt->mem_usage = tms->mem_usage;
+ mt->digest = tms->digest;
+ mt->free = tms->free;
+ memcpy(mt->name,name,sizeof(mt->name));
+ listAddNodeTail(ctx->module->types,mt);
+ return mt;
+}
+
+/* If the key is open for writing, set the specified module type object
+ * as the value of the key, deleting the old value if any.
+ * On success REDISMODULE_OK is returned. If the key is not open for
+ * writing or there is an active iterator, REDISMODULE_ERR is returned. */
+int RM_ModuleTypeSetValue(RedisModuleKey *key, moduleType *mt, void *value) {
+ if (!(key->mode & REDISMODULE_WRITE) || key->iter) return REDISMODULE_ERR;
+ RM_DeleteKey(key);
+ robj *o = createModuleObject(mt,value);
+ setKey(key->db,key->key,o);
+ decrRefCount(o);
+ key->value = o;
+ return REDISMODULE_OK;
+}
+
+/* Assuming RedisModule_KeyType() returned REDISMODULE_KEYTYPE_MODULE on
+ * the key, returns the moduel type pointer of the value stored at key.
+ *
+ * If the key is NULL, is not associated with a module type, or is empty,
+ * then NULL is returned instead. */
+moduleType *RM_ModuleTypeGetType(RedisModuleKey *key) {
+ if (key == NULL ||
+ key->value == NULL ||
+ RM_KeyType(key) != REDISMODULE_KEYTYPE_MODULE) return NULL;
+ moduleValue *mv = key->value->ptr;
+ return mv->type;
+}
+
+/* Assuming RedisModule_KeyType() returned REDISMODULE_KEYTYPE_MODULE on
+ * the key, returns the module type low-level value stored at key, as
+ * it was set by the user via RedisModule_ModuleTypeSet().
+ *
+ * If the key is NULL, is not associated with a module type, or is empty,
+ * then NULL is returned instead. */
+void *RM_ModuleTypeGetValue(RedisModuleKey *key) {
+ if (key == NULL ||
+ key->value == NULL ||
+ RM_KeyType(key) != REDISMODULE_KEYTYPE_MODULE) return NULL;
+ moduleValue *mv = key->value->ptr;
+ return mv->value;
+}
+
+/* --------------------------------------------------------------------------
+ * RDB loading and saving functions
+ * -------------------------------------------------------------------------- */
+
+/* Called when there is a load error in the context of a module. This cannot
+ * be recovered like for the built-in types. */
+void moduleRDBLoadError(RedisModuleIO *io) {
+ serverLog(LL_WARNING,
+ "Error loading data from RDB (short read or EOF). "
+ "Read performed by module '%s' about type '%s' "
+ "after reading '%llu' bytes of a value.",
+ io->type->module->name,
+ io->type->name,
+ (unsigned long long)io->bytes);
+ exit(1);
+}
+
+/* Save an unsigned 64 bit value into the RDB file. This function should only
+ * be called in the context of the rdb_save method of modules implementing new
+ * data types. */
+void RM_SaveUnsigned(RedisModuleIO *io, uint64_t value) {
+ if (io->error) return;
+ int retval = rdbSaveLen(io->rio, value);
+ if (retval == -1) {
+ io->error = 1;
+ } else {
+ io->bytes += retval;
+ }
+}
+
+/* Load an unsigned 64 bit value from the RDB file. This function should only
+ * be called in the context of the rdb_load method of modules implementing
+ * new data types. */
+uint64_t RM_LoadUnsigned(RedisModuleIO *io) {
+ uint64_t value;
+ int retval = rdbLoadLenByRef(io->rio, NULL, &value);
+ if (retval == -1) {
+ moduleRDBLoadError(io);
+ return 0; /* Never reached. */
+ }
+ return value;
+}
+
+/* Like RedisModule_SaveUnsigned() but for signed 64 bit values. */
+void RM_SaveSigned(RedisModuleIO *io, int64_t value) {
+ union {uint64_t u; int64_t i;} conv;
+ conv.i = value;
+ RM_SaveUnsigned(io,conv.u);
+}
+
+/* Like RedisModule_LoadUnsigned() but for signed 64 bit values. */
+int64_t RM_LoadSigned(RedisModuleIO *io) {
+ union {uint64_t u; int64_t i;} conv;
+ conv.u = RM_LoadUnsigned(io);
+ return conv.i;
+}
+
+/* In the context of the rdb_save method of a module type, saves a
+ * string into the RDB file taking as input a RedisModuleString.
+ *
+ * The string can be later loaded with RedisModule_LoadString() or
+ * other Load family functions expecting a serialized string inside
+ * the RDB file. */
+void RM_SaveString(RedisModuleIO *io, RedisModuleString *s) {
+ if (io->error) return;
+ int retval = rdbSaveStringObject(io->rio,s);
+ if (retval == -1) {
+ io->error = 1;
+ } else {
+ io->bytes += retval;
+ }
+}
+
+/* Like RedisModule_SaveString() but takes a raw C pointer and length
+ * as input. */
+void RM_SaveStringBuffer(RedisModuleIO *io, const char *str, size_t len) {
+ if (io->error) return;
+ int retval = rdbSaveRawString(io->rio,(unsigned char*)str,len);
+ if (retval == -1) {
+ io->error = 1;
+ } else {
+ io->bytes += retval;
+ }
+}
+
+/* Implements RM_LoadString() and RM_LoadStringBuffer() */
+void *moduleLoadString(RedisModuleIO *io, int plain, size_t *lenptr) {
+ void *s = rdbGenericLoadStringObject(io->rio,
+ plain ? RDB_LOAD_PLAIN : RDB_LOAD_NONE, lenptr);
+ if (s == NULL) {
+ moduleRDBLoadError(io);
+ return NULL; /* Never reached. */
+ }
+ return s;
+}
+
+/* In the context of the rdb_load method of a module data type, loads a string
+ * from the RDB file, that was previously saved with RedisModule_SaveString()
+ * functions family.
+ *
+ * The returned string is a newly allocated RedisModuleString object, and
+ * the user should at some point free it with a call to RedisModule_FreeString().
+ *
+ * If the data structure does not store strings as RedisModuleString objects,
+ * the similar function RedisModule_LoadStringBuffer() could be used instead. */
+RedisModuleString *RM_LoadString(RedisModuleIO *io) {
+ return moduleLoadString(io,0,NULL);
+}
+
+/* Like RedisModule_LoadString() but returns an heap allocated string that
+ * was allocated with RedisModule_Alloc(), and can be resized or freed with
+ * RedisModule_Realloc() or RedisModule_Free().
+ *
+ * The size of the string is stored at '*lenptr' if not NULL.
+ * The returned string is not automatically NULL termianted, it is loaded
+ * exactly as it was stored inisde the RDB file. */
+char *RM_LoadStringBuffer(RedisModuleIO *io, size_t *lenptr) {
+ return moduleLoadString(io,1,lenptr);
+}
+
+/* In the context of the rdb_save method of a module data type, saves a double
+ * value to the RDB file. The double can be a valid number, a NaN or infinity.
+ * It is possible to load back the value with RedisModule_LoadDouble(). */
+void RM_SaveDouble(RedisModuleIO *io, double value) {
+ if (io->error) return;
+ int retval = rdbSaveBinaryDoubleValue(io->rio, value);
+ if (retval == -1) {
+ io->error = 1;
+ } else {
+ io->bytes += retval;
+ }
+}
+
+/* In the context of the rdb_save method of a module data type, loads back the
+ * double value saved by RedisModule_SaveDouble(). */
+double RM_LoadDouble(RedisModuleIO *io) {
+ double value;
+ int retval = rdbLoadBinaryDoubleValue(io->rio, &value);
+ if (retval == -1) {
+ moduleRDBLoadError(io);
+ return 0; /* Never reached. */
+ }
+ return value;
+}
+
+/* In the context of the rdb_save method of a module data type, saves a float
+ * value to the RDB file. The float can be a valid number, a NaN or infinity.
+ * It is possible to load back the value with RedisModule_LoadFloat(). */
+void RM_SaveFloat(RedisModuleIO *io, float value) {
+ if (io->error) return;
+ int retval = rdbSaveBinaryFloatValue(io->rio, value);
+ if (retval == -1) {
+ io->error = 1;
+ } else {
+ io->bytes += retval;
+ }
+}
+
+/* In the context of the rdb_save method of a module data type, loads back the
+ * float value saved by RedisModule_SaveFloat(). */
+float RM_LoadFloat(RedisModuleIO *io) {
+ float value;
+ int retval = rdbLoadBinaryFloatValue(io->rio, &value);
+ if (retval == -1) {
+ moduleRDBLoadError(io);
+ return 0; /* Never reached. */
+ }
+ return value;
+}
+
+/* --------------------------------------------------------------------------
+ * AOF API for modules data types
+ * -------------------------------------------------------------------------- */
+
+/* Emits a command into the AOF during the AOF rewriting process. This function
+ * is only called in the context of the aof_rewrite method of data types exported
+ * by a module. The command works exactly like RedisModule_Call() in the way
+ * the parameters are passed, but it does not return anything as the error
+ * handling is performed by Redis itself. */
+void RM_EmitAOF(RedisModuleIO *io, const char *cmdname, const char *fmt, ...) {
+ if (io->error) return;
+ struct redisCommand *cmd;
+ robj **argv = NULL;
+ int argc = 0, flags = 0, j;
+ va_list ap;
+
+ cmd = lookupCommandByCString((char*)cmdname);
+ if (!cmd) {
+ serverLog(LL_WARNING,
+ "Fatal: AOF method for module data type '%s' tried to "
+ "emit unknown command '%s'",
+ io->type->name, cmdname);
+ io->error = 1;
+ errno = EINVAL;
+ return;
+ }
+
+ /* Emit the arguments into the AOF in Redis protocol format. */
+ va_start(ap, fmt);
+ argv = moduleCreateArgvFromUserFormat(cmdname,fmt,&argc,&flags,ap);
+ va_end(ap);
+ if (argv == NULL) {
+ serverLog(LL_WARNING,
+ "Fatal: AOF method for module data type '%s' tried to "
+ "call RedisModule_EmitAOF() with wrong format specifiers '%s'",
+ io->type->name, fmt);
+ io->error = 1;
+ errno = EINVAL;
+ return;
+ }
+
+ /* Bulk count. */
+ if (!io->error && rioWriteBulkCount(io->rio,'*',argc) == 0)
+ io->error = 1;
+
+ /* Arguments. */
+ for (j = 0; j < argc; j++) {
+ if (!io->error && rioWriteBulkObject(io->rio,argv[j]) == 0)
+ io->error = 1;
+ decrRefCount(argv[j]);
+ }
+ zfree(argv);
+ return;
+}
+
+/* --------------------------------------------------------------------------
+ * IO context handling
+ * -------------------------------------------------------------------------- */
+
+RedisModuleCtx *RM_GetContextFromIO(RedisModuleIO *io) {
+ if (io->ctx) return io->ctx; /* Can't have more than one... */
+ RedisModuleCtx ctxtemplate = REDISMODULE_CTX_INIT;
+ io->ctx = zmalloc(sizeof(RedisModuleCtx));
+ *(io->ctx) = ctxtemplate;
+ io->ctx->module = io->type->module;
+ io->ctx->client = NULL;
+ return io->ctx;
+}
+
+/* --------------------------------------------------------------------------
+ * Logging
+ * -------------------------------------------------------------------------- */
+
+/* This is the low level function implementing both:
+ *
+ * RM_Log()
+ * RM_LogIOError()
+ *
+ */
+void RM_LogRaw(RedisModule *module, const char *levelstr, const char *fmt, va_list ap) {
+ char msg[LOG_MAX_LEN];
+ size_t name_len;
+ int level;
+
+ if (!strcasecmp(levelstr,"debug")) level = LL_DEBUG;
+ else if (!strcasecmp(levelstr,"verbose")) level = LL_VERBOSE;
+ else if (!strcasecmp(levelstr,"notice")) level = LL_NOTICE;
+ else if (!strcasecmp(levelstr,"warning")) level = LL_WARNING;
+ else level = LL_VERBOSE; /* Default. */
+
+ name_len = snprintf(msg, sizeof(msg),"<%s> ", module->name);
+ vsnprintf(msg + name_len, sizeof(msg) - name_len, fmt, ap);
+ serverLogRaw(level,msg);
+}
+
+/*
+ * Produces a log message to the standard Redis log, the format accepts
+ * printf-alike specifiers, while level is a string describing the log
+ * level to use when emitting the log, and must be one of the following:
+ *
+ * * "debug"
+ * * "verbose"
+ * * "notice"
+ * * "warning"
+ *
+ * If the specified log level is invalid, verbose is used by default.
+ * There is a fixed limit to the length of the log line this function is able
+ * to emit, this limti is not specified but is guaranteed to be more than
+ * a few lines of text.
+ */
+void RM_Log(RedisModuleCtx *ctx, const char *levelstr, const char *fmt, ...) {
+ if (!ctx->module) return; /* Can only log if module is initialized */
+
+ va_list ap;
+ va_start(ap, fmt);
+ RM_LogRaw(ctx->module,levelstr,fmt,ap);
+ va_end(ap);
+}
+
+/* Log errors from RDB / AOF serialization callbacks.
+ *
+ * This function should be used when a callback is returning a critical
+ * error to the caller since cannot load or save the data for some
+ * critical reason. */
+void RM_LogIOError(RedisModuleIO *io, const char *levelstr, const char *fmt, ...) {
+ va_list ap;
+ va_start(ap, fmt);
+ RM_LogRaw(io->type->module,levelstr,fmt,ap);
+ va_end(ap);
+}
+
+/* --------------------------------------------------------------------------
+ * Blocking clients from modules
+ * -------------------------------------------------------------------------- */
+
+/* This is called from blocked.c in order to unblock a client: may be called
+ * for multiple reasons while the client is in the middle of being blocked
+ * because the client is terminated, but is also called for cleanup when a
+ * client is unblocked in a clean way after replaying.
+ *
+ * What we do here is just to set the client to NULL in the redis module
+ * blocked client handle. This way if the client is terminated while there
+ * is a pending threaded operation involving the blocked client, we'll know
+ * that the client no longer exists and no reply callback should be called.
+ *
+ * The structure RedisModuleBlockedClient will be always deallocated when
+ * running the list of clients blocked by a module that need to be unblocked. */
+void unblockClientFromModule(client *c) {
+ RedisModuleBlockedClient *bc = c->bpop.module_blocked_handle;
+ bc->client = NULL;
+}
+
+/* Block a client in the context of a blocking command, returning an handle
+ * which will be used, later, in order to block the client with a call to
+ * RedisModule_UnblockClient(). The arguments specify callback functions
+ * and a timeout after which the client is unblocked.
+ *
+ * The callbacks are called in the following contexts:
+ *
+ * reply_callback: called after a successful RedisModule_UnblockClient() call
+ * in order to reply to the client and unblock it.
+ * reply_timeout: called when the timeout is reached in order to send an
+ * error to the client.
+ * free_privdata: called in order to free the privata data that is passed
+ * by RedisModule_UnblockClient() call.
+ */
+RedisModuleBlockedClient *RM_BlockClient(RedisModuleCtx *ctx, RedisModuleCmdFunc reply_callback, RedisModuleCmdFunc timeout_callback, void (*free_privdata)(void*), long long timeout_ms) {
+ client *c = ctx->client;
+ c->bpop.module_blocked_handle = zmalloc(sizeof(RedisModuleBlockedClient));
+ RedisModuleBlockedClient *bc = c->bpop.module_blocked_handle;
+
+ bc->client = c;
+ bc->module = ctx->module;
+ bc->reply_callback = reply_callback;
+ bc->timeout_callback = timeout_callback;
+ bc->free_privdata = free_privdata;
+ bc->privdata = NULL;
+ c->bpop.timeout = timeout_ms ? (mstime()+timeout_ms) : 0;
+
+ blockClient(c,BLOCKED_MODULE);
+ return bc;
+}
+
+/* Unblock a client blocked by `RedisModule_BlockedClient`. This will trigger
+ * the reply callbacks to be called in order to reply to the client.
+ * The 'privdata' argument will be accessible by the reply callback, so
+ * the caller of this function can pass any value that is needed in order to
+ * actually reply to the client.
+ *
+ * A common usage for 'privdata' is a thread that computes something that
+ * needs to be passed to the client, included but not limited some slow
+ * to compute reply or some reply obtained via networking.
+ *
+ * Note: this function can be called from threads spawned by the module. */
+int RM_UnblockClient(RedisModuleBlockedClient *bc, void *privdata) {
+ pthread_mutex_lock(&moduleUnblockedClientsMutex);
+ bc->privdata = privdata;
+ listAddNodeTail(moduleUnblockedClients,bc);
+ pthread_mutex_unlock(&moduleUnblockedClientsMutex);
+ return REDISMODULE_OK;
+}
+
+/* Abort a blocked client blocking operation: the client will be unblocked
+ * without firing the reply callback. */
+int RM_AbortBlock(RedisModuleBlockedClient *bc) {
+ bc->reply_callback = NULL;
+ return RM_UnblockClient(bc,NULL);
+}
+
+/* This function will check the moduleUnblockedClients queue in order to
+ * call the reply callback and really unblock the client.
+ *
+ * Clients end into this list because of calls to RM_UnblockClient(),
+ * however it is possible that while the module was doing work for the
+ * blocked client, it was terminated by Redis (for timeout or other reasons).
+ * When this happens the RedisModuleBlockedClient structure in the queue
+ * will have the 'client' field set to NULL. */
+void moduleHandleBlockedClients(void) {
+ listNode *ln;
+ RedisModuleBlockedClient *bc;
+
+ pthread_mutex_lock(&moduleUnblockedClientsMutex);
+ while (listLength(moduleUnblockedClients)) {
+ ln = listFirst(moduleUnblockedClients);
+ bc = ln->value;
+ client *c = bc->client;
+ listDelNode(moduleUnblockedClients,ln);
+ pthread_mutex_unlock(&moduleUnblockedClientsMutex);
+
+ /* Release the lock during the loop, as long as we don't
+ * touch the shared list. */
+
+ if (c != NULL && bc->reply_callback != NULL) {
+ RedisModuleCtx ctx = REDISMODULE_CTX_INIT;
+ ctx.flags |= REDISMODULE_CTX_BLOCKED_REPLY;
+ ctx.blocked_privdata = bc->privdata;
+ ctx.module = bc->module;
+ ctx.client = bc->client;
+ bc->reply_callback(&ctx,(void**)c->argv,c->argc);
+ moduleHandlePropagationAfterCommandCallback(&ctx);
+ moduleFreeContext(&ctx);
+ }
+ if (bc->privdata && bc->free_privdata)
+ bc->free_privdata(bc->privdata);
+ zfree(bc);
+ if (c != NULL) unblockClient(c);
+
+ /* Lock again before to iterate the loop. */
+ pthread_mutex_lock(&moduleUnblockedClientsMutex);
+ }
+ pthread_mutex_unlock(&moduleUnblockedClientsMutex);
+}
+
+/* Called when our client timed out. After this function unblockClient()
+ * is called, and it will invalidate the blocked client. So this function
+ * does not need to do any cleanup. Eventually the module will call the
+ * API to unblock the client and the memory will be released. */
+void moduleBlockedClientTimedOut(client *c) {
+ RedisModuleBlockedClient *bc = c->bpop.module_blocked_handle;
+ RedisModuleCtx ctx = REDISMODULE_CTX_INIT;
+ ctx.flags |= REDISMODULE_CTX_BLOCKED_TIMEOUT;
+ ctx.module = bc->module;
+ ctx.client = bc->client;
+ bc->timeout_callback(&ctx,(void**)c->argv,c->argc);
+ moduleFreeContext(&ctx);
+}
+
+/* Return non-zero if a module command was called in order to fill the
+ * reply for a blocked client. */
+int RM_IsBlockedReplyRequest(RedisModuleCtx *ctx) {
+ return (ctx->flags & REDISMODULE_CTX_BLOCKED_REPLY) != 0;
+}
+
+/* Return non-zero if a module command was called in order to fill the
+ * reply for a blocked client that timed out. */
+int RM_IsBlockedTimeoutRequest(RedisModuleCtx *ctx) {
+ return (ctx->flags & REDISMODULE_CTX_BLOCKED_TIMEOUT) != 0;
+}
+
+/* Get the privata data set by RedisModule_UnblockClient() */
+void *RM_GetBlockedClientPrivateData(RedisModuleCtx *ctx) {
+ return ctx->blocked_privdata;
+}
+
+/* --------------------------------------------------------------------------
* Modules API internals
* -------------------------------------------------------------------------- */
@@ -2309,77 +3289,12 @@ int moduleRegisterApi(const char *funcname, void *funcptr) {
#define REGISTER_API(name) \
moduleRegisterApi("RedisModule_" #name, (void *)(unsigned long)RM_ ## name)
-/* Register all the APIs we export. */
-void moduleRegisterCoreAPI(void) {
- server.moduleapi = dictCreate(&moduleAPIDictType,NULL);
- REGISTER_API(CreateCommand);
- REGISTER_API(SetModuleAttribs);
- REGISTER_API(WrongArity);
- REGISTER_API(ReplyWithLongLong);
- REGISTER_API(ReplyWithError);
- REGISTER_API(ReplyWithSimpleString);
- REGISTER_API(ReplyWithArray);
- REGISTER_API(ReplySetArrayLength);
- REGISTER_API(ReplyWithString);
- REGISTER_API(ReplyWithStringBuffer);
- REGISTER_API(ReplyWithNull);
- REGISTER_API(ReplyWithCallReply);
- REGISTER_API(ReplyWithDouble);
- REGISTER_API(GetSelectedDb);
- REGISTER_API(SelectDb);
- REGISTER_API(OpenKey);
- REGISTER_API(CloseKey);
- REGISTER_API(KeyType);
- REGISTER_API(ValueLength);
- REGISTER_API(ListPush);
- REGISTER_API(ListPop);
- REGISTER_API(StringToLongLong);
- REGISTER_API(StringToDouble);
- REGISTER_API(Call);
- REGISTER_API(CallReplyProto);
- REGISTER_API(FreeCallReply);
- REGISTER_API(CallReplyInteger);
- REGISTER_API(CallReplyType);
- REGISTER_API(CallReplyLength);
- REGISTER_API(CallReplyArrayElement);
- REGISTER_API(CallReplyStringPtr);
- REGISTER_API(CreateStringFromCallReply);
- REGISTER_API(CreateString);
- REGISTER_API(CreateStringFromLongLong);
- REGISTER_API(FreeString);
- REGISTER_API(StringPtrLen);
- REGISTER_API(AutoMemory);
- REGISTER_API(Replicate);
- REGISTER_API(ReplicateVerbatim);
- REGISTER_API(DeleteKey);
- REGISTER_API(StringSet);
- REGISTER_API(StringDMA);
- REGISTER_API(StringTruncate);
- REGISTER_API(SetExpire);
- REGISTER_API(GetExpire);
- REGISTER_API(ZsetAdd);
- REGISTER_API(ZsetIncrby);
- REGISTER_API(ZsetScore);
- REGISTER_API(ZsetRem);
- REGISTER_API(ZsetRangeStop);
- REGISTER_API(ZsetFirstInScoreRange);
- REGISTER_API(ZsetLastInScoreRange);
- REGISTER_API(ZsetFirstInLexRange);
- REGISTER_API(ZsetLastInLexRange);
- REGISTER_API(ZsetRangeCurrentElement);
- REGISTER_API(ZsetRangeNext);
- REGISTER_API(ZsetRangePrev);
- REGISTER_API(ZsetRangeEndReached);
- REGISTER_API(HashSet);
- REGISTER_API(HashGet);
- REGISTER_API(IsKeysPositionRequest);
- REGISTER_API(KeyAtPos);
- REGISTER_API(GetClientId);
- REGISTER_API(PoolAlloc);
-}
-
/* Global initialization at Redis startup. */
+void moduleRegisterCoreAPI(void);
+
void moduleInitModulesSystem(void) {
+ moduleUnblockedClients = listCreate();
+
server.loadmodule_queue = listCreate();
modules = dictCreate(&modulesDictType,NULL);
moduleRegisterCoreAPI();
@@ -2400,25 +3315,28 @@ void moduleLoadFromQueue(void) {
listRewind(server.loadmodule_queue,&li);
while((ln = listNext(&li))) {
- sds modulepath = ln->value;
- if (moduleLoad(modulepath) == C_ERR) {
+ struct moduleLoadQueueEntry *loadmod = ln->value;
+ if (moduleLoad(loadmod->path,(void **)loadmod->argv,loadmod->argc)
+ == C_ERR)
+ {
serverLog(LL_WARNING,
"Can't load module from %s: server aborting",
- modulepath);
+ loadmod->path);
exit(1);
}
}
}
void moduleFreeModuleStructure(struct RedisModule *module) {
+ listRelease(module->types);
sdsfree(module->name);
zfree(module);
}
/* Load a module and initialize it. On success C_OK is returned, otherwise
* C_ERR is returned. */
-int moduleLoad(const char *path) {
- int (*onload)(void *);
+int moduleLoad(const char *path, void **module_argv, int module_argc) {
+ int (*onload)(void *, void **, int);
void *handle;
RedisModuleCtx ctx = REDISMODULE_CTX_INIT;
@@ -2427,14 +3345,14 @@ int moduleLoad(const char *path) {
serverLog(LL_WARNING, "Module %s failed to load: %s", path, dlerror());
return C_ERR;
}
- onload = (int (*)(void *))(unsigned long) dlsym(handle,"RedisModule_OnLoad");
+ onload = (int (*)(void *, void **, int))(unsigned long) dlsym(handle,"RedisModule_OnLoad");
if (onload == NULL) {
serverLog(LL_WARNING,
"Module %s does not export RedisModule_OnLoad() "
"symbol. Module not loaded.",path);
return C_ERR;
}
- if (onload((void*)&ctx) == REDISMODULE_ERR) {
+ if (onload((void*)&ctx,module_argv,module_argc) == REDISMODULE_ERR) {
if (ctx.module) moduleFreeModuleStructure(ctx.module);
dlclose(handle);
serverLog(LL_WARNING,
@@ -2446,6 +3364,7 @@ int moduleLoad(const char *path) {
dictAdd(modules,ctx.module->name,ctx.module);
ctx.module->handle = handle;
serverLog(LL_NOTICE,"Module '%s' loaded from %s",ctx.module->name,path);
+ moduleFreeContext(&ctx);
return C_OK;
}
@@ -2453,14 +3372,21 @@ int moduleLoad(const char *path) {
* C_OK is returned, otherwise C_ERR is returned and errno is set
* to the following values depending on the type of error:
*
- * ENONET: No such module having the specified name. */
+ * ENONET: No such module having the specified name.
+ * EBUSY: The module exports a new data type and can only be reloaded. */
int moduleUnload(sds name) {
struct RedisModule *module = dictFetchValue(modules,name);
+
if (module == NULL) {
errno = ENOENT;
return REDISMODULE_ERR;
}
+ if (listLength(module->types)) {
+ errno = EBUSY;
+ return REDISMODULE_ERR;
+ }
+
/* Unregister all the commands registered by this module. */
dictIterator *di = dictGetSafeIterator(server.commands);
dictEntry *de;
@@ -2494,21 +3420,28 @@ int moduleUnload(sds name) {
/* Remove from list of modules. */
serverLog(LL_NOTICE,"Module %s unloaded",module->name);
dictDelete(modules,module->name);
-
- /* Free the module structure. */
- zfree(module);
+ module->name = NULL; /* The name was already freed by dictDelete(). */
+ moduleFreeModuleStructure(module);
return REDISMODULE_OK;
}
/* Redis MODULE command.
*
- * MODULE LOAD <path> */
+ * MODULE LOAD <path> [args...] */
void moduleCommand(client *c) {
char *subcmd = c->argv[1]->ptr;
- if (!strcasecmp(subcmd,"load") && c->argc == 3) {
- if (moduleLoad(c->argv[2]->ptr) == C_OK)
+ if (!strcasecmp(subcmd,"load") && c->argc >= 3) {
+ robj **argv = NULL;
+ int argc = 0;
+
+ if (c->argc > 3) {
+ argc = c->argc - 3;
+ argv = &c->argv[3];
+ }
+
+ if (moduleLoad(c->argv[2]->ptr,(void **)argv,argc) == C_OK)
addReply(c,shared.ok);
else
addReplyError(c,
@@ -2517,9 +3450,17 @@ void moduleCommand(client *c) {
if (moduleUnload(c->argv[2]->ptr) == C_OK)
addReply(c,shared.ok);
else {
- char *errmsg = "operation not possible.";
+ char *errmsg;
switch(errno) {
- case ENOENT: errmsg = "no such module with that name";
+ case ENOENT:
+ errmsg = "no such module with that name";
+ break;
+ case EBUSY:
+ errmsg = "the module exports one or more module-side data types, can't unload";
+ break;
+ default:
+ errmsg = "operation not possible.";
+ break;
}
addReplyErrorFormat(c,"Error unloading module: %s",errmsg);
}
@@ -2542,3 +3483,110 @@ void moduleCommand(client *c) {
addReply(c,shared.syntaxerr);
}
}
+
+/* Register all the APIs we export. Keep this function at the end of the
+ * file so that's easy to seek it to add new entries. */
+void moduleRegisterCoreAPI(void) {
+ server.moduleapi = dictCreate(&moduleAPIDictType,NULL);
+ REGISTER_API(Alloc);
+ REGISTER_API(Calloc);
+ REGISTER_API(Realloc);
+ REGISTER_API(Free);
+ REGISTER_API(Strdup);
+ REGISTER_API(CreateCommand);
+ REGISTER_API(SetModuleAttribs);
+ REGISTER_API(WrongArity);
+ REGISTER_API(ReplyWithLongLong);
+ REGISTER_API(ReplyWithError);
+ REGISTER_API(ReplyWithSimpleString);
+ REGISTER_API(ReplyWithArray);
+ REGISTER_API(ReplySetArrayLength);
+ REGISTER_API(ReplyWithString);
+ REGISTER_API(ReplyWithStringBuffer);
+ REGISTER_API(ReplyWithNull);
+ REGISTER_API(ReplyWithCallReply);
+ REGISTER_API(ReplyWithDouble);
+ REGISTER_API(GetSelectedDb);
+ REGISTER_API(SelectDb);
+ REGISTER_API(OpenKey);
+ REGISTER_API(CloseKey);
+ REGISTER_API(KeyType);
+ REGISTER_API(ValueLength);
+ REGISTER_API(ListPush);
+ REGISTER_API(ListPop);
+ REGISTER_API(StringToLongLong);
+ REGISTER_API(StringToDouble);
+ REGISTER_API(Call);
+ REGISTER_API(CallReplyProto);
+ REGISTER_API(FreeCallReply);
+ REGISTER_API(CallReplyInteger);
+ REGISTER_API(CallReplyType);
+ REGISTER_API(CallReplyLength);
+ REGISTER_API(CallReplyArrayElement);
+ REGISTER_API(CallReplyStringPtr);
+ REGISTER_API(CreateStringFromCallReply);
+ REGISTER_API(CreateString);
+ REGISTER_API(CreateStringFromLongLong);
+ REGISTER_API(CreateStringFromString);
+ REGISTER_API(CreateStringPrintf);
+ REGISTER_API(FreeString);
+ REGISTER_API(StringPtrLen);
+ REGISTER_API(AutoMemory);
+ REGISTER_API(Replicate);
+ REGISTER_API(ReplicateVerbatim);
+ REGISTER_API(DeleteKey);
+ REGISTER_API(StringSet);
+ REGISTER_API(StringDMA);
+ REGISTER_API(StringTruncate);
+ REGISTER_API(SetExpire);
+ REGISTER_API(GetExpire);
+ REGISTER_API(ZsetAdd);
+ REGISTER_API(ZsetIncrby);
+ REGISTER_API(ZsetScore);
+ REGISTER_API(ZsetRem);
+ REGISTER_API(ZsetRangeStop);
+ REGISTER_API(ZsetFirstInScoreRange);
+ REGISTER_API(ZsetLastInScoreRange);
+ REGISTER_API(ZsetFirstInLexRange);
+ REGISTER_API(ZsetLastInLexRange);
+ REGISTER_API(ZsetRangeCurrentElement);
+ REGISTER_API(ZsetRangeNext);
+ REGISTER_API(ZsetRangePrev);
+ REGISTER_API(ZsetRangeEndReached);
+ REGISTER_API(HashSet);
+ REGISTER_API(HashGet);
+ REGISTER_API(IsKeysPositionRequest);
+ REGISTER_API(KeyAtPos);
+ REGISTER_API(GetClientId);
+ REGISTER_API(PoolAlloc);
+ REGISTER_API(CreateDataType);
+ REGISTER_API(ModuleTypeSetValue);
+ REGISTER_API(ModuleTypeGetType);
+ REGISTER_API(ModuleTypeGetValue);
+ REGISTER_API(SaveUnsigned);
+ REGISTER_API(LoadUnsigned);
+ REGISTER_API(SaveSigned);
+ REGISTER_API(LoadSigned);
+ REGISTER_API(SaveString);
+ REGISTER_API(SaveStringBuffer);
+ REGISTER_API(LoadString);
+ REGISTER_API(LoadStringBuffer);
+ REGISTER_API(SaveDouble);
+ REGISTER_API(LoadDouble);
+ REGISTER_API(SaveFloat);
+ REGISTER_API(LoadFloat);
+ REGISTER_API(EmitAOF);
+ REGISTER_API(Log);
+ REGISTER_API(LogIOError);
+ REGISTER_API(StringAppendBuffer);
+ REGISTER_API(RetainString);
+ REGISTER_API(StringCompare);
+ REGISTER_API(GetContextFromIO);
+ REGISTER_API(BlockClient);
+ REGISTER_API(UnblockClient);
+ REGISTER_API(IsBlockedReplyRequest);
+ REGISTER_API(IsBlockedTimeoutRequest);
+ REGISTER_API(GetBlockedClientPrivateData);
+ REGISTER_API(AbortBlock);
+ REGISTER_API(Milliseconds);
+}
diff --git a/src/modules/API.md b/src/modules/API.md
index 0ff303d0e..e90429e3b 100644
--- a/src/modules/API.md
+++ b/src/modules/API.md
@@ -1,5 +1,60 @@
# Modules API reference
+## `RM_Alloc`
+
+ void *RM_Alloc(size_t bytes);
+
+Use like malloc(). Memory allocated with this function is reported in
+Redis INFO memory, used for keys eviction according to maxmemory settings
+and in general is taken into account as memory allocated by Redis.
+You should avoid using malloc().
+
+## `RM_Calloc`
+
+ void *RM_Calloc(size_t nmemb, size_t size);
+
+Use like calloc(). Memory allocated with this function is reported in
+Redis INFO memory, used for keys eviction according to maxmemory settings
+and in general is taken into account as memory allocated by Redis.
+You should avoid using calloc() directly.
+
+## `RM_Realloc`
+
+ void* RM_Realloc(void *ptr, size_t bytes);
+
+Use like realloc() for memory obtained with `RedisModule_Alloc()`.
+
+## `RM_Free`
+
+ void RM_Free(void *ptr);
+
+Use like free() for memory obtained by `RedisModule_Alloc()` and
+`RedisModule_Realloc()`. However you should never try to free with
+`RedisModule_Free()` memory allocated with malloc() inside your module.
+
+## `RM_Strdup`
+
+ char *RM_Strdup(const char *str);
+
+Like strdup() but returns memory allocated with `RedisModule_Alloc()`.
+
+## `RM_PoolAlloc`
+
+ void *RM_PoolAlloc(RedisModuleCtx *ctx, size_t bytes);
+
+Return heap allocated memory that will be freed automatically when the
+module callback function returns. Mostly suitable for small allocations
+that are short living and must be released when the callback returns
+anyway. The returned memory is aligned to the architecture word size
+if at least word size bytes are requested, otherwise it is just
+aligned to the next power of two, so for example a 3 bytes request is
+4 bytes aligned while a 2 bytes request is 2 bytes aligned.
+
+There is no realloc style function since when this is needed to use the
+pool allocator is not a good idea.
+
+The function returns NULL if `bytes` is 0.
+
## `RM_GetApi`
int RM_GetApi(const char *funcname, void **targetPtrPtr);
@@ -104,6 +159,12 @@ Called by `RM_Init()` to setup the `ctx->module` structure.
This is an internal function, Redis modules developers don't need
to use it.
+## `RM_Milliseconds`
+
+ long long RM_Milliseconds(void);
+
+Return the current UNIX time in milliseconds.
+
## `RM_AutoMemory`
void RM_AutoMemory(RedisModuleCtx *ctx);
@@ -123,6 +184,16 @@ with `RedisModule_FreeString()`, unless automatic memory is enabled.
The string is created by copying the `len` bytes starting
at `ptr`. No reference is retained to the passed buffer.
+## `RM_CreateStringPrintf`
+
+ RedisModuleString *RM_CreateStringPrintf(RedisModuleCtx *ctx, const char *fmt, ...);
+
+Create a new module string object from a printf format and arguments.
+The returned string must be freed with `RedisModule_FreeString()`, unless
+automatic memory is enabled.
+
+The string is created using the sds formatter function sdscatvprintf().
+
## `RM_CreateStringFromLongLong`
RedisModuleString *RM_CreateStringFromLongLong(RedisModuleCtx *ctx, long long ll);
@@ -133,6 +204,16 @@ integer instead of taking a buffer and its length.
The returned string must be released with `RedisModule_FreeString()` or by
enabling automatic memory management.
+## `RM_CreateStringFromString`
+
+ RedisModuleString *RM_CreateStringFromString(RedisModuleCtx *ctx, const RedisModuleString *str);
+
+Like `RedisModule_CreatString()`, but creates a string starting from another
+RedisModuleString.
+
+The returned string must be released with `RedisModule_FreeString()` or by
+enabling automatic memory management.
+
## `RM_FreeString`
void RM_FreeString(RedisModuleCtx *ctx, RedisModuleString *str);
@@ -144,9 +225,36 @@ It is possible to call this function even when automatic memory management
is enabled. In that case the string will be released ASAP and removed
from the pool of string to release at the end.
+## `RM_RetainString`
+
+ void RM_RetainString(RedisModuleCtx *ctx, RedisModuleString *str);
+
+Every call to this function, will make the string 'str' requiring
+an additional call to `RedisModule_FreeString()` in order to really
+free the string. Note that the automatic freeing of the string obtained
+enabling modules automatic memory management counts for one
+`RedisModule_FreeString()` call (it is just executed automatically).
+
+Normally you want to call this function when, at the same time
+the following conditions are true:
+
+1) You have automatic memory management enabled.
+2) You want to create string objects.
+3) Those string objects you create need to live *after* the callback
+ function(for example a command implementation) creating them returns.
+
+Usually you want this in order to store the created string object
+into your own data structure, for example when implementing a new data
+type.
+
+Note that when memory management is turned off, you don't need
+any call to RetainString() since creating a string will always result
+into a string that lives after the callback function returns, if
+no FreeString() call is performed.
+
## `RM_StringPtrLen`
- const char *RM_StringPtrLen(RedisModuleString *str, size_t *len);
+ const char *RM_StringPtrLen(const RedisModuleString *str, size_t *len);
Given a string module object, this function returns the string pointer
and length of the string. The returned pointer and length should only
@@ -154,7 +262,7 @@ be used for read only accesses and never modified.
## `RM_StringToLongLong`
- int RM_StringToLongLong(RedisModuleString *str, long long *ll);
+ int RM_StringToLongLong(const RedisModuleString *str, long long *ll);
Convert the string into a long long integer, storing it at `*ll`.
Returns `REDISMODULE_OK` on success. If the string can't be parsed
@@ -163,12 +271,28 @@ is returned.
## `RM_StringToDouble`
- int RM_StringToDouble(RedisModuleString *str, double *d);
+ int RM_StringToDouble(const RedisModuleString *str, double *d);
Convert the string into a double, storing it at `*d`.
Returns `REDISMODULE_OK` on success or `REDISMODULE_ERR` if the string is
not a valid string representation of a double value.
+## `RM_StringCompare`
+
+ int RM_StringCompare(RedisModuleString *a, RedisModuleString *b);
+
+Compare two string objects, returning -1, 0 or 1 respectively if
+a < b, a == b, a > b. Strings are compared byte by byte as two
+binary blobs without any encoding care / collation attempt.
+
+## `RM_StringAppendBuffer`
+
+ int RM_StringAppendBuffer(RedisModuleCtx *ctx, RedisModuleString *str, const char *buf, size_t len);
+
+Append the specified buffere to the string 'str'. The string must be a
+string created by the user that is referenced only a single time, otherwise
+`REDISMODULE_ERR` is returend and the operation is not performed.
+
## `RM_WrongArity`
int RM_WrongArity(RedisModuleCtx *ctx);
@@ -579,9 +703,9 @@ The output flags are:
On success the function returns `REDISMODULE_OK`. On the following errors
`REDISMODULE_ERR` is returned:
-- The key was not opened for writing.
-- The key is of the wrong type.
-- 'score' double value is not a number (NaN).
+* The key was not opened for writing.
+* The key is of the wrong type.
+* 'score' double value is not a number (NaN).
## `RM_ZsetIncrby`
@@ -609,8 +733,8 @@ Remove the specified element from the sorted set.
The function returns `REDISMODULE_OK` on success, and `REDISMODULE_ERR`
on one of the following conditions:
-- The key was not opened for writing.
-- The key is of the wrong type.
+* The key was not opened for writing.
+* The key is of the wrong type.
The return value does NOT indicate the fact the element was really
removed (since it existed) or not, just if the function was executed
@@ -632,9 +756,9 @@ On success retrieve the double score associated at the sorted set element
'ele' and returns `REDISMODULE_OK`. Otherwise `REDISMODULE_ERR` is returned
to signal one of the following conditions:
-- There is no such element 'ele' in the sorted set.
-- The key is not a sorted set.
-- The key is an open empty key.
+* There is no such element 'ele' in the sorted set.
+* The key is not a sorted set.
+* The key is an open empty key.
## `RM_ZsetRangeStop`
@@ -774,8 +898,8 @@ specified because of the XX or NX options).
In the following case the return value is always zero:
-- The key was not open for writing.
-- The key was associated with a non Hash value.
+* The key was not open for writing.
+* The key was associated with a non Hash value.
## `RM_HashGet`
@@ -893,3 +1017,313 @@ EPERM: operation in Cluster instance with key in non local slot.
Return a pointer, and a length, to the protocol returned by the command
that returned the reply object.
+## `RM_CreateDataType`
+
+ moduleType *RM_CreateDataType(RedisModuleCtx *ctx, const char *name, int encver, void *typemethods_ptr);
+
+Register a new data type exported by the module. The parameters are the
+following. Please for in depth documentation check the modules API
+documentation, especially the TYPES.md file.
+
+* **name**: A 9 characters data type name that MUST be unique in the Redis
+ Modules ecosystem. Be creative... and there will be no collisions. Use
+ the charset A-Z a-z 9-0, plus the two "-_" characters. A good
+ idea is to use, for example `<typename>-<vendor>`. For example
+ "tree-AntZ" may mean "Tree data structure by @antirez". To use both
+ lower case and upper case letters helps in order to prevent collisions.
+* **encver**: Encoding version, which is, the version of the serialization
+ that a module used in order to persist data. As long as the "name"
+ matches, the RDB loading will be dispatched to the type callbacks
+ whatever 'encver' is used, however the module can understand if
+ the encoding it must load are of an older version of the module.
+ For example the module "tree-AntZ" initially used encver=0. Later
+ after an upgrade, it started to serialize data in a different format
+ and to register the type with encver=1. However this module may
+ still load old data produced by an older version if the rdb_load
+ callback is able to check the encver value and act accordingly.
+ The encver must be a positive value between 0 and 1023.
+* **typemethods_ptr** is a pointer to a RedisModuleTypeMethods structure
+ that should be populated with the methods callbacks and structure
+ version, like in the following example:
+
+ RedisModuleTypeMethods tm = {
+ .version = `REDISMODULE_TYPE_METHOD_VERSION`,
+ .rdb_load = myType_RDBLoadCallBack,
+ .rdb_save = myType_RDBSaveCallBack,
+ .aof_rewrite = myType_AOFRewriteCallBack,
+ .free = myType_FreeCallBack,
+
+ // Optional fields
+ .digest = myType_DigestCallBack,
+ .mem_usage = myType_MemUsageCallBack,
+ }
+
+* **rdb_load**: A callback function pointer that loads data from RDB files.
+* **rdb_save**: A callback function pointer that saves data to RDB files.
+* **aof_rewrite**: A callback function pointer that rewrites data as commands.
+* **digest**: A callback function pointer that is used for `DEBUG DIGEST`.
+* **mem_usage**: A callback function pointer that is used for `MEMORY`.
+* **free**: A callback function pointer that can free a type value.
+
+The **digest* and **mem_usage** methods should currently be omitted since
+they are not yet implemented inside the Redis modules core.
+
+Note: the module name "AAAAAAAAA" is reserved and produces an error, it
+happens to be pretty lame as well.
+
+If there is already a module registering a type with the same name,
+and if the module name or encver is invalid, NULL is returned.
+Otherwise the new type is registered into Redis, and a reference of
+type RedisModuleType is returned: the caller of the function should store
+this reference into a gobal variable to make future use of it in the
+modules type API, since a single module may register multiple types.
+Example code fragment:
+
+ static RedisModuleType *BalancedTreeType;
+
+ int `RedisModule_OnLoad(RedisModuleCtx` *ctx) {
+ // some code here ...
+ BalancedTreeType = `RM_CreateDataType(`...);
+ }
+
+## `RM_ModuleTypeSetValue`
+
+ int RM_ModuleTypeSetValue(RedisModuleKey *key, moduleType *mt, void *value);
+
+If the key is open for writing, set the specified module type object
+as the value of the key, deleting the old value if any.
+On success `REDISMODULE_OK` is returned. If the key is not open for
+writing or there is an active iterator, `REDISMODULE_ERR` is returned.
+
+## `RM_ModuleTypeGetType`
+
+ moduleType *RM_ModuleTypeGetType(RedisModuleKey *key);
+
+Assuming `RedisModule_KeyType()` returned `REDISMODULE_KEYTYPE_MODULE` on
+the key, returns the moduel type pointer of the value stored at key.
+
+If the key is NULL, is not associated with a module type, or is empty,
+then NULL is returned instead.
+
+## `RM_ModuleTypeGetValue`
+
+ void *RM_ModuleTypeGetValue(RedisModuleKey *key);
+
+Assuming `RedisModule_KeyType()` returned `REDISMODULE_KEYTYPE_MODULE` on
+the key, returns the module type low-level value stored at key, as
+it was set by the user via `RedisModule_ModuleTypeSet()`.
+
+If the key is NULL, is not associated with a module type, or is empty,
+then NULL is returned instead.
+
+## `RM_SaveUnsigned`
+
+ void RM_SaveUnsigned(RedisModuleIO *io, uint64_t value);
+
+Save an unsigned 64 bit value into the RDB file. This function should only
+be called in the context of the rdb_save method of modules implementing new
+data types.
+
+## `RM_LoadUnsigned`
+
+ uint64_t RM_LoadUnsigned(RedisModuleIO *io);
+
+Load an unsigned 64 bit value from the RDB file. This function should only
+be called in the context of the rdb_load method of modules implementing
+new data types.
+
+## `RM_SaveSigned`
+
+ void RM_SaveSigned(RedisModuleIO *io, int64_t value);
+
+Like `RedisModule_SaveUnsigned()` but for signed 64 bit values.
+
+## `RM_LoadSigned`
+
+ int64_t RM_LoadSigned(RedisModuleIO *io);
+
+Like `RedisModule_LoadUnsigned()` but for signed 64 bit values.
+
+## `RM_SaveString`
+
+ void RM_SaveString(RedisModuleIO *io, RedisModuleString *s);
+
+In the context of the rdb_save method of a module type, saves a
+string into the RDB file taking as input a RedisModuleString.
+
+The string can be later loaded with `RedisModule_LoadString()` or
+other Load family functions expecting a serialized string inside
+the RDB file.
+
+## `RM_SaveStringBuffer`
+
+ void RM_SaveStringBuffer(RedisModuleIO *io, const char *str, size_t len);
+
+Like `RedisModule_SaveString()` but takes a raw C pointer and length
+as input.
+
+## `RM_LoadString`
+
+ RedisModuleString *RM_LoadString(RedisModuleIO *io);
+
+In the context of the rdb_load method of a module data type, loads a string
+from the RDB file, that was previously saved with `RedisModule_SaveString()`
+functions family.
+
+The returned string is a newly allocated RedisModuleString object, and
+the user should at some point free it with a call to `RedisModule_FreeString()`.
+
+If the data structure does not store strings as RedisModuleString objects,
+the similar function `RedisModule_LoadStringBuffer()` could be used instead.
+
+## `RM_LoadStringBuffer`
+
+ char *RM_LoadStringBuffer(RedisModuleIO *io, size_t *lenptr);
+
+Like `RedisModule_LoadString()` but returns an heap allocated string that
+was allocated with `RedisModule_Alloc()`, and can be resized or freed with
+`RedisModule_Realloc()` or `RedisModule_Free()`.
+
+The size of the string is stored at '*lenptr' if not NULL.
+The returned string is not automatically NULL termianted, it is loaded
+exactly as it was stored inisde the RDB file.
+
+## `RM_SaveDouble`
+
+ void RM_SaveDouble(RedisModuleIO *io, double value);
+
+In the context of the rdb_save method of a module data type, saves a double
+value to the RDB file. The double can be a valid number, a NaN or infinity.
+It is possible to load back the value with `RedisModule_LoadDouble()`.
+
+## `RM_LoadDouble`
+
+ double RM_LoadDouble(RedisModuleIO *io);
+
+In the context of the rdb_save method of a module data type, loads back the
+double value saved by `RedisModule_SaveDouble()`.
+
+## `RM_SaveFloat`
+
+ void RM_SaveFloat(RedisModuleIO *io, float value);
+
+In the context of the rdb_save method of a module data type, saves a float
+value to the RDB file. The float can be a valid number, a NaN or infinity.
+It is possible to load back the value with `RedisModule_LoadFloat()`.
+
+## `RM_LoadFloat`
+
+ float RM_LoadFloat(RedisModuleIO *io);
+
+In the context of the rdb_save method of a module data type, loads back the
+float value saved by `RedisModule_SaveFloat()`.
+
+## `RM_EmitAOF`
+
+ void RM_EmitAOF(RedisModuleIO *io, const char *cmdname, const char *fmt, ...);
+
+Emits a command into the AOF during the AOF rewriting process. This function
+is only called in the context of the aof_rewrite method of data types exported
+by a module. The command works exactly like `RedisModule_Call()` in the way
+the parameters are passed, but it does not return anything as the error
+handling is performed by Redis itself.
+
+## `RM_LogRaw`
+
+ void RM_LogRaw(RedisModule *module, const char *levelstr, const char *fmt, va_list ap);
+
+This is the low level function implementing both:
+
+ `RM_Log()`
+ `RM_LogIOError()`
+
+## `RM_Log`
+
+ void RM_Log(RedisModuleCtx *ctx, const char *levelstr, const char *fmt, ...);
+
+/*
+Produces a log message to the standard Redis log, the format accepts
+printf-alike specifiers, while level is a string describing the log
+level to use when emitting the log, and must be one of the following:
+
+* "debug"
+* "verbose"
+* "notice"
+* "warning"
+
+If the specified log level is invalid, verbose is used by default.
+There is a fixed limit to the length of the log line this function is able
+to emit, this limti is not specified but is guaranteed to be more than
+a few lines of text.
+
+## `RM_LogIOError`
+
+ void RM_LogIOError(RedisModuleIO *io, const char *levelstr, const char *fmt, ...);
+
+Log errors from RDB / AOF serialization callbacks.
+
+This function should be used when a callback is returning a critical
+error to the caller since cannot load or save the data for some
+critical reason.
+
+## `RM_BlockClient`
+
+ RedisModuleBlockedClient *RM_BlockClient(RedisModuleCtx *ctx, RedisModuleCmdFunc reply_callback, RedisModuleCmdFunc timeout_callback, void (*free_privdata)(void*), long long timeout_ms);
+
+Block a client in the context of a blocking command, returning an handle
+which will be used, later, in order to block the client with a call to
+`RedisModule_UnblockClient()`. The arguments specify callback functions
+and a timeout after which the client is unblocked.
+
+The callbacks are called in the following contexts:
+
+reply_callback: called after a successful `RedisModule_UnblockClient()` call
+ in order to reply to the client and unblock it.
+reply_timeout: called when the timeout is reached in order to send an
+ error to the client.
+free_privdata: called in order to free the privata data that is passed
+ by `RedisModule_UnblockClient()` call.
+
+## `RM_UnblockClient`
+
+ int RM_UnblockClient(RedisModuleBlockedClient *bc, void *privdata);
+
+Unblock a client blocked by ``RedisModule_BlockedClient``. This will trigger
+the reply callbacks to be called in order to reply to the client.
+The 'privdata' argument will be accessible by the reply callback, so
+the caller of this function can pass any value that is needed in order to
+actually reply to the client.
+
+A common usage for 'privdata' is a thread that computes something that
+needs to be passed to the client, included but not limited some slow
+to compute reply or some reply obtained via networking.
+
+Note: this function can be called from threads spawned by the module.
+
+## `RM_AbortBlock`
+
+ int RM_AbortBlock(RedisModuleBlockedClient *bc);
+
+Abort a blocked client blocking operation: the client will be unblocked
+without firing the reply callback.
+
+## `RM_IsBlockedReplyRequest`
+
+ int RM_IsBlockedReplyRequest(RedisModuleCtx *ctx);
+
+Return non-zero if a module command was called in order to fill the
+reply for a blocked client.
+
+## `RM_IsBlockedTimeoutRequest`
+
+ int RM_IsBlockedTimeoutRequest(RedisModuleCtx *ctx);
+
+Return non-zero if a module command was called in order to fill the
+reply for a blocked client that timed out.
+
+## `RM_GetBlockedClientPrivateData`
+
+ void *RM_GetBlockedClientPrivateData(RedisModuleCtx *ctx);
+
+Get the privata data set by `RedisModule_UnblockClient()`
+
diff --git a/src/modules/BLOCK.md b/src/modules/BLOCK.md
new file mode 100644
index 000000000..d4f3c93bc
--- /dev/null
+++ b/src/modules/BLOCK.md
@@ -0,0 +1,265 @@
+Blocking commands in Redis modules
+===
+
+Redis has a few blocking commands among the built-in set of commands.
+One of the most used is `BLPOP` (or the symmetric `BRPOP`) which blocks
+waiting for elements arriving in a list.
+
+The interesting fact about blocking commands is that they do not block
+the whole server, but just the client calling them. Usually the reason to
+block is that we expect some external event to happen: this can be
+some change in the Redis data structures like in the `BLPOP` case, a
+long computation happening in a thread, to receive some data from the
+network, and so forth.
+
+Redis modules have the ability to implement blocking commands as well,
+this documentation shows how the API works and describes a few patterns
+that can be used in order to model blocking commands.
+
+How blocking and resuming works.
+---
+
+_Note: You may want to check the `helloblock.c` example in the Redis source tree
+inside the `src/modules` directory, for a simple to understand example
+on how the blocking API is applied._
+
+In Redis modules, commands are implemented by callback functions that
+are invoked by the Redis core when the specific command is called
+by the user. Normally the callback terminates its execution sending
+some reply to the client. Using the following function instead, the
+function implementing the module command may request that the client
+is put into the blocked state:
+
+ RedisModuleBlockedClient *RedisModule_BlockClient(RedisModuleCtx *ctx, RedisModuleCmdFunc reply_callback, RedisModuleCmdFunc timeout_callback, void (*free_privdata)(void*), long long timeout_ms);
+
+The function returns a `RedisModuleBlockedClient` object, which is later
+used in order to unblock the client. The arguments have the following
+meaning:
+
+* `ctx` is the command execution context as usually in the rest of the API.
+* `reply_callback` is the callback, having the same prototype of a normal command function, that is called when the client is unblocked in order to return a reply to the client.
+* `timeout_callback` is the callback, having the same prototype of a normal command function that is called when the client reached the `ms` timeout.
+* `free_privdata` is the callback that is called in order to free the private data. Private data is a pointer to some data that is passed between the API used to unblock the client, to the callback that will send the reply to the client. We'll see how this mechanism works later in this document.
+* `ms` is the timeout in milliseconds. When the timeout is reached, the timeout callback is called and the client is automatically aborted.
+
+Once a client is blocked, it can be unblocked with the following API:
+
+ int RedisModule_UnblockClient(RedisModuleBlockedClient *bc, void *privdata);
+
+The function takes as argument the blocked client object returned by
+the previous call to `RedisModule_BlockClient()`, and unblock the client.
+Immediately before the client gets unblocked, the `reply_callback` function
+specified when the client was blocked is called: this function will
+have access to the `privdata` pointer used here.
+
+IMPORTANT: The above function is thread safe, and can be called from within
+a thread doing some work in order to implement the command that blocked
+the client.
+
+The `privdata` data will be freed automatically using the `free_privdata`
+callback when the client is unblocked. This is useful **since the reply
+callback may never be called** in case the client timeouts or disconnects
+from the server, so it's important that it's up to an external function
+to have the responsibility to free the data passed if needed.
+
+To better understand how the API works, we can imagine writing a command
+that blocks a client for one second, and then send as reply "Hello!".
+
+Note: arity checks and other non important things are not implemented
+int his command, in order to take the example simple.
+
+ int Example_RedisCommand(RedisModuleCtx *ctx, RedisModuleString **argv,
+ int argc)
+ {
+ RedisModuleBlockedClient *bc =
+ RedisModule_BlockClient(ctx,reply_func,timeout_func,NULL,0);
+
+ pthread_t tid;
+ pthread_create(&tid,NULL,threadmain,bc);
+
+ return REDISMODULE_OK;
+ }
+
+ void *threadmain(void *arg) {
+ RedisModuleBlockedClient *bc = arg;
+
+ sleep(1); /* Wait one second and unblock. */
+ RedisModule_UnblockClient(bc,NULL);
+ }
+
+The above command blocks the client ASAP, spawining a thread that will
+wait a second and will unblock the client. Let's check the reply and
+timeout callbacks, which are in our case very similar, since they
+just reply the client with a different reply type.
+
+ int reply_func(RedisModuleCtx *ctx, RedisModuleString **argv,
+ int argc)
+ {
+ return RedisModule_ReplyWithSimpleString(ctx,"Hello!");
+ }
+
+ int timeout_func(RedisModuleCtx *ctx, RedisModuleString **argv,
+ int argc)
+ {
+ return RedisModule_ReplyWithNull(ctx);
+ }
+
+The reply callback just sends the "Hello!" string to the client.
+The important bit here is that the reply callback is called when the
+client is unblocked from the thread.
+
+The timeout command returns `NULL`, as it often happens with actual
+Redis blocking commands timing out.
+
+Passing reply data when unblocking
+---
+
+The above example is simple to understand but lacks an important
+real world aspect of an actual blocking command implementation: often
+the reply function will need to know what to reply to the client,
+and this information is often provided as the client is unblocked.
+
+We could modify the above example so that the thread generates a
+random number after waiting one second. You can think at it as an
+actually expansive operation of some kind. Then this random number
+can be passed to the reply function so that we return it to the command
+caller. In order to make this working, we modify the functions as follow:
+
+ void *threadmain(void *arg) {
+ RedisModuleBlockedClient *bc = arg;
+
+ sleep(1); /* Wait one second and unblock. */
+
+ long *mynumber = RedisModule_Alloc(sizeof(long));
+ *mynumber = rand();
+ RedisModule_UnblockClient(bc,mynumber);
+ }
+
+As you can see, now the unblocking call is passing some private data,
+that is the `mynumber` pointer, to the reply callback. In order to
+obtain this private data, the reply callback will use the following
+fnuction:
+
+ void *RedisModule_GetBlockedClientPrivateData(RedisModuleCtx *ctx);
+
+So our reply callback is modified like that:
+
+ int reply_func(RedisModuleCtx *ctx, RedisModuleString **argv,
+ int argc)
+ {
+ long *mynumber = RedisModule_GetBlockedClientPrivateData(ctx);
+ /* IMPORTANT: don't free mynumber here, but in the
+ * free privdata callback. */
+ return RedisModule_ReplyWithLongLong(ctx,mynumber);
+ }
+
+Note that we also need to pass a `free_privdata` function when blocking
+the client with `RedisModule_BlockClient()`, since the allocated
+long value must be freed. Our callback will look like the following:
+
+ void free_privdata(void *privdata) {
+ RedisModule_Free(privdata);
+ }
+
+NOTE: It is important to stress that the private data is best freed in the
+`free_privdata` callback becaues the reply function may not be called
+if the client disconnects or timeout.
+
+Also note that the private data is also accessible from the timeout
+callback, always using the `GetBlockedClientPrivateData()` API.
+
+Aborting the blocking of a client
+---
+
+One problem that sometimes arises is that we need to allocate resources
+in order to implement the non blocking command. So we block the client,
+then, for example, try to create a thread, but the thread creation function
+returns an error. What to do in such a condition in order to recover? We
+don't want to take the client blocked, nor we want to call `UnblockClient()`
+because this will trigger the reply callback to be called.
+
+In this case the best thing to do is to use the following function:
+
+ int RedisModule_AbortBlock(RedisModuleBlockedClient *bc);
+
+Practically this is how to use it:
+
+ int Example_RedisCommand(RedisModuleCtx *ctx, RedisModuleString **argv,
+ int argc)
+ {
+ RedisModuleBlockedClient *bc =
+ RedisModule_BlockClient(ctx,reply_func,timeout_func,NULL,0);
+
+ pthread_t tid;
+ if (pthread_create(&tid,NULL,threadmain,bc) != 0) {
+ RedisModule_AbortBlock(bc);
+ RedisModule_ReplyWithError(ctx,"Sorry can't create a thread");
+ }
+
+ return REDISMODULE_OK;
+ }
+
+The client will be unblocked but the reply callback will not be called.
+
+Implementing the command, reply and timeout callback using a single function
+---
+
+The following functions can be used in order to implement the reply and
+callback with the same function that implements the primary command
+function:
+
+ int RedisModule_IsBlockedReplyRequest(RedisModuleCtx *ctx);
+ int RedisModule_IsBlockedTimeoutRequest(RedisModuleCtx *ctx);
+
+So I could rewrite the example command without using a separated
+reply and timeout callback:
+
+ int Example_RedisCommand(RedisModuleCtx *ctx, RedisModuleString **argv,
+ int argc)
+ {
+ if (RedisModule_IsBlockedReplyRequest(ctx)) {
+ long *mynumber = RedisModule_GetBlockedClientPrivateData(ctx);
+ return RedisModule_ReplyWithLongLong(ctx,mynumber);
+ } else if (RedisModule_IsBlockedTimeoutRequest) {
+ return RedisModule_ReplyWithNull(ctx);
+ }
+
+ RedisModuleBlockedClient *bc =
+ RedisModule_BlockClient(ctx,reply_func,timeout_func,NULL,0);
+
+ pthread_t tid;
+ if (pthread_create(&tid,NULL,threadmain,bc) != 0) {
+ RedisModule_AbortBlock(bc);
+ RedisModule_ReplyWithError(ctx,"Sorry can't create a thread");
+ }
+
+ return REDISMODULE_OK;
+ }
+
+Functionally is the same but there are people that will prefer the less
+verbose implementation that concentrates most of the command logic in a
+single function.
+
+Working on copies of data inside a thread
+---
+
+An interesting pattern in order to work with threads implementing the
+slow part of a command, is to work with a copy of the data, so that
+while some operation is performed in a key, the user continues to see
+the old version. However when the thread terminated its work, the
+representations are swapped and the new, processed version, is used.
+
+An example of this approach is the
+[Neural Redis module](https://github.com/antirez/neural-redis)
+where neural networks are trained in different threads while the
+user can still execute and inspect their older versions.
+
+Future work
+---
+
+An API is work in progress right now in order to allow Redis modules APIs
+to be called in a safe way from threads, so that the threaded command
+can access the data space and do incremental operations.
+
+There is no ETA for this feature but it may appear in the course of the
+Redis 4.0 release at some point.
diff --git a/src/modules/INTRO.md b/src/modules/INTRO.md
index c64a50078..3ac6a4673 100644
--- a/src/modules/INTRO.md
+++ b/src/modules/INTRO.md
@@ -1,6 +1,13 @@
-Redis Modules API reference manual
+Redis Modules: an introduction to the API
===
+The modules documentation is composed of the following files:
+
+* `INTRO.md` (this file). An overview about Redis Modules system and API. It's a good idea to start your reading here.
+* `API.md` is generated from module.c top comments of RedisMoule functions. It is a good reference in order to understand how each function works.
+* `TYPES.md` covers the implementation of native data types into modules.
+* `BLOCK.md` shows how to write blocking commands that will not reply immediately, but will block the client, without blocking the Redis server, and will provide a reply whenever will be possible.
+
Redis modules make possible to extend Redis functionality using external
modules, implementing new Redis commands at a speed and with features
similar to what can be done inside the core itself.
@@ -59,7 +66,7 @@ simple module that implements a command that outputs a random number.
return REDISMODULE_OK;
}
- int RedisModule_OnLoad(RedisModuleCtx *ctx) {
+ int RedisModule_OnLoad(RedisModuleCtx *ctx, RedisModuleString **argv, int argc) {
if (RedisModule_Init(ctx,"helloworld",1,REDISMODULE_APIVER_1)
== REDISMODULE_ERR) return REDISMODULE_ERR;
@@ -150,6 +157,24 @@ exported.
The module will be able to load into different versions of Redis.
+# Passing configuration parameters to Redis modules
+
+When the module is loaded with the `MODULE LOAD` command, or using the
+`loadmodule` directive in the `redis.conf` file, the user is able to pass
+configuration parameters to the module by adding arguments after the module
+file name:
+
+ loadmodule mymodule.so foo bar 1234
+
+In the above example the strings `foo`, `bar` and `123` will be passed
+to the module `OnLoad()` function in the `argv` argument as an array
+of RedisModuleString pointers. The number of arguments passed is into `argc`.
+
+The way you can access those strings will be explained in the rest of this
+document. Normally the module will store the module configuration parameters
+in some `static` global variable that can be accessed module wide, so that
+the configuration can change the behavior of different commands.
+
# Working with RedisModuleString objects
The command argument vector `argv` passed to module commands, and the
@@ -162,7 +187,7 @@ There are a few functions in order to work with string objects:
const char *RedisModule_StringPtrLen(RedisModuleString *string, size_t *len);
-The above function accesses a string by returning its pointer and setting its
+The above function accesses a string by returning its pointer and setting its
length in `len`.
You should never write to a string object pointer, as you can see from the
`const` pointer qualifier.
@@ -344,7 +369,7 @@ section).
# Releasing call reply objects
-Reply objects must be freed using `RedisModule_FreeCallRelpy`. For arrays,
+Reply objects must be freed using `RedisModule_FreeCallReply`. For arrays,
you need to free only the top level reply, not the nested replies.
Currently the module implementation provides a protection in order to avoid
crashing if you free a nested reply object for error, however this feature
@@ -623,7 +648,7 @@ access) for speed. The API will return a pointer and a length, so that's
possible to access and, if needed, modify the string directly.
size_t len, j;
- char *myptr = RedisModule_StringDMA(key,REDISMODULE_WRITE,&len);
+ char *myptr = RedisModule_StringDMA(key,&len,REDISMODULE_WRITE);
for (j = 0; j < len; j++) myptr[j] = 'A';
In the above example we write directly on the string. Note that if you want
@@ -777,10 +802,56 @@ Automatic memory management is usually the way to go, however experienced
C programmers may not use it in order to gain some speed and memory usage
benefit.
+# Allocating memory into modules
+
+Normal C programs use `malloc()` and `free()` in order to allocate and
+release memory dynamically. While in Redis modules the use of malloc is
+not technically forbidden, it is a lot better to use the Redis Modules
+specific functions, that are exact replacements for `malloc`, `free`,
+`realloc` and `strdup`. These functions are:
+
+ void *RedisModule_Alloc(size_t bytes);
+ void* RedisModule_Realloc(void *ptr, size_t bytes);
+ void RedisModule_Free(void *ptr);
+ void RedisModule_Calloc(size_t nmemb, size_t size);
+ char *RedisModule_Strdup(const char *str);
+
+They work exactly like their `libc` equivalent calls, however they use
+the same allocator Redis uses, and the memory allocated using these
+functions is reported by the `INFO` command in the memory section, is
+accounted when enforcing the `maxmemory` policy, and in general is
+a first citizen of the Redis executable. On the contrar, the method
+allocated inside modules with libc `malloc()` is transparent to Redis.
+
+Another reason to use the modules functions in order to allocate memory
+is that, when creating native data types inside modules, the RDB loading
+functions can return deserialized strings (from the RDB file) directly
+as `RedisModule_Alloc()` allocations, so they can be used directly to
+populate data structures after loading, instead of having to copy them
+to the data structure.
+
+## Pool allocator
+
+Sometimes in commands implementations, it is required to perform many
+small allocations that will be not retained at the end of the command
+execution, but are just functional to execute the command itself.
+
+This work can be more easily accomplished using the Redis pool allocator:
+
+ void *RedisModule_PoolAlloc(RedisModuleCtx *ctx, size_t bytes);
+
+It works similarly to `malloc()`, and returns memory aligned to the
+next power of two of greater or equal to `bytes` (for a maximum alignment
+of 8 bytes). However it allocates memory in blocks, so it the overhead
+of the allocations is small, and more important, the memory allocated
+is automatically released when the command returns.
+
+So in general short living allocations are a good candidates for the pool
+allocator.
+
# Writing commands compatible with Redis Cluster
Documentation missing, please check the following functions inside `module.c`:
RedisModule_IsKeysPositionRequest(ctx);
RedisModule_KeyAtPos(ctx,pos);
-
diff --git a/src/modules/Makefile b/src/modules/Makefile
index 0c91361a1..066e65e9b 100644
--- a/src/modules/Makefile
+++ b/src/modules/Makefile
@@ -4,16 +4,16 @@ uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not')
# Compile flags for linux / osx
ifeq ($(uname_S),Linux)
- SHOBJ_CFLAGS ?= -fno-common -g -ggdb
+ SHOBJ_CFLAGS ?= -W -Wall -fno-common -g -ggdb -std=c99 -O2
SHOBJ_LDFLAGS ?= -shared
else
- SHOBJ_CFLAGS ?= -dynamic -fno-common -g -ggdb
+ SHOBJ_CFLAGS ?= -W -Wall -dynamic -fno-common -g -ggdb -std=c99 -O2
SHOBJ_LDFLAGS ?= -bundle -undefined dynamic_lookup
endif
.SUFFIXES: .c .so .xo .o
-all: helloworld.so
+all: helloworld.so hellotype.so helloblock.so testmodule.so
.c.xo:
$(CC) -I. $(CFLAGS) $(SHOBJ_CFLAGS) -fPIC -c $< -o $@
@@ -23,5 +23,20 @@ helloworld.xo: ../redismodule.h
helloworld.so: helloworld.xo
$(LD) -o $@ $< $(SHOBJ_LDFLAGS) $(LIBS) -lc
+hellotype.xo: ../redismodule.h
+
+hellotype.so: hellotype.xo
+ $(LD) -o $@ $< $(SHOBJ_LDFLAGS) $(LIBS) -lc
+
+helloblock.xo: ../redismodule.h
+
+helloblock.so: helloblock.xo
+ $(LD) -o $@ $< $(SHOBJ_LDFLAGS) $(LIBS) -lpthread -lc
+
+testmodule.xo: ../redismodule.h
+
+testmodule.so: testmodule.xo
+ $(LD) -o $@ $< $(SHOBJ_LDFLAGS) $(LIBS) -lc
+
clean:
rm -rf *.xo *.so
diff --git a/src/modules/TYPES.md b/src/modules/TYPES.md
new file mode 100644
index 000000000..4d497356a
--- /dev/null
+++ b/src/modules/TYPES.md
@@ -0,0 +1,379 @@
+Native types in Redis modules
+===
+
+Redis modules can access Redis built-in data structures both at high level,
+by calling Redis commands, and at low level, by manipulating the data structures
+directly.
+
+By using these capabilities in order to build new abstractions on top of existing
+Redis data structures, or by using strings DMA in order to encode modules
+data structures into Redis strings, it is possible to create modules that
+*feel like* they are exporting new data types. However, for more complex
+problems, this is not enough, and the implementation of new data structures
+inside the module is needed.
+
+We call the ability of Redis modules to implement new data structures that
+feel like native Redis ones **native types support**. This document describes
+the API exported by the Redis modules system in order to create new data
+structures and handle the serialization in RDB files, the rewriting process
+in AOF, the type reporting via the `TYPE` command, and so forth.
+
+Overview of native types
+---
+
+A module exporting a native type is composed of the following main parts:
+
+* The implementation of some kind of new data structure and of commands operating on the new data structure.
+* A set of callbacks that handle: RDB saving, RDB loading, AOF rewriting, releasing of a value associated with a key, calculation of a value digest (hash) to be used with the `DEBUG DIGEST` command.
+* A 9 characters name that is unique to each module native data type.
+* An encoding version, used to persist into RDB files a module-specific data version, so that a module will be able to load older representations from RDB files.
+
+While to handle RDB loading, saving and AOF rewriting may look complex as a first glance, the modules API provide very high level function for handling all this, without requiring the user to handle read/write errors, so in practical terms, writing a new data structure for Redis is a simple task.
+
+A **very easy** to understand but complete example of native type implementation
+is available inside the Redis distribution in the `/modules/hellotype.c` file.
+The reader is encouraged to read the documentation by looking at this example
+implementation to see how things are applied in the practice.
+
+Registering a new data type
+===
+
+In order to register a new native type into the Redis core, the module needs
+to declare a global variable that will hold a reference to the data type.
+The API to register the data type will return a data type reference that will
+be stored in the global variable.
+
+ static RedisModuleType *MyType;
+ #define MYTYPE_ENCODING_VERSION 0
+
+ int RedisModule_OnLoad(RedisModuleCtx *ctx) {
+ RedisModuleTypeMethods tm = {
+ .version = REDISMODULE_TYPE_METHOD_VERSION,
+ .rdb_load = MyTypeRDBLoad,
+ .rdb_save = MyTypeRDBSave,
+ .aof_rewrite = MyTypeAOFRewrite,
+ .free = MyTypeFree
+ };
+
+ MyType = RedisModule_CreateDataType(ctx, "MyType-AZ",
+ MYTYPE_ENCODING_VERSION, &tm);
+ if (MyType == NULL) return REDISMODULE_ERR;
+ }
+
+As you can see from the example above, a single API call is needed in order to
+register the new type. However a number of function pointers are passed as
+arguments. Certain are optionals while some are mandatory. The above set
+of methods *must* be passed, while `.digest` and `.mem_usage` are optional
+and are currently not actually supported by the modules internals, so for
+now you can just ignore them.
+
+The `ctx` argument is the context that we receive in the `OnLoad` function.
+The type `name` is a 9 character name in the character set that includes
+from `A-Z`, `a-z`, `0-9`, plus the underscore `_` and minus `-` characters.
+
+Note that **this name must be unique** for each data type in the Redis
+ecosystem, so be creative, use both lower-case and upper case if it makes
+sense, and try to use the convention of mixing the type name with the name
+of the author of the module, to create a 9 character unique name.
+
+**NOTE:** It is very important that the name is exactly 9 chars or the
+registration of the type will fail. Read more to understand why.
+
+For example if I'm building a *b-tree* data structure and my name is *antirez*
+I'll call my type **btree1-az**. The name, converted to a 64 bit integer,
+is stored inside the RDB file when saving the type, and will be used when the
+RDB data is loaded in order to resolve what module can load the data. If Redis
+finds no matching module, the integer is converted back to a name in order to
+provide some clue to the user about what module is missing in order to load
+the data.
+
+The type name is also used as a reply for the `TYPE` command when called
+with a key holding the registered type.
+
+The `encver` argument is the encoding version used by the module to store data
+inside the RDB file. For example I can start with an encoding version of 0,
+but later when I release version 2.0 of my module, I can switch encoding to
+something better. The new module will register with an encoding version of 1,
+so when it saves new RDB files, the new version will be stored on disk. However
+when loading RDB files, the module `rdb_load` method will be called even if
+there is data found for a different encoding version (and the encoding version
+is passed as argument to `rdb_load`), so that the module can still load old
+RDB files.
+
+The last argument is a structure used in order to pass the type methods to the
+registration function: `rdb_load`, `rdb_save`, `aof_rewrite`, `digest` and
+`free` and `mem_usage` are all callbacks with the following prototypes and uses:
+
+ typedef void *(*RedisModuleTypeLoadFunc)(RedisModuleIO *rdb, int encver);
+ typedef void (*RedisModuleTypeSaveFunc)(RedisModuleIO *rdb, void *value);
+ typedef void (*RedisModuleTypeRewriteFunc)(RedisModuleIO *aof, RedisModuleString *key, void *value);
+ typedef size_t (*RedisModuleTypeMemUsageFunc)(void *value);
+ typedef void (*RedisModuleTypeDigestFunc)(RedisModuleDigest *digest, void *value);
+ typedef void (*RedisModuleTypeFreeFunc)(void *value);
+
+* `rdb_load` is called when loading data from the RDB file. It loads data in the same format as `rdb_save` produces.
+* `rdb_save` is called when saving data to the RDB file.
+* `aof_rewrite` is called when the AOF is being rewritten, and the module needs to tell Redis what is the sequence of commands to recreate the content of a given key.
+* `digest` is called when `DEBUG DIGEST` is executed and a key holding this module type is found. Currently this is not yet implemented so the function ca be left empty.
+* `mem_usage` is called when the `MEMORY` command asks for the total memory consumed by a specific key, and is used in order to get the amount of bytes used by the module value.
+* `free` is called when a key with the module native type is deleted via `DEL` or in any other mean, in order to let the module reclaim the memory associated with such a value.
+
+Ok, but *why* modules types require a 9 characters name?
+---
+
+Oh, I understand you need to understand this, so here is a very specific
+explanation.
+
+When Redis persists to RDB files, modules specific data types require to
+be persisted as well. Now RDB files are sequences of key-value pairs
+like the following:
+
+ [1 byte type] [key] [a type specific value]
+
+The 1 byte type identifies strings, lists, sets, and so forth. In the case
+of modules data, it is set to a special value of `module data`, but of
+course this is not enough, we need the information needed to link a specific
+value with a specific module type that is able to load and handle it.
+
+So when we save a `type specific value` about a module, we prefix it with
+a 64 bit integer. 64 bits is large enough to store the informations needed
+in order to lookup the module that can handle that specific type, but is
+short enough that we can prefix each module value we store inside the RDB
+without making the final RDB file too big. At the same time, this solution
+of prefixing the value with a 64 bit *signature* does not require to do
+strange things like defining in the RDB header a list of modules specific
+types. Everything is pretty simple.
+
+So, what you can store in 64 bits in order to identify a given module in
+a reliable way? Well if you build a character set of 64 symbols, you can
+easily store 9 characters of 6 bits, and you are left with 10 bits, that
+are used in order to store the *encoding version* of the type, so that
+the same type can evolve in the future and provide a different and more
+efficient or updated serialization format for RDB files.
+
+So the 64 bit prefix stored before each module value is like the following:
+
+ 6|6|6|6|6|6|6|6|6|10
+
+The first 9 elements are 6-bits characters, the final 10 bits is the
+encoding version.
+
+When the RDB file is loaded back, it reads the 64 bit value, masks the final
+10 bits, and searches for a matching module in the modules types cache.
+When a matching one is found, the method to load the RDB file value is called
+with the 10 bits encoding version as argument, so that the module knows
+what version of the data layout to load, if it can support multiple versions.
+
+Now the interesting thing about all this is that, if instead the module type
+cannot be resolved, since there is no loaded module having this signature,
+we can convert back the 64 bit value into a 9 characters name, and print
+an error to the user that includes the module type name! So that she or he
+immediately realizes what's wrong.
+
+Setting and getting keys
+---
+
+After registering our new data type in the `RedisModule_OnLoad()` function,
+we also need to be able to set Redis keys having as value our native type.
+
+This normally happens in the context of commands that write data to a key.
+The native types API allow to set and get keys to module native data types,
+and to test if a given key is already associated to a value of a specific data
+type.
+
+The API uses the normal modules `RedisModule_OpenKey()` low level key access
+interface in order to deal with this. This is an eaxmple of setting a
+native type private data structure to a Redis key:
+
+ RedisModuleKey *key = RedisModule_OpenKey(ctx,keyname,REDISMODULE_WRITE);
+ struct some_private_struct *data = createMyDataStructure();
+ RedisModule_ModuleTypeSetValue(key,MyType,data);
+
+The function `RedisModule_ModuleTypeSetValue()` is used with a key handle open
+for writing, and gets three arguments: the key handle, the reference to the
+native type, as obtained during the type registration, and finally a `void*`
+pointer that contains the private data implementing the module native type.
+
+Note that Redis has no clues at all about what your data contains. It will
+just call the callbacks you provided during the method registration in order
+to perform operations on the type.
+
+Similarly we can retrieve the private data from a key using this function:
+
+ struct some_private_struct *data;
+ data = RedisModule_ModuleTypeGetValue(key);
+
+We can also test for a key to have our native type as value:
+
+ if (RedisModule_ModuleTypeGetType(key) == MyType) {
+ /* ... do something ... */
+ }
+
+However for the calls to do the right thing, we need to check if the key
+is empty, if it contains a value of the right kind, and so forth. So
+the idiomatic code to implement a command writing to our native type
+is along these lines:
+
+ RedisModuleKey *key = RedisModule_OpenKey(ctx,argv[1],
+ REDISMODULE_READ|REDISMODULE_WRITE);
+ int type = RedisModule_KeyType(key);
+ if (type != REDISMODULE_KEYTYPE_EMPTY &&
+ RedisModule_ModuleTypeGetType(key) != MyType)
+ {
+ return RedisModule_ReplyWithError(ctx,REDISMODULE_ERRORMSG_WRONGTYPE);
+ }
+
+Then if we successfully verified the key is not of the wrong type, and
+we are going to write to it, we usually want to create a new data structure if
+the key is empty, or retrieve the reference to the value associated to the
+key if there is already one:
+
+ /* Create an empty value object if the key is currently empty. */
+ struct some_private_struct *data;
+ if (type == REDISMODULE_KEYTYPE_EMPTY) {
+ data = createMyDataStructure();
+ RedisModule_ModuleTypeSetValue(key,MyTyke,data);
+ } else {
+ data = RedisModule_ModuleTypeGetValue(key);
+ }
+ /* Do something with 'data'... */
+
+Free method
+---
+
+As already mentioned, when Redis needs to free a key holding a native type
+value, it needs help from the module in order to release the memory. This
+is the reason why we pass a `free` callback during the type registration:
+
+ typedef void (*RedisModuleTypeFreeFunc)(void *value);
+
+A trivial implementation of the free method can be something like this,
+assuming our data structure is composed of a single allocation:
+
+ void MyTypeFreeCallback(void *value) {
+ RedisModule_Free(value);
+ }
+
+However a more real world one will call some function that performs a more
+complex memory reclaiming, by casting the void pointer to some structure
+and freeing all the resources composing the value.
+
+RDB load and save methods
+---
+
+The RDB saving and loading callbacks need to create (and load back) a
+representation of the data type on disk. Redis offers an high level API
+that can automatically store inside the RDB file the following types:
+
+* Unsigned 64 bit integers.
+* Signed 64 bit integers.
+* Doubles.
+* Strings.
+
+It is up to the module to find a viable representation using the above base
+types. However note that while the integer and double values are stored
+and loaded in an architecture and *endianess* agnostic way, if you use
+the raw string saving API to, for example, save a structure on disk, you
+have to care those details yourself.
+
+This is the list of functions performing RDB saving and loading:
+
+ void RedisModule_SaveUnsigned(RedisModuleIO *io, uint64_t value);
+ uint64_t RedisModule_LoadUnsigned(RedisModuleIO *io);
+ void RedisModule_SaveSigned(RedisModuleIO *io, int64_t value);
+ int64_t RedisModule_LoadSigned(RedisModuleIO *io);
+ void RedisModule_SaveString(RedisModuleIO *io, RedisModuleString *s);
+ void RedisModule_SaveStringBuffer(RedisModuleIO *io, const char *str, size_t len);
+ RedisModuleString *RedisModule_LoadString(RedisModuleIO *io);
+ char *RedisModule_LoadStringBuffer(RedisModuleIO *io, size_t *lenptr);
+ void RedisModule_SaveDouble(RedisModuleIO *io, double value);
+ double RedisModule_LoadDouble(RedisModuleIO *io);
+
+The functions don't require any error checking from the module, that can
+always assume calls succeed.
+
+As an example, imagine I've a native type that implements an array of
+double values, with the following structure:
+
+ struct double_array {
+ size_t count;
+ double *values;
+ };
+
+My `rdb_save` method may look like the following:
+
+ void DoubleArrayRDBSave(RedisModuleIO *io, void *ptr) {
+ struct dobule_array *da = ptr;
+ RedisModule_SaveUnsigned(io,da->count);
+ for (size_t j = 0; j < da->count; j++)
+ RedisModule_SaveDouble(io,da->values[j]);
+ }
+
+What we did was to store the number of elements followed by each double
+value. So when later we'll have to load the structure in the `rdb_load`
+method we'll do something like this:
+
+ void *DoubleArrayRDBLoad(RedisModuleIO *io, int encver) {
+ if (encver != DOUBLE_ARRAY_ENC_VER) {
+ /* We should actually log an error here, or try to implement
+ the ability to load older versions of our data structure. */
+ return NULL;
+ }
+
+ struct double_array *da;
+ da = RedisModule_Alloc(sizeof(*da));
+ da->count = RedisModule_LoadUnsigned(io);
+ da->values = RedisModule_Alloc(da->count * sizeof(double));
+ for (size_t j = 0; j < da->count; j++)
+ da->values = RedisModule_LoadDouble(io);
+ return da;
+ }
+
+The load callback just reconstruct back the data structure from the data
+we stored in the RDB file.
+
+Note that while there is no error handling on the API that writes and reads
+from disk, still the load callback can return NULL on errors in case what
+it reads does not look correct. Redis will just panic in that case.
+
+AOF rewriting
+---
+
+ void RedisModule_EmitAOF(RedisModuleIO *io, const char *cmdname, const char *fmt, ...);
+
+Handling multiple encodings
+---
+
+ WORK IN PROGRESS
+
+Allocating memory
+---
+
+Modules data types should try to use `RedisModule_Alloc()` functions family
+in order to allocate, reallocate and release heap memory used to implement the native data structures (see the other Redis Modules documentation for detailed information).
+
+This is not just useful in order for Redis to be able to account for the memory used by the module, but there are also more advantages:
+
+* Redis uses the `jemalloc` allcator, that often prevents fragmentation problems that could be caused by using the libc allocator.
+* When loading strings from the RDB file, the native types API is able to return strings allocated directly with `RedisModule_Alloc()`, so that the module can directly link this memory into the data structure representation, avoiding an useless copy of the data.
+
+Even if you are using external libraries implementing your data structures, the
+allocation functions provided by the module API is exactly compatible with
+`malloc()`, `realloc()`, `free()` and `strdup()`, so converting the libraries
+in order to use these functions should be trivial.
+
+In case you have an external library that uses libc `malloc()`, and you want
+to avoid replacing manually all the calls with the Redis Modules API calls,
+an approach could be to use simple macros in order to replace the libc calls
+with the Redis API calls. Something like this could work:
+
+ #define malloc RedisModule_Alloc
+ #define realloc RedisModule_Realloc
+ #define free RedisModule_Free
+ #define strdup RedisModule_Strdup
+
+However take in mind that mixing libc calls with Redis API calls will result
+into troubles and crashes, so if you replace calls using macros, you need to
+make sure that all the calls are correctly replaced, and that the code with
+the substituted calls will never, for example, attempt to call
+`RedisModule_Free()` with a pointer allocated using libc `malloc()`.
diff --git a/src/modules/helloblock.c b/src/modules/helloblock.c
new file mode 100644
index 000000000..71ec9b121
--- /dev/null
+++ b/src/modules/helloblock.c
@@ -0,0 +1,122 @@
+/* Helloblock module -- An example of blocking command implementation
+ * with threads.
+ *
+ * -----------------------------------------------------------------------------
+ *
+ * Copyright (c) 2016, Salvatore Sanfilippo <antirez at gmail dot com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Redis nor the names of its contributors may be used
+ * to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "../redismodule.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <pthread.h>
+#include <unistd.h>
+
+/* Reply callback for blocking command HELLO.BLOCK */
+int HelloBlock_Reply(RedisModuleCtx *ctx, RedisModuleString **argv, int argc) {
+ REDISMODULE_NOT_USED(argv);
+ REDISMODULE_NOT_USED(argc);
+ int *myint = RedisModule_GetBlockedClientPrivateData(ctx);
+ return RedisModule_ReplyWithLongLong(ctx,*myint);
+}
+
+/* Timeout callback for blocking command HELLO.BLOCK */
+int HelloBlock_Timeout(RedisModuleCtx *ctx, RedisModuleString **argv, int argc) {
+ REDISMODULE_NOT_USED(argv);
+ REDISMODULE_NOT_USED(argc);
+ return RedisModule_ReplyWithSimpleString(ctx,"Request timedout");
+}
+
+/* Private data freeing callback for HELLO.BLOCK command. */
+void HelloBlock_FreeData(void *privdata) {
+ RedisModule_Free(privdata);
+}
+
+/* The thread entry point that actually executes the blocking part
+ * of the command HELLO.BLOCK. */
+void *HelloBlock_ThreadMain(void *arg) {
+ void **targ = arg;
+ RedisModuleBlockedClient *bc = targ[0];
+ long long delay = (unsigned long)targ[1];
+ RedisModule_Free(targ);
+
+ sleep(delay);
+ int *r = RedisModule_Alloc(sizeof(int));
+ *r = rand();
+ RedisModule_UnblockClient(bc,r);
+ return NULL;
+}
+
+/* HELLO.BLOCK <delay> <timeout> -- Block for <count> seconds, then reply with
+ * a random number. Timeout is the command timeout, so that you can test
+ * what happens when the delay is greater than the timeout. */
+int HelloBlock_RedisCommand(RedisModuleCtx *ctx, RedisModuleString **argv, int argc) {
+ if (argc != 3) return RedisModule_WrongArity(ctx);
+ long long delay;
+ long long timeout;
+
+ if (RedisModule_StringToLongLong(argv[1],&delay) != REDISMODULE_OK) {
+ return RedisModule_ReplyWithError(ctx,"ERR invalid count");
+ }
+
+ if (RedisModule_StringToLongLong(argv[2],&timeout) != REDISMODULE_OK) {
+ return RedisModule_ReplyWithError(ctx,"ERR invalid count");
+ }
+
+ pthread_t tid;
+ RedisModuleBlockedClient *bc = RedisModule_BlockClient(ctx,HelloBlock_Reply,HelloBlock_Timeout,HelloBlock_FreeData,timeout);
+
+ /* Now that we setup a blocking client, we need to pass the control
+ * to the thread. However we need to pass arguments to the thread:
+ * the delay and a reference to the blocked client handle. */
+ void **targ = RedisModule_Alloc(sizeof(void*)*2);
+ targ[0] = bc;
+ targ[1] = (void*)(unsigned long) delay;
+
+ if (pthread_create(&tid,NULL,HelloBlock_ThreadMain,targ) != 0) {
+ RedisModule_AbortBlock(bc);
+ return RedisModule_ReplyWithError(ctx,"-ERR Can't start thread");
+ }
+ return REDISMODULE_OK;
+}
+
+/* This function must be present on each Redis module. It is used in order to
+ * register the commands into the Redis server. */
+int RedisModule_OnLoad(RedisModuleCtx *ctx, RedisModuleString **argv, int argc) {
+ REDISMODULE_NOT_USED(argv);
+ REDISMODULE_NOT_USED(argc);
+
+ if (RedisModule_Init(ctx,"helloblock",1,REDISMODULE_APIVER_1)
+ == REDISMODULE_ERR) return REDISMODULE_ERR;
+
+ if (RedisModule_CreateCommand(ctx,"hello.block",
+ HelloBlock_RedisCommand,"",0,0,0) == REDISMODULE_ERR)
+ return REDISMODULE_ERR;
+
+ return REDISMODULE_OK;
+}
diff --git a/src/modules/hellotype.c b/src/modules/hellotype.c
new file mode 100644
index 000000000..02a5bb477
--- /dev/null
+++ b/src/modules/hellotype.c
@@ -0,0 +1,272 @@
+/* This file implements a new module native data type called "HELLOTYPE".
+ * The data structure implemented is a very simple ordered linked list of
+ * 64 bit integers, in order to have something that is real world enough, but
+ * at the same time, extremely simple to understand, to show how the API
+ * works, how a new data type is created, and how to write basic methods
+ * for RDB loading, saving and AOF rewriting.
+ *
+ * -----------------------------------------------------------------------------
+ *
+ * Copyright (c) 2016, Salvatore Sanfilippo <antirez at gmail dot com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Redis nor the names of its contributors may be used
+ * to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "../redismodule.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <string.h>
+#include <stdint.h>
+
+static RedisModuleType *HelloType;
+
+/* ========================== Internal data structure =======================
+ * This is just a linked list of 64 bit integers where elements are inserted
+ * in-place, so it's ordered. There is no pop/push operation but just insert
+ * because it is enough to show the implementation of new data types without
+ * making things complex. */
+
+struct HelloTypeNode {
+ int64_t value;
+ struct HelloTypeNode *next;
+};
+
+struct HelloTypeObject {
+ struct HelloTypeNode *head;
+ size_t len; /* Number of elements added. */
+};
+
+struct HelloTypeObject *createHelloTypeObject(void) {
+ struct HelloTypeObject *o;
+ o = RedisModule_Alloc(sizeof(*o));
+ o->head = NULL;
+ o->len = 0;
+ return o;
+}
+
+void HelloTypeInsert(struct HelloTypeObject *o, int64_t ele) {
+ struct HelloTypeNode *next = o->head, *newnode, *prev = NULL;
+
+ while(next && next->value < ele) {
+ prev = next;
+ next = next->next;
+ }
+ newnode = RedisModule_Alloc(sizeof(*newnode));
+ newnode->value = ele;
+ newnode->next = next;
+ if (prev) {
+ prev->next = newnode;
+ } else {
+ o->head = newnode;
+ }
+ o->len++;
+}
+
+void HelloTypeReleaseObject(struct HelloTypeObject *o) {
+ struct HelloTypeNode *cur, *next;
+ cur = o->head;
+ while(cur) {
+ next = cur->next;
+ RedisModule_Free(cur);
+ cur = next;
+ }
+ RedisModule_Free(o);
+}
+
+/* ========================= "hellotype" type commands ======================= */
+
+/* HELLOTYPE.INSERT key value */
+int HelloTypeInsert_RedisCommand(RedisModuleCtx *ctx, RedisModuleString **argv, int argc) {
+ RedisModule_AutoMemory(ctx); /* Use automatic memory management. */
+
+ if (argc != 3) return RedisModule_WrongArity(ctx);
+ RedisModuleKey *key = RedisModule_OpenKey(ctx,argv[1],
+ REDISMODULE_READ|REDISMODULE_WRITE);
+ int type = RedisModule_KeyType(key);
+ if (type != REDISMODULE_KEYTYPE_EMPTY &&
+ RedisModule_ModuleTypeGetType(key) != HelloType)
+ {
+ return RedisModule_ReplyWithError(ctx,REDISMODULE_ERRORMSG_WRONGTYPE);
+ }
+
+ long long value;
+ if ((RedisModule_StringToLongLong(argv[2],&value) != REDISMODULE_OK)) {
+ return RedisModule_ReplyWithError(ctx,"ERR invalid value: must be a signed 64 bit integer");
+ }
+
+ /* Create an empty value object if the key is currently empty. */
+ struct HelloTypeObject *hto;
+ if (type == REDISMODULE_KEYTYPE_EMPTY) {
+ hto = createHelloTypeObject();
+ RedisModule_ModuleTypeSetValue(key,HelloType,hto);
+ } else {
+ hto = RedisModule_ModuleTypeGetValue(key);
+ }
+
+ /* Insert the new element. */
+ HelloTypeInsert(hto,value);
+
+ RedisModule_ReplyWithLongLong(ctx,hto->len);
+ RedisModule_ReplicateVerbatim(ctx);
+ return REDISMODULE_OK;
+}
+
+/* HELLOTYPE.RANGE key first count */
+int HelloTypeRange_RedisCommand(RedisModuleCtx *ctx, RedisModuleString **argv, int argc) {
+ RedisModule_AutoMemory(ctx); /* Use automatic memory management. */
+
+ if (argc != 4) return RedisModule_WrongArity(ctx);
+ RedisModuleKey *key = RedisModule_OpenKey(ctx,argv[1],
+ REDISMODULE_READ|REDISMODULE_WRITE);
+ int type = RedisModule_KeyType(key);
+ if (type != REDISMODULE_KEYTYPE_EMPTY &&
+ RedisModule_ModuleTypeGetType(key) != HelloType)
+ {
+ return RedisModule_ReplyWithError(ctx,REDISMODULE_ERRORMSG_WRONGTYPE);
+ }
+
+ long long first, count;
+ if (RedisModule_StringToLongLong(argv[2],&first) != REDISMODULE_OK ||
+ RedisModule_StringToLongLong(argv[3],&count) != REDISMODULE_OK ||
+ first < 0 || count < 0)
+ {
+ return RedisModule_ReplyWithError(ctx,
+ "ERR invalid first or count parameters");
+ }
+
+ struct HelloTypeObject *hto = RedisModule_ModuleTypeGetValue(key);
+ struct HelloTypeNode *node = hto ? hto->head : NULL;
+ RedisModule_ReplyWithArray(ctx,REDISMODULE_POSTPONED_ARRAY_LEN);
+ long long arraylen = 0;
+ while(node && count--) {
+ RedisModule_ReplyWithLongLong(ctx,node->value);
+ arraylen++;
+ node = node->next;
+ }
+ RedisModule_ReplySetArrayLength(ctx,arraylen);
+ return REDISMODULE_OK;
+}
+
+/* HELLOTYPE.LEN key */
+int HelloTypeLen_RedisCommand(RedisModuleCtx *ctx, RedisModuleString **argv, int argc) {
+ RedisModule_AutoMemory(ctx); /* Use automatic memory management. */
+
+ if (argc != 2) return RedisModule_WrongArity(ctx);
+ RedisModuleKey *key = RedisModule_OpenKey(ctx,argv[1],
+ REDISMODULE_READ|REDISMODULE_WRITE);
+ int type = RedisModule_KeyType(key);
+ if (type != REDISMODULE_KEYTYPE_EMPTY &&
+ RedisModule_ModuleTypeGetType(key) != HelloType)
+ {
+ return RedisModule_ReplyWithError(ctx,REDISMODULE_ERRORMSG_WRONGTYPE);
+ }
+
+ struct HelloTypeObject *hto = RedisModule_ModuleTypeGetValue(key);
+ RedisModule_ReplyWithLongLong(ctx,hto ? hto->len : 0);
+ return REDISMODULE_OK;
+}
+
+
+/* ========================== "hellotype" type methods ======================= */
+
+void *HelloTypeRdbLoad(RedisModuleIO *rdb, int encver) {
+ if (encver != 0) {
+ /* RedisModule_Log("warning","Can't load data with version %d", encver);*/
+ return NULL;
+ }
+ uint64_t elements = RedisModule_LoadUnsigned(rdb);
+ struct HelloTypeObject *hto = createHelloTypeObject();
+ while(elements--) {
+ int64_t ele = RedisModule_LoadSigned(rdb);
+ HelloTypeInsert(hto,ele);
+ }
+ return hto;
+}
+
+void HelloTypeRdbSave(RedisModuleIO *rdb, void *value) {
+ struct HelloTypeObject *hto = value;
+ struct HelloTypeNode *node = hto->head;
+ RedisModule_SaveUnsigned(rdb,hto->len);
+ while(node) {
+ RedisModule_SaveSigned(rdb,node->value);
+ node = node->next;
+ }
+}
+
+void HelloTypeAofRewrite(RedisModuleIO *aof, RedisModuleString *key, void *value) {
+ struct HelloTypeObject *hto = value;
+ struct HelloTypeNode *node = hto->head;
+ while(node) {
+ RedisModule_EmitAOF(aof,"HELLOTYPE.INSERT","sl",key,node->value);
+ node = node->next;
+ }
+}
+
+void HelloTypeDigest(RedisModuleDigest *digest, void *value) {
+ REDISMODULE_NOT_USED(digest);
+ REDISMODULE_NOT_USED(value);
+ /* TODO: The DIGEST module interface is yet not implemented. */
+}
+
+void HelloTypeFree(void *value) {
+ HelloTypeReleaseObject(value);
+}
+
+/* This function must be present on each Redis module. It is used in order to
+ * register the commands into the Redis server. */
+int RedisModule_OnLoad(RedisModuleCtx *ctx, RedisModuleString **argv, int argc) {
+ REDISMODULE_NOT_USED(argv);
+ REDISMODULE_NOT_USED(argc);
+
+ if (RedisModule_Init(ctx,"hellotype",1,REDISMODULE_APIVER_1)
+ == REDISMODULE_ERR) return REDISMODULE_ERR;
+
+ RedisModuleTypeMethods tm = {
+ .version = REDISMODULE_TYPE_METHOD_VERSION,
+ .rdb_load = HelloTypeRdbLoad,
+ .rdb_save = HelloTypeRdbSave,
+ .aof_rewrite = HelloTypeAofRewrite,
+ .free = HelloTypeFree
+ };
+
+ HelloType = RedisModule_CreateDataType(ctx,"hellotype",0,&tm);
+ if (HelloType == NULL) return REDISMODULE_ERR;
+
+ if (RedisModule_CreateCommand(ctx,"hellotype.insert",
+ HelloTypeInsert_RedisCommand,"write deny-oom",1,1,1) == REDISMODULE_ERR)
+ return REDISMODULE_ERR;
+
+ if (RedisModule_CreateCommand(ctx,"hellotype.range",
+ HelloTypeRange_RedisCommand,"readonly",1,1,1) == REDISMODULE_ERR)
+ return REDISMODULE_ERR;
+
+ if (RedisModule_CreateCommand(ctx,"hellotype.len",
+ HelloTypeLen_RedisCommand,"readonly",1,1,1) == REDISMODULE_ERR)
+ return REDISMODULE_ERR;
+
+ return REDISMODULE_OK;
+}
diff --git a/src/modules/helloworld.c b/src/modules/helloworld.c
index 2d6a71a20..4e30af2a0 100644
--- a/src/modules/helloworld.c
+++ b/src/modules/helloworld.c
@@ -1,3 +1,39 @@
+/* Helloworld module -- A few examples of the Redis Modules API in the form
+ * of commands showing how to accomplish common tasks.
+ *
+ * This module does not do anything useful, if not for a few commands. The
+ * examples are designed in order to show the API.
+ *
+ * -----------------------------------------------------------------------------
+ *
+ * Copyright (c) 2016, Salvatore Sanfilippo <antirez at gmail dot com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Redis nor the names of its contributors may be used
+ * to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
#include "../redismodule.h"
#include <stdio.h>
#include <stdlib.h>
@@ -10,6 +46,8 @@
* fetch the currently selected DB, the other in order to send the client
* an integer reply as response. */
int HelloSimple_RedisCommand(RedisModuleCtx *ctx, RedisModuleString **argv, int argc) {
+ REDISMODULE_NOT_USED(argv);
+ REDISMODULE_NOT_USED(argc);
RedisModule_ReplyWithLongLong(ctx,RedisModule_GetSelectedDb(ctx));
return REDISMODULE_OK;
}
@@ -201,6 +239,8 @@ int HelloRandArray_RedisCommand(RedisModuleCtx *ctx, RedisModuleString **argv, i
* comments the function implementation). */
int HelloRepl1_RedisCommand(RedisModuleCtx *ctx, RedisModuleString **argv, int argc)
{
+ REDISMODULE_NOT_USED(argv);
+ REDISMODULE_NOT_USED(argc);
RedisModuleCallReply *reply;
RedisModule_AutoMemory(ctx);
@@ -483,7 +523,7 @@ int HelloLeftPad_RedisCommand(RedisModuleCtx *ctx, RedisModuleString **argv, int
/* If the string is already larger than the target len, just return
* the string itself. */
- if (strlen >= padlen)
+ if (strlen >= (size_t)padlen)
return RedisModule_ReplyWithString(ctx,argv[1]);
/* Padding must be a single character in this simple implementation. */
@@ -494,7 +534,7 @@ int HelloLeftPad_RedisCommand(RedisModuleCtx *ctx, RedisModuleString **argv, int
/* Here we use our pool allocator, for our throw-away allocation. */
padlen -= strlen;
char *buf = RedisModule_PoolAlloc(ctx,padlen+strlen);
- for (size_t j = 0; j < padlen; j++) buf[j] = *ch;
+ for (long long j = 0; j < padlen; j++) buf[j] = *ch;
memcpy(buf+padlen,str,strlen);
RedisModule_ReplyWithStringBuffer(ctx,buf,padlen+strlen);
@@ -503,10 +543,16 @@ int HelloLeftPad_RedisCommand(RedisModuleCtx *ctx, RedisModuleString **argv, int
/* This function must be present on each Redis module. It is used in order to
* register the commands into the Redis server. */
-int RedisModule_OnLoad(RedisModuleCtx *ctx) {
+int RedisModule_OnLoad(RedisModuleCtx *ctx, RedisModuleString **argv, int argc) {
if (RedisModule_Init(ctx,"helloworld",1,REDISMODULE_APIVER_1)
== REDISMODULE_ERR) return REDISMODULE_ERR;
+ /* Log the list of parameters passing loading the module. */
+ for (int j = 0; j < argc; j++) {
+ const char *s = RedisModule_StringPtrLen(argv[j],NULL);
+ printf("Module loaded with ARGV[%d] = %s\n", j, s);
+ }
+
if (RedisModule_CreateCommand(ctx,"hello.simple",
HelloSimple_RedisCommand,"readonly",0,0,0) == REDISMODULE_ERR)
return REDISMODULE_ERR;
diff --git a/src/modules/testmodule.c b/src/modules/testmodule.c
new file mode 100644
index 000000000..8da45c0ea
--- /dev/null
+++ b/src/modules/testmodule.c
@@ -0,0 +1,237 @@
+/* Module designed to test the Redis modules subsystem.
+ *
+ * -----------------------------------------------------------------------------
+ *
+ * Copyright (c) 2016, Salvatore Sanfilippo <antirez at gmail dot com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Redis nor the names of its contributors may be used
+ * to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "../redismodule.h"
+#include <string.h>
+
+/* --------------------------------- Helpers -------------------------------- */
+
+/* Return true if the reply and the C null term string matches. */
+int TestMatchReply(RedisModuleCallReply *reply, char *str) {
+ RedisModuleString *mystr;
+ mystr = RedisModule_CreateStringFromCallReply(reply);
+ if (!mystr) return 0;
+ const char *ptr = RedisModule_StringPtrLen(mystr,NULL);
+ return strcmp(ptr,str) == 0;
+}
+
+/* ------------------------------- Test units ------------------------------- */
+
+/* TEST.CALL -- Test Call() API. */
+int TestCall(RedisModuleCtx *ctx, RedisModuleString **argv, int argc) {
+ REDISMODULE_NOT_USED(argv);
+ REDISMODULE_NOT_USED(argc);
+
+ RedisModule_AutoMemory(ctx);
+ RedisModuleCallReply *reply;
+
+ RedisModule_Call(ctx,"DEL","c","mylist");
+ RedisModuleString *mystr = RedisModule_CreateString(ctx,"foo",3);
+ RedisModule_Call(ctx,"RPUSH","csl","mylist",mystr,(long long)1234);
+ reply = RedisModule_Call(ctx,"LRANGE","ccc","mylist","0","-1");
+ long long items = RedisModule_CallReplyLength(reply);
+ if (items != 2) goto fail;
+
+ RedisModuleCallReply *item0, *item1;
+
+ item0 = RedisModule_CallReplyArrayElement(reply,0);
+ item1 = RedisModule_CallReplyArrayElement(reply,1);
+ if (!TestMatchReply(item0,"foo")) goto fail;
+ if (!TestMatchReply(item1,"1234")) goto fail;
+
+ RedisModule_ReplyWithSimpleString(ctx,"OK");
+ return REDISMODULE_OK;
+
+fail:
+ RedisModule_ReplyWithSimpleString(ctx,"ERR");
+ return REDISMODULE_OK;
+}
+
+/* TEST.STRING.APPEND -- Test appending to an existing string object. */
+int TestStringAppend(RedisModuleCtx *ctx, RedisModuleString **argv, int argc) {
+ REDISMODULE_NOT_USED(argv);
+ REDISMODULE_NOT_USED(argc);
+
+ RedisModuleString *s = RedisModule_CreateString(ctx,"foo",3);
+ RedisModule_StringAppendBuffer(ctx,s,"bar",3);
+ RedisModule_ReplyWithString(ctx,s);
+ RedisModule_FreeString(ctx,s);
+ return REDISMODULE_OK;
+}
+
+/* TEST.STRING.APPEND.AM -- Test append with retain when auto memory is on. */
+int TestStringAppendAM(RedisModuleCtx *ctx, RedisModuleString **argv, int argc) {
+ REDISMODULE_NOT_USED(argv);
+ REDISMODULE_NOT_USED(argc);
+
+ RedisModule_AutoMemory(ctx);
+ RedisModuleString *s = RedisModule_CreateString(ctx,"foo",3);
+ RedisModule_RetainString(ctx,s);
+ RedisModule_StringAppendBuffer(ctx,s,"bar",3);
+ RedisModule_ReplyWithString(ctx,s);
+ RedisModule_FreeString(ctx,s);
+ return REDISMODULE_OK;
+}
+
+/* TEST.STRING.PRINTF -- Test string formatting. */
+int TestStringPrintf(RedisModuleCtx *ctx, RedisModuleString **argv, int argc) {
+ RedisModule_AutoMemory(ctx);
+ if (argc < 3) {
+ return RedisModule_WrongArity(ctx);
+ }
+ RedisModuleString *s = RedisModule_CreateStringPrintf(ctx,
+ "Got %d args. argv[1]: %s, argv[2]: %s",
+ argc,
+ RedisModule_StringPtrLen(argv[1], NULL),
+ RedisModule_StringPtrLen(argv[2], NULL)
+ );
+
+ RedisModule_ReplyWithString(ctx,s);
+
+ return REDISMODULE_OK;
+}
+
+
+/* ----------------------------- Test framework ----------------------------- */
+
+/* Return 1 if the reply matches the specified string, otherwise log errors
+ * in the server log and return 0. */
+int TestAssertStringReply(RedisModuleCtx *ctx, RedisModuleCallReply *reply, char *str, size_t len) {
+ RedisModuleString *mystr, *expected;
+
+ if (RedisModule_CallReplyType(reply) != REDISMODULE_REPLY_STRING) {
+ RedisModule_Log(ctx,"warning","Unexpected reply type %d",
+ RedisModule_CallReplyType(reply));
+ return 0;
+ }
+ mystr = RedisModule_CreateStringFromCallReply(reply);
+ expected = RedisModule_CreateString(ctx,str,len);
+ if (RedisModule_StringCompare(mystr,expected) != 0) {
+ const char *mystr_ptr = RedisModule_StringPtrLen(mystr,NULL);
+ const char *expected_ptr = RedisModule_StringPtrLen(expected,NULL);
+ RedisModule_Log(ctx,"warning",
+ "Unexpected string reply '%s' (instead of '%s')",
+ mystr_ptr, expected_ptr);
+ return 0;
+ }
+ return 1;
+}
+
+/* Return 1 if the reply matches the specified integer, otherwise log errors
+ * in the server log and return 0. */
+int TestAssertIntegerReply(RedisModuleCtx *ctx, RedisModuleCallReply *reply, long long expected) {
+ if (RedisModule_CallReplyType(reply) != REDISMODULE_REPLY_INTEGER) {
+ RedisModule_Log(ctx,"warning","Unexpected reply type %d",
+ RedisModule_CallReplyType(reply));
+ return 0;
+ }
+ long long val = RedisModule_CallReplyInteger(reply);
+ if (val != expected) {
+ RedisModule_Log(ctx,"warning",
+ "Unexpected integer reply '%lld' (instead of '%lld')",
+ val, expected);
+ return 0;
+ }
+ return 1;
+}
+
+#define T(name,...) \
+ do { \
+ RedisModule_Log(ctx,"warning","Testing %s", name); \
+ reply = RedisModule_Call(ctx,name,__VA_ARGS__); \
+ } while (0);
+
+/* TEST.IT -- Run all the tests. */
+int TestIt(RedisModuleCtx *ctx, RedisModuleString **argv, int argc) {
+ REDISMODULE_NOT_USED(argv);
+ REDISMODULE_NOT_USED(argc);
+
+ RedisModule_AutoMemory(ctx);
+ RedisModuleCallReply *reply;
+
+ /* Make sure the DB is empty before to proceed. */
+ T("dbsize","");
+ if (!TestAssertIntegerReply(ctx,reply,0)) goto fail;
+
+ T("ping","");
+ if (!TestAssertStringReply(ctx,reply,"PONG",4)) goto fail;
+
+ T("test.call","");
+ if (!TestAssertStringReply(ctx,reply,"OK",2)) goto fail;
+
+ T("test.string.append","");
+ if (!TestAssertStringReply(ctx,reply,"foobar",6)) goto fail;
+
+ T("test.string.append.am","");
+ if (!TestAssertStringReply(ctx,reply,"foobar",6)) goto fail;
+
+ T("test.string.printf", "cc", "foo", "bar");
+ if (!TestAssertStringReply(ctx,reply,"Got 3 args. argv[1]: foo, argv[2]: bar",38)) goto fail;
+
+ RedisModule_ReplyWithSimpleString(ctx,"ALL TESTS PASSED");
+ return REDISMODULE_OK;
+
+fail:
+ RedisModule_ReplyWithSimpleString(ctx,
+ "SOME TEST NOT PASSED! Check server logs");
+ return REDISMODULE_OK;
+}
+
+int RedisModule_OnLoad(RedisModuleCtx *ctx, RedisModuleString **argv, int argc) {
+ REDISMODULE_NOT_USED(argv);
+ REDISMODULE_NOT_USED(argc);
+
+ if (RedisModule_Init(ctx,"test",1,REDISMODULE_APIVER_1)
+ == REDISMODULE_ERR) return REDISMODULE_ERR;
+
+ if (RedisModule_CreateCommand(ctx,"test.call",
+ TestCall,"write deny-oom",1,1,1) == REDISMODULE_ERR)
+ return REDISMODULE_ERR;
+
+ if (RedisModule_CreateCommand(ctx,"test.string.append",
+ TestStringAppend,"write deny-oom",1,1,1) == REDISMODULE_ERR)
+ return REDISMODULE_ERR;
+
+ if (RedisModule_CreateCommand(ctx,"test.string.append.am",
+ TestStringAppendAM,"write deny-oom",1,1,1) == REDISMODULE_ERR)
+ return REDISMODULE_ERR;
+
+ if (RedisModule_CreateCommand(ctx,"test.string.printf",
+ TestStringPrintf,"write deny-oom",1,1,1) == REDISMODULE_ERR)
+ return REDISMODULE_ERR;
+
+ if (RedisModule_CreateCommand(ctx,"test.it",
+ TestIt,"readonly",1,1,1) == REDISMODULE_ERR)
+ return REDISMODULE_ERR;
+
+ return REDISMODULE_OK;
+}
diff --git a/src/networking.c b/src/networking.c
index d50d2c852..343a910e2 100644
--- a/src/networking.c
+++ b/src/networking.c
@@ -30,8 +30,9 @@
#include "server.h"
#include <sys/uio.h>
#include <math.h>
+#include <ctype.h>
-static void setProtocolError(client *c, int pos);
+static void setProtocolError(const char *errstr, client *c, int pos);
/* Return the size consumed from the allocator, for the specified SDS string,
* including internal fragmentation. This function is used in order to compute
@@ -109,6 +110,7 @@ client *createClient(int fd) {
c->repl_ack_off = 0;
c->repl_ack_time = 0;
c->slave_listening_port = 0;
+ c->slave_ip[0] = '\0';
c->slave_capa = SLAVE_CAPA_NONE;
c->reply = listCreate();
c->reply_bytes = 0;
@@ -351,6 +353,14 @@ void addReplySds(client *c, sds s) {
}
}
+/* This low level function just adds whatever protocol you send it to the
+ * client buffer, trying the static buffer initially, and using the string
+ * of objects if not possible.
+ *
+ * It is efficient because does not create an SDS object nor an Redis object
+ * if not needed. The object will only be created by calling
+ * _addReplyStringToList() if we fail to extend the existing tail object
+ * in the list of objects. */
void addReplyString(client *c, const char *s, size_t len) {
if (prepareClientToWrite(c) != C_OK) return;
if (_addReplyToBuffer(c,s,len) != C_OK)
@@ -1030,7 +1040,7 @@ int processInlineBuffer(client *c) {
if (newline == NULL) {
if (sdslen(c->querybuf) > PROTO_INLINE_MAX_SIZE) {
addReplyError(c,"Protocol error: too big inline request");
- setProtocolError(c,0);
+ setProtocolError("too big inline request",c,0);
}
return C_ERR;
}
@@ -1046,7 +1056,7 @@ int processInlineBuffer(client *c) {
sdsfree(aux);
if (argv == NULL) {
addReplyError(c,"Protocol error: unbalanced quotes in request");
- setProtocolError(c,0);
+ setProtocolError("unbalanced quotes in inline request",c,0);
return C_ERR;
}
@@ -1080,11 +1090,29 @@ int processInlineBuffer(client *c) {
/* Helper function. Trims query buffer to make the function that processes
* multi bulk requests idempotent. */
-static void setProtocolError(client *c, int pos) {
+#define PROTO_DUMP_LEN 128
+static void setProtocolError(const char *errstr, client *c, int pos) {
if (server.verbosity <= LL_VERBOSE) {
sds client = catClientInfoString(sdsempty(),c);
+
+ /* Sample some protocol to given an idea about what was inside. */
+ char buf[256];
+ if (sdslen(c->querybuf) < PROTO_DUMP_LEN) {
+ snprintf(buf,sizeof(buf),"Query buffer during protocol error: '%s'", c->querybuf);
+ } else {
+ snprintf(buf,sizeof(buf),"Query buffer during protocol error: '%.*s' (... more %zu bytes ...) '%.*s'", PROTO_DUMP_LEN/2, c->querybuf, sdslen(c->querybuf)-PROTO_DUMP_LEN, PROTO_DUMP_LEN/2, c->querybuf+sdslen(c->querybuf)-PROTO_DUMP_LEN/2);
+ }
+
+ /* Remove non printable chars. */
+ char *p = buf;
+ while (*p != '\0') {
+ if (!isprint(*p)) *p = '.';
+ p++;
+ }
+
+ /* Log all the client and protocol info. */
serverLog(LL_VERBOSE,
- "Protocol error from client: %s", client);
+ "Protocol error (%s) from client: %s. %s", errstr, client, buf);
sdsfree(client);
}
c->flags |= CLIENT_CLOSE_AFTER_REPLY;
@@ -1105,7 +1133,7 @@ int processMultibulkBuffer(client *c) {
if (newline == NULL) {
if (sdslen(c->querybuf) > PROTO_INLINE_MAX_SIZE) {
addReplyError(c,"Protocol error: too big mbulk count string");
- setProtocolError(c,0);
+ setProtocolError("too big mbulk count string",c,0);
}
return C_ERR;
}
@@ -1120,7 +1148,7 @@ int processMultibulkBuffer(client *c) {
ok = string2ll(c->querybuf+1,newline-(c->querybuf+1),&ll);
if (!ok || ll > 1024*1024) {
addReplyError(c,"Protocol error: invalid multibulk length");
- setProtocolError(c,pos);
+ setProtocolError("invalid mbulk count",c,pos);
return C_ERR;
}
@@ -1146,7 +1174,7 @@ int processMultibulkBuffer(client *c) {
if (sdslen(c->querybuf) > PROTO_INLINE_MAX_SIZE) {
addReplyError(c,
"Protocol error: too big bulk count string");
- setProtocolError(c,0);
+ setProtocolError("too big bulk count string",c,0);
return C_ERR;
}
break;
@@ -1160,14 +1188,14 @@ int processMultibulkBuffer(client *c) {
addReplyErrorFormat(c,
"Protocol error: expected '$', got '%c'",
c->querybuf[pos]);
- setProtocolError(c,pos);
+ setProtocolError("expected $ but got something else",c,pos);
return C_ERR;
}
ok = string2ll(c->querybuf+pos+1,newline-(c->querybuf+pos+1),&ll);
if (!ok || ll < 0 || ll > 512*1024*1024) {
addReplyError(c,"Protocol error: invalid bulk length");
- setProtocolError(c,pos);
+ setProtocolError("invalid bulk length",c,pos);
return C_ERR;
}
@@ -1241,8 +1269,10 @@ void processInputBuffer(client *c) {
/* CLIENT_CLOSE_AFTER_REPLY closes the connection once the reply is
* written to the client. Make sure to not let the reply grow after
- * this flag has been set (i.e. don't process more commands). */
- if (c->flags & CLIENT_CLOSE_AFTER_REPLY) break;
+ * this flag has been set (i.e. don't process more commands).
+ *
+ * The same applies for clients we want to terminate ASAP. */
+ if (c->flags & (CLIENT_CLOSE_AFTER_REPLY|CLIENT_CLOSE_ASAP)) break;
/* Determine request type when unknown. */
if (!c->reqtype) {
@@ -1318,7 +1348,11 @@ void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
sdsIncrLen(c->querybuf,nread);
c->lastinteraction = server.unixtime;
- if (c->flags & CLIENT_MASTER) c->reploff += nread;
+ if (c->flags & CLIENT_MASTER) {
+ c->reploff += nread;
+ replicationFeedSlavesFromMasterStream(server.slaves,
+ c->querybuf+qblen,nread);
+ }
server.stat_net_input_bytes += nread;
if (sdslen(c->querybuf) > server.client_max_querybuf_len) {
sds ci = catClientInfoString(sdsempty(),c), bytes = sdsempty();
@@ -1605,8 +1639,28 @@ void clientCommand(client *c) {
pauseClients(duration);
addReply(c,shared.ok);
} else {
- addReplyError(c, "Syntax error, try CLIENT (LIST | KILL ip:port | GETNAME | SETNAME connection-name)");
+ addReplyError(c, "Syntax error, try CLIENT (LIST | KILL | GETNAME | SETNAME | PAUSE | REPLY)");
+ }
+}
+
+/* This callback is bound to POST and "Host:" command names. Those are not
+ * really commands, but are used in security attacks in order to talk to
+ * Redis instances via HTTP, with a technique called "cross protocol scripting"
+ * which exploits the fact that services like Redis will discard invalid
+ * HTTP headers and will process what follows.
+ *
+ * As a protection against this attack, Redis will terminate the connection
+ * when a POST or "Host:" header is seen, and will log the event from
+ * time to time (to avoid creating a DOS as a result of too many logs). */
+void securityWarningCommand(client *c) {
+ static time_t logged_time;
+ time_t now = time(NULL);
+
+ if (labs(now-logged_time) > 60) {
+ serverLog(LL_WARNING,"Possible SECURITY ATTACK detected. It looks like somebody is sending POST or Host: commands to Redis. This is likely due to an attacker attempting to use Cross Protocol Scripting to compromise your Redis instance. Connection aborted.");
+ logged_time = now;
}
+ freeClientAsync(c);
}
/* Rewrite the command vector of the client. All the new objects ref count
diff --git a/src/object.c b/src/object.c
index 167290d49..1ae37c9d1 100644
--- a/src/object.c
+++ b/src/object.c
@@ -36,6 +36,8 @@
#define strtold(a,b) ((long double)strtod((a),(b)))
#endif
+/* ===================== Creation and parsing of objects ==================== */
+
robj *createObject(int type, void *ptr) {
robj *o = zmalloc(sizeof(*o));
o->type = type;
@@ -43,8 +45,13 @@ robj *createObject(int type, void *ptr) {
o->ptr = ptr;
o->refcount = 1;
- /* Set the LRU to the current lruclock (minutes resolution). */
- o->lru = LRU_CLOCK();
+ /* Set the LRU to the current lruclock (minutes resolution), or
+ * alternatively the LFU counter. */
+ if (server.maxmemory_policy & MAXMEMORY_FLAG_LFU) {
+ o->lru = (LFUGetTimeInMinutes()<<8) | LFU_INIT_VAL;
+ } else {
+ o->lru = LRU_CLOCK();
+ }
return o;
}
@@ -68,7 +75,7 @@ robj *makeObjectShared(robj *o) {
/* Create a string object with encoding OBJ_ENCODING_RAW, that is a plain
* string object where o->ptr points to a proper sds string. */
robj *createRawStringObject(const char *ptr, size_t len) {
- return createObject(OBJ_STRING,sdsnewlen(ptr,len));
+ return createObject(OBJ_STRING, sdsnewlen(ptr,len));
}
/* Create a string object with encoding OBJ_ENCODING_EMBSTR, that is
@@ -82,7 +89,11 @@ robj *createEmbeddedStringObject(const char *ptr, size_t len) {
o->encoding = OBJ_ENCODING_EMBSTR;
o->ptr = sh+1;
o->refcount = 1;
- o->lru = LRU_CLOCK();
+ if (server.maxmemory_policy & MAXMEMORY_FLAG_LFU) {
+ o->lru = (LFUGetTimeInMinutes()<<8) | LFU_INIT_VAL;
+ } else {
+ o->lru = LRU_CLOCK();
+ }
sh->len = len;
sh->alloc = len;
@@ -97,7 +108,7 @@ robj *createEmbeddedStringObject(const char *ptr, size_t len) {
}
/* Create a string object with EMBSTR encoding if it is smaller than
- * REIDS_ENCODING_EMBSTR_SIZE_LIMIT, otherwise the RAW encoding is
+ * OBJ_ENCODING_EMBSTR_SIZE_LIMIT, otherwise the RAW encoding is
* used.
*
* The current limit of 39 is chosen so that the biggest string object
@@ -147,7 +158,7 @@ robj *createStringObjectFromLongDouble(long double value, int humanfriendly) {
* will always result in a fresh object that is unshared (refcount == 1).
*
* The resulting object always has refcount set to 1. */
-robj *dupStringObject(robj *o) {
+robj *dupStringObject(const robj *o) {
robj *d;
serverAssert(o->type == OBJ_STRING);
@@ -221,6 +232,13 @@ robj *createZsetZiplistObject(void) {
return o;
}
+robj *createModuleObject(moduleType *mt, void *value) {
+ moduleValue *mv = zmalloc(sizeof(*mv));
+ mv->type = mt;
+ mv->value = value;
+ return createObject(OBJ_MODULE,mv);
+}
+
void freeStringObject(robj *o) {
if (o->encoding == OBJ_ENCODING_RAW) {
sdsfree(o->ptr);
@@ -281,6 +299,12 @@ void freeHashObject(robj *o) {
}
}
+void freeModuleObject(robj *o) {
+ moduleValue *mv = o->ptr;
+ mv->type->free(mv->value);
+ zfree(mv);
+}
+
void incrRefCount(robj *o) {
if (o->refcount != OBJ_SHARED_REFCOUNT) o->refcount++;
}
@@ -293,6 +317,7 @@ void decrRefCount(robj *o) {
case OBJ_SET: freeSetObject(o); break;
case OBJ_ZSET: freeZsetObject(o); break;
case OBJ_HASH: freeHashObject(o); break;
+ case OBJ_MODULE: freeModuleObject(o); break;
default: serverPanic("Unknown object type"); break;
}
zfree(o);
@@ -371,17 +396,16 @@ robj *tryObjectEncoding(robj *o) {
if (o->refcount > 1) return o;
/* Check if we can represent this string as a long integer.
- * Note that we are sure that a string larger than 21 chars is not
+ * Note that we are sure that a string larger than 20 chars is not
* representable as a 32 nor 64 bit integer. */
len = sdslen(s);
- if (len <= 21 && string2l(s,len,&value)) {
+ if (len <= 20 && string2l(s,len,&value)) {
/* This object is encodable as a long. Try to use a shared object.
* Note that we avoid using shared integers when maxmemory is used
* because every object needs to have a private LRU field for the LRU
* algorithm to work well. */
if ((server.maxmemory == 0 ||
- (server.maxmemory_policy != MAXMEMORY_VOLATILE_LRU &&
- server.maxmemory_policy != MAXMEMORY_ALLKEYS_LRU)) &&
+ !(server.maxmemory_policy & MAXMEMORY_FLAG_NO_SHARED_INTEGERS)) &&
value >= 0 &&
value < OBJ_SHARED_INTEGERS)
{
@@ -525,7 +549,7 @@ size_t stringObjectLen(robj *o) {
}
}
-int getDoubleFromObject(robj *o, double *target) {
+int getDoubleFromObject(const robj *o, double *target) {
double value;
char *eptr;
@@ -536,7 +560,7 @@ int getDoubleFromObject(robj *o, double *target) {
if (sdsEncodedObject(o)) {
errno = 0;
value = strtod(o->ptr, &eptr);
- if (isspace(((char*)o->ptr)[0]) ||
+ if (isspace(((const char*)o->ptr)[0]) ||
eptr[0] != '\0' ||
(errno == ERANGE &&
(value == HUGE_VAL || value == -HUGE_VAL || value == 0)) ||
@@ -605,20 +629,6 @@ int getLongDoubleFromObjectOrReply(client *c, robj *o, long double *target, cons
return C_OK;
}
-/* Helper function for getLongLongFromObject(). The function parses the string
- * as a long long value in a strict way (no spaces before/after). On success
- * C_OK is returned, otherwise C_ERR is returned. */
-int strict_strtoll(char *str, long long *vp) {
- char *eptr;
- long long value;
-
- errno = 0;
- value = strtoll(str, &eptr, 10);
- if (isspace(str[0]) || eptr[0] != '\0' || errno == ERANGE) return C_ERR;
- if (vp) *vp = value;
- return C_OK;
-}
-
int getLongLongFromObject(robj *o, long long *target) {
long long value;
@@ -627,7 +637,7 @@ int getLongLongFromObject(robj *o, long long *target) {
} else {
serverAssertWithInfo(NULL,o,o->type == OBJ_STRING);
if (sdsEncodedObject(o)) {
- if (strict_strtoll(o->ptr,&value) == C_ERR) return C_ERR;
+ if (string2ll(o->ptr,sdslen(o->ptr),&value) == 0) return C_ERR;
} else if (o->encoding == OBJ_ENCODING_INT) {
value = (long)o->ptr;
} else {
@@ -682,18 +692,299 @@ char *strEncoding(int encoding) {
}
}
-/* Given an object returns the min number of milliseconds the object was never
- * requested, using an approximated LRU algorithm. */
-unsigned long long estimateObjectIdleTime(robj *o) {
- unsigned long long lruclock = LRU_CLOCK();
- if (lruclock >= o->lru) {
- return (lruclock - o->lru) * LRU_CLOCK_RESOLUTION;
+/* =========================== Memory introspection ========================== */
+
+/* Returns the size in bytes consumed by the key's value in RAM.
+ * Note that the returned value is just an approximation, especially in the
+ * case of aggregated data types where only "sample_size" elements
+ * are checked and averaged to estimate the total size. */
+#define OBJ_COMPUTE_SIZE_DEF_SAMPLES 5 /* Default sample size. */
+size_t objectComputeSize(robj *o, size_t sample_size) {
+ sds ele, ele2;
+ dict *d;
+ dictIterator *di;
+ struct dictEntry *de;
+ size_t asize = 0, elesize = 0, samples = 0;
+
+ if (o->type == OBJ_STRING) {
+ if(o->encoding == OBJ_ENCODING_INT) {
+ asize = sizeof(*o);
+ } else if(o->encoding == OBJ_ENCODING_RAW) {
+ asize = sdsAllocSize(o->ptr)+sizeof(*o);
+ } else if(o->encoding == OBJ_ENCODING_EMBSTR) {
+ asize = sdslen(o->ptr)+2+sizeof(*o);
+ } else {
+ serverPanic("Unknown string encoding");
+ }
+ } else if (o->type == OBJ_LIST) {
+ if (o->encoding == OBJ_ENCODING_QUICKLIST) {
+ quicklist *ql = o->ptr;
+ quicklistNode *node = ql->head;
+ asize = sizeof(*o)+sizeof(quicklist);
+ do {
+ elesize += sizeof(quicklistNode)+ziplistBlobLen(node->zl);
+ samples++;
+ } while ((node = node->next) && samples < sample_size);
+ asize += (double)elesize/samples*listTypeLength(o);
+ } else if (o->encoding == OBJ_ENCODING_ZIPLIST) {
+ asize = sizeof(*o)+ziplistBlobLen(o->ptr);
+ } else {
+ serverPanic("Unknown list encoding");
+ }
+ } else if (o->type == OBJ_SET) {
+ if (o->encoding == OBJ_ENCODING_HT) {
+ d = o->ptr;
+ di = dictGetIterator(d);
+ asize = sizeof(*o)+sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
+ while((de = dictNext(di)) != NULL && samples < sample_size) {
+ ele = dictGetKey(de);
+ elesize += sizeof(struct dictEntry) + sdsAllocSize(ele);
+ samples++;
+ }
+ dictReleaseIterator(di);
+ if (samples) asize += (double)elesize/samples*dictSize(d);
+ } else if (o->encoding == OBJ_ENCODING_INTSET) {
+ intset *is = o->ptr;
+ asize = sizeof(*o)+sizeof(*is)+is->encoding*is->length;
+ } else {
+ serverPanic("Unknown set encoding");
+ }
+ } else if (o->type == OBJ_ZSET) {
+ if (o->encoding == OBJ_ENCODING_ZIPLIST) {
+ asize = sizeof(*o)+(ziplistBlobLen(o->ptr));
+ } else if (o->encoding == OBJ_ENCODING_SKIPLIST) {
+ d = ((zset*)o->ptr)->dict;
+ zskiplist *zsl = ((zset*)o->ptr)->zsl;
+ zskiplistNode *znode = zsl->header->level[0].forward;
+ asize = sizeof(*o)+sizeof(zset)+(sizeof(struct dictEntry*)*dictSlots(d));
+ while(znode != NULL && samples < sample_size) {
+ elesize += sdsAllocSize(znode->ele);
+ elesize += sizeof(struct dictEntry) + zmalloc_size(znode);
+ samples++;
+ znode = znode->level[0].forward;
+ }
+ if (samples) asize += (double)elesize/samples*dictSize(d);
+ } else {
+ serverPanic("Unknown sorted set encoding");
+ }
+ } else if (o->type == OBJ_HASH) {
+ if (o->encoding == OBJ_ENCODING_ZIPLIST) {
+ asize = sizeof(*o)+(ziplistBlobLen(o->ptr));
+ } else if (o->encoding == OBJ_ENCODING_HT) {
+ d = o->ptr;
+ di = dictGetIterator(d);
+ asize = sizeof(*o)+sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
+ while((de = dictNext(di)) != NULL && samples < sample_size) {
+ ele = dictGetKey(de);
+ ele2 = dictGetVal(de);
+ elesize += sdsAllocSize(ele) + sdsAllocSize(ele2);
+ elesize += sizeof(struct dictEntry);
+ samples++;
+ }
+ dictReleaseIterator(di);
+ if (samples) asize += (double)elesize/samples*dictSize(d);
+ } else {
+ serverPanic("Unknown hash encoding");
+ }
} else {
- return (lruclock + (LRU_CLOCK_MAX - o->lru)) *
- LRU_CLOCK_RESOLUTION;
+ serverPanic("Unknown object type");
+ }
+ return asize;
+}
+
+/* Release data obtained with getMemoryOverheadData(). */
+void freeMemoryOverheadData(struct redisMemOverhead *mh) {
+ zfree(mh->db);
+ zfree(mh);
+}
+
+/* Return a struct redisMemOverhead filled with memory overhead
+ * information used for the MEMORY OVERHEAD and INFO command. The returned
+ * structure pointer should be freed calling freeMemoryOverheadData(). */
+struct redisMemOverhead *getMemoryOverheadData(void) {
+ int j;
+ size_t mem_total = 0;
+ size_t mem = 0;
+ size_t zmalloc_used = zmalloc_used_memory();
+ struct redisMemOverhead *mh = zcalloc(sizeof(*mh));
+
+ mh->total_allocated = zmalloc_used;
+ mh->startup_allocated = server.initial_memory_usage;
+ mh->peak_allocated = server.stat_peak_memory;
+ mh->fragmentation =
+ zmalloc_get_fragmentation_ratio(server.resident_set_size);
+ mem_total += server.initial_memory_usage;
+
+ mem = 0;
+ if (server.repl_backlog)
+ mem += zmalloc_size(server.repl_backlog);
+ mh->repl_backlog = mem;
+ mem_total += mem;
+
+ mem = 0;
+ if (listLength(server.slaves)) {
+ listIter li;
+ listNode *ln;
+
+ listRewind(server.slaves,&li);
+ while((ln = listNext(&li))) {
+ client *c = listNodeValue(ln);
+ mem += getClientOutputBufferMemoryUsage(c);
+ mem += sdsAllocSize(c->querybuf);
+ mem += sizeof(client);
+ }
+ }
+ mh->clients_slaves = mem;
+ mem_total+=mem;
+
+ mem = 0;
+ if (listLength(server.clients)) {
+ listIter li;
+ listNode *ln;
+
+ listRewind(server.clients,&li);
+ while((ln = listNext(&li))) {
+ client *c = listNodeValue(ln);
+ if (c->flags & CLIENT_SLAVE)
+ continue;
+ mem += getClientOutputBufferMemoryUsage(c);
+ mem += sdsAllocSize(c->querybuf);
+ mem += sizeof(client);
+ }
+ }
+ mh->clients_normal = mem;
+ mem_total+=mem;
+
+ mem = 0;
+ if (server.aof_state != AOF_OFF) {
+ mem += sdslen(server.aof_buf);
+ mem += aofRewriteBufferSize();
+ }
+ mh->aof_buffer = mem;
+ mem_total+=mem;
+
+ for (j = 0; j < server.dbnum; j++) {
+ redisDb *db = server.db+j;
+ long long keyscount = dictSize(db->dict);
+ if (keyscount==0) continue;
+
+ mh->total_keys += keyscount;
+ mh->db = zrealloc(mh->db,sizeof(mh->db[0])*(mh->num_dbs+1));
+ mh->db[mh->num_dbs].dbid = j;
+
+ mem = dictSize(db->dict) * sizeof(dictEntry) +
+ dictSlots(db->dict) * sizeof(dictEntry*) +
+ dictSize(db->dict) * sizeof(robj);
+ mh->db[mh->num_dbs].overhead_ht_main = mem;
+ mem_total+=mem;
+
+ mem = dictSize(db->expires) * sizeof(dictEntry) +
+ dictSlots(db->expires) * sizeof(dictEntry*);
+ mh->db[mh->num_dbs].overhead_ht_expires = mem;
+ mem_total+=mem;
+
+ mh->num_dbs++;
}
+
+ mh->overhead_total = mem_total;
+ mh->dataset = zmalloc_used - mem_total;
+ mh->peak_perc = (float)zmalloc_used*100/mh->peak_allocated;
+
+ /* Metrics computed after subtracting the startup memory from
+ * the total memory. */
+ size_t net_usage = 1;
+ if (zmalloc_used > mh->startup_allocated)
+ net_usage = zmalloc_used - mh->startup_allocated;
+ mh->dataset_perc = (float)mh->dataset*100/net_usage;
+ mh->bytes_per_key = mh->total_keys ? (net_usage / mh->total_keys) : 0;
+
+ return mh;
}
+/* Helper for "MEMORY allocator-stats", used as a callback for the jemalloc
+ * stats output. */
+void inputCatSds(void *result, const char *str) {
+ /* result is actually a (sds *), so re-cast it here */
+ sds *info = (sds *)result;
+ *info = sdscat(*info, str);
+}
+
+/* This implements MEMORY DOCTOR. An human readable analysis of the Redis
+ * memory condition. */
+sds getMemoryDoctorReport(void) {
+ int empty = 0; /* Instance is empty or almost empty. */
+ int big_peak = 0; /* Memory peak is much larger than used mem. */
+ int high_frag = 0; /* High fragmentation. */
+ int big_slave_buf = 0; /* Slave buffers are too big. */
+ int big_client_buf = 0; /* Client buffers are too big. */
+ int num_reports = 0;
+ struct redisMemOverhead *mh = getMemoryOverheadData();
+
+ if (mh->total_allocated < (1024*1024*5)) {
+ empty = 1;
+ num_reports++;
+ } else {
+ /* Peak is > 150% of current used memory? */
+ if (((float)mh->peak_allocated / mh->total_allocated) > 1.5) {
+ big_peak = 1;
+ num_reports++;
+ }
+
+ /* Fragmentation is higher than 1.4? */
+ if (mh->fragmentation > 1.4) {
+ high_frag = 1;
+ num_reports++;
+ }
+
+ /* Clients using more than 200k each average? */
+ long numslaves = listLength(server.slaves);
+ long numclients = listLength(server.clients)-numslaves;
+ if (mh->clients_normal / numclients > (1024*200)) {
+ big_client_buf = 1;
+ num_reports++;
+ }
+
+ /* Slaves using more than 10 MB each? */
+ if (mh->clients_slaves / numslaves > (1024*1024*10)) {
+ big_slave_buf = 1;
+ num_reports++;
+ }
+ }
+
+ sds s;
+ if (num_reports == 0) {
+ s = sdsnew(
+ "Hi Sam, I can't find any memory issue in your instance. "
+ "I can only account for what occurs on this base.\n");
+ } else if (empty == 1) {
+ s = sdsnew(
+ "Hi Sam, this instance is empty or is using very little memory, "
+ "my issues detector can't be used in these conditions. "
+ "Please, leave for your mission on Earth and fill it with some data. "
+ "The new Sam and I will be back to our programming as soon as I "
+ "finished rebooting.\n");
+ } else {
+ s = sdsnew("Sam, I detected a few issues in this Redis instance memory implants:\n\n");
+ if (big_peak) {
+ s = sdscat(s," * Peak memory: In the past this instance used more than 150% the memory that is currently using. The allocator is normally not able to release memory after a peak, so you can expect to see a big fragmentation ratio, however this is actually harmless and is only due to the memory peak, and if the Redis instance Resident Set Size (RSS) is currently bigger than expected, the memory will be used as soon as you fill the Redis instance with more data. If the memory peak was only occasional and you want to try to reclaim memory, please try the MEMORY PURGE command, otherwise the only other option is to shutdown and restart the instance.\n\n");
+ }
+ if (high_frag) {
+ s = sdscatprintf(s," * High fragmentation: This instance has a memory fragmentation greater than 1.4 (this means that the Resident Set Size of the Redis process is much larger than the sum of the logical allocations Redis performed). This problem is usually due either to a large peak memory (check if there is a peak memory entry above in the report) or may result from a workload that causes the allocator to fragment memory a lot. If the problem is a large peak memory, then there is no issue. Otherwise, make sure you are using the Jemalloc allocator and not the default libc malloc. Note: The currently used allocator is \"%s\".\n\n", ZMALLOC_LIB);
+ }
+ if (big_slave_buf) {
+ s = sdscat(s," * Big slave buffers: The slave output buffers in this instance are greater than 10MB for each slave (on average). This likely means that there is some slave instance that is struggling receiving data, either because it is too slow or because of networking issues. As a result, data piles on the master output buffers. Please try to identify what slave is not receiving data correctly and why. You can use the INFO output in order to check the slaves delays and the CLIENT LIST command to check the output buffers of each slave.\n\n");
+ }
+ if (big_client_buf) {
+ s = sdscat(s," * Big client buffers: The clients output buffers in this instance are greater than 200K per client (on average). This may result from different causes, like Pub/Sub clients subscribed to channels bot not receiving data fast enough, so that data piles on the Redis instance output buffer, or clients sending commands with large replies or very large sequences of commands in the same pipeline. Please use the CLIENT LIST command in order to investigate the issue if it causes problems in your instance, or to understand better why certain clients are using a big amount of memory.\n\n");
+ }
+ s = sdscat(s,"I'm here to keep you safe, Sam. I want to help you.\n");
+ }
+ freeMemoryOverheadData(mh);
+ return s;
+}
+
+/* ======================= The OBJECT and MEMORY commands =================== */
+
/* This is a helper function for the OBJECT command. We need to lookup keys
* without any modification of LRU or other parameters. */
robj *objectCommandLookup(client *c, robj *key) {
@@ -726,9 +1017,156 @@ void objectCommand(client *c) {
} else if (!strcasecmp(c->argv[1]->ptr,"idletime") && c->argc == 3) {
if ((o = objectCommandLookupOrReply(c,c->argv[2],shared.nullbulk))
== NULL) return;
+ if (server.maxmemory_policy & MAXMEMORY_FLAG_LFU) {
+ addReplyError(c,"An LFU maxmemory policy is selected, idle time not tracked. Please note that when switching between policies at runtime LRU and LFU data will take some time to adjust.");
+ return;
+ }
addReplyLongLong(c,estimateObjectIdleTime(o)/1000);
+ } else if (!strcasecmp(c->argv[1]->ptr,"freq") && c->argc == 3) {
+ if ((o = objectCommandLookupOrReply(c,c->argv[2],shared.nullbulk))
+ == NULL) return;
+ if (server.maxmemory_policy & MAXMEMORY_FLAG_LRU) {
+ addReplyError(c,"An LRU maxmemory policy is selected, access frequency not tracked. Please note that when switching between policies at runtime LRU and LFU data will take some time to adjust.");
+ return;
+ }
+ addReplyLongLong(c,o->lru&255);
} else {
- addReplyError(c,"Syntax error. Try OBJECT (refcount|encoding|idletime)");
+ addReplyError(c,"Syntax error. Try OBJECT (refcount|encoding|idletime|freq)");
}
}
+/* The memory command will eventually be a complete interface for the
+ * memory introspection capabilities of Redis.
+ *
+ * Usage: MEMORY usage <key> */
+void memoryCommand(client *c) {
+ robj *o;
+
+ if (!strcasecmp(c->argv[1]->ptr,"usage") && c->argc >= 3) {
+ long long samples = OBJ_COMPUTE_SIZE_DEF_SAMPLES;
+ for (int j = 3; j < c->argc; j++) {
+ if (!strcasecmp(c->argv[j]->ptr,"samples") &&
+ j+1 < c->argc)
+ {
+ if (getLongLongFromObjectOrReply(c,c->argv[j+1],&samples,NULL)
+ == C_ERR) return;
+ if (samples < 0) {
+ addReply(c,shared.syntaxerr);
+ return;
+ }
+ if (samples == 0) samples = LLONG_MAX;;
+ j++; /* skip option argument. */
+ } else {
+ addReply(c,shared.syntaxerr);
+ return;
+ }
+ }
+ if ((o = objectCommandLookupOrReply(c,c->argv[2],shared.nullbulk))
+ == NULL) return;
+ size_t usage = objectComputeSize(o,samples);
+ usage += sdsAllocSize(c->argv[1]->ptr);
+ usage += sizeof(dictEntry);
+ addReplyLongLong(c,usage);
+ } else if (!strcasecmp(c->argv[1]->ptr,"stats") && c->argc == 2) {
+ struct redisMemOverhead *mh = getMemoryOverheadData();
+
+ addReplyMultiBulkLen(c,(14+mh->num_dbs)*2);
+
+ addReplyBulkCString(c,"peak.allocated");
+ addReplyLongLong(c,mh->peak_allocated);
+
+ addReplyBulkCString(c,"total.allocated");
+ addReplyLongLong(c,mh->total_allocated);
+
+ addReplyBulkCString(c,"startup.allocated");
+ addReplyLongLong(c,mh->startup_allocated);
+
+ addReplyBulkCString(c,"replication.backlog");
+ addReplyLongLong(c,mh->repl_backlog);
+
+ addReplyBulkCString(c,"clients.slaves");
+ addReplyLongLong(c,mh->clients_slaves);
+
+ addReplyBulkCString(c,"clients.normal");
+ addReplyLongLong(c,mh->clients_normal);
+
+ addReplyBulkCString(c,"aof.buffer");
+ addReplyLongLong(c,mh->aof_buffer);
+
+ for (size_t j = 0; j < mh->num_dbs; j++) {
+ char dbname[32];
+ snprintf(dbname,sizeof(dbname),"db.%zd",mh->db[j].dbid);
+ addReplyBulkCString(c,dbname);
+ addReplyMultiBulkLen(c,4);
+
+ addReplyBulkCString(c,"overhead.hashtable.main");
+ addReplyLongLong(c,mh->db[j].overhead_ht_main);
+
+ addReplyBulkCString(c,"overhead.hashtable.expires");
+ addReplyLongLong(c,mh->db[j].overhead_ht_expires);
+ }
+
+ addReplyBulkCString(c,"overhead.total");
+ addReplyLongLong(c,mh->overhead_total);
+
+ addReplyBulkCString(c,"keys.count");
+ addReplyLongLong(c,mh->total_keys);
+
+ addReplyBulkCString(c,"keys.bytes-per-key");
+ addReplyLongLong(c,mh->bytes_per_key);
+
+ addReplyBulkCString(c,"dataset.bytes");
+ addReplyLongLong(c,mh->dataset);
+
+ addReplyBulkCString(c,"dataset.percentage");
+ addReplyDouble(c,mh->dataset_perc);
+
+ addReplyBulkCString(c,"peak.percentage");
+ addReplyDouble(c,mh->peak_perc);
+
+ addReplyBulkCString(c,"fragmentation");
+ addReplyDouble(c,mh->fragmentation);
+
+ freeMemoryOverheadData(mh);
+ } else if (!strcasecmp(c->argv[1]->ptr,"malloc-stats") && c->argc == 2) {
+#if defined(USE_JEMALLOC)
+ sds info = sdsempty();
+ je_malloc_stats_print(inputCatSds, &info, NULL);
+ addReplyBulkSds(c, info);
+#else
+ addReplyBulkCString(c,"Stats not supported for the current allocator");
+#endif
+ } else if (!strcasecmp(c->argv[1]->ptr,"doctor") && c->argc == 2) {
+ sds report = getMemoryDoctorReport();
+ addReplyBulkSds(c,report);
+ } else if (!strcasecmp(c->argv[1]->ptr,"purge") && c->argc == 2) {
+#if defined(USE_JEMALLOC)
+ char tmp[32];
+ unsigned narenas = 0;
+ size_t sz = sizeof(unsigned);
+ if (!je_mallctl("arenas.narenas", &narenas, &sz, NULL, 0)) {
+ sprintf(tmp, "arena.%d.purge", narenas);
+ if (!je_mallctl(tmp, NULL, 0, NULL, 0)) {
+ addReply(c, shared.ok);
+ return;
+ }
+ }
+ addReplyError(c, "Error purging dirty pages");
+#else
+ addReply(c, shared.ok);
+ /* Nothing to do for other allocators. */
+#endif
+ } else if (!strcasecmp(c->argv[1]->ptr,"help") && c->argc == 2) {
+ addReplyMultiBulkLen(c,4);
+ addReplyBulkCString(c,
+"MEMORY USAGE <key> [SAMPLES <count>] - Estimate memory usage of key");
+ addReplyBulkCString(c,
+"MEMORY STATS - Show memory usage details");
+ addReplyBulkCString(c,
+"MEMORY PURGE - Ask the allocator to release memory");
+ addReplyBulkCString(c,
+"MEMORY MALLOC-STATS - Show allocator internal stats");
+ } else {
+ addReplyError(c,"Syntax error. Try MEMORY HELP");
+ }
+}
diff --git a/src/quicklist.c b/src/quicklist.c
index be02e3276..c8b72743c 100644
--- a/src/quicklist.c
+++ b/src/quicklist.c
@@ -149,7 +149,7 @@ REDIS_STATIC quicklistNode *quicklistCreateNode(void) {
}
/* Return cached quicklist count */
-unsigned int quicklistCount(quicklist *ql) { return ql->count; }
+unsigned int quicklistCount(const quicklist *ql) { return ql->count; }
/* Free entire quicklist. */
void quicklistRelease(quicklist *quicklist) {
@@ -671,6 +671,7 @@ int quicklistReplaceAtIndex(quicklist *quicklist, long index, void *data,
/* quicklistIndex provides an uncompressed node */
entry.node->zl = ziplistDelete(entry.node->zl, &entry.zi);
entry.node->zl = ziplistInsert(entry.node->zl, entry.zi, data, sz);
+ quicklistNodeUpdateSz(entry.node);
quicklistCompress(quicklist, entry.node);
return 1;
} else {
@@ -1191,12 +1192,12 @@ quicklist *quicklistDup(quicklist *orig) {
current = current->next) {
quicklistNode *node = quicklistCreateNode();
- if (node->encoding == QUICKLIST_NODE_ENCODING_LZF) {
- quicklistLZF *lzf = (quicklistLZF *)node->zl;
+ if (current->encoding == QUICKLIST_NODE_ENCODING_LZF) {
+ quicklistLZF *lzf = (quicklistLZF *)current->zl;
size_t lzf_sz = sizeof(*lzf) + lzf->sz;
node->zl = zmalloc(lzf_sz);
memcpy(node->zl, current->zl, lzf_sz);
- } else if (node->encoding == QUICKLIST_NODE_ENCODING_RAW) {
+ } else if (current->encoding == QUICKLIST_NODE_ENCODING_RAW) {
node->zl = zmalloc(current->sz);
memcpy(node->zl, current->zl, current->sz);
}
diff --git a/src/quicklist.h b/src/quicklist.h
index e040368e5..8f3875900 100644
--- a/src/quicklist.h
+++ b/src/quicklist.h
@@ -154,7 +154,7 @@ int quicklistPopCustom(quicklist *quicklist, int where, unsigned char **data,
void *(*saver)(unsigned char *data, unsigned int sz));
int quicklistPop(quicklist *quicklist, int where, unsigned char **data,
unsigned int *sz, long long *slong);
-unsigned int quicklistCount(quicklist *ql);
+unsigned int quicklistCount(const quicklist *ql);
int quicklistCompare(unsigned char *p1, unsigned char *p2, int p2_len);
size_t quicklistGetLzf(const quicklistNode *node, void **data);
diff --git a/src/rdb.c b/src/rdb.c
index 57b759278..2689b172d 100644
--- a/src/rdb.c
+++ b/src/rdb.c
@@ -41,18 +41,30 @@
#include <sys/stat.h>
#include <sys/param.h>
-#define RDB_LOAD_NONE 0
-#define RDB_LOAD_ENC (1<<0)
-#define RDB_LOAD_PLAIN (1<<1)
-#define RDB_LOAD_SDS (1<<2)
-
-#define rdbExitReportCorruptRDB(reason) rdbCheckThenExit(reason, __LINE__);
-
-void rdbCheckThenExit(char *reason, int where) {
- serverLog(LL_WARNING, "Corrupt RDB detected at rdb.c:%d (%s). "
- "Running 'redis-check-rdb %s'",
- where, reason, server.rdb_filename);
- redis_check_rdb(server.rdb_filename);
+#define rdbExitReportCorruptRDB(...) rdbCheckThenExit(__LINE__,__VA_ARGS__)
+
+extern int rdbCheckMode;
+void rdbCheckError(const char *fmt, ...);
+void rdbCheckSetError(const char *fmt, ...);
+
+void rdbCheckThenExit(int linenum, char *reason, ...) {
+ va_list ap;
+ char msg[1024];
+ int len;
+
+ len = snprintf(msg,sizeof(msg),
+ "Internal error in RDB reading function at rdb.c:%d -> ", linenum);
+ va_start(ap,reason);
+ vsnprintf(msg+len,sizeof(msg)-len,reason,ap);
+ va_end(ap);
+
+ if (!rdbCheckMode) {
+ serverLog(LL_WARNING, "%s", msg);
+ char *argv[2] = {"",server.rdb_filename};
+ redis_check_rdb_main(2,argv);
+ } else {
+ rdbCheckError("%s",msg);
+ }
exit(1);
}
@@ -95,7 +107,7 @@ long long rdbLoadMillisecondTime(rio *rdb) {
/* Saves an encoded length. The first two bits in the first byte are used to
* hold the encoding type. See the RDB_* definitions for more information
* on the types of encoding. */
-int rdbSaveLen(rio *rdb, uint32_t len) {
+int rdbSaveLen(rio *rdb, uint64_t len) {
unsigned char buf[2];
size_t nwritten;
@@ -110,44 +122,79 @@ int rdbSaveLen(rio *rdb, uint32_t len) {
buf[1] = len&0xFF;
if (rdbWriteRaw(rdb,buf,2) == -1) return -1;
nwritten = 2;
- } else {
+ } else if (len <= UINT32_MAX) {
/* Save a 32 bit len */
- buf[0] = (RDB_32BITLEN<<6);
+ buf[0] = RDB_32BITLEN;
if (rdbWriteRaw(rdb,buf,1) == -1) return -1;
- len = htonl(len);
- if (rdbWriteRaw(rdb,&len,4) == -1) return -1;
+ uint32_t len32 = htonl(len);
+ if (rdbWriteRaw(rdb,&len32,4) == -1) return -1;
nwritten = 1+4;
+ } else {
+ /* Save a 64 bit len */
+ buf[0] = RDB_64BITLEN;
+ if (rdbWriteRaw(rdb,buf,1) == -1) return -1;
+ len = htonu64(len);
+ if (rdbWriteRaw(rdb,&len,8) == -1) return -1;
+ nwritten = 1+8;
}
return nwritten;
}
-/* Load an encoded length. The "isencoded" argument is set to 1 if the length
- * is not actually a length but an "encoding type". See the RDB_ENC_*
- * definitions in rdb.h for more information. */
-uint32_t rdbLoadLen(rio *rdb, int *isencoded) {
+
+/* Load an encoded length. If the loaded length is a normal length as stored
+ * with rdbSaveLen(), the read length is set to '*lenptr'. If instead the
+ * loaded length describes a special encoding that follows, then '*isencoded'
+ * is set to 1 and the encoding format is stored at '*lenptr'.
+ *
+ * See the RDB_ENC_* definitions in rdb.h for more information on special
+ * encodings.
+ *
+ * The function returns -1 on error, 0 on success. */
+int rdbLoadLenByRef(rio *rdb, int *isencoded, uint64_t *lenptr) {
unsigned char buf[2];
- uint32_t len;
int type;
if (isencoded) *isencoded = 0;
- if (rioRead(rdb,buf,1) == 0) return RDB_LENERR;
+ if (rioRead(rdb,buf,1) == 0) return -1;
type = (buf[0]&0xC0)>>6;
if (type == RDB_ENCVAL) {
/* Read a 6 bit encoding type. */
if (isencoded) *isencoded = 1;
- return buf[0]&0x3F;
+ *lenptr = buf[0]&0x3F;
} else if (type == RDB_6BITLEN) {
/* Read a 6 bit len. */
- return buf[0]&0x3F;
+ *lenptr = buf[0]&0x3F;
} else if (type == RDB_14BITLEN) {
/* Read a 14 bit len. */
- if (rioRead(rdb,buf+1,1) == 0) return RDB_LENERR;
- return ((buf[0]&0x3F)<<8)|buf[1];
- } else {
+ if (rioRead(rdb,buf+1,1) == 0) return -1;
+ *lenptr = ((buf[0]&0x3F)<<8)|buf[1];
+ } else if (buf[0] == RDB_32BITLEN) {
/* Read a 32 bit len. */
- if (rioRead(rdb,&len,4) == 0) return RDB_LENERR;
- return ntohl(len);
+ uint32_t len;
+ if (rioRead(rdb,&len,4) == 0) return -1;
+ *lenptr = ntohl(len);
+ } else if (buf[0] == RDB_64BITLEN) {
+ /* Read a 64 bit len. */
+ uint64_t len;
+ if (rioRead(rdb,&len,8) == 0) return -1;
+ *lenptr = ntohu64(len);
+ } else {
+ rdbExitReportCorruptRDB(
+ "Unknown length encoding %d in rdbLoadLen()",type);
+ return -1; /* Never reached. */
}
+ return 0;
+}
+
+/* This is like rdbLoadLenByRef() but directly returns the value read
+ * from the RDB stream, signaling an error by returning RDB_LENERR
+ * (since it is a too large count to be applicable in any Redis data
+ * structure). */
+uint64_t rdbLoadLen(rio *rdb, int *isencoded) {
+ uint64_t len;
+
+ if (rdbLoadLenByRef(rdb,isencoded,&len) == -1) return RDB_LENERR;
+ return len;
}
/* Encodes the "value" argument as integer when it fits in the supported ranges
@@ -179,7 +226,7 @@ int rdbEncodeInteger(long long value, unsigned char *enc) {
/* Loads an integer-encoded object with the specified encoding type "enctype".
* The returned value changes according to the flags, see
* rdbGenerincLoadStringObject() for more info. */
-void *rdbLoadIntegerObject(rio *rdb, int enctype, int flags) {
+void *rdbLoadIntegerObject(rio *rdb, int enctype, int flags, size_t *lenptr) {
int plain = flags & RDB_LOAD_PLAIN;
int sds = flags & RDB_LOAD_SDS;
int encode = flags & RDB_LOAD_ENC;
@@ -201,11 +248,12 @@ void *rdbLoadIntegerObject(rio *rdb, int enctype, int flags) {
val = (int32_t)v;
} else {
val = 0; /* anti-warning */
- rdbExitReportCorruptRDB("Unknown RDB integer encoding type");
+ rdbExitReportCorruptRDB("Unknown RDB integer encoding type %d",enctype);
}
if (plain || sds) {
char buf[LONG_STR_SIZE], *p;
int len = ll2string(buf,sizeof(buf),val);
+ if (lenptr) *lenptr = len;
p = plain ? zmalloc(len) : sdsnewlen(NULL,len);
memcpy(p,buf,len);
return p;
@@ -281,10 +329,10 @@ ssize_t rdbSaveLzfStringObject(rio *rdb, unsigned char *s, size_t len) {
/* Load an LZF compressed string in RDB format. The returned value
* changes according to 'flags'. For more info check the
* rdbGenericLoadStringObject() function. */
-void *rdbLoadLzfStringObject(rio *rdb, int flags) {
+void *rdbLoadLzfStringObject(rio *rdb, int flags, size_t *lenptr) {
int plain = flags & RDB_LOAD_PLAIN;
int sds = flags & RDB_LOAD_SDS;
- unsigned int len, clen;
+ uint64_t len, clen;
unsigned char *c = NULL;
char *val = NULL;
@@ -295,13 +343,17 @@ void *rdbLoadLzfStringObject(rio *rdb, int flags) {
/* Allocate our target according to the uncompressed size. */
if (plain) {
val = zmalloc(len);
+ if (lenptr) *lenptr = len;
} else {
val = sdsnewlen(NULL,len);
}
/* Load the compressed representation and uncompress it to target. */
if (rioRead(rdb,c,clen) == 0) goto err;
- if (lzf_decompress(c,clen,val,len) == 0) goto err;
+ if (lzf_decompress(c,clen,val,len) == 0) {
+ if (rdbCheckMode) rdbCheckSetError("Invalid LZF compressed string");
+ goto err;
+ }
zfree(c);
if (plain || sds) {
@@ -393,13 +445,15 @@ int rdbSaveStringObject(rio *rdb, robj *obj) {
* RDB_LOAD_PLAIN: Return a plain string allocated with zmalloc()
* instead of a Redis object with an sds in it.
* RDB_LOAD_SDS: Return an SDS string instead of a Redis object.
-*/
-void *rdbGenericLoadStringObject(rio *rdb, int flags) {
+ *
+ * On I/O error NULL is returned.
+ */
+void *rdbGenericLoadStringObject(rio *rdb, int flags, size_t *lenptr) {
int encode = flags & RDB_LOAD_ENC;
int plain = flags & RDB_LOAD_PLAIN;
int sds = flags & RDB_LOAD_SDS;
int isencoded;
- uint32_t len;
+ uint64_t len;
len = rdbLoadLen(rdb,&isencoded);
if (isencoded) {
@@ -407,17 +461,18 @@ void *rdbGenericLoadStringObject(rio *rdb, int flags) {
case RDB_ENC_INT8:
case RDB_ENC_INT16:
case RDB_ENC_INT32:
- return rdbLoadIntegerObject(rdb,len,flags);
+ return rdbLoadIntegerObject(rdb,len,flags,lenptr);
case RDB_ENC_LZF:
- return rdbLoadLzfStringObject(rdb,flags);
+ return rdbLoadLzfStringObject(rdb,flags,lenptr);
default:
- rdbExitReportCorruptRDB("Unknown RDB encoding type");
+ rdbExitReportCorruptRDB("Unknown RDB string encoding type %d",len);
}
}
if (len == RDB_LENERR) return NULL;
if (plain || sds) {
void *buf = plain ? zmalloc(len) : sdsnewlen(NULL,len);
+ if (lenptr) *lenptr = len;
if (len && rioRead(rdb,buf,len) == 0) {
if (plain)
zfree(buf);
@@ -438,11 +493,11 @@ void *rdbGenericLoadStringObject(rio *rdb, int flags) {
}
robj *rdbLoadStringObject(rio *rdb) {
- return rdbGenericLoadStringObject(rdb,RDB_LOAD_NONE);
+ return rdbGenericLoadStringObject(rdb,RDB_LOAD_NONE,NULL);
}
robj *rdbLoadEncodedStringObject(rio *rdb) {
- return rdbGenericLoadStringObject(rdb,RDB_LOAD_ENC);
+ return rdbGenericLoadStringObject(rdb,RDB_LOAD_ENC,NULL);
}
/* Save a double value. Doubles are saved as strings prefixed by an unsigned
@@ -505,6 +560,37 @@ int rdbLoadDoubleValue(rio *rdb, double *val) {
}
}
+/* Saves a double for RDB 8 or greater, where IE754 binary64 format is assumed.
+ * We just make sure the integer is always stored in little endian, otherwise
+ * the value is copied verbatim from memory to disk.
+ *
+ * Return -1 on error, the size of the serialized value on success. */
+int rdbSaveBinaryDoubleValue(rio *rdb, double val) {
+ memrev64ifbe(&val);
+ return rdbWriteRaw(rdb,&val,sizeof(val));
+}
+
+/* Loads a double from RDB 8 or greater. See rdbSaveBinaryDoubleValue() for
+ * more info. On error -1 is returned, otherwise 0. */
+int rdbLoadBinaryDoubleValue(rio *rdb, double *val) {
+ if (rioRead(rdb,val,sizeof(*val)) == 0) return -1;
+ memrev64ifbe(val);
+ return 0;
+}
+
+/* Like rdbSaveBinaryDoubleValue() but single precision. */
+int rdbSaveBinaryFloatValue(rio *rdb, float val) {
+ memrev32ifbe(&val);
+ return rdbWriteRaw(rdb,&val,sizeof(val));
+}
+
+/* Like rdbLoadBinaryDoubleValue() but single precision. */
+int rdbLoadBinaryFloatValue(rio *rdb, float *val) {
+ if (rioRead(rdb,val,sizeof(*val)) == 0) return -1;
+ memrev32ifbe(val);
+ return 0;
+}
+
/* Save the object type of object "o". */
int rdbSaveObjectType(rio *rdb, robj *o) {
switch (o->type) {
@@ -526,7 +612,7 @@ int rdbSaveObjectType(rio *rdb, robj *o) {
if (o->encoding == OBJ_ENCODING_ZIPLIST)
return rdbSaveType(rdb,RDB_TYPE_ZSET_ZIPLIST);
else if (o->encoding == OBJ_ENCODING_SKIPLIST)
- return rdbSaveType(rdb,RDB_TYPE_ZSET);
+ return rdbSaveType(rdb,RDB_TYPE_ZSET_2);
else
serverPanic("Unknown sorted set encoding");
case OBJ_HASH:
@@ -536,6 +622,8 @@ int rdbSaveObjectType(rio *rdb, robj *o) {
return rdbSaveType(rdb,RDB_TYPE_HASH);
else
serverPanic("Unknown hash encoding");
+ case OBJ_MODULE:
+ return rdbSaveType(rdb,RDB_TYPE_MODULE);
default:
serverPanic("Unknown object type");
}
@@ -629,7 +717,7 @@ ssize_t rdbSaveObject(rio *rdb, robj *o) {
if ((n = rdbSaveRawString(rdb,(unsigned char*)ele,sdslen(ele)))
== -1) return -1;
nwritten += n;
- if ((n = rdbSaveDoubleValue(rdb,*score)) == -1) return -1;
+ if ((n = rdbSaveBinaryDoubleValue(rdb,*score)) == -1) return -1;
nwritten += n;
}
dictReleaseIterator(di);
@@ -667,6 +755,26 @@ ssize_t rdbSaveObject(rio *rdb, robj *o) {
serverPanic("Unknown hash encoding");
}
+ } else if (o->type == OBJ_MODULE) {
+ /* Save a module-specific value. */
+ RedisModuleIO io;
+ moduleValue *mv = o->ptr;
+ moduleType *mt = mv->type;
+ moduleInitIOContext(io,mt,rdb);
+
+ /* Write the "module" identifier as prefix, so that we'll be able
+ * to call the right module during loading. */
+ int retval = rdbSaveLen(rdb,mt->id);
+ if (retval == -1) return -1;
+ io.bytes += retval;
+
+ /* Then write the module-specific representation. */
+ mt->rdb_save(&io,mv->value);
+ if (io.ctx) {
+ moduleFreeContext(io.ctx);
+ zfree(io.ctx);
+ }
+ return io.error ? -1 : (ssize_t)io.bytes;
} else {
serverPanic("Unknown object type");
}
@@ -727,14 +835,28 @@ int rdbSaveAuxFieldStrInt(rio *rdb, char *key, long long val) {
}
/* Save a few default AUX fields with information about the RDB generated. */
-int rdbSaveInfoAuxFields(rio *rdb) {
+int rdbSaveInfoAuxFields(rio *rdb, int flags, rdbSaveInfo *rsi) {
int redis_bits = (sizeof(void*) == 8) ? 64 : 32;
+ int aof_preamble = (flags & RDB_SAVE_AOF_PREAMBLE) != 0;
/* Add a few fields about the state when the RDB was created. */
if (rdbSaveAuxFieldStrStr(rdb,"redis-ver",REDIS_VERSION) == -1) return -1;
if (rdbSaveAuxFieldStrInt(rdb,"redis-bits",redis_bits) == -1) return -1;
if (rdbSaveAuxFieldStrInt(rdb,"ctime",time(NULL)) == -1) return -1;
if (rdbSaveAuxFieldStrInt(rdb,"used-mem",zmalloc_used_memory()) == -1) return -1;
+
+ /* Handle saving options that generate aux fields. */
+ if (rsi) {
+ if (rsi->repl_stream_db &&
+ rdbSaveAuxFieldStrInt(rdb,"repl-stream-db",rsi->repl_stream_db)
+ == -1)
+ {
+ return -1;
+ }
+ }
+ if (rdbSaveAuxFieldStrInt(rdb,"aof-preamble",aof_preamble) == -1) return -1;
+ if (rdbSaveAuxFieldStrStr(rdb,"repl-id",server.replid) == -1) return -1;
+ if (rdbSaveAuxFieldStrInt(rdb,"repl-offset",server.master_repl_offset) == -1) return -1;
return 1;
}
@@ -746,19 +868,20 @@ int rdbSaveInfoAuxFields(rio *rdb) {
* When the function returns C_ERR and if 'error' is not NULL, the
* integer pointed by 'error' is set to the value of errno just after the I/O
* error. */
-int rdbSaveRio(rio *rdb, int *error) {
+int rdbSaveRio(rio *rdb, int *error, int flags, rdbSaveInfo *rsi) {
dictIterator *di = NULL;
dictEntry *de;
char magic[10];
int j;
long long now = mstime();
uint64_t cksum;
+ size_t processed = 0;
if (server.rdb_checksum)
rdb->update_cksum = rioGenericUpdateChecksum;
snprintf(magic,sizeof(magic),"REDIS%04d",RDB_VERSION);
if (rdbWriteRaw(rdb,magic,9) == -1) goto werr;
- if (rdbSaveInfoAuxFields(rdb) == -1) goto werr;
+ if (rdbSaveInfoAuxFields(rdb,flags,rsi) == -1) goto werr;
for (j = 0; j < server.dbnum; j++) {
redisDb *db = server.db+j;
@@ -795,6 +918,16 @@ int rdbSaveRio(rio *rdb, int *error) {
initStaticStringObject(key,keystr);
expire = getExpire(db,&key);
if (rdbSaveKeyValuePair(rdb,&key,o,expire,now) == -1) goto werr;
+
+ /* When this RDB is produced as part of an AOF rewrite, move
+ * accumulated diff from parent to child while rewriting in
+ * order to have a smaller final write. */
+ if (flags & RDB_SAVE_AOF_PREAMBLE &&
+ rdb->processed_bytes > processed+AOF_READ_DIFF_INTERVAL_BYTES)
+ {
+ processed = rdb->processed_bytes;
+ aofReadDiffFromParent();
+ }
}
dictReleaseIterator(di);
}
@@ -824,7 +957,7 @@ werr:
* While the suffix is the 40 bytes hex string we announced in the prefix.
* This way processes receiving the payload can understand when it ends
* without doing any processing of the content. */
-int rdbSaveRioWithEOFMark(rio *rdb, int *error) {
+int rdbSaveRioWithEOFMark(rio *rdb, int *error, rdbSaveInfo *rsi) {
char eofmark[RDB_EOF_MARK_SIZE];
getRandomHexChars(eofmark,RDB_EOF_MARK_SIZE);
@@ -832,7 +965,7 @@ int rdbSaveRioWithEOFMark(rio *rdb, int *error) {
if (rioWrite(rdb,"$EOF:",5) == 0) goto werr;
if (rioWrite(rdb,eofmark,RDB_EOF_MARK_SIZE) == 0) goto werr;
if (rioWrite(rdb,"\r\n",2) == 0) goto werr;
- if (rdbSaveRio(rdb,error) == C_ERR) goto werr;
+ if (rdbSaveRio(rdb,error,RDB_SAVE_NONE,rsi) == C_ERR) goto werr;
if (rioWrite(rdb,eofmark,RDB_EOF_MARK_SIZE) == 0) goto werr;
return C_OK;
@@ -843,7 +976,7 @@ werr: /* Write error. */
}
/* Save the DB on disk. Return C_ERR on error, C_OK on success. */
-int rdbSave(char *filename) {
+int rdbSave(char *filename, rdbSaveInfo *rsi) {
char tmpfile[256];
char cwd[MAXPATHLEN]; /* Current working dir path for error messages. */
FILE *fp;
@@ -864,7 +997,7 @@ int rdbSave(char *filename) {
}
rioInitWithFile(&rdb,fp);
- if (rdbSaveRio(&rdb,&error) == C_ERR) {
+ if (rdbSaveRio(&rdb,&error,RDB_SAVE_NONE,rsi) == C_ERR) {
errno = error;
goto werr;
}
@@ -902,14 +1035,15 @@ werr:
return C_ERR;
}
-int rdbSaveBackground(char *filename) {
+int rdbSaveBackground(char *filename, rdbSaveInfo *rsi) {
pid_t childpid;
long long start;
- if (server.rdb_child_pid != -1) return C_ERR;
+ if (server.aof_child_pid != -1 || server.rdb_child_pid != -1) return C_ERR;
server.dirty_before_bgsave = server.dirty;
server.lastbgsave_try = time(NULL);
+ openChildInfoPipe();
start = ustime();
if ((childpid = fork()) == 0) {
@@ -918,15 +1052,18 @@ int rdbSaveBackground(char *filename) {
/* Child */
closeListeningSockets(0);
redisSetProcTitle("redis-rdb-bgsave");
- retval = rdbSave(filename);
+ retval = rdbSave(filename,rsi);
if (retval == C_OK) {
- size_t private_dirty = zmalloc_get_private_dirty();
+ size_t private_dirty = zmalloc_get_private_dirty(-1);
if (private_dirty) {
serverLog(LL_NOTICE,
"RDB: %zu MB of memory used by copy-on-write",
private_dirty/(1024*1024));
}
+
+ server.child_info_data.cow_size = private_dirty;
+ sendChildInfo(CHILD_INFO_TYPE_RDB);
}
exitFromChild((retval == C_OK) ? 0 : 1);
} else {
@@ -935,6 +1072,7 @@ int rdbSaveBackground(char *filename) {
server.stat_fork_rate = (double) zmalloc_used_memory() * 1000000 / server.stat_fork_time / (1024*1024*1024); /* GB per second. */
latencyAddSampleIfNeeded("fork",server.stat_fork_time/1000);
if (childpid == -1) {
+ closeChildInfoPipe();
server.lastbgsave_status = C_ERR;
serverLog(LL_WARNING,"Can't save in background: fork: %s",
strerror(errno));
@@ -961,7 +1099,7 @@ void rdbRemoveTempFile(pid_t childpid) {
* On success a newly allocated object is returned, otherwise NULL. */
robj *rdbLoadObject(int rdbtype, rio *rdb) {
robj *o = NULL, *ele, *dec;
- size_t len;
+ uint64_t len;
unsigned int i;
if (rdbtype == RDB_TYPE_STRING) {
@@ -1005,8 +1143,8 @@ robj *rdbLoadObject(int rdbtype, rio *rdb) {
long long llval;
sds sdsele;
- if ((sdsele = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS)) == NULL)
- return NULL;
+ if ((sdsele = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS,NULL))
+ == NULL) return NULL;
if (o->encoding == OBJ_ENCODING_INTSET) {
/* Fetch integer value from element. */
@@ -1026,9 +1164,9 @@ robj *rdbLoadObject(int rdbtype, rio *rdb) {
sdsfree(sdsele);
}
}
- } else if (rdbtype == RDB_TYPE_ZSET) {
+ } else if (rdbtype == RDB_TYPE_ZSET_2 || rdbtype == RDB_TYPE_ZSET) {
/* Read list/set value. */
- size_t zsetlen;
+ uint64_t zsetlen;
size_t maxelelen = 0;
zset *zs;
@@ -1042,9 +1180,14 @@ robj *rdbLoadObject(int rdbtype, rio *rdb) {
double score;
zskiplistNode *znode;
- if ((sdsele = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS)) == NULL)
- return NULL;
- if (rdbLoadDoubleValue(rdb,&score) == -1) return NULL;
+ if ((sdsele = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS,NULL))
+ == NULL) return NULL;
+
+ if (rdbtype == RDB_TYPE_ZSET_2) {
+ if (rdbLoadBinaryDoubleValue(rdb,&score) == -1) return NULL;
+ } else {
+ if (rdbLoadDoubleValue(rdb,&score) == -1) return NULL;
+ }
/* Don't care about integer-encoded strings. */
if (sdslen(sdsele) > maxelelen) maxelelen = sdslen(sdsele);
@@ -1058,7 +1201,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb) {
maxelelen <= server.zset_max_ziplist_value)
zsetConvert(o,OBJ_ENCODING_ZIPLIST);
} else if (rdbtype == RDB_TYPE_HASH) {
- size_t len;
+ uint64_t len;
int ret;
sds field, value;
@@ -1075,10 +1218,10 @@ robj *rdbLoadObject(int rdbtype, rio *rdb) {
while (o->encoding == OBJ_ENCODING_ZIPLIST && len > 0) {
len--;
/* Load raw strings */
- if ((field = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS)) == NULL)
- return NULL;
- if ((value = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS)) == NULL)
- return NULL;
+ if ((field = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS,NULL))
+ == NULL) return NULL;
+ if ((value = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS,NULL))
+ == NULL) return NULL;
/* Add pair to ziplist */
o->ptr = ziplistPush(o->ptr, (unsigned char*)field,
@@ -1103,10 +1246,10 @@ robj *rdbLoadObject(int rdbtype, rio *rdb) {
while (o->encoding == OBJ_ENCODING_HT && len > 0) {
len--;
/* Load encoded strings */
- if ((field = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS)) == NULL)
- return NULL;
- if ((value = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS)) == NULL)
- return NULL;
+ if ((field = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS,NULL))
+ == NULL) return NULL;
+ if ((value = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS,NULL))
+ == NULL) return NULL;
/* Add pair to hash table */
ret = dictAdd((dict*)o->ptr, field, value);
@@ -1124,7 +1267,8 @@ robj *rdbLoadObject(int rdbtype, rio *rdb) {
server.list_compress_depth);
while (len--) {
- unsigned char *zl = rdbGenericLoadStringObject(rdb,RDB_LOAD_PLAIN);
+ unsigned char *zl =
+ rdbGenericLoadStringObject(rdb,RDB_LOAD_PLAIN,NULL);
if (zl == NULL) return NULL;
quicklistAppendZiplist(o->ptr, zl);
}
@@ -1134,7 +1278,8 @@ robj *rdbLoadObject(int rdbtype, rio *rdb) {
rdbtype == RDB_TYPE_ZSET_ZIPLIST ||
rdbtype == RDB_TYPE_HASH_ZIPLIST)
{
- unsigned char *encoded = rdbGenericLoadStringObject(rdb,RDB_LOAD_PLAIN);
+ unsigned char *encoded =
+ rdbGenericLoadStringObject(rdb,RDB_LOAD_PLAIN,NULL);
if (encoded == NULL) return NULL;
o = createObject(OBJ_STRING,encoded); /* Obj type fixed below. */
@@ -1198,11 +1343,32 @@ robj *rdbLoadObject(int rdbtype, rio *rdb) {
hashTypeConvert(o, OBJ_ENCODING_HT);
break;
default:
- rdbExitReportCorruptRDB("Unknown encoding");
+ rdbExitReportCorruptRDB("Unknown RDB encoding type %d",rdbtype);
break;
}
+ } else if (rdbtype == RDB_TYPE_MODULE) {
+ uint64_t moduleid = rdbLoadLen(rdb,NULL);
+ moduleType *mt = moduleTypeLookupModuleByID(moduleid);
+ char name[10];
+
+ if (mt == NULL) {
+ moduleTypeNameByID(name,moduleid);
+ serverLog(LL_WARNING,"The RDB file contains module data I can't load: no matching module '%s'", name);
+ exit(1);
+ }
+ RedisModuleIO io;
+ moduleInitIOContext(io,mt,rdb);
+ /* Call the rdb_load method of the module providing the 10 bit
+ * encoding version in the lower 10 bits of the module ID. */
+ void *ptr = mt->rdb_load(&io,moduleid&1023);
+ if (ptr == NULL) {
+ moduleTypeNameByID(name,moduleid);
+ serverLog(LL_WARNING,"The RDB file contains module data for the module type '%s', that the responsible module is not able to load. Check for modules log above for additional clues.", name);
+ exit(1);
+ }
+ o = createModuleObject(mt,ptr);
} else {
- rdbExitReportCorruptRDB("Unknown object type");
+ rdbExitReportCorruptRDB("Unknown RDB encoding type %d",rdbtype);
}
return o;
}
@@ -1254,67 +1420,61 @@ void rdbLoadProgressCallback(rio *r, const void *buf, size_t len) {
}
}
-int rdbLoad(char *filename) {
- uint32_t dbid;
+/* Load an RDB file from the rio stream 'rdb'. On success C_OK is returned,
+ * otherwise C_ERR is returned and 'errno' is set accordingly. */
+int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi) {
+ uint64_t dbid;
int type, rdbver;
redisDb *db = server.db+0;
char buf[1024];
long long expiretime, now = mstime();
- FILE *fp;
- rio rdb;
- if ((fp = fopen(filename,"r")) == NULL) return C_ERR;
-
- rioInitWithFile(&rdb,fp);
- rdb.update_cksum = rdbLoadProgressCallback;
- rdb.max_processing_chunk = server.loading_process_events_interval_bytes;
- if (rioRead(&rdb,buf,9) == 0) goto eoferr;
+ rdb->update_cksum = rdbLoadProgressCallback;
+ rdb->max_processing_chunk = server.loading_process_events_interval_bytes;
+ if (rioRead(rdb,buf,9) == 0) goto eoferr;
buf[9] = '\0';
if (memcmp(buf,"REDIS",5) != 0) {
- fclose(fp);
serverLog(LL_WARNING,"Wrong signature trying to load DB from file");
errno = EINVAL;
return C_ERR;
}
rdbver = atoi(buf+5);
if (rdbver < 1 || rdbver > RDB_VERSION) {
- fclose(fp);
serverLog(LL_WARNING,"Can't handle RDB format version %d",rdbver);
errno = EINVAL;
return C_ERR;
}
- startLoading(fp);
while(1) {
robj *key, *val;
expiretime = -1;
/* Read type. */
- if ((type = rdbLoadType(&rdb)) == -1) goto eoferr;
+ if ((type = rdbLoadType(rdb)) == -1) goto eoferr;
/* Handle special types. */
if (type == RDB_OPCODE_EXPIRETIME) {
/* EXPIRETIME: load an expire associated with the next key
* to load. Note that after loading an expire we need to
* load the actual type, and continue. */
- if ((expiretime = rdbLoadTime(&rdb)) == -1) goto eoferr;
+ if ((expiretime = rdbLoadTime(rdb)) == -1) goto eoferr;
/* We read the time so we need to read the object type again. */
- if ((type = rdbLoadType(&rdb)) == -1) goto eoferr;
+ if ((type = rdbLoadType(rdb)) == -1) goto eoferr;
/* the EXPIRETIME opcode specifies time in seconds, so convert
* into milliseconds. */
expiretime *= 1000;
} else if (type == RDB_OPCODE_EXPIRETIME_MS) {
/* EXPIRETIME_MS: milliseconds precision expire times introduced
* with RDB v3. Like EXPIRETIME but no with more precision. */
- if ((expiretime = rdbLoadMillisecondTime(&rdb)) == -1) goto eoferr;
+ if ((expiretime = rdbLoadMillisecondTime(rdb)) == -1) goto eoferr;
/* We read the time so we need to read the object type again. */
- if ((type = rdbLoadType(&rdb)) == -1) goto eoferr;
+ if ((type = rdbLoadType(rdb)) == -1) goto eoferr;
} else if (type == RDB_OPCODE_EOF) {
/* EOF: End of file, exit the main loop. */
break;
} else if (type == RDB_OPCODE_SELECTDB) {
/* SELECTDB: Select the specified database. */
- if ((dbid = rdbLoadLen(&rdb,NULL)) == RDB_LENERR)
+ if ((dbid = rdbLoadLen(rdb,NULL)) == RDB_LENERR)
goto eoferr;
if (dbid >= (unsigned)server.dbnum) {
serverLog(LL_WARNING,
@@ -1328,10 +1488,10 @@ int rdbLoad(char *filename) {
} else if (type == RDB_OPCODE_RESIZEDB) {
/* RESIZEDB: Hint about the size of the keys in the currently
* selected data base, in order to avoid useless rehashing. */
- uint32_t db_size, expires_size;
- if ((db_size = rdbLoadLen(&rdb,NULL)) == RDB_LENERR)
+ uint64_t db_size, expires_size;
+ if ((db_size = rdbLoadLen(rdb,NULL)) == RDB_LENERR)
goto eoferr;
- if ((expires_size = rdbLoadLen(&rdb,NULL)) == RDB_LENERR)
+ if ((expires_size = rdbLoadLen(rdb,NULL)) == RDB_LENERR)
goto eoferr;
dictExpand(db->dict,db_size);
dictExpand(db->expires,expires_size);
@@ -1343,8 +1503,8 @@ int rdbLoad(char *filename) {
*
* An AUX field is composed of two strings: key and value. */
robj *auxkey, *auxval;
- if ((auxkey = rdbLoadStringObject(&rdb)) == NULL) goto eoferr;
- if ((auxval = rdbLoadStringObject(&rdb)) == NULL) goto eoferr;
+ if ((auxkey = rdbLoadStringObject(rdb)) == NULL) goto eoferr;
+ if ((auxval = rdbLoadStringObject(rdb)) == NULL) goto eoferr;
if (((char*)auxkey->ptr)[0] == '%') {
/* All the fields with a name staring with '%' are considered
@@ -1353,6 +1513,15 @@ int rdbLoad(char *filename) {
serverLog(LL_NOTICE,"RDB '%s': %s",
(char*)auxkey->ptr,
(char*)auxval->ptr);
+ } else if (!strcasecmp(auxkey->ptr,"repl-stream-db")) {
+ if (rsi) rsi->repl_stream_db = atoi(auxval->ptr);
+ } else if (!strcasecmp(auxkey->ptr,"repl-id")) {
+ if (rsi && sdslen(auxval->ptr) == CONFIG_RUN_ID_SIZE) {
+ memcpy(rsi->repl_id,auxval->ptr,CONFIG_RUN_ID_SIZE+1);
+ rsi->repl_id_is_set = 1;
+ }
+ } else if (!strcasecmp(auxkey->ptr,"repl-offset")) {
+ if (rsi) rsi->repl_offset = strtoll(auxval->ptr,NULL,10);
} else {
/* We ignore fields we don't understand, as by AUX field
* contract. */
@@ -1366,9 +1535,9 @@ int rdbLoad(char *filename) {
}
/* Read key */
- if ((key = rdbLoadStringObject(&rdb)) == NULL) goto eoferr;
+ if ((key = rdbLoadStringObject(rdb)) == NULL) goto eoferr;
/* Read value */
- if ((val = rdbLoadObject(type,&rdb)) == NULL) goto eoferr;
+ if ((val = rdbLoadObject(type,rdb)) == NULL) goto eoferr;
/* Check if the key already expired. This function is used when loading
* an RDB file from disk, either at startup, or when an RDB was
* received from the master. In the latter case, the master is
@@ -1383,15 +1552,15 @@ int rdbLoad(char *filename) {
dbAdd(db,key,val);
/* Set the expire time if needed */
- if (expiretime != -1) setExpire(db,key,expiretime);
+ if (expiretime != -1) setExpire(NULL,db,key,expiretime);
decrRefCount(key);
}
/* Verify the checksum if RDB version is >= 5 */
if (rdbver >= 5 && server.rdb_checksum) {
- uint64_t cksum, expected = rdb.cksum;
+ uint64_t cksum, expected = rdb->cksum;
- if (rioRead(&rdb,&cksum,8) == 0) goto eoferr;
+ if (rioRead(rdb,&cksum,8) == 0) goto eoferr;
memrev64ifbe(&cksum);
if (cksum == 0) {
serverLog(LL_WARNING,"RDB file was saved with checksum disabled: no check performed.");
@@ -1400,9 +1569,6 @@ int rdbLoad(char *filename) {
rdbExitReportCorruptRDB("RDB CRC error");
}
}
-
- fclose(fp);
- stopLoading();
return C_OK;
eoferr: /* unexpected end of file is handled here with a fatal exit */
@@ -1411,6 +1577,27 @@ eoferr: /* unexpected end of file is handled here with a fatal exit */
return C_ERR; /* Just to avoid warning */
}
+/* Like rdbLoadRio() but takes a filename instead of a rio stream. The
+ * filename is open for reading and a rio stream object created in order
+ * to do the actual loading. Moreover the ETA displayed in the INFO
+ * output is initialized and finalized.
+ *
+ * If you pass an 'rsi' structure initialied with RDB_SAVE_OPTION_INIT, the
+ * loading code will fiil the information fields in the structure. */
+int rdbLoad(char *filename, rdbSaveInfo *rsi) {
+ FILE *fp;
+ rio rdb;
+ int retval;
+
+ if ((fp = fopen(filename,"r")) == NULL) return C_ERR;
+ startLoading(fp);
+ rioInitWithFile(&rdb,fp);
+ retval = rdbLoadRio(&rdb,rsi);
+ fclose(fp);
+ stopLoading();
+ return retval;
+}
+
/* A background saving child (BGSAVE) terminated its work. Handle this.
* This function covers the case of actual BGSAVEs. */
void backgroundSaveDoneHandlerDisk(int exitcode, int bysignal) {
@@ -1558,7 +1745,7 @@ void backgroundSaveDoneHandler(int exitcode, int bysignal) {
/* Spawn an RDB child that writes the RDB to the sockets of the slaves
* that are currently in SLAVE_STATE_WAIT_BGSAVE_START state. */
-int rdbSaveToSlavesSockets(void) {
+int rdbSaveToSlavesSockets(rdbSaveInfo *rsi) {
int *fds;
uint64_t *clientids;
int numfds;
@@ -1568,7 +1755,7 @@ int rdbSaveToSlavesSockets(void) {
long long start;
int pipefds[2];
- if (server.rdb_child_pid != -1) return C_ERR;
+ if (server.aof_child_pid != -1 || server.rdb_child_pid != -1) return C_ERR;
/* Before to fork, create a pipe that will be used in order to
* send back to the parent the IDs of the slaves that successfully
@@ -1603,6 +1790,7 @@ int rdbSaveToSlavesSockets(void) {
}
/* Create the child process. */
+ openChildInfoPipe();
start = ustime();
if ((childpid = fork()) == 0) {
/* Child */
@@ -1615,12 +1803,12 @@ int rdbSaveToSlavesSockets(void) {
closeListeningSockets(0);
redisSetProcTitle("redis-rdb-to-slaves");
- retval = rdbSaveRioWithEOFMark(&slave_sockets,NULL);
+ retval = rdbSaveRioWithEOFMark(&slave_sockets,NULL,rsi);
if (retval == C_OK && rioFlush(&slave_sockets) == 0)
retval = C_ERR;
if (retval == C_OK) {
- size_t private_dirty = zmalloc_get_private_dirty();
+ size_t private_dirty = zmalloc_get_private_dirty(-1);
if (private_dirty) {
serverLog(LL_NOTICE,
@@ -1628,6 +1816,9 @@ int rdbSaveToSlavesSockets(void) {
private_dirty/(1024*1024));
}
+ server.child_info_data.cow_size = private_dirty;
+ sendChildInfo(CHILD_INFO_TYPE_RDB);
+
/* If we are returning OK, at least one slave was served
* with the RDB file as expected, so we need to send a report
* to the parent via the pipe. The format of the message is:
@@ -1696,6 +1887,7 @@ int rdbSaveToSlavesSockets(void) {
}
close(pipefds[0]);
close(pipefds[1]);
+ closeChildInfoPipe();
} else {
serverLog(LL_NOTICE,"Background RDB transfer started by pid %d",
childpid);
@@ -1716,19 +1908,41 @@ void saveCommand(client *c) {
addReplyError(c,"Background save already in progress");
return;
}
- if (rdbSave(server.rdb_filename) == C_OK) {
+ if (rdbSave(server.rdb_filename,NULL) == C_OK) {
addReply(c,shared.ok);
} else {
addReply(c,shared.err);
}
}
+/* BGSAVE [SCHEDULE] */
void bgsaveCommand(client *c) {
+ int schedule = 0;
+
+ /* The SCHEDULE option changes the behavior of BGSAVE when an AOF rewrite
+ * is in progress. Instead of returning an error a BGSAVE gets scheduled. */
+ if (c->argc > 1) {
+ if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr,"schedule")) {
+ schedule = 1;
+ } else {
+ addReply(c,shared.syntaxerr);
+ return;
+ }
+ }
+
if (server.rdb_child_pid != -1) {
addReplyError(c,"Background save already in progress");
} else if (server.aof_child_pid != -1) {
- addReplyError(c,"Can't BGSAVE while AOF log rewriting is in progress");
- } else if (rdbSaveBackground(server.rdb_filename) == C_OK) {
+ if (schedule) {
+ server.rdb_bgsave_scheduled = 1;
+ addReplyStatus(c,"Background saving scheduled");
+ } else {
+ addReplyError(c,
+ "An AOF log rewriting in progress: can't BGSAVE right now. "
+ "Use BGSAVE SCHEDULE in order to schedule a BGSAVE whenever "
+ "possible.");
+ }
+ } else if (rdbSaveBackground(server.rdb_filename,NULL) == C_OK) {
addReplyStatus(c,"Background saving started");
} else {
addReply(c,shared.err);
diff --git a/src/rdb.h b/src/rdb.h
index 48a064a19..efe932255 100644
--- a/src/rdb.h
+++ b/src/rdb.h
@@ -38,16 +38,17 @@
/* The current RDB version. When the format changes in a way that is no longer
* backward compatible this number gets incremented. */
-#define RDB_VERSION 7
+#define RDB_VERSION 8
/* Defines related to the dump file format. To store 32 bits lengths for short
* keys requires a lot of space, so we check the most significant 2 bits of
* the first byte to interpreter the length:
*
- * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
- * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
- * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
- * 11|000000 this means: specially encoded object will follow. The six bits
+ * 00|XXXXXX => if the two MSB are 00 the len is the 6 bits of this byte
+ * 01|XXXXXX XXXXXXXX => 01, the len is 14 byes, 6 bits + 8 bits of next byte
+ * 10|000000 [32 bit integer] => A full 32 bit len in net byte order will follow
+ * 10|000001 [64 bit integer] => A full 64 bit len in net byte order will follow
+ * 11|OBKIND this means: specially encoded object will follow. The six bits
* number specify the kind of object that follows.
* See the RDB_ENC_* defines.
*
@@ -55,12 +56,13 @@
* values, will fit inside. */
#define RDB_6BITLEN 0
#define RDB_14BITLEN 1
-#define RDB_32BITLEN 2
+#define RDB_32BITLEN 0x80
+#define RDB_64BITLEN 0x81
#define RDB_ENCVAL 3
-#define RDB_LENERR UINT_MAX
+#define RDB_LENERR UINT64_MAX
/* When a length of a string object stored on disk has the first two bits
- * set, the remaining two bits specify a special encoding for the object
+ * set, the remaining six bits specify a special encoding for the object
* accordingly to the following defines: */
#define RDB_ENC_INT8 0 /* 8 bit signed integer */
#define RDB_ENC_INT16 1 /* 16 bit signed integer */
@@ -74,6 +76,8 @@
#define RDB_TYPE_SET 2
#define RDB_TYPE_ZSET 3
#define RDB_TYPE_HASH 4
+#define RDB_TYPE_ZSET_2 5 /* ZSET version 2 with doubles stored in binary. */
+#define RDB_TYPE_MODULE 6
/* NOTE: WHEN ADDING NEW RDB TYPE, UPDATE rdbIsObjectType() BELOW */
/* Object types for encoded objects. */
@@ -86,7 +90,7 @@
/* NOTE: WHEN ADDING NEW RDB TYPE, UPDATE rdbIsObjectType() BELOW */
/* Test if a type is an object type. */
-#define rdbIsObjectType(t) ((t >= 0 && t <= 4) || (t >= 9 && t <= 14))
+#define rdbIsObjectType(t) ((t >= 0 && t <= 6) || (t >= 9 && t <= 14))
/* Special RDB opcodes (saved/loaded with rdbSaveType/rdbLoadType). */
#define RDB_OPCODE_AUX 250
@@ -96,24 +100,42 @@
#define RDB_OPCODE_SELECTDB 254
#define RDB_OPCODE_EOF 255
+/* rdbLoad...() functions flags. */
+#define RDB_LOAD_NONE 0
+#define RDB_LOAD_ENC (1<<0)
+#define RDB_LOAD_PLAIN (1<<1)
+#define RDB_LOAD_SDS (1<<2)
+
+#define RDB_SAVE_NONE 0
+#define RDB_SAVE_AOF_PREAMBLE (1<<0)
+
int rdbSaveType(rio *rdb, unsigned char type);
int rdbLoadType(rio *rdb);
int rdbSaveTime(rio *rdb, time_t t);
time_t rdbLoadTime(rio *rdb);
-int rdbSaveLen(rio *rdb, uint32_t len);
-uint32_t rdbLoadLen(rio *rdb, int *isencoded);
+int rdbSaveLen(rio *rdb, uint64_t len);
+uint64_t rdbLoadLen(rio *rdb, int *isencoded);
+int rdbLoadLenByRef(rio *rdb, int *isencoded, uint64_t *lenptr);
int rdbSaveObjectType(rio *rdb, robj *o);
int rdbLoadObjectType(rio *rdb);
-int rdbLoad(char *filename);
-int rdbSaveBackground(char *filename);
-int rdbSaveToSlavesSockets(void);
+int rdbLoad(char *filename, rdbSaveInfo *rsi);
+int rdbSaveBackground(char *filename, rdbSaveInfo *rsi);
+int rdbSaveToSlavesSockets(rdbSaveInfo *rsi);
void rdbRemoveTempFile(pid_t childpid);
-int rdbSave(char *filename);
+int rdbSave(char *filename, rdbSaveInfo *rsi);
ssize_t rdbSaveObject(rio *rdb, robj *o);
size_t rdbSavedObjectLen(robj *o);
robj *rdbLoadObject(int type, rio *rdb);
void backgroundSaveDoneHandler(int exitcode, int bysignal);
int rdbSaveKeyValuePair(rio *rdb, robj *key, robj *val, long long expiretime, long long now);
robj *rdbLoadStringObject(rio *rdb);
+int rdbSaveStringObject(rio *rdb, robj *obj);
+ssize_t rdbSaveRawString(rio *rdb, unsigned char *s, size_t len);
+void *rdbGenericLoadStringObject(rio *rdb, int flags, size_t *lenptr);
+int rdbSaveBinaryDoubleValue(rio *rdb, double val);
+int rdbLoadBinaryDoubleValue(rio *rdb, double *val);
+int rdbSaveBinaryFloatValue(rio *rdb, float val);
+int rdbLoadBinaryFloatValue(rio *rdb, float *val);
+int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi);
#endif
diff --git a/src/redis-benchmark.c b/src/redis-benchmark.c
index 2829df913..f382019a4 100644
--- a/src/redis-benchmark.c
+++ b/src/redis-benchmark.c
@@ -65,6 +65,7 @@ static struct config {
int randomkeys_keyspacelen;
int keepalive;
int pipeline;
+ int showerrors;
long long start;
long long totlatency;
long long *latency;
@@ -212,6 +213,16 @@ static void readHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
exit(1);
}
+ if (config.showerrors) {
+ static time_t lasterr_time = 0;
+ time_t now = time(NULL);
+ redisReply *r = reply;
+ if (r->type == REDIS_REPLY_ERROR && lasterr_time != now) {
+ lasterr_time = now;
+ printf("Error from server: %s\n", r->str);
+ }
+ }
+
freeReplyObject(reply);
/* This is an OK for prefix commands such as auth and select.*/
if (c->prefix_pending > 0) {
@@ -227,7 +238,7 @@ static void readHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
c->randptr[j] -= c->prefixlen;
c->prefixlen = 0;
}
- continue;
+ continue;
}
if (config.requests_finished < config.requests)
@@ -518,6 +529,8 @@ int parseOptions(int argc, const char **argv) {
config.loop = 1;
} else if (!strcmp(argv[i],"-I")) {
config.idlemode = 1;
+ } else if (!strcmp(argv[i],"-e")) {
+ config.showerrors = 1;
} else if (!strcmp(argv[i],"-t")) {
if (lastarg) goto invalid;
/* We get the list of tests to run as a string in the form
@@ -552,7 +565,7 @@ invalid:
usage:
printf(
-"Usage: redis-benchmark [-h <host>] [-p <port>] [-c <clients>] [-n <requests]> [-k <boolean>]\n\n"
+"Usage: redis-benchmark [-h <host>] [-p <port>] [-c <clients>] [-n <requests>] [-k <boolean>]\n\n"
" -h <hostname> Server hostname (default 127.0.0.1)\n"
" -p <port> Server port (default 6379)\n"
" -s <socket> Server socket (overrides host and port)\n"
@@ -569,6 +582,8 @@ usage:
" is executed. Default tests use this to hit random keys in the\n"
" specified range.\n"
" -P <numreq> Pipeline <numreq> requests. Default 1 (no pipeline).\n"
+" -e If server replies with errors, show them on stdout.\n"
+" (no more than 1 error per second is displayed)\n"
" -q Quiet. Just show query/sec values\n"
" --csv Output in CSV format\n"
" -l Loop. Run the tests forever\n"
@@ -649,6 +664,7 @@ int main(int argc, const char **argv) {
config.keepalive = 1;
config.datasize = 3;
config.pipeline = 1;
+ config.showerrors = 0;
config.randomkeys = 0;
config.randomkeys_keyspacelen = 0;
config.quiet = 0;
diff --git a/src/redis-check-rdb.c b/src/redis-check-rdb.c
index 7bb93b60b..08be40f6a 100644
--- a/src/redis-check-rdb.c
+++ b/src/redis-check-rdb.c
@@ -1,6 +1,5 @@
/*
- * Copyright (c) 2009-2012, Pieter Noordhuis <pcnoordhuis at gmail dot com>
- * Copyright (c) 2009-2012, Salvatore Sanfilippo <antirez at gmail dot com>
+ * Copyright (c) 2016, Salvatore Sanfilippo <antirez at gmail dot com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -28,683 +27,324 @@
* POSSIBILITY OF SUCH DAMAGE.
*/
-
#include "server.h"
#include "rdb.h"
-#include <stdlib.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <sys/mman.h>
-#include "lzf.h"
-#include "crc64.h"
-
-#define ERROR(...) { \
- serverLog(LL_WARNING, __VA_ARGS__); \
- exit(1); \
-}
-
-/* data type to hold offset in file and size */
-typedef struct {
- void *data;
- size_t size;
- size_t offset;
-} pos;
-
-static unsigned char level = 0;
-static pos positions[16];
-
-#define CURR_OFFSET (positions[level].offset)
-
-/* Hold a stack of errors */
-typedef struct {
- char error[16][1024];
- size_t offset[16];
- size_t level;
-} errors_t;
-static errors_t errors;
-
-#define SHIFT_ERROR(provided_offset, ...) { \
- sprintf(errors.error[errors.level], __VA_ARGS__); \
- errors.offset[errors.level] = provided_offset; \
- errors.level++; \
-}
-
-/* Data type to hold opcode with optional key name an success status */
-typedef struct {
- char* key;
- int type;
- char success;
-} entry;
-
-#define MAX_TYPES_NUM 256
-#define MAX_TYPE_NAME_LEN 16
-/* store string types for output */
-static char types[MAX_TYPES_NUM][MAX_TYPE_NAME_LEN];
-
-/* Return true if 't' is a valid object type. */
-static int rdbCheckType(unsigned char t) {
- /* In case a new object type is added, update the following
- * condition as necessary. */
- return
- (t >= RDB_TYPE_HASH_ZIPMAP && t <= RDB_TYPE_HASH_ZIPLIST) ||
- t <= RDB_TYPE_HASH ||
- t >= RDB_OPCODE_EXPIRETIME_MS;
-}
-
-/* when number of bytes to read is negative, do a peek */
-static int readBytes(void *target, long num) {
- char peek = (num < 0) ? 1 : 0;
- num = (num < 0) ? -num : num;
-
- pos p = positions[level];
- if (p.offset + num > p.size) {
- return 0;
- } else {
- memcpy(target, (void*)((size_t)p.data + p.offset), num);
- if (!peek) positions[level].offset += num;
- }
- return 1;
-}
-
-int processHeader(void) {
- char buf[10] = "_________";
- int dump_version;
-
- if (!readBytes(buf, 9)) {
- ERROR("Cannot read header");
- }
-
- /* expect the first 5 bytes to equal REDIS */
- if (memcmp(buf,"REDIS",5) != 0) {
- ERROR("Wrong signature in header");
- }
-
- dump_version = (int)strtol(buf + 5, NULL, 10);
- if (dump_version < 1 || dump_version > 6) {
- ERROR("Unknown RDB format version: %d", dump_version);
- }
- return dump_version;
-}
-
-static int loadType(entry *e) {
- uint32_t offset = CURR_OFFSET;
-
- /* this byte needs to qualify as type */
- unsigned char t;
- if (readBytes(&t, 1)) {
- if (rdbCheckType(t)) {
- e->type = t;
- return 1;
- } else {
- SHIFT_ERROR(offset, "Unknown type (0x%02x)", t);
- }
- } else {
- SHIFT_ERROR(offset, "Could not read type");
- }
-
- /* failure */
- return 0;
-}
-
-static int peekType() {
- unsigned char t;
- if (readBytes(&t, -1) && (rdbCheckType(t)))
- return t;
- return -1;
-}
-
-/* discard time, just consume the bytes */
-static int processTime(int type) {
- uint32_t offset = CURR_OFFSET;
- unsigned char t[8];
- int timelen = (type == RDB_OPCODE_EXPIRETIME_MS) ? 8 : 4;
-
- if (readBytes(t,timelen)) {
- return 1;
- } else {
- SHIFT_ERROR(offset, "Could not read time");
- }
-
- /* failure */
- return 0;
-}
-static uint32_t loadLength(int *isencoded) {
- unsigned char buf[2];
- uint32_t len;
- int type;
-
- if (isencoded) *isencoded = 0;
- if (!readBytes(buf, 1)) return RDB_LENERR;
- type = (buf[0] & 0xC0) >> 6;
- if (type == RDB_6BITLEN) {
- /* Read a 6 bit len */
- return buf[0] & 0x3F;
- } else if (type == RDB_ENCVAL) {
- /* Read a 6 bit len encoding type */
- if (isencoded) *isencoded = 1;
- return buf[0] & 0x3F;
- } else if (type == RDB_14BITLEN) {
- /* Read a 14 bit len */
- if (!readBytes(buf+1,1)) return RDB_LENERR;
- return ((buf[0] & 0x3F) << 8) | buf[1];
- } else {
- /* Read a 32 bit len */
- if (!readBytes(&len, 4)) return RDB_LENERR;
- return (unsigned int)ntohl(len);
- }
+#include <stdarg.h>
+
+void createSharedObjects(void);
+void rdbLoadProgressCallback(rio *r, const void *buf, size_t len);
+long long rdbLoadMillisecondTime(rio *rdb);
+int rdbCheckMode = 0;
+
+struct {
+ rio *rio;
+ robj *key; /* Current key we are reading. */
+ int key_type; /* Current key type if != -1. */
+ unsigned long keys; /* Number of keys processed. */
+ unsigned long expires; /* Number of keys with an expire. */
+ unsigned long already_expired; /* Number of keys already expired. */
+ int doing; /* The state while reading the RDB. */
+ int error_set; /* True if error is populated. */
+ char error[1024];
+} rdbstate;
+
+/* At every loading step try to remember what we were about to do, so that
+ * we can log this information when an error is encountered. */
+#define RDB_CHECK_DOING_START 0
+#define RDB_CHECK_DOING_READ_TYPE 1
+#define RDB_CHECK_DOING_READ_EXPIRE 2
+#define RDB_CHECK_DOING_READ_KEY 3
+#define RDB_CHECK_DOING_READ_OBJECT_VALUE 4
+#define RDB_CHECK_DOING_CHECK_SUM 5
+#define RDB_CHECK_DOING_READ_LEN 6
+#define RDB_CHECK_DOING_READ_AUX 7
+
+char *rdb_check_doing_string[] = {
+ "start",
+ "read-type",
+ "read-expire",
+ "read-key",
+ "read-object-value",
+ "check-sum",
+ "read-len",
+ "read-aux"
+};
+
+char *rdb_type_string[] = {
+ "string",
+ "list-linked",
+ "set-hashtable",
+ "zset-v1",
+ "hash-hashtable",
+ "zset-v2",
+ "module-value",
+ "","",
+ "hash-zipmap",
+ "list-ziplist",
+ "set-intset",
+ "zset-ziplist",
+ "hash-ziplist",
+ "quicklist"
+};
+
+/* Show a few stats collected into 'rdbstate' */
+void rdbShowGenericInfo(void) {
+ printf("[info] %lu keys read\n", rdbstate.keys);
+ printf("[info] %lu expires\n", rdbstate.expires);
+ printf("[info] %lu already expired\n", rdbstate.already_expired);
}
-static char *loadIntegerObject(int enctype) {
- uint32_t offset = CURR_OFFSET;
- unsigned char enc[4];
- long long val;
-
- if (enctype == RDB_ENC_INT8) {
- uint8_t v;
- if (!readBytes(enc, 1)) return NULL;
- v = enc[0];
- val = (int8_t)v;
- } else if (enctype == RDB_ENC_INT16) {
- uint16_t v;
- if (!readBytes(enc, 2)) return NULL;
- v = enc[0]|(enc[1]<<8);
- val = (int16_t)v;
- } else if (enctype == RDB_ENC_INT32) {
- uint32_t v;
- if (!readBytes(enc, 4)) return NULL;
- v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
- val = (int32_t)v;
- } else {
- SHIFT_ERROR(offset, "Unknown integer encoding (0x%02x)", enctype);
- return NULL;
- }
-
- /* convert val into string */
- char *buf;
- buf = zmalloc(sizeof(char) * 128);
- sprintf(buf, "%lld", val);
- return buf;
+/* Called on RDB errors. Provides details about the RDB and the offset
+ * we were when the error was detected. */
+void rdbCheckError(const char *fmt, ...) {
+ char msg[1024];
+ va_list ap;
+
+ va_start(ap, fmt);
+ vsnprintf(msg, sizeof(msg), fmt, ap);
+ va_end(ap);
+
+ printf("--- RDB ERROR DETECTED ---\n");
+ printf("[offset %llu] %s\n",
+ (unsigned long long) (rdbstate.rio ?
+ rdbstate.rio->processed_bytes : 0), msg);
+ printf("[additional info] While doing: %s\n",
+ rdb_check_doing_string[rdbstate.doing]);
+ if (rdbstate.key)
+ printf("[additional info] Reading key '%s'\n",
+ (char*)rdbstate.key->ptr);
+ if (rdbstate.key_type != -1)
+ printf("[additional info] Reading type %d (%s)\n",
+ rdbstate.key_type,
+ ((unsigned)rdbstate.key_type <
+ sizeof(rdb_type_string)/sizeof(char*)) ?
+ rdb_type_string[rdbstate.key_type] : "unknown");
+ rdbShowGenericInfo();
}
-static char* loadLzfStringObject() {
- unsigned int slen, clen;
- char *c, *s;
-
- if ((clen = loadLength(NULL)) == RDB_LENERR) return NULL;
- if ((slen = loadLength(NULL)) == RDB_LENERR) return NULL;
+/* Print informations during RDB checking. */
+void rdbCheckInfo(const char *fmt, ...) {
+ char msg[1024];
+ va_list ap;
- c = zmalloc(clen);
- if (!readBytes(c, clen)) {
- zfree(c);
- return NULL;
- }
-
- s = zmalloc(slen+1);
- if (lzf_decompress(c,clen,s,slen) == 0) {
- zfree(c); zfree(s);
- return NULL;
- }
+ va_start(ap, fmt);
+ vsnprintf(msg, sizeof(msg), fmt, ap);
+ va_end(ap);
- zfree(c);
- return s;
+ printf("[offset %llu] %s\n",
+ (unsigned long long) (rdbstate.rio ?
+ rdbstate.rio->processed_bytes : 0), msg);
}
-/* returns NULL when not processable, char* when valid */
-static char* loadStringObject() {
- uint32_t offset = CURR_OFFSET;
- int isencoded;
- uint32_t len;
-
- len = loadLength(&isencoded);
- if (isencoded) {
- switch(len) {
- case RDB_ENC_INT8:
- case RDB_ENC_INT16:
- case RDB_ENC_INT32:
- return loadIntegerObject(len);
- case RDB_ENC_LZF:
- return loadLzfStringObject();
- default:
- /* unknown encoding */
- SHIFT_ERROR(offset, "Unknown string encoding (0x%02x)", len);
- return NULL;
- }
- }
-
- if (len == RDB_LENERR) return NULL;
+/* Used inside rdb.c in order to log specific errors happening inside
+ * the RDB loading internals. */
+void rdbCheckSetError(const char *fmt, ...) {
+ va_list ap;
- char *buf = zmalloc(sizeof(char) * (len+1));
- if (buf == NULL) return NULL;
- buf[len] = '\0';
- if (!readBytes(buf, len)) {
- zfree(buf);
- return NULL;
- }
- return buf;
+ va_start(ap, fmt);
+ vsnprintf(rdbstate.error, sizeof(rdbstate.error), fmt, ap);
+ va_end(ap);
+ rdbstate.error_set = 1;
}
-static int processStringObject(char** store) {
- unsigned long offset = CURR_OFFSET;
- char *key = loadStringObject();
- if (key == NULL) {
- SHIFT_ERROR(offset, "Error reading string object");
- zfree(key);
- return 0;
- }
+/* During RDB check we setup a special signal handler for memory violations
+ * and similar conditions, so that we can log the offending part of the RDB
+ * if the crash is due to broken content. */
+void rdbCheckHandleCrash(int sig, siginfo_t *info, void *secret) {
+ UNUSED(sig);
+ UNUSED(info);
+ UNUSED(secret);
- if (store != NULL) {
- *store = key;
- } else {
- zfree(key);
- }
- return 1;
+ rdbCheckError("Server crash checking the specified RDB file!");
+ exit(1);
}
-static double* loadDoubleValue() {
- char buf[256];
- unsigned char len;
- double* val;
-
- if (!readBytes(&len,1)) return NULL;
-
- val = zmalloc(sizeof(double));
- switch(len) {
- case 255: *val = R_NegInf; return val;
- case 254: *val = R_PosInf; return val;
- case 253: *val = R_Nan; return val;
- default:
- if (!readBytes(buf, len)) {
- zfree(val);
- return NULL;
- }
- buf[len] = '\0';
- sscanf(buf, "%lg", val);
- return val;
- }
-}
-
-static int processDoubleValue(double** store) {
- unsigned long offset = CURR_OFFSET;
- double *val = loadDoubleValue();
- if (val == NULL) {
- SHIFT_ERROR(offset, "Error reading double value");
- zfree(val);
- return 0;
- }
+void rdbCheckSetupSignals(void) {
+ struct sigaction act;
- if (store != NULL) {
- *store = val;
- } else {
- zfree(val);
- }
- return 1;
+ sigemptyset(&act.sa_mask);
+ act.sa_flags = SA_NODEFER | SA_RESETHAND | SA_SIGINFO;
+ act.sa_sigaction = rdbCheckHandleCrash;
+ sigaction(SIGSEGV, &act, NULL);
+ sigaction(SIGBUS, &act, NULL);
+ sigaction(SIGFPE, &act, NULL);
+ sigaction(SIGILL, &act, NULL);
}
-static int loadPair(entry *e) {
- uint32_t offset = CURR_OFFSET;
- uint32_t i;
-
- /* read key first */
- char *key;
- if (processStringObject(&key)) {
- e->key = key;
- } else {
- SHIFT_ERROR(offset, "Error reading entry key");
- return 0;
- }
-
- uint32_t length = 0;
- if (e->type == RDB_TYPE_LIST ||
- e->type == RDB_TYPE_SET ||
- e->type == RDB_TYPE_ZSET ||
- e->type == RDB_TYPE_HASH) {
- if ((length = loadLength(NULL)) == RDB_LENERR) {
- SHIFT_ERROR(offset, "Error reading %s length", types[e->type]);
- return 0;
- }
- }
-
- switch(e->type) {
- case RDB_TYPE_STRING:
- case RDB_TYPE_HASH_ZIPMAP:
- case RDB_TYPE_LIST_ZIPLIST:
- case RDB_TYPE_SET_INTSET:
- case RDB_TYPE_ZSET_ZIPLIST:
- case RDB_TYPE_HASH_ZIPLIST:
- if (!processStringObject(NULL)) {
- SHIFT_ERROR(offset, "Error reading entry value");
- return 0;
- }
- break;
- case RDB_TYPE_LIST:
- case RDB_TYPE_SET:
- for (i = 0; i < length; i++) {
- offset = CURR_OFFSET;
- if (!processStringObject(NULL)) {
- SHIFT_ERROR(offset, "Error reading element at index %d (length: %d)", i, length);
- return 0;
- }
- }
- break;
- case RDB_TYPE_ZSET:
- for (i = 0; i < length; i++) {
- offset = CURR_OFFSET;
- if (!processStringObject(NULL)) {
- SHIFT_ERROR(offset, "Error reading element key at index %d (length: %d)", i, length);
- return 0;
- }
- offset = CURR_OFFSET;
- if (!processDoubleValue(NULL)) {
- SHIFT_ERROR(offset, "Error reading element value at index %d (length: %d)", i, length);
- return 0;
- }
- }
- break;
- case RDB_TYPE_HASH:
- for (i = 0; i < length; i++) {
- offset = CURR_OFFSET;
- if (!processStringObject(NULL)) {
- SHIFT_ERROR(offset, "Error reading element key at index %d (length: %d)", i, length);
- return 0;
- }
- offset = CURR_OFFSET;
- if (!processStringObject(NULL)) {
- SHIFT_ERROR(offset, "Error reading element value at index %d (length: %d)", i, length);
- return 0;
- }
- }
- break;
- default:
- SHIFT_ERROR(offset, "Type not implemented");
- return 0;
+/* Check the specified RDB file. Return 0 if the RDB looks sane, otherwise
+ * 1 is returned. */
+int redis_check_rdb(char *rdbfilename) {
+ uint64_t dbid;
+ int type, rdbver;
+ char buf[1024];
+ long long expiretime, now = mstime();
+ FILE *fp;
+ static rio rdb; /* Pointed by global struct riostate. */
+
+ if ((fp = fopen(rdbfilename,"r")) == NULL) return 1;
+
+ rioInitWithFile(&rdb,fp);
+ rdbstate.rio = &rdb;
+ rdb.update_cksum = rdbLoadProgressCallback;
+ if (rioRead(&rdb,buf,9) == 0) goto eoferr;
+ buf[9] = '\0';
+ if (memcmp(buf,"REDIS",5) != 0) {
+ rdbCheckError("Wrong signature trying to load DB from file");
+ return 1;
}
- /* because we're done, we assume success */
- e->success = 1;
- return 1;
-}
-
-static entry loadEntry() {
- entry e = { NULL, -1, 0 };
- uint32_t length, offset[4];
-
- /* reset error container */
- errors.level = 0;
-
- offset[0] = CURR_OFFSET;
- if (!loadType(&e)) {
- return e;
+ rdbver = atoi(buf+5);
+ if (rdbver < 1 || rdbver > RDB_VERSION) {
+ rdbCheckError("Can't handle RDB format version %d",rdbver);
+ return 1;
}
- offset[1] = CURR_OFFSET;
- if (e.type == RDB_OPCODE_SELECTDB) {
- if ((length = loadLength(NULL)) == RDB_LENERR) {
- SHIFT_ERROR(offset[1], "Error reading database number");
- return e;
- }
- if (length > 63) {
- SHIFT_ERROR(offset[1], "Database number out of range (%d)", length);
- return e;
- }
- } else if (e.type == RDB_OPCODE_EOF) {
- if (positions[level].offset < positions[level].size) {
- SHIFT_ERROR(offset[0], "Unexpected EOF");
+ startLoading(fp);
+ while(1) {
+ robj *key, *val;
+ expiretime = -1;
+
+ /* Read type. */
+ rdbstate.doing = RDB_CHECK_DOING_READ_TYPE;
+ if ((type = rdbLoadType(&rdb)) == -1) goto eoferr;
+
+ /* Handle special types. */
+ if (type == RDB_OPCODE_EXPIRETIME) {
+ rdbstate.doing = RDB_CHECK_DOING_READ_EXPIRE;
+ /* EXPIRETIME: load an expire associated with the next key
+ * to load. Note that after loading an expire we need to
+ * load the actual type, and continue. */
+ if ((expiretime = rdbLoadTime(&rdb)) == -1) goto eoferr;
+ /* We read the time so we need to read the object type again. */
+ rdbstate.doing = RDB_CHECK_DOING_READ_TYPE;
+ if ((type = rdbLoadType(&rdb)) == -1) goto eoferr;
+ /* the EXPIRETIME opcode specifies time in seconds, so convert
+ * into milliseconds. */
+ expiretime *= 1000;
+ } else if (type == RDB_OPCODE_EXPIRETIME_MS) {
+ /* EXPIRETIME_MS: milliseconds precision expire times introduced
+ * with RDB v3. Like EXPIRETIME but no with more precision. */
+ rdbstate.doing = RDB_CHECK_DOING_READ_EXPIRE;
+ if ((expiretime = rdbLoadMillisecondTime(&rdb)) == -1) goto eoferr;
+ /* We read the time so we need to read the object type again. */
+ rdbstate.doing = RDB_CHECK_DOING_READ_TYPE;
+ if ((type = rdbLoadType(&rdb)) == -1) goto eoferr;
+ } else if (type == RDB_OPCODE_EOF) {
+ /* EOF: End of file, exit the main loop. */
+ break;
+ } else if (type == RDB_OPCODE_SELECTDB) {
+ /* SELECTDB: Select the specified database. */
+ rdbstate.doing = RDB_CHECK_DOING_READ_LEN;
+ if ((dbid = rdbLoadLen(&rdb,NULL)) == RDB_LENERR)
+ goto eoferr;
+ rdbCheckInfo("Selecting DB ID %d", dbid);
+ continue; /* Read type again. */
+ } else if (type == RDB_OPCODE_RESIZEDB) {
+ /* RESIZEDB: Hint about the size of the keys in the currently
+ * selected data base, in order to avoid useless rehashing. */
+ uint64_t db_size, expires_size;
+ rdbstate.doing = RDB_CHECK_DOING_READ_LEN;
+ if ((db_size = rdbLoadLen(&rdb,NULL)) == RDB_LENERR)
+ goto eoferr;
+ if ((expires_size = rdbLoadLen(&rdb,NULL)) == RDB_LENERR)
+ goto eoferr;
+ continue; /* Read type again. */
+ } else if (type == RDB_OPCODE_AUX) {
+ /* AUX: generic string-string fields. Use to add state to RDB
+ * which is backward compatible. Implementations of RDB loading
+ * are requierd to skip AUX fields they don't understand.
+ *
+ * An AUX field is composed of two strings: key and value. */
+ robj *auxkey, *auxval;
+ rdbstate.doing = RDB_CHECK_DOING_READ_AUX;
+ if ((auxkey = rdbLoadStringObject(&rdb)) == NULL) goto eoferr;
+ if ((auxval = rdbLoadStringObject(&rdb)) == NULL) goto eoferr;
+
+ rdbCheckInfo("AUX FIELD %s = '%s'",
+ (char*)auxkey->ptr, (char*)auxval->ptr);
+ decrRefCount(auxkey);
+ decrRefCount(auxval);
+ continue; /* Read type again. */
} else {
- e.success = 1;
- }
- return e;
- } else {
- /* optionally consume expire */
- if (e.type == RDB_OPCODE_EXPIRETIME ||
- e.type == RDB_OPCODE_EXPIRETIME_MS) {
- if (!processTime(e.type)) return e;
- if (!loadType(&e)) return e;
- }
-
- offset[1] = CURR_OFFSET;
- if (!loadPair(&e)) {
- SHIFT_ERROR(offset[1], "Error for type %s", types[e.type]);
- return e;
- }
- }
-
- /* all entries are followed by a valid type:
- * e.g. a new entry, SELECTDB, EXPIRE, EOF */
- offset[2] = CURR_OFFSET;
- if (peekType() == -1) {
- SHIFT_ERROR(offset[2], "Followed by invalid type");
- SHIFT_ERROR(offset[0], "Error for type %s", types[e.type]);
- e.success = 0;
- } else {
- e.success = 1;
- }
-
- return e;
-}
-
-static void printCentered(int indent, int width, char* body) {
- char head[256], tail[256];
- memset(head, '\0', 256);
- memset(tail, '\0', 256);
-
- memset(head, '=', indent);
- memset(tail, '=', width - 2 - indent - strlen(body));
- serverLog(LL_WARNING, "%s %s %s", head, body, tail);
-}
-
-static void printValid(uint64_t ops, uint64_t bytes) {
- char body[80];
- sprintf(body, "Processed %llu valid opcodes (in %llu bytes)",
- (unsigned long long) ops, (unsigned long long) bytes);
- printCentered(4, 80, body);
-}
-
-static void printSkipped(uint64_t bytes, uint64_t offset) {
- char body[80];
- sprintf(body, "Skipped %llu bytes (resuming at 0x%08llx)",
- (unsigned long long) bytes, (unsigned long long) offset);
- printCentered(4, 80, body);
-}
-
-static void printErrorStack(entry *e) {
- unsigned int i;
- char body[64];
-
- if (e->type == -1) {
- sprintf(body, "Error trace");
- } else if (e->type >= 253) {
- sprintf(body, "Error trace (%s)", types[e->type]);
- } else if (!e->key) {
- sprintf(body, "Error trace (%s: (unknown))", types[e->type]);
- } else {
- char tmp[41];
- strncpy(tmp, e->key, 40);
-
- /* display truncation at the last 3 chars */
- if (strlen(e->key) > 40) {
- memset(&tmp[37], '.', 3);
- }
-
- /* display unprintable characters as ? */
- for (i = 0; i < strlen(tmp); i++) {
- if (tmp[i] <= 32) tmp[i] = '?';
- }
- sprintf(body, "Error trace (%s: %s)", types[e->type], tmp);
- }
-
- printCentered(4, 80, body);
-
- /* display error stack */
- for (i = 0; i < errors.level; i++) {
- serverLog(LL_WARNING, "0x%08lx - %s",
- (unsigned long) errors.offset[i], errors.error[i]);
- }
-}
-
-void process(void) {
- uint64_t num_errors = 0, num_valid_ops = 0, num_valid_bytes = 0;
- entry entry = { NULL, -1, 0 };
- int dump_version = processHeader();
-
- /* Exclude the final checksum for RDB >= 5. Will be checked at the end. */
- if (dump_version >= 5) {
- if (positions[0].size < 8) {
- serverLog(LL_WARNING, "RDB version >= 5 but no room for checksum.");
- exit(1);
- }
- positions[0].size -= 8;
- }
-
- level = 1;
- while(positions[0].offset < positions[0].size) {
- positions[1] = positions[0];
-
- entry = loadEntry();
- if (!entry.success) {
- printValid(num_valid_ops, num_valid_bytes);
- printErrorStack(&entry);
- num_errors++;
- num_valid_ops = 0;
- num_valid_bytes = 0;
-
- /* search for next valid entry */
- uint64_t offset = positions[0].offset + 1;
- int i = 0;
-
- while (!entry.success && offset < positions[0].size) {
- positions[1].offset = offset;
-
- /* find 3 consecutive valid entries */
- for (i = 0; i < 3; i++) {
- entry = loadEntry();
- if (!entry.success) break;
- }
- /* check if we found 3 consecutive valid entries */
- if (i < 3) {
- offset++;
- }
+ if (!rdbIsObjectType(type)) {
+ rdbCheckError("Invalid object type: %d", type);
+ return 1;
}
-
- /* print how many bytes we have skipped to find a new valid opcode */
- if (offset < positions[0].size) {
- printSkipped(offset - positions[0].offset, offset);
- }
-
- positions[0].offset = offset;
- } else {
- num_valid_ops++;
- num_valid_bytes += positions[1].offset - positions[0].offset;
-
- /* advance position */
- positions[0] = positions[1];
+ rdbstate.key_type = type;
}
- zfree(entry.key);
- }
-
- /* because there is another potential error,
- * print how many valid ops we have processed */
- printValid(num_valid_ops, num_valid_bytes);
-
- /* expect an eof */
- if (entry.type != RDB_OPCODE_EOF) {
- /* last byte should be EOF, add error */
- errors.level = 0;
- SHIFT_ERROR(positions[0].offset, "Expected EOF, got %s", types[entry.type]);
- /* this is an EOF error so reset type */
- entry.type = -1;
- printErrorStack(&entry);
-
- num_errors++;
- }
-
- /* Verify checksum */
- if (dump_version >= 5) {
- uint64_t crc = crc64(0,positions[0].data,positions[0].size);
- uint64_t crc2;
- unsigned char *p = (unsigned char*)positions[0].data+positions[0].size;
- crc2 = ((uint64_t)p[0] << 0) |
- ((uint64_t)p[1] << 8) |
- ((uint64_t)p[2] << 16) |
- ((uint64_t)p[3] << 24) |
- ((uint64_t)p[4] << 32) |
- ((uint64_t)p[5] << 40) |
- ((uint64_t)p[6] << 48) |
- ((uint64_t)p[7] << 56);
- if (crc != crc2) {
- SHIFT_ERROR(positions[0].offset, "RDB CRC64 does not match.");
+ /* Read key */
+ rdbstate.doing = RDB_CHECK_DOING_READ_KEY;
+ if ((key = rdbLoadStringObject(&rdb)) == NULL) goto eoferr;
+ rdbstate.key = key;
+ rdbstate.keys++;
+ /* Read value */
+ rdbstate.doing = RDB_CHECK_DOING_READ_OBJECT_VALUE;
+ if ((val = rdbLoadObject(type,&rdb)) == NULL) goto eoferr;
+ /* Check if the key already expired. This function is used when loading
+ * an RDB file from disk, either at startup, or when an RDB was
+ * received from the master. In the latter case, the master is
+ * responsible for key expiry. If we would expire keys here, the
+ * snapshot taken by the master may not be reflected on the slave. */
+ if (server.masterhost == NULL && expiretime != -1 && expiretime < now)
+ rdbstate.already_expired++;
+ if (expiretime != -1) rdbstate.expires++;
+ rdbstate.key = NULL;
+ decrRefCount(key);
+ decrRefCount(val);
+ rdbstate.key_type = -1;
+ }
+ /* Verify the checksum if RDB version is >= 5 */
+ if (rdbver >= 5 && server.rdb_checksum) {
+ uint64_t cksum, expected = rdb.cksum;
+
+ rdbstate.doing = RDB_CHECK_DOING_CHECK_SUM;
+ if (rioRead(&rdb,&cksum,8) == 0) goto eoferr;
+ memrev64ifbe(&cksum);
+ if (cksum == 0) {
+ rdbCheckInfo("RDB file was saved with checksum disabled: no check performed.");
+ } else if (cksum != expected) {
+ rdbCheckError("RDB CRC error");
} else {
- serverLog(LL_WARNING, "CRC64 checksum is OK");
+ rdbCheckInfo("Checksum OK");
}
}
- /* print summary on errors */
- if (num_errors) {
- serverLog(LL_WARNING, "Total unprocessable opcodes: %llu",
- (unsigned long long) num_errors);
- }
-}
+ fclose(fp);
+ return 0;
-int redis_check_rdb(char *rdbfilename) {
- int fd;
- off_t size;
- struct stat stat;
- void *data;
-
- fd = open(rdbfilename, O_RDONLY);
- if (fd < 1) {
- ERROR("Cannot open file: %s", rdbfilename);
- }
- if (fstat(fd, &stat) == -1) {
- ERROR("Cannot stat: %s", rdbfilename);
+eoferr: /* unexpected end of file is handled here with a fatal exit */
+ if (rdbstate.error_set) {
+ rdbCheckError(rdbstate.error);
} else {
- size = stat.st_size;
- }
-
- if (sizeof(size_t) == sizeof(int32_t) && size >= INT_MAX) {
- ERROR("Cannot check dump files >2GB on a 32-bit platform");
- }
-
- data = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0);
- if (data == MAP_FAILED) {
- ERROR("Cannot mmap: %s", rdbfilename);
+ rdbCheckError("Unexpected EOF reading RDB file");
}
-
- /* Initialize static vars */
- positions[0].data = data;
- positions[0].size = size;
- positions[0].offset = 0;
- errors.level = 0;
-
- /* Object types */
- sprintf(types[RDB_TYPE_STRING], "STRING");
- sprintf(types[RDB_TYPE_LIST], "LIST");
- sprintf(types[RDB_TYPE_SET], "SET");
- sprintf(types[RDB_TYPE_ZSET], "ZSET");
- sprintf(types[RDB_TYPE_HASH], "HASH");
-
- /* Object types only used for dumping to disk */
- sprintf(types[RDB_OPCODE_EXPIRETIME], "EXPIRETIME");
- sprintf(types[RDB_OPCODE_SELECTDB], "SELECTDB");
- sprintf(types[RDB_OPCODE_EOF], "EOF");
-
- process();
-
- munmap(data, size);
- close(fd);
- return 0;
+ return 1;
}
/* RDB check main: called form redis.c when Redis is executed with the
- * redis-check-rdb alias. */
-int redis_check_rdb_main(char **argv, int argc) {
+ * redis-check-rdb alias.
+ *
+ * The function never returns, but exits with the status code according
+ * to success (RDB is sane) or error (RDB is corrupted). */
+int redis_check_rdb_main(int argc, char **argv) {
if (argc != 2) {
fprintf(stderr, "Usage: %s <rdb-file-name>\n", argv[0]);
exit(1);
}
- serverLog(LL_WARNING, "Checking RDB file %s", argv[1]);
- exit(redis_check_rdb(argv[1]));
- return 0;
+ /* In order to call the loading functions we need to create the shared
+ * integer objects, however since this function may be called from
+ * an already initialized Redis instance, check if we really need to. */
+ if (shared.integers[0] == NULL)
+ createSharedObjects();
+ server.loading_process_events_interval_bytes = 0;
+ rdbCheckMode = 1;
+ rdbCheckInfo("Checking RDB file %s", argv[1]);
+ rdbCheckSetupSignals();
+ int retval = redis_check_rdb(argv[1]);
+ if (retval == 0) {
+ rdbCheckInfo("\\o/ RDB looks OK! \\o/");
+ rdbShowGenericInfo();
+ }
+ exit(retval);
}
diff --git a/src/redis-cli.c b/src/redis-cli.c
index cf939c8c9..ac4358220 100644
--- a/src/redis-cli.c
+++ b/src/redis-cli.c
@@ -161,7 +161,7 @@ static void cliRefreshPrompt(void) {
len = anetFormatAddr(config.prompt, sizeof(config.prompt),
config.hostip, config.hostport);
/* Add [dbnum] if needed */
- if (config.dbnum != 0 && config.last_cmd_type != REDIS_REPLY_ERROR)
+ if (config.dbnum != 0)
len += snprintf(config.prompt+len,sizeof(config.prompt)-len,"[%d]",
config.dbnum);
snprintf(config.prompt+len,sizeof(config.prompt)-len,"> ");
@@ -275,6 +275,10 @@ static void cliIntegrateHelp(void) {
* don't already match what we have. */
for (size_t j = 0; j < reply->elements; j++) {
redisReply *entry = reply->element[j];
+ if (entry->type != REDIS_REPLY_ARRAY || entry->elements < 4 ||
+ entry->element[0]->type != REDIS_REPLY_STRING ||
+ entry->element[1]->type != REDIS_REPLY_INTEGER ||
+ entry->element[3]->type != REDIS_REPLY_INTEGER) return;
char *cmdname = entry->element[0]->str;
int i;
@@ -284,7 +288,6 @@ static void cliIntegrateHelp(void) {
break;
}
if (i != helpEntriesLen) continue;
- printf("%s\n", cmdname);
helpEntriesLen++;
helpEntries = zrealloc(helpEntries,sizeof(helpEntry)*helpEntriesLen);
@@ -314,8 +317,6 @@ static void cliIntegrateHelp(void) {
new->org = ch;
}
freeReplyObject(reply);
-
- printf("%s\n", helpEntries[80].full);
}
/* Output command help to stdout. */
@@ -339,7 +340,7 @@ static void cliOutputGenericHelp(void) {
" \"help <tab>\" to get a list of possible help topics\n"
" \"quit\" to exit\n"
"\n"
- "To set redis-cli perferences:\n"
+ "To set redis-cli preferences:\n"
" \":set hints\" enable online hints\n"
" \":set nohints\" disable online hints\n"
"Set your preferences in ~/.redisclirc\n",
@@ -846,8 +847,10 @@ static int cliSendCommand(int argc, char **argv, int repeat) {
output_raw = 0;
if (!strcasecmp(command,"info") ||
(argc >= 2 && !strcasecmp(command,"debug") &&
- ((!strcasecmp(argv[1],"jemalloc") && !strcasecmp(argv[2],"info")) ||
- !strcasecmp(argv[1],"htstats"))) ||
+ !strcasecmp(argv[1],"htstats")) ||
+ (argc >= 2 && !strcasecmp(command,"memory") &&
+ (!strcasecmp(argv[1],"malloc-stats") ||
+ !strcasecmp(argv[1],"doctor"))) ||
(argc == 2 && !strcasecmp(command,"cluster") &&
(!strcasecmp(argv[1],"nodes") ||
!strcasecmp(argv[1],"info"))) ||
@@ -919,7 +922,7 @@ static int cliSendCommand(int argc, char **argv, int repeat) {
return REDIS_ERR;
} else {
/* Store database number when SELECT was successfully executed. */
- if (!strcasecmp(command,"select") && argc == 2) {
+ if (!strcasecmp(command,"select") && argc == 2 && config.last_cmd_type != REDIS_REPLY_ERROR) {
config.dbnum = atoi(argv[1]);
cliRefreshPrompt();
} else if (!strcasecmp(command,"auth") && argc == 2) {
@@ -1223,7 +1226,7 @@ static sds *cliSplitArgs(char *line, int *argc) {
}
}
-/* Set the CLI perferences. This function is invoked when an interactive
+/* Set the CLI preferences. This function is invoked when an interactive
* ":command" is called, or when reading ~/.redisclirc file, in order to
* set user preferences. */
void cliSetPreferences(char **argv, int argc, int interactive) {
@@ -1258,6 +1261,7 @@ void cliLoadPreferences(void) {
if (argc > 0) cliSetPreferences(argv,argc,0);
sdsfreesplitres(argv,argc);
}
+ fclose(fp);
}
sdsfree(rcfile);
}
@@ -2405,7 +2409,7 @@ long long powerLawRand(long long min, long long max, double alpha) {
/* Generates a key name among a set of lru_test_sample_size keys, using
* an 80-20 distribution. */
void LRUTestGenKey(char *buf, size_t buflen) {
- snprintf(buf, buflen, "lru:%lld\n",
+ snprintf(buf, buflen, "lru:%lld",
powerLawRand(1, config.lru_test_sample_size, 6.2));
}
@@ -2427,8 +2431,11 @@ static void LRUTestMode(void) {
while(mstime() - start_cycle < 1000) {
/* Write cycle. */
for (j = 0; j < LRU_CYCLE_PIPELINE_SIZE; j++) {
+ char val[6];
+ val[5] = '\0';
+ for (int i = 0; i < 5; i++) val[i] = 'A'+rand()%('z'-'A');
LRUTestGenKey(key,sizeof(key));
- redisAppendCommand(context, "SET %s val",key);
+ redisAppendCommand(context, "SET %s %s",key,val);
}
for (j = 0; j < LRU_CYCLE_PIPELINE_SIZE; j++)
redisGetReply(context, (void**)&reply);
@@ -2594,13 +2601,16 @@ int main(int argc, char **argv) {
else
config.output = OUTPUT_STANDARD;
config.mb_delim = sdsnew("\n");
- cliInitHelp();
- cliIntegrateHelp();
firstarg = parseOptions(argc,argv);
argc -= firstarg;
argv += firstarg;
+ /* Initialize the help and, if possible, use the COMMAND command in order
+ * to retrieve missing entries. */
+ cliInitHelp();
+ cliIntegrateHelp();
+
/* Latency mode */
if (config.latency_mode) {
if (cliConnect(0) == REDIS_ERR) exit(1);
diff --git a/src/redis-trib.rb b/src/redis-trib.rb
index 68d46bdf8..39db97947 100755
--- a/src/redis-trib.rb
+++ b/src/redis-trib.rb
@@ -1305,6 +1305,11 @@ class RedisTrib
sleep 1
wait_cluster_join
flush_nodes_config # Useful for the replicas
+ # Reset the node information, so that when the
+ # final summary is listed in check_cluster about the newly created cluster
+ # all the nodes would get properly listed as slaves or masters
+ reset_nodes
+ load_cluster_info_from_node(argv[0])
check_cluster
end
@@ -1440,7 +1445,7 @@ class RedisTrib
xputs ">>> Importing data from #{source_addr} to cluster #{argv[1]}"
use_copy = opt['copy']
use_replace = opt['replace']
-
+
# Check the existing cluster.
load_cluster_info_from_node(argv[0])
check_cluster
@@ -1664,7 +1669,6 @@ ALLOWED_OPTIONS={
def show_help
puts "Usage: redis-trib <command> <options> <arguments ...>\n\n"
COMMANDS.each{|k,v|
- o = ""
puts " #{k.ljust(15)} #{v[2]}"
if ALLOWED_OPTIONS[k]
ALLOWED_OPTIONS[k].each{|optname,has_arg|
diff --git a/src/redismodule.h b/src/redismodule.h
index 80767ed46..186e284c0 100644
--- a/src/redismodule.h
+++ b/src/redismodule.h
@@ -28,6 +28,7 @@
#define REDISMODULE_KEYTYPE_HASH 3
#define REDISMODULE_KEYTYPE_SET 4
#define REDISMODULE_KEYTYPE_ZSET 5
+#define REDISMODULE_KEYTYPE_MODULE 6
/* Reply types. */
#define REDISMODULE_REPLY_UNKNOWN -1
@@ -67,6 +68,8 @@
#define REDISMODULE_POSITIVE_INFINITE (1.0/0.0)
#define REDISMODULE_NEGATIVE_INFINITE (-1.0/0.0)
+#define REDISMODULE_NOT_USED(V) ((void) V)
+
/* ------------------------- End of common defines ------------------------ */
#ifndef REDISMODULE_CORE
@@ -78,14 +81,42 @@ typedef struct RedisModuleCtx RedisModuleCtx;
typedef struct RedisModuleKey RedisModuleKey;
typedef struct RedisModuleString RedisModuleString;
typedef struct RedisModuleCallReply RedisModuleCallReply;
+typedef struct RedisModuleIO RedisModuleIO;
+typedef struct RedisModuleType RedisModuleType;
+typedef struct RedisModuleDigest RedisModuleDigest;
+typedef struct RedisModuleBlockedClient RedisModuleBlockedClient;
typedef int (*RedisModuleCmdFunc) (RedisModuleCtx *ctx, RedisModuleString **argv, int argc);
+typedef void *(*RedisModuleTypeLoadFunc)(RedisModuleIO *rdb, int encver);
+typedef void (*RedisModuleTypeSaveFunc)(RedisModuleIO *rdb, void *value);
+typedef void (*RedisModuleTypeRewriteFunc)(RedisModuleIO *aof, RedisModuleString *key, void *value);
+typedef size_t (*RedisModuleTypeMemUsageFunc)(void *value);
+typedef void (*RedisModuleTypeDigestFunc)(RedisModuleDigest *digest, void *value);
+typedef void (*RedisModuleTypeFreeFunc)(void *value);
+
+#define REDISMODULE_TYPE_METHOD_VERSION 1
+typedef struct RedisModuleTypeMethods {
+ uint64_t version;
+ RedisModuleTypeLoadFunc rdb_load;
+ RedisModuleTypeSaveFunc rdb_save;
+ RedisModuleTypeRewriteFunc aof_rewrite;
+ RedisModuleTypeMemUsageFunc mem_usage;
+ RedisModuleTypeDigestFunc digest;
+ RedisModuleTypeFreeFunc free;
+} RedisModuleTypeMethods;
+
#define REDISMODULE_GET_API(name) \
RedisModule_GetApi("RedisModule_" #name, ((void **)&RedisModule_ ## name))
#define REDISMODULE_API_FUNC(x) (*x)
+
+void *REDISMODULE_API_FUNC(RedisModule_Alloc)(size_t bytes);
+void *REDISMODULE_API_FUNC(RedisModule_Realloc)(void *ptr, size_t bytes);
+void REDISMODULE_API_FUNC(RedisModule_Free)(void *ptr);
+void *REDISMODULE_API_FUNC(RedisModule_Calloc)(size_t nmemb, size_t size);
+char *REDISMODULE_API_FUNC(RedisModule_Strdup)(const char *str);
int REDISMODULE_API_FUNC(RedisModule_GetApi)(const char *, void *);
int REDISMODULE_API_FUNC(RedisModule_CreateCommand)(RedisModuleCtx *ctx, const char *name, RedisModuleCmdFunc cmdfunc, const char *strflags, int firstkey, int lastkey, int keystep);
int REDISMODULE_API_FUNC(RedisModule_SetModuleAttribs)(RedisModuleCtx *ctx, const char *name, int ver, int apiver);
@@ -108,8 +139,10 @@ size_t REDISMODULE_API_FUNC(RedisModule_CallReplyLength)(RedisModuleCallReply *r
RedisModuleCallReply *REDISMODULE_API_FUNC(RedisModule_CallReplyArrayElement)(RedisModuleCallReply *reply, size_t idx);
RedisModuleString *REDISMODULE_API_FUNC(RedisModule_CreateString)(RedisModuleCtx *ctx, const char *ptr, size_t len);
RedisModuleString *REDISMODULE_API_FUNC(RedisModule_CreateStringFromLongLong)(RedisModuleCtx *ctx, long long ll);
+RedisModuleString *REDISMODULE_API_FUNC(RedisModule_CreateStringFromString)(RedisModuleCtx *ctx, const RedisModuleString *str);
+RedisModuleString *REDISMODULE_API_FUNC(RedisModule_CreateStringPrintf)(RedisModuleCtx *ctx, const char *fmt, ...);
void REDISMODULE_API_FUNC(RedisModule_FreeString)(RedisModuleCtx *ctx, RedisModuleString *str);
-const char *REDISMODULE_API_FUNC(RedisModule_StringPtrLen)(RedisModuleString *str, size_t *len);
+const char *REDISMODULE_API_FUNC(RedisModule_StringPtrLen)(const RedisModuleString *str, size_t *len);
int REDISMODULE_API_FUNC(RedisModule_ReplyWithError)(RedisModuleCtx *ctx, const char *err);
int REDISMODULE_API_FUNC(RedisModule_ReplyWithSimpleString)(RedisModuleCtx *ctx, const char *msg);
int REDISMODULE_API_FUNC(RedisModule_ReplyWithArray)(RedisModuleCtx *ctx, long len);
@@ -119,8 +152,8 @@ int REDISMODULE_API_FUNC(RedisModule_ReplyWithString)(RedisModuleCtx *ctx, Redis
int REDISMODULE_API_FUNC(RedisModule_ReplyWithNull)(RedisModuleCtx *ctx);
int REDISMODULE_API_FUNC(RedisModule_ReplyWithDouble)(RedisModuleCtx *ctx, double d);
int REDISMODULE_API_FUNC(RedisModule_ReplyWithCallReply)(RedisModuleCtx *ctx, RedisModuleCallReply *reply);
-int REDISMODULE_API_FUNC(RedisModule_StringToLongLong)(RedisModuleString *str, long long *ll);
-int REDISMODULE_API_FUNC(RedisModule_StringToDouble)(RedisModuleString *str, double *d);
+int REDISMODULE_API_FUNC(RedisModule_StringToLongLong)(const RedisModuleString *str, long long *ll);
+int REDISMODULE_API_FUNC(RedisModule_StringToDouble)(const RedisModuleString *str, double *d);
void REDISMODULE_API_FUNC(RedisModule_AutoMemory)(RedisModuleCtx *ctx);
int REDISMODULE_API_FUNC(RedisModule_Replicate)(RedisModuleCtx *ctx, const char *cmdname, const char *fmt, ...);
int REDISMODULE_API_FUNC(RedisModule_ReplicateVerbatim)(RedisModuleCtx *ctx);
@@ -151,11 +184,47 @@ int REDISMODULE_API_FUNC(RedisModule_IsKeysPositionRequest)(RedisModuleCtx *ctx)
void REDISMODULE_API_FUNC(RedisModule_KeyAtPos)(RedisModuleCtx *ctx, int pos);
unsigned long long REDISMODULE_API_FUNC(RedisModule_GetClientId)(RedisModuleCtx *ctx);
void *REDISMODULE_API_FUNC(RedisModule_PoolAlloc)(RedisModuleCtx *ctx, size_t bytes);
+RedisModuleType *REDISMODULE_API_FUNC(RedisModule_CreateDataType)(RedisModuleCtx *ctx, const char *name, int encver, RedisModuleTypeMethods *typemethods);
+int REDISMODULE_API_FUNC(RedisModule_ModuleTypeSetValue)(RedisModuleKey *key, RedisModuleType *mt, void *value);
+RedisModuleType *REDISMODULE_API_FUNC(RedisModule_ModuleTypeGetType)(RedisModuleKey *key);
+void *REDISMODULE_API_FUNC(RedisModule_ModuleTypeGetValue)(RedisModuleKey *key);
+void REDISMODULE_API_FUNC(RedisModule_SaveUnsigned)(RedisModuleIO *io, uint64_t value);
+uint64_t REDISMODULE_API_FUNC(RedisModule_LoadUnsigned)(RedisModuleIO *io);
+void REDISMODULE_API_FUNC(RedisModule_SaveSigned)(RedisModuleIO *io, int64_t value);
+int64_t REDISMODULE_API_FUNC(RedisModule_LoadSigned)(RedisModuleIO *io);
+void REDISMODULE_API_FUNC(RedisModule_EmitAOF)(RedisModuleIO *io, const char *cmdname, const char *fmt, ...);
+void REDISMODULE_API_FUNC(RedisModule_SaveString)(RedisModuleIO *io, RedisModuleString *s);
+void REDISMODULE_API_FUNC(RedisModule_SaveStringBuffer)(RedisModuleIO *io, const char *str, size_t len);
+RedisModuleString *REDISMODULE_API_FUNC(RedisModule_LoadString)(RedisModuleIO *io);
+char *REDISMODULE_API_FUNC(RedisModule_LoadStringBuffer)(RedisModuleIO *io, size_t *lenptr);
+void REDISMODULE_API_FUNC(RedisModule_SaveDouble)(RedisModuleIO *io, double value);
+double REDISMODULE_API_FUNC(RedisModule_LoadDouble)(RedisModuleIO *io);
+void REDISMODULE_API_FUNC(RedisModule_SaveFloat)(RedisModuleIO *io, float value);
+float REDISMODULE_API_FUNC(RedisModule_LoadFloat)(RedisModuleIO *io);
+void REDISMODULE_API_FUNC(RedisModule_Log)(RedisModuleCtx *ctx, const char *level, const char *fmt, ...);
+void REDISMODULE_API_FUNC(RedisModule_LogIOError)(RedisModuleIO *io, const char *levelstr, const char *fmt, ...);
+int REDISMODULE_API_FUNC(RedisModule_StringAppendBuffer)(RedisModuleCtx *ctx, RedisModuleString *str, const char *buf, size_t len);
+void REDISMODULE_API_FUNC(RedisModule_RetainString)(RedisModuleCtx *ctx, RedisModuleString *str);
+int REDISMODULE_API_FUNC(RedisModule_StringCompare)(RedisModuleString *a, RedisModuleString *b);
+RedisModuleCtx *REDISMODULE_API_FUNC(RedisModule_GetContextFromIO)(RedisModuleIO *io);
+RedisModuleBlockedClient *REDISMODULE_API_FUNC(RedisModule_BlockClient)(RedisModuleCtx *ctx, RedisModuleCmdFunc reply_callback, RedisModuleCmdFunc timeout_callback, void (*free_privdata)(void*), long long timeout_ms);
+int REDISMODULE_API_FUNC(RedisModule_UnblockClient)(RedisModuleBlockedClient *bc, void *privdata);
+int REDISMODULE_API_FUNC(RedisModule_IsBlockedReplyRequest)(RedisModuleCtx *ctx);
+int REDISMODULE_API_FUNC(RedisModule_IsBlockedTimeoutRequest)(RedisModuleCtx *ctx);
+void *REDISMODULE_API_FUNC(RedisModule_GetBlockedClientPrivateData)(RedisModuleCtx *ctx);
+int REDISMODULE_API_FUNC(RedisModule_AbortBlock)(RedisModuleBlockedClient *bc);
+long long REDISMODULE_API_FUNC(RedisModule_Milliseconds)(void);
/* This is included inline inside each Redis module. */
+static int RedisModule_Init(RedisModuleCtx *ctx, const char *name, int ver, int apiver) __attribute__((unused));
static int RedisModule_Init(RedisModuleCtx *ctx, const char *name, int ver, int apiver) {
void *getapifuncptr = ((void**)ctx)[0];
RedisModule_GetApi = (int (*)(const char *, void *)) (unsigned long)getapifuncptr;
+ REDISMODULE_GET_API(Alloc);
+ REDISMODULE_GET_API(Calloc);
+ REDISMODULE_GET_API(Free);
+ REDISMODULE_GET_API(Realloc);
+ REDISMODULE_GET_API(Strdup);
REDISMODULE_GET_API(CreateCommand);
REDISMODULE_GET_API(SetModuleAttribs);
REDISMODULE_GET_API(WrongArity);
@@ -191,6 +260,8 @@ static int RedisModule_Init(RedisModuleCtx *ctx, const char *name, int ver, int
REDISMODULE_GET_API(CreateStringFromCallReply);
REDISMODULE_GET_API(CreateString);
REDISMODULE_GET_API(CreateStringFromLongLong);
+ REDISMODULE_GET_API(CreateStringFromString);
+ REDISMODULE_GET_API(CreateStringPrintf);
REDISMODULE_GET_API(FreeString);
REDISMODULE_GET_API(StringPtrLen);
REDISMODULE_GET_API(AutoMemory);
@@ -221,6 +292,36 @@ static int RedisModule_Init(RedisModuleCtx *ctx, const char *name, int ver, int
REDISMODULE_GET_API(KeyAtPos);
REDISMODULE_GET_API(GetClientId);
REDISMODULE_GET_API(PoolAlloc);
+ REDISMODULE_GET_API(CreateDataType);
+ REDISMODULE_GET_API(ModuleTypeSetValue);
+ REDISMODULE_GET_API(ModuleTypeGetType);
+ REDISMODULE_GET_API(ModuleTypeGetValue);
+ REDISMODULE_GET_API(SaveUnsigned);
+ REDISMODULE_GET_API(LoadUnsigned);
+ REDISMODULE_GET_API(SaveSigned);
+ REDISMODULE_GET_API(LoadSigned);
+ REDISMODULE_GET_API(SaveString);
+ REDISMODULE_GET_API(SaveStringBuffer);
+ REDISMODULE_GET_API(LoadString);
+ REDISMODULE_GET_API(LoadStringBuffer);
+ REDISMODULE_GET_API(SaveDouble);
+ REDISMODULE_GET_API(LoadDouble);
+ REDISMODULE_GET_API(SaveFloat);
+ REDISMODULE_GET_API(LoadFloat);
+ REDISMODULE_GET_API(EmitAOF);
+ REDISMODULE_GET_API(Log);
+ REDISMODULE_GET_API(LogIOError);
+ REDISMODULE_GET_API(StringAppendBuffer);
+ REDISMODULE_GET_API(RetainString);
+ REDISMODULE_GET_API(StringCompare);
+ REDISMODULE_GET_API(GetContextFromIO);
+ REDISMODULE_GET_API(BlockClient);
+ REDISMODULE_GET_API(UnblockClient);
+ REDISMODULE_GET_API(IsBlockedReplyRequest);
+ REDISMODULE_GET_API(IsBlockedTimeoutRequest);
+ REDISMODULE_GET_API(GetBlockedClientPrivateData);
+ REDISMODULE_GET_API(AbortBlock);
+ REDISMODULE_GET_API(Milliseconds);
RedisModule_SetModuleAttribs(ctx,name,ver,apiver);
return REDISMODULE_OK;
diff --git a/src/replication.c b/src/replication.c
index 8aa0d807a..df2e23f3a 100644
--- a/src/replication.c
+++ b/src/replication.c
@@ -47,7 +47,7 @@ int cancelReplicationHandshake(void);
/* Return the pointer to a string representing the slave ip:listening_port
* pair. Mostly useful for logging, since we want to log a slave using its
- * IP address and it's listening port which is more clear for the user, for
+ * IP address and its listening port which is more clear for the user, for
* example: "Closing connection with slave 10.1.2.3:6380". */
char *replicationGetSlaveName(client *c) {
static char buf[NET_PEER_ID_LEN];
@@ -55,7 +55,12 @@ char *replicationGetSlaveName(client *c) {
ip[0] = '\0';
buf[0] = '\0';
- if (anetPeerToString(c->fd,ip,sizeof(ip),NULL) != -1) {
+ if (c->slave_ip[0] != '\0' ||
+ anetPeerToString(c->fd,ip,sizeof(ip),NULL) != -1)
+ {
+ /* Note that the 'ip' buffer is always larger than 'c->slave_ip' */
+ if (c->slave_ip[0] != '\0') memcpy(ip,c->slave_ip,sizeof(c->slave_ip));
+
if (c->slave_listening_port)
anetFormatAddr(buf,sizeof(buf),ip,c->slave_listening_port);
else
@@ -74,11 +79,6 @@ void createReplicationBacklog(void) {
server.repl_backlog = zmalloc(server.repl_backlog_size);
server.repl_backlog_histlen = 0;
server.repl_backlog_idx = 0;
- /* When a new backlog buffer is created, we increment the replication
- * offset by one to make sure we'll not be able to PSYNC with any
- * previous slave. This is needed because we avoid incrementing the
- * master_repl_offset if no backlog exists nor slaves are attached. */
- server.master_repl_offset++;
/* We don't have any data inside our buffer, but virtually the first
* byte we have is the next byte that will be generated for the
@@ -165,12 +165,24 @@ void feedReplicationBacklogWithObject(robj *o) {
feedReplicationBacklog(p,len);
}
+/* Propagate write commands to slaves, and populate the replication backlog
+ * as well. This function is used if the instance is a master: we use
+ * the commands received by our clients in order to create the replication
+ * stream. Instead if the instance is a slave and has sub-slaves attached,
+ * we use replicationFeedSlavesFromMaster() */
void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
listNode *ln;
listIter li;
int j, len;
char llstr[LONG_STR_SIZE];
+ /* If the instance is not a top level master, return ASAP: we'll just proxy
+ * the stream of data we receive from our master instead, in order to
+ * propagate *identical* replication stream. In this way this slave can
+ * advertise the same replication ID as the master (since it shares the
+ * master replication history and has the same backlog and offsets). */
+ if (server.masterhost != NULL) return;
+
/* If there aren't slaves, and there is no backlog buffer to populate,
* we can return ASAP. */
if (server.repl_backlog == NULL && listLength(slaves) == 0) return;
@@ -239,7 +251,7 @@ void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
}
/* Write the command to every slave. */
- listRewind(server.slaves,&li);
+ listRewind(slaves,&li);
while((ln = listNext(&li))) {
client *slave = ln->value;
@@ -260,6 +272,34 @@ void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
}
}
+/* This function is used in order to proxy what we receive from our master
+ * to our sub-slaves. */
+#include <ctype.h>
+void replicationFeedSlavesFromMasterStream(list *slaves, char *buf, size_t buflen) {
+ listNode *ln;
+ listIter li;
+
+ /* Debugging: this is handy to see the stream sent from master
+ * to slaves. Disabled with if(0). */
+ if (0) {
+ printf("%zu:",buflen);
+ for (size_t j = 0; j < buflen; j++) {
+ printf("%c", isprint(buf[j]) ? buf[j] : '.');
+ }
+ printf("\n");
+ }
+
+ if (server.repl_backlog) feedReplicationBacklog(buf,buflen);
+ listRewind(slaves,&li);
+ while((ln = listNext(&li))) {
+ client *slave = ln->value;
+
+ /* Don't feed slaves that are still waiting for BGSAVE to start */
+ if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_START) continue;
+ addReplyString(slave,buf,buflen);
+ }
+}
+
void replicationFeedMonitors(client *c, list *monitors, int dictid, robj **argv, int argc) {
listNode *ln;
listIter li;
@@ -324,7 +364,7 @@ long long addReplyReplicationBacklog(client *c, long long offset) {
skip = offset - server.repl_backlog_off;
serverLog(LL_DEBUG, "[PSYNC] Skipping: %lld", skip);
- /* Point j to the oldest byte, that is actaully our
+ /* Point j to the oldest byte, that is actually our
* server.repl_backlog_off byte. */
j = (server.repl_backlog_idx +
(server.repl_backlog_size-server.repl_backlog_histlen)) %
@@ -356,18 +396,14 @@ long long addReplyReplicationBacklog(client *c, long long offset) {
* the BGSAVE process started and before executing any other command
* from clients. */
long long getPsyncInitialOffset(void) {
- long long psync_offset = server.master_repl_offset;
- /* Add 1 to psync_offset if it the replication backlog does not exists
- * as when it will be created later we'll increment the offset by one. */
- if (server.repl_backlog == NULL) psync_offset++;
- return psync_offset;
+ return server.master_repl_offset;
}
/* Send a FULLRESYNC reply in the specific case of a full resynchronization,
* as a side effect setup the slave for a full sync in different ways:
*
- * 1) Remember, into the slave client structure, the offset we sent
- * here, so that if new slaves will later attach to the same
+ * 1) Remember, into the slave client structure, the replication offset
+ * we sent here, so that if new slaves will later attach to the same
* background RDB saving process (by duplicating this client output
* buffer), we can get the right offset from this slave.
* 2) Set the replication state of the slave to WAIT_BGSAVE_END so that
@@ -387,14 +423,14 @@ int replicationSetupSlaveForFullResync(client *slave, long long offset) {
slave->replstate = SLAVE_STATE_WAIT_BGSAVE_END;
/* We are going to accumulate the incremental changes for this
* slave as well. Set slaveseldb to -1 in order to force to re-emit
- * a SLEECT statement in the replication stream. */
+ * a SELECT statement in the replication stream. */
server.slaveseldb = -1;
/* Don't send this reply to slaves that approached us with
* the old SYNC command. */
if (!(slave->flags & CLIENT_PRE_PSYNC)) {
buflen = snprintf(buf,sizeof(buf),"+FULLRESYNC %s %lld\r\n",
- server.runid,offset);
+ server.replid,offset);
if (write(slave->fd,buf,buflen) != buflen) {
freeClientAsync(slave);
return C_ERR;
@@ -410,19 +446,40 @@ int replicationSetupSlaveForFullResync(client *slave, long long offset) {
* with the usual full resync. */
int masterTryPartialResynchronization(client *c) {
long long psync_offset, psync_len;
- char *master_runid = c->argv[1]->ptr;
+ char *master_replid = c->argv[1]->ptr;
char buf[128];
int buflen;
- /* Is the runid of this master the same advertised by the wannabe slave
- * via PSYNC? If runid changed this master is a different instance and
- * there is no way to continue. */
- if (strcasecmp(master_runid, server.runid)) {
+ /* Parse the replication offset asked by the slave. Go to full sync
+ * on parse error: this should never happen but we try to handle
+ * it in a robust way compared to aborting. */
+ if (getLongLongFromObjectOrReply(c,c->argv[2],&psync_offset,NULL) !=
+ C_OK) goto need_full_resync;
+
+ /* Is the replication ID of this master the same advertised by the wannabe
+ * slave via PSYNC? If the replication ID changed this master has a
+ * different replication history, and there is no way to continue.
+ *
+ * Note that there are two potentially valid replication IDs: the ID1
+ * and the ID2. The ID2 however is only valid up to a specific offset. */
+ if (strcasecmp(master_replid, server.replid) &&
+ (strcasecmp(master_replid, server.replid2) ||
+ psync_offset > server.second_replid_offset))
+ {
/* Run id "?" is used by slaves that want to force a full resync. */
- if (master_runid[0] != '?') {
- serverLog(LL_NOTICE,"Partial resynchronization not accepted: "
- "Runid mismatch (Client asked for runid '%s', my runid is '%s')",
- master_runid, server.runid);
+ if (master_replid[0] != '?') {
+ if (strcasecmp(master_replid, server.replid) &&
+ strcasecmp(master_replid, server.replid2))
+ {
+ serverLog(LL_NOTICE,"Partial resynchronization not accepted: "
+ "Replication ID mismatch (Slave asked for '%s', my "
+ "replication IDs are '%s' and '%s')",
+ master_replid, server.replid, server.replid2);
+ } else {
+ serverLog(LL_NOTICE,"Partial resynchronization not accepted: "
+ "Requested offset for second ID was %lld, but I can reply "
+ "up to %lld", psync_offset, server.second_replid_offset);
+ }
} else {
serverLog(LL_NOTICE,"Full resync requested by slave %s",
replicationGetSlaveName(c));
@@ -431,8 +488,6 @@ int masterTryPartialResynchronization(client *c) {
}
/* We still have the data our slave is asking for? */
- if (getLongLongFromObjectOrReply(c,c->argv[2],&psync_offset,NULL) !=
- C_OK) goto need_full_resync;
if (!server.repl_backlog ||
psync_offset < server.repl_backlog_off ||
psync_offset > (server.repl_backlog_off + server.repl_backlog_histlen))
@@ -458,7 +513,11 @@ int masterTryPartialResynchronization(client *c) {
/* We can't use the connection buffers since they are used to accumulate
* new commands at this stage. But we are sure the socket send buffer is
* empty so this write will never fail actually. */
- buflen = snprintf(buf,sizeof(buf),"+CONTINUE\r\n");
+ if (c->slave_capa & SLAVE_CAPA_PSYNC2) {
+ buflen = snprintf(buf,sizeof(buf),"+CONTINUE %s\r\n", server.replid);
+ } else {
+ buflen = snprintf(buf,sizeof(buf),"+CONTINUE\r\n");
+ }
if (write(c->fd,buf,buflen) != buflen) {
freeClientAsync(c);
return C_OK;
@@ -510,10 +569,18 @@ int startBgsaveForReplication(int mincapa) {
serverLog(LL_NOTICE,"Starting BGSAVE for SYNC with target: %s",
socket_target ? "slaves sockets" : "disk");
+ rdbSaveInfo rsi = RDB_SAVE_INFO_INIT;
+ /* If we are saving for a chained slave (that is, if we are,
+ * in turn, a slave of another instance), make sure after
+ * loadig the RDB, our slaves select the right DB: we'll just
+ * send the replication stream we receive from our master, so
+ * no way to send SELECT commands. */
+ if (server.master) rsi.repl_stream_db = server.master->db->id;
+
if (socket_target)
- retval = rdbSaveToSlavesSockets();
+ retval = rdbSaveToSlavesSockets(&rsi);
else
- retval = rdbSaveBackground(server.rdb_filename);
+ retval = rdbSaveBackground(server.rdb_filename,&rsi);
/* If we failed to BGSAVE, remove the slaves waiting for a full
* resynchorinization from the list of salves, inform them with
@@ -563,7 +630,7 @@ void syncCommand(client *c) {
/* Refuse SYNC requests if we are a slave but the link with our master
* is not ok... */
if (server.masterhost && server.repl_state != REPL_STATE_CONNECTED) {
- addReplyError(c,"Can't SYNC while not connected with my master");
+ addReplySds(c,sdsnew("-NOMASTERLINK Can't SYNC while not connected with my master\r\n"));
return;
}
@@ -584,22 +651,22 @@ void syncCommand(client *c) {
* when this happens masterTryPartialResynchronization() already
* replied with:
*
- * +FULLRESYNC <runid> <offset>
+ * +FULLRESYNC <replid> <offset>
*
- * So the slave knows the new runid and offset to try a PSYNC later
+ * So the slave knows the new replid and offset to try a PSYNC later
* if the connection with the master is lost. */
if (!strcasecmp(c->argv[0]->ptr,"psync")) {
if (masterTryPartialResynchronization(c) == C_OK) {
server.stat_sync_partial_ok++;
return; /* No full resync needed, return. */
} else {
- char *master_runid = c->argv[1]->ptr;
+ char *master_replid = c->argv[1]->ptr;
/* Increment stats for failed PSYNCs, but only if the
- * runid is not "?", as this is used by slaves to force a full
+ * replid is not "?", as this is used by slaves to force a full
* resync on purpose when they are not albe to partially
* resync. */
- if (master_runid[0] != '?') server.stat_sync_partial_err++;
+ if (master_replid[0] != '?') server.stat_sync_partial_err++;
}
} else {
/* If a slave uses SYNC, we are dealing with an old implementation
@@ -620,6 +687,16 @@ void syncCommand(client *c) {
c->flags |= CLIENT_SLAVE;
listAddNodeTail(server.slaves,c);
+ /* Create the replication backlog if needed. */
+ if (listLength(server.slaves) == 1 && server.repl_backlog == NULL) {
+ /* When we create the backlog from scratch, we always use a new
+ * replication ID and clear the ID2, since there is no valid
+ * past history. */
+ changeReplicationId();
+ clearReplicationId2();
+ createReplicationBacklog();
+ }
+
/* CASE 1: BGSAVE is in progress, with disk target. */
if (server.rdb_child_pid != -1 &&
server.rdb_child_type == RDB_CHILD_TYPE_DISK)
@@ -647,7 +724,7 @@ void syncCommand(client *c) {
} else {
/* No way, we need to wait for the next BGSAVE in order to
* register differences. */
- serverLog(LL_NOTICE,"Waiting for next BGSAVE for SYNC");
+ serverLog(LL_NOTICE,"Can't attach the slave to the current BGSAVE. Waiting for next BGSAVE for SYNC");
}
/* CASE 2: BGSAVE is in progress, with socket target. */
@@ -657,7 +734,7 @@ void syncCommand(client *c) {
/* There is an RDB child process but it is writing directly to
* children sockets. We need to wait for the next BGSAVE
* in order to synchronize. */
- serverLog(LL_NOTICE,"Waiting for next BGSAVE for SYNC");
+ serverLog(LL_NOTICE,"Current BGSAVE has socket target. Waiting for next BGSAVE for SYNC");
/* CASE 3: There is no BGSAVE is progress. */
} else {
@@ -666,17 +743,20 @@ void syncCommand(client *c) {
* replicationCron() since we want to delay its start a
* few seconds to wait for more slaves to arrive. */
if (server.repl_diskless_sync_delay)
- serverLog(LL_NOTICE,"Delay next BGSAVE for SYNC");
+ serverLog(LL_NOTICE,"Delay next BGSAVE for diskless SYNC");
} else {
/* Target is disk (or the slave is not capable of supporting
* diskless replication) and we don't have a BGSAVE in progress,
* let's start one. */
- if (startBgsaveForReplication(c->slave_capa) != C_OK) return;
+ if (server.aof_child_pid == -1) {
+ startBgsaveForReplication(c->slave_capa);
+ } else {
+ serverLog(LL_NOTICE,
+ "No BGSAVE in progress, but an AOF rewrite is active. "
+ "BGSAVE for replication delayed");
+ }
}
}
-
- if (listLength(server.slaves) == 1 && server.repl_backlog == NULL)
- createReplicationBacklog();
return;
}
@@ -711,10 +791,21 @@ void replconfCommand(client *c) {
&port,NULL) != C_OK))
return;
c->slave_listening_port = port;
+ } else if (!strcasecmp(c->argv[j]->ptr,"ip-address")) {
+ sds ip = c->argv[j+1]->ptr;
+ if (sdslen(ip) < sizeof(c->slave_ip)) {
+ memcpy(c->slave_ip,ip,sdslen(ip)+1);
+ } else {
+ addReplyErrorFormat(c,"REPLCONF ip-address provided by "
+ "slave instance is too long: %zd bytes", sdslen(ip));
+ return;
+ }
} else if (!strcasecmp(c->argv[j]->ptr,"capa")) {
/* Ignore capabilities not understood by this master. */
if (!strcasecmp(c->argv[j+1]->ptr,"eof"))
c->slave_capa |= SLAVE_CAPA_EOF;
+ else if (!strcasecmp(c->argv[j+1]->ptr,"psync2"))
+ c->slave_capa |= SLAVE_CAPA_PSYNC2;
} else if (!strcasecmp(c->argv[j]->ptr,"ack")) {
/* REPLCONF ACK is used by slave to inform the master the amount
* of replication stream that it processed so far. It is an
@@ -738,7 +829,7 @@ void replconfCommand(client *c) {
/* REPLCONF GETACK is used in order to request an ACK ASAP
* to the slave. */
if (server.masterhost && server.master) replicationSendAck();
- /* Note: this command does not reply anything! */
+ return;
} else {
addReplyErrorFormat(c,"Unrecognized REPLCONF option: %s",
(char*)c->argv[j]->ptr);
@@ -908,6 +999,43 @@ void updateSlavesWaitingBgsave(int bgsaveerr, int type) {
if (startbgsave) startBgsaveForReplication(mincapa);
}
+/* Change the current instance replication ID with a new, random one.
+ * This will prevent successful PSYNCs between this master and other
+ * slaves, so the command should be called when something happens that
+ * alters the current story of the dataset. */
+void changeReplicationId(void) {
+ getRandomHexChars(server.replid,CONFIG_RUN_ID_SIZE);
+ server.replid[CONFIG_RUN_ID_SIZE] = '\0';
+}
+
+/* Clear (invalidate) the secondary replication ID. This happens, for
+ * example, after a full resynchronization, when we start a new replication
+ * history. */
+void clearReplicationId2(void) {
+ memset(server.replid2,'0',sizeof(server.replid));
+ server.replid2[CONFIG_RUN_ID_SIZE] = '\0';
+ server.second_replid_offset = -1;
+}
+
+/* Use the current replication ID / offset as secondary replication
+ * ID, and change the current one in order to start a new history.
+ * This should be used when an instance is switched from slave to master
+ * so that it can serve PSYNC requests performed using the master
+ * replication ID. */
+void shiftReplicationId(void) {
+ memcpy(server.replid2,server.replid,sizeof(server.replid));
+ /* We set the second replid offset to the master offset + 1, since
+ * the slave will ask for the first byte it has not yet received, so
+ * we need to add one to the offset: for example if, as a slave, we are
+ * sure we have the same history as the master for 50 bytes, after we
+ * are turned into a master, we can accept a PSYNC request with offset
+ * 51, since the slave asking has the same history up to the 50th
+ * byte, and is asking for the new bytes starting at offset 51. */
+ server.second_replid_offset = server.master_repl_offset+1;
+ changeReplicationId();
+ serverLog(LL_WARNING,"Setting secondary replication ID to %s, valid up to offset: %lld. New replication ID is %s", server.replid2, server.second_replid_offset, server.replid);
+}
+
/* ----------------------------------- SLAVE -------------------------------- */
/* Returns 1 if the given replication state is a handshake state,
@@ -945,18 +1073,30 @@ void replicationEmptyDbCallback(void *privdata) {
/* Once we have a link with the master and the synchroniziation was
* performed, this function materializes the master client we store
* at server.master, starting from the specified file descriptor. */
-void replicationCreateMasterClient(int fd) {
+void replicationCreateMasterClient(int fd, int dbid) {
server.master = createClient(fd);
server.master->flags |= CLIENT_MASTER;
server.master->authenticated = 1;
- server.repl_state = REPL_STATE_CONNECTED;
- server.master->reploff = server.repl_master_initial_offset;
- memcpy(server.master->replrunid, server.repl_master_runid,
- sizeof(server.repl_master_runid));
+ server.master->reploff = server.master_initial_offset;
+ memcpy(server.master->replid, server.master_replid,
+ sizeof(server.master_replid));
/* If master offset is set to -1, this master is old and is not
* PSYNC capable, so we flag it accordingly. */
if (server.master->reploff == -1)
server.master->flags |= CLIENT_PRE_PSYNC;
+ if (dbid != -1) selectDb(server.master,dbid);
+}
+
+void restartAOF() {
+ int retry = 10;
+ while (retry-- && startAppendOnly() == C_ERR) {
+ serverLog(LL_WARNING,"Failed enabling the AOF after successful master synchronization! Trying it again in one second.");
+ sleep(1);
+ }
+ if (!retry) {
+ serverLog(LL_WARNING,"FATAL: this slave instance finished the synchronization with its master, but the AOF can't be turned on. Exiting now.");
+ exit(1);
+ }
}
/* Asynchronously read the SYNC payload we receive from a master */
@@ -1100,12 +1240,17 @@ void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask) {
}
if (eof_reached) {
+ int aof_is_enabled = server.aof_state != AOF_OFF;
+
if (rename(server.repl_transfer_tmpfile,server.rdb_filename) == -1) {
serverLog(LL_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
cancelReplicationHandshake();
return;
}
serverLog(LL_NOTICE, "MASTER <-> SLAVE sync: Flushing old data");
+ /* We need to stop any AOFRW fork before flusing and parsing
+ * RDB, otherwise we'll create a copy-on-write disaster. */
+ if(aof_is_enabled) stopAppendOnly();
signalFlushedDb(-1);
emptyDb(
-1,
@@ -1117,34 +1262,38 @@ void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask) {
* time for non blocking loading. */
aeDeleteFileEvent(server.el,server.repl_transfer_s,AE_READABLE);
serverLog(LL_NOTICE, "MASTER <-> SLAVE sync: Loading DB in memory");
- if (rdbLoad(server.rdb_filename) != C_OK) {
+ rdbSaveInfo rsi = RDB_SAVE_INFO_INIT;
+ if (rdbLoad(server.rdb_filename,&rsi) != C_OK) {
serverLog(LL_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
cancelReplicationHandshake();
+ /* Re-enable the AOF if we disabled it earlier, in order to restore
+ * the original configuration. */
+ if (aof_is_enabled) restartAOF();
return;
}
/* Final setup of the connected slave <- master link */
zfree(server.repl_transfer_tmpfile);
close(server.repl_transfer_fd);
- replicationCreateMasterClient(server.repl_transfer_s);
+ replicationCreateMasterClient(server.repl_transfer_s,rsi.repl_stream_db);
+ server.repl_state = REPL_STATE_CONNECTED;
+ /* After a full resynchroniziation we use the replication ID and
+ * offset of the master. The secondary ID / offset are cleared since
+ * we are starting a new history. */
+ memcpy(server.replid,server.master->replid,sizeof(server.replid));
+ server.master_repl_offset = server.master->reploff;
+ clearReplicationId2();
+ /* Let's create the replication backlog if needed. Slaves need to
+ * accumulate the backlog regardless of the fact they have sub-slaves
+ * or not, in order to behave correctly if they are promoted to
+ * masters after a failover. */
+ if (server.repl_backlog == NULL) createReplicationBacklog();
+
serverLog(LL_NOTICE, "MASTER <-> SLAVE sync: Finished with success");
/* Restart the AOF subsystem now that we finished the sync. This
* will trigger an AOF rewrite, and when done will start appending
* to the new file. */
- if (server.aof_state != AOF_OFF) {
- int retry = 10;
-
- stopAppendOnly();
- while (retry-- && startAppendOnly() == C_ERR) {
- serverLog(LL_WARNING,"Failed enabling the AOF after successful master synchronization! Trying it again in one second.");
- sleep(1);
- }
- if (!retry) {
- serverLog(LL_WARNING,"FATAL: this slave instance finished the synchronization with its master, but the AOF can't be turned on. Exiting now.");
- exit(1);
- }
- }
+ if (aof_is_enabled) restartAOF();
}
-
return;
error:
@@ -1243,14 +1392,15 @@ char *sendSynchronousCommand(int flags, int fd, ...) {
* offset is saved.
* PSYNC_NOT_SUPPORTED: If the server does not understand PSYNC at all and
* the caller should fall back to SYNC.
- * PSYNC_WRITE_ERR: There was an error writing the command to the socket.
+ * PSYNC_WRITE_ERROR: There was an error writing the command to the socket.
* PSYNC_WAIT_REPLY: Call again the function with read_reply set to 1.
+ * PSYNC_TRY_LATER: Master is currently in a transient error condition.
*
* Notable side effects:
*
* 1) As a side effect of the function call the function removes the readable
* event handler from "fd", unless the return value is PSYNC_WAIT_REPLY.
- * 2) server.repl_master_initial_offset is set to the right value according
+ * 2) server.master_initial_offset is set to the right value according
* to the master reply. This will be used to populate the 'server.master'
* structure replication offset.
*/
@@ -1260,32 +1410,33 @@ char *sendSynchronousCommand(int flags, int fd, ...) {
#define PSYNC_CONTINUE 2
#define PSYNC_FULLRESYNC 3
#define PSYNC_NOT_SUPPORTED 4
+#define PSYNC_TRY_LATER 5
int slaveTryPartialResynchronization(int fd, int read_reply) {
- char *psync_runid;
+ char *psync_replid;
char psync_offset[32];
sds reply;
/* Writing half */
if (!read_reply) {
- /* Initially set repl_master_initial_offset to -1 to mark the current
+ /* Initially set master_initial_offset to -1 to mark the current
* master run_id and offset as not valid. Later if we'll be able to do
* a FULL resync using the PSYNC command we'll set the offset at the
* right value, so that this information will be propagated to the
* client structure representing the master into server.master. */
- server.repl_master_initial_offset = -1;
+ server.master_initial_offset = -1;
if (server.cached_master) {
- psync_runid = server.cached_master->replrunid;
+ psync_replid = server.cached_master->replid;
snprintf(psync_offset,sizeof(psync_offset),"%lld", server.cached_master->reploff+1);
- serverLog(LL_NOTICE,"Trying a partial resynchronization (request %s:%s).", psync_runid, psync_offset);
+ serverLog(LL_NOTICE,"Trying a partial resynchronization (request %s:%s).", psync_replid, psync_offset);
} else {
serverLog(LL_NOTICE,"Partial resynchronization not possible (no cached master)");
- psync_runid = "?";
+ psync_replid = "?";
memcpy(psync_offset,"-1",3);
}
/* Issue the PSYNC command */
- reply = sendSynchronousCommand(SYNC_CMD_WRITE,fd,"PSYNC",psync_runid,psync_offset,NULL);
+ reply = sendSynchronousCommand(SYNC_CMD_WRITE,fd,"PSYNC",psync_replid,psync_offset,NULL);
if (reply != NULL) {
serverLog(LL_WARNING,"Unable to send PSYNC to master: %s",reply);
sdsfree(reply);
@@ -1307,31 +1458,31 @@ int slaveTryPartialResynchronization(int fd, int read_reply) {
aeDeleteFileEvent(server.el,fd,AE_READABLE);
if (!strncmp(reply,"+FULLRESYNC",11)) {
- char *runid = NULL, *offset = NULL;
+ char *replid = NULL, *offset = NULL;
/* FULL RESYNC, parse the reply in order to extract the run id
* and the replication offset. */
- runid = strchr(reply,' ');
- if (runid) {
- runid++;
- offset = strchr(runid,' ');
+ replid = strchr(reply,' ');
+ if (replid) {
+ replid++;
+ offset = strchr(replid,' ');
if (offset) offset++;
}
- if (!runid || !offset || (offset-runid-1) != CONFIG_RUN_ID_SIZE) {
+ if (!replid || !offset || (offset-replid-1) != CONFIG_RUN_ID_SIZE) {
serverLog(LL_WARNING,
"Master replied with wrong +FULLRESYNC syntax.");
/* This is an unexpected condition, actually the +FULLRESYNC
* reply means that the master supports PSYNC, but the reply
* format seems wrong. To stay safe we blank the master
- * runid to make sure next PSYNCs will fail. */
- memset(server.repl_master_runid,0,CONFIG_RUN_ID_SIZE+1);
+ * replid to make sure next PSYNCs will fail. */
+ memset(server.master_replid,0,CONFIG_RUN_ID_SIZE+1);
} else {
- memcpy(server.repl_master_runid, runid, offset-runid-1);
- server.repl_master_runid[CONFIG_RUN_ID_SIZE] = '\0';
- server.repl_master_initial_offset = strtoll(offset,NULL,10);
+ memcpy(server.master_replid, replid, offset-replid-1);
+ server.master_replid[CONFIG_RUN_ID_SIZE] = '\0';
+ server.master_initial_offset = strtoll(offset,NULL,10);
serverLog(LL_NOTICE,"Full resync from master: %s:%lld",
- server.repl_master_runid,
- server.repl_master_initial_offset);
+ server.master_replid,
+ server.master_initial_offset);
}
/* We are going to full resync, discard the cached master structure. */
replicationDiscardCachedMaster();
@@ -1340,17 +1491,64 @@ int slaveTryPartialResynchronization(int fd, int read_reply) {
}
if (!strncmp(reply,"+CONTINUE",9)) {
- /* Partial resync was accepted, set the replication state accordingly */
+ /* Partial resync was accepted. */
serverLog(LL_NOTICE,
"Successful partial resynchronization with master.");
+
+ /* Check the new replication ID advertised by the master. If it
+ * changed, we need to set the new ID as primary ID, and set or
+ * secondary ID as the old master ID up to the current offset, so
+ * that our sub-slaves will be able to PSYNC with us after a
+ * disconnection. */
+ char *start = reply+10;
+ char *end = reply+9;
+ while(end[0] != '\r' && end[0] != '\n' && end[0] != '\0') end++;
+ if (end-start == CONFIG_RUN_ID_SIZE) {
+ char new[CONFIG_RUN_ID_SIZE+1];
+ memcpy(new,start,CONFIG_RUN_ID_SIZE);
+ new[CONFIG_RUN_ID_SIZE] = '\0';
+
+ if (strcmp(new,server.cached_master->replid)) {
+ /* Master ID changed. */
+ serverLog(LL_WARNING,"Master replication ID changed to %s",new);
+
+ /* Set the old ID as our ID2, up to the current offset+1. */
+ memcpy(server.replid2,server.cached_master->replid,
+ sizeof(server.replid2));
+ server.second_replid_offset = server.master_repl_offset+1;
+
+ /* Update the cached master ID and our own primary ID to the
+ * new one. */
+ memcpy(server.replid,new,sizeof(server.replid));
+ memcpy(server.cached_master->replid,new,sizeof(server.replid));
+
+ /* Disconnect all the sub-slaves: they need to be notified. */
+ disconnectSlaves();
+ }
+ }
+
+ /* Setup the replication to continue. */
sdsfree(reply);
replicationResurrectCachedMaster(fd);
return PSYNC_CONTINUE;
}
- /* If we reach this point we received either an error since the master does
- * not understand PSYNC, or an unexpected reply from the master.
- * Return PSYNC_NOT_SUPPORTED to the caller in both cases. */
+ /* If we reach this point we received either an error (since the master does
+ * not understand PSYNC or because it is in a special state and cannot
+ * serve our request), or an unexpected reply from the master.
+ *
+ * Return PSYNC_NOT_SUPPORTED on errors we don't understand, otherwise
+ * return PSYNC_TRY_LATER if we believe this is a transient error. */
+
+ if (!strncmp(reply,"-NOMASTERLINK",13) ||
+ !strncmp(reply,"-LOADING",8))
+ {
+ serverLog(LL_NOTICE,
+ "Master is currently unable to PSYNC "
+ "but should be in the future: %s", reply);
+ sdsfree(reply);
+ return PSYNC_TRY_LATER;
+ }
if (strncmp(reply,"-ERR",4)) {
/* If it's not an error, log the unexpected event. */
@@ -1366,6 +1564,8 @@ int slaveTryPartialResynchronization(int fd, int read_reply) {
return PSYNC_NOT_SUPPORTED;
}
+/* This handler fires when the non blocking connect was able to
+ * establish a connection with the master. */
void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) {
char tmpfile[256], *err = NULL;
int dfd, maxtries = 5;
@@ -1382,7 +1582,8 @@ void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) {
return;
}
- /* Check for errors in the socket. */
+ /* Check for errors in the socket: after a non blocking connect() we
+ * may find that the socket is in error state. */
if (getsockopt(fd, SOL_SOCKET, SO_ERROR, &sockerr, &errlen) == -1)
sockerr = errno;
if (sockerr) {
@@ -1456,7 +1657,8 @@ void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) {
/* Set the slave port, so that Master's INFO command can list the
* slave listening port correctly. */
if (server.repl_state == REPL_STATE_SEND_PORT) {
- sds port = sdsfromlonglong(server.port);
+ sds port = sdsfromlonglong(server.slave_announce_port ?
+ server.slave_announce_port : server.port);
err = sendSynchronousCommand(SYNC_CMD_WRITE,fd,"REPLCONF",
"listening-port",port, NULL);
sdsfree(port);
@@ -1476,16 +1678,49 @@ void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) {
"REPLCONF listening-port: %s", err);
}
sdsfree(err);
+ server.repl_state = REPL_STATE_SEND_IP;
+ }
+
+ /* Skip REPLCONF ip-address if there is no slave-announce-ip option set. */
+ if (server.repl_state == REPL_STATE_SEND_IP &&
+ server.slave_announce_ip == NULL)
+ {
+ server.repl_state = REPL_STATE_SEND_CAPA;
+ }
+
+ /* Set the slave ip, so that Master's INFO command can list the
+ * slave IP address port correctly in case of port forwarding or NAT. */
+ if (server.repl_state == REPL_STATE_SEND_IP) {
+ err = sendSynchronousCommand(SYNC_CMD_WRITE,fd,"REPLCONF",
+ "ip-address",server.slave_announce_ip, NULL);
+ if (err) goto write_error;
+ sdsfree(err);
+ server.repl_state = REPL_STATE_RECEIVE_IP;
+ return;
+ }
+
+ /* Receive REPLCONF ip-address reply. */
+ if (server.repl_state == REPL_STATE_RECEIVE_IP) {
+ err = sendSynchronousCommand(SYNC_CMD_READ,fd,NULL);
+ /* Ignore the error if any, not all the Redis versions support
+ * REPLCONF listening-port. */
+ if (err[0] == '-') {
+ serverLog(LL_NOTICE,"(Non critical) Master does not understand "
+ "REPLCONF ip-address: %s", err);
+ }
+ sdsfree(err);
server.repl_state = REPL_STATE_SEND_CAPA;
}
- /* Inform the master of our capabilities. While we currently send
- * just one capability, it is possible to chain new capabilities here
- * in the form of REPLCONF capa X capa Y capa Z ...
+ /* Inform the master of our (slave) capabilities.
+ *
+ * EOF: supports EOF-style RDB transfer for diskless replication.
+ * PSYNC2: supports PSYNC v2, so understands +CONTINUE <new repl ID>.
+ *
* The master will ignore capabilities it does not understand. */
if (server.repl_state == REPL_STATE_SEND_CAPA) {
err = sendSynchronousCommand(SYNC_CMD_WRITE,fd,"REPLCONF",
- "capa","eof",NULL);
+ "capa","eof","capa","psync2",NULL);
if (err) goto write_error;
sdsfree(err);
server.repl_state = REPL_STATE_RECEIVE_CAPA;
@@ -1530,6 +1765,12 @@ void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) {
psync_result = slaveTryPartialResynchronization(fd,1);
if (psync_result == PSYNC_WAIT_REPLY) return; /* Try again later... */
+ /* If the master is in an transient error, we should try to PSYNC
+ * from scratch later, so go to the error path. This happens when
+ * the server is loading the dataset or is not connected with its
+ * master and so forth. */
+ if (psync_result == PSYNC_TRY_LATER) goto error;
+
/* Note: if PSYNC does not return WAIT_REPLY, it will take care of
* uninstalling the read handler from the file descriptor. */
@@ -1539,14 +1780,14 @@ void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) {
}
/* PSYNC failed or is not supported: we want our slaves to resync with us
- * as well, if we have any (chained replication case). The mater may
- * transfer us an entirely different data set and we have no way to
- * incrementally feed our slaves after that. */
+ * as well, if we have any sub-slaves. The master may transfer us an
+ * entirely different data set and we have no way to incrementally feed
+ * our slaves after that. */
disconnectSlaves(); /* Force our slaves to resync with us as well. */
freeReplicationBacklog(); /* Don't allow our chained slaves to PSYNC. */
/* Fall back to SYNC if needed. Otherwise psync_result == PSYNC_FULLRESYNC
- * and the server.repl_master_runid and repl_master_initial_offset are
+ * and the server.master_replid and master_initial_offset are
* already populated. */
if (psync_result == PSYNC_NOT_SUPPORTED) {
serverLog(LL_NOTICE,"Retrying with SYNC...");
@@ -1675,17 +1916,24 @@ int cancelReplicationHandshake(void) {
/* Set replication to the specified master address and port. */
void replicationSetMaster(char *ip, int port) {
+ int was_master = server.masterhost == NULL;
+
sdsfree(server.masterhost);
server.masterhost = sdsnew(ip);
server.masterport = port;
- if (server.master) freeClient(server.master);
+ if (server.master) {
+ freeClient(server.master);
+ }
disconnectAllBlockedClients(); /* Clients blocked in master, now slave. */
- disconnectSlaves(); /* Force our slaves to resync with us as well. */
- replicationDiscardCachedMaster(); /* Don't try a PSYNC. */
- freeReplicationBacklog(); /* Don't allow our chained slaves to PSYNC. */
+
+ /* Force our slaves to resync with us as well. They may hopefully be able
+ * to partially resync with us, but we can notify the replid change. */
+ disconnectSlaves();
cancelReplicationHandshake();
+ /* Before destroying our master state, create a cached master using
+ * our own parameters, to later PSYNC with the new master. */
+ if (was_master) replicationCacheMasterUsingMyself();
server.repl_state = REPL_STATE_CONNECT;
- server.master_repl_offset = 0;
server.repl_down_since = 0;
}
@@ -1694,20 +1942,26 @@ void replicationUnsetMaster(void) {
if (server.masterhost == NULL) return; /* Nothing to do. */
sdsfree(server.masterhost);
server.masterhost = NULL;
- if (server.master) {
- if (listLength(server.slaves) == 0) {
- /* If this instance is turned into a master and there are no
- * slaves, it inherits the replication offset from the master.
- * Under certain conditions this makes replicas comparable by
- * replication offset to understand what is the most updated. */
- server.master_repl_offset = server.master->reploff;
- freeReplicationBacklog();
- }
- freeClient(server.master);
- }
+ /* When a slave is turned into a master, the current replication ID
+ * (that was inherited from the master at synchronization time) is
+ * used as secondary ID up to the current offset, and a new replication
+ * ID is created to continue with a new replication history. */
+ shiftReplicationId();
+ if (server.master) freeClient(server.master);
replicationDiscardCachedMaster();
cancelReplicationHandshake();
+ /* Disconnecting all the slaves is required: we need to inform slaves
+ * of the replication ID change (see shiftReplicationId() call). However
+ * the slaves will be able to partially resync with us, so it will be
+ * a very fast reconnection. */
+ disconnectSlaves();
server.repl_state = REPL_STATE_NONE;
+
+ /* We need to make sure the new master will start the replication stream
+ * with a SELECT statement. This is forced after a full resync, but
+ * with PSYNC version 2, there is no need for full resync after a
+ * master switch. */
+ server.slaveseldb = -1;
}
/* This function is called when the slave lose the connection with the
@@ -1781,12 +2035,16 @@ void roleCommand(client *c) {
listRewind(server.slaves,&li);
while((ln = listNext(&li))) {
client *slave = ln->value;
- char ip[NET_IP_STR_LEN];
+ char ip[NET_IP_STR_LEN], *slaveip = slave->slave_ip;
- if (anetPeerToString(slave->fd,ip,sizeof(ip),NULL) == -1) continue;
+ if (slaveip[0] == '\0') {
+ if (anetPeerToString(slave->fd,ip,sizeof(ip),NULL) == -1)
+ continue;
+ slaveip = ip;
+ }
if (slave->replstate != SLAVE_STATE_ONLINE) continue;
addReplyMultiBulkLen(c,3);
- addReplyBulkCString(c,ip);
+ addReplyBulkCString(c,slaveip);
addReplyBulkLongLong(c,slave->slave_listening_port);
addReplyBulkLongLong(c,slave->repl_ack_off);
slaves++;
@@ -1875,6 +2133,31 @@ void replicationCacheMaster(client *c) {
replicationHandleMasterDisconnection();
}
+/* This function is called when a master is turend into a slave, in order to
+ * create from scratch a cached master for the new client, that will allow
+ * to PSYNC with the slave that was promoted as the new master after a
+ * failover.
+ *
+ * Assuming this instance was previously the master instance of the new master,
+ * the new master will accept its replication ID, and potentiall also the
+ * current offset if no data was lost during the failover. So we use our
+ * current replication ID and offset in order to synthesize a cached master. */
+void replicationCacheMasterUsingMyself(void) {
+ /* The master client we create can be set to any DBID, because
+ * the new master will start its replication stream with SELECT. */
+ server.master_initial_offset = server.master_repl_offset;
+ replicationCreateMasterClient(-1,-1);
+
+ /* Use our own ID / offset. */
+ memcpy(server.master->replid, server.replid, sizeof(server.replid));
+
+ /* Set as cached master. */
+ unlinkClient(server.master);
+ server.cached_master = server.master;
+ server.master = NULL;
+ serverLog(LL_NOTICE,"Before turning into a slave, using my master parameters to synthesize a cached master: I may be able to synchronize with the new master with just a partial transfer.");
+}
+
/* Free a cached master, called when there are no longer the conditions for
* a partial resync on reconnection. */
void replicationDiscardCachedMaster(void) {
@@ -2086,6 +2369,11 @@ void waitCommand(client *c) {
long numreplicas, ackreplicas;
long long offset = c->woff;
+ if (server.masterhost) {
+ addReplyError(c,"WAIT cannot be used with slave instances. Please also note that since Redis 4.0 if a slave is configured to be writable (which is not the default) writes to slaves are just local and are not propagated.");
+ return;
+ }
+
/* Argument parsing. */
if (getLongFromObjectOrReply(c,c->argv[1],&numreplicas,NULL) != C_OK)
return;
@@ -2234,7 +2522,9 @@ void replicationCron(void) {
robj *ping_argv[1];
/* First, send PING according to ping_slave_period. */
- if ((replication_cron_loops % server.repl_ping_slave_period) == 0) {
+ if ((replication_cron_loops % server.repl_ping_slave_period) == 0 &&
+ listLength(server.slaves))
+ {
ping_argv[0] = createStringObject("PING",4);
replicationFeedSlaves(server.slaves, server.slaveseldb,
ping_argv, 1);
@@ -2243,20 +2533,30 @@ void replicationCron(void) {
/* Second, send a newline to all the slaves in pre-synchronization
* stage, that is, slaves waiting for the master to create the RDB file.
+ *
+ * Also send the a newline to all the chained slaves we have, if we lost
+ * connection from our master, to keep the slaves aware that their
+ * master is online. This is needed since sub-slaves only receive proxied
+ * data from top-level masters, so there is no explicit pinging in order
+ * to avoid altering the replication offsets. This special out of band
+ * pings (newlines) can be sent, they will have no effect in the offset.
+ *
* The newline will be ignored by the slave but will refresh the
- * last-io timer preventing a timeout. In this case we ignore the
+ * last interaction timer preventing a timeout. In this case we ignore the
* ping period and refresh the connection once per second since certain
* timeouts are set at a few seconds (example: PSYNC response). */
listRewind(server.slaves,&li);
while((ln = listNext(&li))) {
client *slave = ln->value;
- if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_START ||
+ int is_presync =
+ (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_START ||
(slave->replstate == SLAVE_STATE_WAIT_BGSAVE_END &&
- server.rdb_child_type != RDB_CHILD_TYPE_SOCKET))
- {
+ server.rdb_child_type != RDB_CHILD_TYPE_SOCKET));
+
+ if (is_presync) {
if (write(slave->fd, "\n", 1) == -1) {
- /* Don't worry, it's just a ping. */
+ /* Don't worry about socket errors, it's just a ping. */
}
}
}
@@ -2281,10 +2581,14 @@ void replicationCron(void) {
}
}
- /* If we have no attached slaves and there is a replication backlog
- * using memory, free it after some (configured) time. */
+ /* If this is a master without attached slaves and there is a replication
+ * backlog active, in order to reclaim memory we can free it after some
+ * (configured) time. Note that this cannot be done for slaves: slaves
+ * without sub-slaves attached should still accumulate data into the
+ * backlog, in order to reply to PSYNC queries if they are turned into
+ * masters after a failover. */
if (listLength(server.slaves) == 0 && server.repl_backlog_time_limit &&
- server.repl_backlog)
+ server.repl_backlog && server.masterhost == NULL)
{
time_t idle = server.unixtime - server.repl_no_slaves_since;
@@ -2307,13 +2611,12 @@ void replicationCron(void) {
replicationScriptCacheFlush();
}
- /* If we are using diskless replication and there are slaves waiting
- * in WAIT_BGSAVE_START state, check if enough seconds elapsed and
- * start a BGSAVE.
+ /* Start a BGSAVE good for replication if we have slaves in
+ * WAIT_BGSAVE_START state.
*
- * This code is also useful to trigger a BGSAVE if the diskless
- * replication was turned off with CONFIG SET, while there were already
- * slaves in WAIT_BGSAVE_START state. */
+ * In case of diskless replication, we make sure to wait the specified
+ * number of seconds (according to configuration) so that other slaves
+ * have the time to arrive before we start streaming. */
if (server.rdb_child_pid == -1 && server.aof_child_pid == -1) {
time_t idle, max_idle = 0;
int slaves_waiting = 0;
@@ -2333,9 +2636,13 @@ void replicationCron(void) {
}
}
- if (slaves_waiting && max_idle > server.repl_diskless_sync_delay) {
- /* Start a BGSAVE. Usually with socket target, or with disk target
- * if there was a recent socket -> disk config change. */
+ if (slaves_waiting &&
+ (!server.repl_diskless_sync ||
+ max_idle > server.repl_diskless_sync_delay))
+ {
+ /* Start the BGSAVE. The called function may start a
+ * BGSAVE with socket target or disk target depending on the
+ * configuration and slaves capabilities. */
startBgsaveForReplication(mincapa);
}
}
diff --git a/src/rio.h b/src/rio.h
index 711308ce6..6749723d2 100644
--- a/src/rio.h
+++ b/src/rio.h
@@ -135,6 +135,9 @@ size_t rioWriteBulkString(rio *r, const char *buf, size_t len);
size_t rioWriteBulkLongLong(rio *r, long long l);
size_t rioWriteBulkDouble(rio *r, double d);
+struct redisObject;
+int rioWriteBulkObject(rio *r, struct redisObject *obj);
+
void rioGenericUpdateChecksum(rio *r, const void *buf, size_t len);
void rioSetAutoSync(rio *r, off_t bytes);
diff --git a/src/sds.c b/src/sds.c
index e3dd67352..eafa13c29 100644
--- a/src/sds.c
+++ b/src/sds.c
@@ -35,6 +35,7 @@
#include <string.h>
#include <ctype.h>
#include <assert.h>
+#include <limits.h>
#include "sds.h"
#include "sdsalloc.h"
@@ -55,14 +56,16 @@ static inline int sdsHdrSize(char type) {
}
static inline char sdsReqType(size_t string_size) {
- if (string_size < 32)
+ if (string_size < 1<<5)
return SDS_TYPE_5;
- if (string_size < 0xff)
+ if (string_size < 1<<8)
return SDS_TYPE_8;
- if (string_size < 0xffff)
+ if (string_size < 1<<16)
return SDS_TYPE_16;
- if (string_size < 0xffffffff)
+#if (LONG_MAX == LLONG_MAX)
+ if (string_size < 1ll<<32)
return SDS_TYPE_32;
+#endif
return SDS_TYPE_64;
}
diff --git a/src/sentinel.c b/src/sentinel.c
index 0d1eb78aa..1f47dd337 100644
--- a/src/sentinel.c
+++ b/src/sentinel.c
@@ -1062,11 +1062,18 @@ int sentinelUpdateSentinelAddressInAllMasters(sentinelRedisInstance *ri) {
sentinelRedisInstance *master = dictGetVal(de), *match;
match = getSentinelRedisInstanceByAddrAndRunID(master->sentinels,
NULL,0,ri->runid);
- if (match->link->disconnected == 0) {
+ /* If there is no match, this master does not know about this
+ * Sentinel, try with the next one. */
+ if (match == NULL) continue;
+
+ /* Disconnect the old links if connected. */
+ if (match->link->cc != NULL)
instanceLinkCloseConnection(match->link,match->link->cc);
+ if (match->link->pc != NULL)
instanceLinkCloseConnection(match->link,match->link->pc);
- }
+
if (match == ri) continue; /* Address already updated for it. */
+
/* Update the address of the matching Sentinel by copying the address
* of the Sentinel object that received the address update. */
releaseSentinelAddr(match->addr);
@@ -1910,6 +1917,7 @@ void sentinelReconnectInstance(sentinelRedisInstance *ri) {
link->cc->errstr);
instanceLinkCloseConnection(link,link->cc);
} else {
+ link->pending_commands = 0;
link->cc_conn_time = mstime();
link->cc->data = link;
redisAeAttach(server.el,link->cc);
@@ -2571,9 +2579,15 @@ void sentinelSendPeriodicCommands(sentinelRedisInstance *ri) {
/* If this is a slave of a master in O_DOWN condition we start sending
* it INFO every second, instead of the usual SENTINEL_INFO_PERIOD
* period. In this state we want to closely monitor slaves in case they
- * are turned into masters by another Sentinel, or by the sysadmin. */
+ * are turned into masters by another Sentinel, or by the sysadmin.
+ *
+ * Similarly we monitor the INFO output more often if the slave reports
+ * to be disconnected from the master, so that we can have a fresh
+ * disconnection time figure. */
if ((ri->flags & SRI_SLAVE) &&
- (ri->master->flags & (SRI_O_DOWN|SRI_FAILOVER_IN_PROGRESS))) {
+ ((ri->master->flags & (SRI_O_DOWN|SRI_FAILOVER_IN_PROGRESS)) ||
+ (ri->master_link_down_time != 0)))
+ {
info_period = 1000;
} else {
info_period = SENTINEL_INFO_PERIOD;
@@ -3626,15 +3640,15 @@ struct sentinelLeader {
/* Helper function for sentinelGetLeader, increment the counter
* relative to the specified runid. */
int sentinelLeaderIncr(dict *counters, char *runid) {
- dictEntry *de = dictFind(counters,runid);
+ dictEntry *existing, *de;
uint64_t oldval;
- if (de) {
- oldval = dictGetUnsignedIntegerVal(de);
- dictSetUnsignedIntegerVal(de,oldval+1);
+ de = dictAddRaw(counters,runid,&existing);
+ if (existing) {
+ oldval = dictGetUnsignedIntegerVal(existing);
+ dictSetUnsignedIntegerVal(existing,oldval+1);
return oldval+1;
} else {
- de = dictAddRaw(counters,runid);
serverAssert(de != NULL);
dictSetUnsignedIntegerVal(de,1);
return 1;
@@ -3660,7 +3674,7 @@ char *sentinelGetLeader(sentinelRedisInstance *master, uint64_t epoch) {
serverAssert(master->flags & (SRI_O_DOWN|SRI_FAILOVER_IN_PROGRESS));
counters = dictCreate(&leaderVotesDictType,NULL);
- voters = dictSize(master->sentinels)+1; /* All the other sentinels and me. */
+ voters = dictSize(master->sentinels)+1; /* All the other sentinels and me.*/
/* Count other sentinels votes */
di = dictGetIterator(master->sentinels);
@@ -3874,11 +3888,11 @@ int compareSlavesForPromotion(const void *a, const void *b) {
return (*sa)->slave_priority - (*sb)->slave_priority;
/* If priority is the same, select the slave with greater replication
- * offset (processed more data frmo the master). */
+ * offset (processed more data from the master). */
if ((*sa)->slave_repl_offset > (*sb)->slave_repl_offset) {
return -1; /* a < b */
} else if ((*sa)->slave_repl_offset < (*sb)->slave_repl_offset) {
- return 1; /* b > a */
+ return 1; /* a > b */
}
/* If the replication offset is the same select the slave with that has
@@ -3996,7 +4010,7 @@ void sentinelFailoverSendSlaveOfNoOne(sentinelRedisInstance *ri) {
/* We can't send the command to the promoted slave if it is now
* disconnected. Retry again and again with this state until the timeout
* is reached, then abort the failover. */
- if (ri->link->disconnected) {
+ if (ri->promoted_slave->link->disconnected) {
if (mstime() - ri->failover_state_change_time > ri->failover_timeout) {
sentinelEvent(LL_WARNING,"-failover-abort-slave-timeout",ri,"%@");
sentinelAbortFailover(ri);
diff --git a/src/server.c b/src/server.c
index f070865cb..f6868832d 100644
--- a/src/server.c
+++ b/src/server.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2009-2012, Salvatore Sanfilippo <antirez at gmail dot com>
+ * Copyright (c) 2009-2016, Salvatore Sanfilippo <antirez at gmail dot com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -145,8 +145,8 @@ struct redisCommand redisCommandTable[] = {
{"mget",mgetCommand,-2,"r",0,NULL,1,-1,1,0,0},
{"rpush",rpushCommand,-3,"wmF",0,NULL,1,1,1,0,0},
{"lpush",lpushCommand,-3,"wmF",0,NULL,1,1,1,0,0},
- {"rpushx",rpushxCommand,3,"wmF",0,NULL,1,1,1,0,0},
- {"lpushx",lpushxCommand,3,"wmF",0,NULL,1,1,1,0,0},
+ {"rpushx",rpushxCommand,-3,"wmF",0,NULL,1,1,1,0,0},
+ {"lpushx",lpushxCommand,-3,"wmF",0,NULL,1,1,1,0,0},
{"linsert",linsertCommand,5,"wm",0,NULL,1,1,1,0,0},
{"rpop",rpopCommand,2,"wF",0,NULL,1,1,1,0,0},
{"lpop",lpopCommand,2,"wF",0,NULL,1,1,1,0,0},
@@ -165,7 +165,7 @@ struct redisCommand redisCommandTable[] = {
{"smove",smoveCommand,4,"wF",0,NULL,1,2,1,0,0},
{"sismember",sismemberCommand,3,"rF",0,NULL,1,1,1,0,0},
{"scard",scardCommand,2,"rF",0,NULL,1,1,1,0,0},
- {"spop",spopCommand,-2,"wRsF",0,NULL,1,1,1,0,0},
+ {"spop",spopCommand,-2,"wRF",0,NULL,1,1,1,0,0},
{"srandmember",srandmemberCommand,-2,"rR",0,NULL,1,1,1,0,0},
{"sinter",sinterCommand,-2,"rS",0,NULL,1,-1,1,0,0},
{"sinterstore",sinterstoreCommand,-3,"wm",0,NULL,1,-1,1,0,0},
@@ -219,6 +219,7 @@ struct redisCommand redisCommandTable[] = {
{"msetnx",msetnxCommand,-3,"wm",0,NULL,1,-1,2,0,0},
{"randomkey",randomkeyCommand,1,"rR",0,NULL,0,0,0,0,0},
{"select",selectCommand,2,"lF",0,NULL,0,0,0,0,0},
+ {"swapdb",swapdbCommand,3,"wF",0,NULL,0,0,0,0,0},
{"move",moveCommand,3,"wF",0,NULL,1,1,1,0,0},
{"rename",renameCommand,3,"w",0,NULL,1,2,1,0,0},
{"renamenx",renamenxCommand,3,"wF",0,NULL,1,2,1,0,0},
@@ -233,7 +234,7 @@ struct redisCommand redisCommandTable[] = {
{"ping",pingCommand,-1,"tF",0,NULL,0,0,0,0,0},
{"echo",echoCommand,2,"F",0,NULL,0,0,0,0,0},
{"save",saveCommand,1,"as",0,NULL,0,0,0,0,0},
- {"bgsave",bgsaveCommand,1,"a",0,NULL,0,0,0,0,0},
+ {"bgsave",bgsaveCommand,-1,"a",0,NULL,0,0,0,0,0},
{"bgrewriteaof",bgrewriteaofCommand,1,"a",0,NULL,0,0,0,0,0},
{"shutdown",shutdownCommand,-1,"alt",0,NULL,0,0,0,0,0},
{"lastsave",lastsaveCommand,1,"RF",0,NULL,0,0,0,0,0},
@@ -250,6 +251,7 @@ struct redisCommand redisCommandTable[] = {
{"info",infoCommand,-1,"lt",0,NULL,0,0,0,0,0},
{"monitor",monitorCommand,1,"as",0,NULL,0,0,0,0,0},
{"ttl",ttlCommand,2,"rF",0,NULL,1,1,1,0,0},
+ {"touch",touchCommand,-2,"rF",0,NULL,1,1,1,0,0},
{"pttl",pttlCommand,2,"rF",0,NULL,1,1,1,0,0},
{"persist",persistCommand,2,"wF",0,NULL,1,1,1,0,0},
{"slaveof",slaveofCommand,3,"ast",0,NULL,0,0,0,0,0},
@@ -273,6 +275,7 @@ struct redisCommand redisCommandTable[] = {
{"readwrite",readwriteCommand,1,"F",0,NULL,0,0,0,0,0},
{"dump",dumpCommand,2,"r",0,NULL,1,1,1,0,0},
{"object",objectCommand,3,"r",0,NULL,2,2,2,0,0},
+ {"memory",memoryCommand,-2,"r",0,NULL,0,0,0,0,0},
{"client",clientCommand,-2,"as",0,NULL,0,0,0,0,0},
{"eval",evalCommand,-3,"s",0,evalGetKeys,0,0,0,0,0},
{"evalsha",evalShaCommand,-3,"s",0,evalGetKeys,0,0,0,0,0},
@@ -295,11 +298,11 @@ struct redisCommand redisCommandTable[] = {
{"pfcount",pfcountCommand,-2,"r",0,NULL,1,-1,1,0,0},
{"pfmerge",pfmergeCommand,-2,"wm",0,NULL,1,-1,1,0,0},
{"pfdebug",pfdebugCommand,-3,"w",0,NULL,0,0,0,0,0},
+ {"post",securityWarningCommand,-1,"lt",0,NULL,0,0,0,0,0},
+ {"host:",securityWarningCommand,-1,"lt",0,NULL,0,0,0,0,0},
{"latency",latencyCommand,-2,"aslt",0,NULL,0,0,0,0,0}
};
-struct evictionPoolEntry *evictionPoolAlloc(void);
-
/*============================ Utility functions ============================ */
/* Low level logging. To use only for very big messages, otherwise
@@ -688,7 +691,7 @@ int htNeedsResize(dict *dict) {
size = dictSlots(dict);
used = dictSize(dict);
- return (size && used && size > DICT_HT_INITIAL_SIZE &&
+ return (size > DICT_HT_INITIAL_SIZE &&
(used*100/size < HASHTABLE_MIN_FILL));
}
@@ -737,186 +740,6 @@ void updateDictResizePolicy(void) {
/* ======================= Cron: called every 100 ms ======================== */
-/* Helper function for the activeExpireCycle() function.
- * This function will try to expire the key that is stored in the hash table
- * entry 'de' of the 'expires' hash table of a Redis database.
- *
- * If the key is found to be expired, it is removed from the database and
- * 1 is returned. Otherwise no operation is performed and 0 is returned.
- *
- * When a key is expired, server.stat_expiredkeys is incremented.
- *
- * The parameter 'now' is the current time in milliseconds as is passed
- * to the function to avoid too many gettimeofday() syscalls. */
-int activeExpireCycleTryExpire(redisDb *db, dictEntry *de, long long now) {
- long long t = dictGetSignedIntegerVal(de);
- if (now > t) {
- sds key = dictGetKey(de);
- robj *keyobj = createStringObject(key,sdslen(key));
-
- propagateExpire(db,keyobj,server.lazyfree_lazy_expire);
- if (server.lazyfree_lazy_expire)
- dbAsyncDelete(db,keyobj);
- else
- dbSyncDelete(db,keyobj);
- notifyKeyspaceEvent(NOTIFY_EXPIRED,
- "expired",keyobj,db->id);
- decrRefCount(keyobj);
- server.stat_expiredkeys++;
- return 1;
- } else {
- return 0;
- }
-}
-
-/* Try to expire a few timed out keys. The algorithm used is adaptive and
- * will use few CPU cycles if there are few expiring keys, otherwise
- * it will get more aggressive to avoid that too much memory is used by
- * keys that can be removed from the keyspace.
- *
- * No more than CRON_DBS_PER_CALL databases are tested at every
- * iteration.
- *
- * This kind of call is used when Redis detects that timelimit_exit is
- * true, so there is more work to do, and we do it more incrementally from
- * the beforeSleep() function of the event loop.
- *
- * Expire cycle type:
- *
- * If type is ACTIVE_EXPIRE_CYCLE_FAST the function will try to run a
- * "fast" expire cycle that takes no longer than EXPIRE_FAST_CYCLE_DURATION
- * microseconds, and is not repeated again before the same amount of time.
- *
- * If type is ACTIVE_EXPIRE_CYCLE_SLOW, that normal expire cycle is
- * executed, where the time limit is a percentage of the REDIS_HZ period
- * as specified by the REDIS_EXPIRELOOKUPS_TIME_PERC define. */
-
-void activeExpireCycle(int type) {
- /* This function has some global state in order to continue the work
- * incrementally across calls. */
- static unsigned int current_db = 0; /* Last DB tested. */
- static int timelimit_exit = 0; /* Time limit hit in previous call? */
- static long long last_fast_cycle = 0; /* When last fast cycle ran. */
-
- int j, iteration = 0;
- int dbs_per_call = CRON_DBS_PER_CALL;
- long long start = ustime(), timelimit;
-
- if (type == ACTIVE_EXPIRE_CYCLE_FAST) {
- /* Don't start a fast cycle if the previous cycle did not exited
- * for time limt. Also don't repeat a fast cycle for the same period
- * as the fast cycle total duration itself. */
- if (!timelimit_exit) return;
- if (start < last_fast_cycle + ACTIVE_EXPIRE_CYCLE_FAST_DURATION*2) return;
- last_fast_cycle = start;
- }
-
- /* We usually should test CRON_DBS_PER_CALL per iteration, with
- * two exceptions:
- *
- * 1) Don't test more DBs than we have.
- * 2) If last time we hit the time limit, we want to scan all DBs
- * in this iteration, as there is work to do in some DB and we don't want
- * expired keys to use memory for too much time. */
- if (dbs_per_call > server.dbnum || timelimit_exit)
- dbs_per_call = server.dbnum;
-
- /* We can use at max ACTIVE_EXPIRE_CYCLE_SLOW_TIME_PERC percentage of CPU time
- * per iteration. Since this function gets called with a frequency of
- * server.hz times per second, the following is the max amount of
- * microseconds we can spend in this function. */
- timelimit = 1000000*ACTIVE_EXPIRE_CYCLE_SLOW_TIME_PERC/server.hz/100;
- timelimit_exit = 0;
- if (timelimit <= 0) timelimit = 1;
-
- if (type == ACTIVE_EXPIRE_CYCLE_FAST)
- timelimit = ACTIVE_EXPIRE_CYCLE_FAST_DURATION; /* in microseconds. */
-
- for (j = 0; j < dbs_per_call; j++) {
- int expired;
- redisDb *db = server.db+(current_db % server.dbnum);
-
- /* Increment the DB now so we are sure if we run out of time
- * in the current DB we'll restart from the next. This allows to
- * distribute the time evenly across DBs. */
- current_db++;
-
- /* Continue to expire if at the end of the cycle more than 25%
- * of the keys were expired. */
- do {
- unsigned long num, slots;
- long long now, ttl_sum;
- int ttl_samples;
-
- /* If there is nothing to expire try next DB ASAP. */
- if ((num = dictSize(db->expires)) == 0) {
- db->avg_ttl = 0;
- break;
- }
- slots = dictSlots(db->expires);
- now = mstime();
-
- /* When there are less than 1% filled slots getting random
- * keys is expensive, so stop here waiting for better times...
- * The dictionary will be resized asap. */
- if (num && slots > DICT_HT_INITIAL_SIZE &&
- (num*100/slots < 1)) break;
-
- /* The main collection cycle. Sample random keys among keys
- * with an expire set, checking for expired ones. */
- expired = 0;
- ttl_sum = 0;
- ttl_samples = 0;
-
- if (num > ACTIVE_EXPIRE_CYCLE_LOOKUPS_PER_LOOP)
- num = ACTIVE_EXPIRE_CYCLE_LOOKUPS_PER_LOOP;
-
- while (num--) {
- dictEntry *de;
- long long ttl;
-
- if ((de = dictGetRandomKey(db->expires)) == NULL) break;
- ttl = dictGetSignedIntegerVal(de)-now;
- if (activeExpireCycleTryExpire(db,de,now)) expired++;
- if (ttl > 0) {
- /* We want the average TTL of keys yet not expired. */
- ttl_sum += ttl;
- ttl_samples++;
- }
- }
-
- /* Update the average TTL stats for this database. */
- if (ttl_samples) {
- long long avg_ttl = ttl_sum/ttl_samples;
-
- /* Do a simple running average with a few samples.
- * We just use the current estimate with a weight of 2%
- * and the previous estimate with a weight of 98%. */
- if (db->avg_ttl == 0) db->avg_ttl = avg_ttl;
- db->avg_ttl = (db->avg_ttl/50)*49 + (avg_ttl/50);
- }
-
- /* We can't block forever here even if there are many keys to
- * expire. So after a given amount of milliseconds return to the
- * caller waiting for the other active expire cycle. */
- iteration++;
- if ((iteration & 0xf) == 0) { /* check once every 16 iterations. */
- long long elapsed = ustime()-start;
-
- latencyAddSampleIfNeeded("expire-cycle",elapsed/1000);
- if (elapsed > timelimit) timelimit_exit = 1;
- }
- if (timelimit_exit) return;
- /* We don't repeat the cycle if there are less than 25% of keys
- * found expired in the current DB. */
- } while (expired > ACTIVE_EXPIRE_CYCLE_LOOKUPS_PER_LOOP/4);
- }
-}
-
-unsigned int getLRUClock(void) {
- return (mstime()/LRU_CLOCK_RESOLUTION) & LRU_CLOCK_MAX;
-}
-
/* Add a sample to the operations per second array of samples. */
void trackInstantaneousMetric(int metric, long long current_reading) {
long long t = mstime() - server.inst_metric[metric].last_sample_time;
@@ -1047,8 +870,11 @@ void clientsCron(void) {
void databasesCron(void) {
/* Expire keys by random sampling. Not required for slaves
* as master will synthesize DELs for us. */
- if (server.active_expire_enabled && server.masterhost == NULL)
+ if (server.active_expire_enabled && server.masterhost == NULL) {
activeExpireCycle(ACTIVE_EXPIRE_CYCLE_SLOW);
+ } else if (server.masterhost != NULL) {
+ expireSlaveKeys();
+ }
/* Perform hash tables rehashing if needed, but only if there are no
* other processes saving the DB on disk. Otherwise rehashing is bad
@@ -1224,8 +1050,10 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
(int) server.aof_child_pid);
} else if (pid == server.rdb_child_pid) {
backgroundSaveDoneHandler(exitcode,bysignal);
+ if (!bysignal && exitcode == 0) receiveChildInfo();
} else if (pid == server.aof_child_pid) {
backgroundRewriteDoneHandler(exitcode,bysignal);
+ if (!bysignal && exitcode == 0) receiveChildInfo();
} else {
if (!ldbRemoveChild(pid)) {
serverLog(LL_WARNING,
@@ -1234,6 +1062,7 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
}
}
updateDictResizePolicy();
+ closeChildInfoPipe();
}
} else {
/* If there is not a background saving/rewrite in progress check if
@@ -1253,7 +1082,7 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
{
serverLog(LL_NOTICE,"%d changes in %d seconds. Saving...",
sp->changes, (int)sp->seconds);
- rdbSaveBackground(server.rdb_filename);
+ rdbSaveBackground(server.rdb_filename,NULL);
break;
}
}
@@ -1292,10 +1121,10 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
freeClientsInAsyncFreeQueue();
/* Clear the paused clients flag if needed. */
- clientsArePaused(); /* Don't check return value, just use the side effect. */
+ clientsArePaused(); /* Don't check return value, just use the side effect.*/
- /* Replication cron function -- used to reconnect to master and
- * to detect transfer failures. */
+ /* Replication cron function -- used to reconnect to master,
+ * detect transfer failures, start background RDB transfers and so forth. */
run_with_period(1000) replicationCron();
/* Run the Redis Cluster cron. */
@@ -1313,6 +1142,22 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
migrateCloseTimedoutSockets();
}
+ /* Start a scheduled BGSAVE if the corresponding flag is set. This is
+ * useful when we are forced to postpone a BGSAVE because an AOF
+ * rewrite is in progress.
+ *
+ * Note: this code must be after the replicationCron() call above so
+ * make sure when refactoring this file to keep this order. This is useful
+ * because we want to give priority to RDB savings for replication. */
+ if (server.rdb_child_pid == -1 && server.aof_child_pid == -1 &&
+ server.rdb_bgsave_scheduled &&
+ (server.unixtime-server.lastbgsave_try > CONFIG_BGSAVE_RETRY_DELAY ||
+ server.lastbgsave_status == C_OK))
+ {
+ if (rdbSaveBackground(server.rdb_filename,NULL) == C_OK)
+ server.rdb_bgsave_scheduled = 0;
+ }
+
server.cronloops++;
return 1000/server.hz;
}
@@ -1354,6 +1199,10 @@ void beforeSleep(struct aeEventLoop *eventLoop) {
if (listLength(server.clients_waiting_acks))
processClientsWaitingReplicas();
+ /* Check if there are clients unblocked by modules that implement
+ * blocking commands. */
+ moduleHandleBlockedClients();
+
/* Try to process pending commands for clients that were just unblocked. */
if (listLength(server.unblocked_clients))
processUnblockedClients();
@@ -1463,10 +1312,12 @@ void initServerConfig(void) {
int j;
getRandomHexChars(server.runid,CONFIG_RUN_ID_SIZE);
+ server.runid[CONFIG_RUN_ID_SIZE] = '\0';
+ changeReplicationId();
+ clearReplicationId2();
server.configfile = NULL;
server.executable = NULL;
server.hz = CONFIG_DEFAULT_HZ;
- server.runid[CONFIG_RUN_ID_SIZE] = '\0';
server.arch_bits = (sizeof(long) == 8) ? 64 : 32;
server.port = CONFIG_DEFAULT_SERVER_PORT;
server.tcp_backlog = CONFIG_DEFAULT_TCP_BACKLOG;
@@ -1508,6 +1359,7 @@ void initServerConfig(void) {
server.aof_flush_postponed_start = 0;
server.aof_rewrite_incremental_fsync = CONFIG_DEFAULT_AOF_REWRITE_INCREMENTAL_FSYNC;
server.aof_load_truncated = CONFIG_DEFAULT_AOF_LOAD_TRUNCATED;
+ server.aof_use_rdb_preamble = CONFIG_DEFAULT_AOF_USE_RDB_PREAMBLE;
server.pidfile = NULL;
server.rdb_filename = zstrdup(CONFIG_DEFAULT_RDB_FILENAME);
server.aof_filename = zstrdup(CONFIG_DEFAULT_AOF_FILENAME);
@@ -1522,6 +1374,8 @@ void initServerConfig(void) {
server.maxmemory = CONFIG_DEFAULT_MAXMEMORY;
server.maxmemory_policy = CONFIG_DEFAULT_MAXMEMORY_POLICY;
server.maxmemory_samples = CONFIG_DEFAULT_MAXMEMORY_SAMPLES;
+ server.lfu_log_factor = CONFIG_DEFAULT_LFU_LOG_FACTOR;
+ server.lfu_decay_time = CONFIG_DEFAULT_LFU_DECAY_TIME;
server.hash_max_ziplist_entries = OBJ_HASH_MAX_ZIPLIST_ENTRIES;
server.hash_max_ziplist_value = OBJ_HASH_MAX_ZIPLIST_VALUE;
server.list_max_ziplist_size = OBJ_LIST_MAX_ZIPLIST_SIZE;
@@ -1546,6 +1400,7 @@ void initServerConfig(void) {
server.lazyfree_lazy_eviction = CONFIG_DEFAULT_LAZYFREE_LAZY_EVICTION;
server.lazyfree_lazy_expire = CONFIG_DEFAULT_LAZYFREE_LAZY_EXPIRE;
server.lazyfree_lazy_server_del = CONFIG_DEFAULT_LAZYFREE_LAZY_SERVER_DEL;
+ server.always_show_logo = CONFIG_DEFAULT_ALWAYS_SHOW_LOGO;
server.lruclock = getLRUClock();
resetServerSaveParams();
@@ -1560,7 +1415,7 @@ void initServerConfig(void) {
server.masterport = 6379;
server.master = NULL;
server.cached_master = NULL;
- server.repl_master_initial_offset = -1;
+ server.master_initial_offset = -1;
server.repl_state = REPL_STATE_NONE;
server.repl_syncio_timeout = CONFIG_REPL_SYNCIO_TIMEOUT;
server.repl_serve_stale_data = CONFIG_DEFAULT_SLAVE_SERVE_STALE_DATA;
@@ -1575,6 +1430,8 @@ void initServerConfig(void) {
server.repl_min_slaves_to_write = CONFIG_DEFAULT_MIN_SLAVES_TO_WRITE;
server.repl_min_slaves_max_lag = CONFIG_DEFAULT_MIN_SLAVES_MAX_LAG;
server.slave_priority = CONFIG_DEFAULT_SLAVE_PRIORITY;
+ server.slave_announce_ip = CONFIG_DEFAULT_SLAVE_ANNOUNCE_IP;
+ server.slave_announce_port = CONFIG_DEFAULT_SLAVE_ANNOUNCE_PORT;
server.master_repl_offset = 0;
/* Replication partial resync backlog */
@@ -1719,9 +1576,12 @@ void adjustOpenFilesLimit(void) {
if (bestlimit < oldlimit) bestlimit = oldlimit;
if (bestlimit < maxfiles) {
- int old_maxclients = server.maxclients;
+ unsigned int old_maxclients = server.maxclients;
server.maxclients = bestlimit-CONFIG_MIN_RESERVED_FDS;
- if (server.maxclients < 1) {
+ /* maxclients is unsigned so may overflow: in order
+ * to check if maxclients is now logically less than 1
+ * we test indirectly via bestlimit. */
+ if (bestlimit <= CONFIG_MIN_RESERVED_FDS) {
serverLog(LL_WARNING,"Your current 'ulimit -n' "
"of %llu is not enough for the server to start. "
"Please increase your open file limit to at least "
@@ -1795,6 +1655,7 @@ int listenToPort(int port, int *fds, int *count) {
if (server.bindaddr_count == 0) server.bindaddr[0] = NULL;
for (j = 0; j < server.bindaddr_count || j == 0; j++) {
if (server.bindaddr[j] == NULL) {
+ int unsupported = 0;
/* Bind * for both IPv6 and IPv4, we enter here only if
* server.bindaddr_count == 0. */
fds[*count] = anetTcp6Server(server.neterr,port,NULL,
@@ -1802,19 +1663,27 @@ int listenToPort(int port, int *fds, int *count) {
if (fds[*count] != ANET_ERR) {
anetNonBlock(NULL,fds[*count]);
(*count)++;
+ } else if (errno == EAFNOSUPPORT) {
+ unsupported++;
+ serverLog(LL_WARNING,"Not listening to IPv6: unsupproted");
+ }
+ if (*count == 1 || unsupported) {
/* Bind the IPv4 address as well. */
fds[*count] = anetTcpServer(server.neterr,port,NULL,
server.tcp_backlog);
if (fds[*count] != ANET_ERR) {
anetNonBlock(NULL,fds[*count]);
(*count)++;
+ } else if (errno == EAFNOSUPPORT) {
+ unsupported++;
+ serverLog(LL_WARNING,"Not listening to IPv4: unsupproted");
}
}
/* Exit the loop if we were able to bind * on IPv4 and IPv6,
* otherwise fds[*count] will be ANET_ERR and we'll print an
* error and return to the caller with an error. */
- if (*count == 2) break;
+ if (*count + unsupported == 2) break;
} else if (strchr(server.bindaddr[j],':')) {
/* Bind IPv6 address. */
fds[*count] = anetTcp6Server(server.neterr,port,server.bindaddr[j],
@@ -1929,10 +1798,10 @@ void initServer(void) {
server.db[j].blocking_keys = dictCreate(&keylistDictType,NULL);
server.db[j].ready_keys = dictCreate(&objectKeyPointerValueDictType,NULL);
server.db[j].watched_keys = dictCreate(&keylistDictType,NULL);
- server.db[j].eviction_pool = evictionPoolAlloc();
server.db[j].id = j;
server.db[j].avg_ttl = 0;
}
+ evictionPoolAlloc(); /* Initialize the LRU keys pool. */
server.pubsub_channels = dictCreate(&keylistDictType,NULL);
server.pubsub_patterns = listCreate();
listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
@@ -1941,6 +1810,10 @@ void initServer(void) {
server.rdb_child_pid = -1;
server.aof_child_pid = -1;
server.rdb_child_type = RDB_CHILD_TYPE_NONE;
+ server.rdb_bgsave_scheduled = 0;
+ server.child_info_pipe[0] = -1;
+ server.child_info_pipe[1] = -1;
+ server.child_info_data.magic = 0;
aofRewriteBufferReset();
server.aof_buf = sdsempty();
server.lastsave = time(NULL); /* At startup we consider the DB saved. */
@@ -1952,6 +1825,8 @@ void initServer(void) {
/* A few stats we don't want to reset: server startup time, and peak mem. */
server.stat_starttime = time(NULL);
server.stat_peak_memory = 0;
+ server.stat_rdb_cow_bytes = 0;
+ server.stat_aof_cow_bytes = 0;
server.resident_set_size = 0;
server.lastbgsave_status = C_OK;
server.aof_last_write_status = C_OK;
@@ -1959,8 +1834,9 @@ void initServer(void) {
server.repl_good_slaves_count = 0;
updateCachedTime();
- /* Create out timers, that's our main way to process background
- * operations. */
+ /* Create the timer callback, this is our way to process many background
+ * operations incrementally, like clients timeout, eviction of unaccessed
+ * expired keys and so forth. */
if (aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL) == AE_ERR) {
serverPanic("Can't create event loop timers.");
exit(1);
@@ -2006,6 +1882,7 @@ void initServer(void) {
slowlogInit();
latencyMonitorInit();
bioInit();
+ server.initial_memory_usage = zmalloc_used_memory();
}
/* Populates the Redis Command Table starting from the hard coded list
@@ -2603,7 +2480,7 @@ int prepareForShutdown(int flags) {
if ((server.saveparamslen > 0 && !nosave) || save) {
serverLog(LL_NOTICE,"Saving the final RDB snapshot before exiting.");
/* Snapshotting. Perform a SYNC SAVE and exit */
- if (rdbSave(server.rdb_filename) != C_OK) {
+ if (rdbSave(server.rdb_filename,NULL) != C_OK) {
/* Ooops.. error saving! The best we can do is to continue
* operating. Note that if there was a background saving process,
* in the next cron() Redis will be notified that the background
@@ -2957,6 +2834,7 @@ sds genRedisInfoString(char *section) {
size_t total_system_mem = server.system_memory_size;
const char *evict_policy = evictPolicyToString();
long long memory_lua = (long long)lua_gc(server.lua,LUA_GCCOUNT,0)*1024;
+ struct redisMemOverhead *mh = getMemoryOverheadData();
/* Peak memory is updated from time to time by serverCron() so it
* may happen that the instantaneous value is slightly bigger than
@@ -2981,6 +2859,11 @@ sds genRedisInfoString(char *section) {
"used_memory_rss_human:%s\r\n"
"used_memory_peak:%zu\r\n"
"used_memory_peak_human:%s\r\n"
+ "used_memory_peak_perc:%.2f%%\r\n"
+ "used_memory_overhead:%zu\r\n"
+ "used_memory_startup:%zu\r\n"
+ "used_memory_dataset:%zu\r\n"
+ "used_memory_dataset_perc:%.2f%%\r\n"
"total_system_memory:%lu\r\n"
"total_system_memory_human:%s\r\n"
"used_memory_lua:%lld\r\n"
@@ -2997,6 +2880,11 @@ sds genRedisInfoString(char *section) {
used_memory_rss_hmem,
server.stat_peak_memory,
peak_hmem,
+ mh->peak_perc,
+ mh->overhead_total,
+ mh->startup_allocated,
+ mh->dataset,
+ mh->dataset_perc,
(unsigned long)total_system_mem,
total_system_hmem,
memory_lua,
@@ -3004,10 +2892,11 @@ sds genRedisInfoString(char *section) {
server.maxmemory,
maxmemory_hmem,
evict_policy,
- zmalloc_get_fragmentation_ratio(server.resident_set_size),
+ mh->fragmentation,
ZMALLOC_LIB,
lazyfreeGetPendingObjectsCount()
- );
+ );
+ freeMemoryOverheadData(mh);
}
/* Persistence */
@@ -3022,13 +2911,15 @@ sds genRedisInfoString(char *section) {
"rdb_last_bgsave_status:%s\r\n"
"rdb_last_bgsave_time_sec:%jd\r\n"
"rdb_current_bgsave_time_sec:%jd\r\n"
+ "rdb_last_cow_size:%zu\r\n"
"aof_enabled:%d\r\n"
"aof_rewrite_in_progress:%d\r\n"
"aof_rewrite_scheduled:%d\r\n"
"aof_last_rewrite_time_sec:%jd\r\n"
"aof_current_rewrite_time_sec:%jd\r\n"
"aof_last_bgrewrite_status:%s\r\n"
- "aof_last_write_status:%s\r\n",
+ "aof_last_write_status:%s\r\n"
+ "aof_last_cow_size:%zu\r\n",
server.loading,
server.dirty,
server.rdb_child_pid != -1,
@@ -3037,6 +2928,7 @@ sds genRedisInfoString(char *section) {
(intmax_t)server.rdb_save_time_last,
(intmax_t)((server.rdb_child_pid == -1) ?
-1 : time(NULL)-server.rdb_save_time_start),
+ server.stat_rdb_cow_bytes,
server.aof_state != AOF_OFF,
server.aof_child_pid != -1,
server.aof_rewrite_scheduled,
@@ -3044,7 +2936,8 @@ sds genRedisInfoString(char *section) {
(intmax_t)((server.aof_child_pid == -1) ?
-1 : time(NULL)-server.aof_rewrite_time_start),
(server.aof_lastbgrewrite_status == C_OK) ? "ok" : "err",
- (server.aof_last_write_status == C_OK) ? "ok" : "err");
+ (server.aof_last_write_status == C_OK) ? "ok" : "err",
+ server.stat_aof_cow_bytes);
if (server.aof_state != AOF_OFF) {
info = sdscatprintf(info,
@@ -3119,7 +3012,8 @@ sds genRedisInfoString(char *section) {
"pubsub_channels:%ld\r\n"
"pubsub_patterns:%lu\r\n"
"latest_fork_usec:%lld\r\n"
- "migrate_cached_sockets:%ld\r\n",
+ "migrate_cached_sockets:%ld\r\n"
+ "slave_expires_tracked_keys:%zu\r\n",
server.stat_numconnections,
server.stat_numcommands,
getInstantaneousMetric(STATS_METRIC_COMMAND),
@@ -3138,7 +3032,8 @@ sds genRedisInfoString(char *section) {
dictSize(server.pubsub_channels),
listLength(server.pubsub_patterns),
server.stat_fork_time,
- dictSize(server.migrate_cached_sockets));
+ dictSize(server.migrate_cached_sockets),
+ getSlaveKeyWithExpireCount());
}
/* Replication */
@@ -3217,11 +3112,15 @@ sds genRedisInfoString(char *section) {
while((ln = listNext(&li))) {
client *slave = listNodeValue(ln);
char *state = NULL;
- char ip[NET_IP_STR_LEN];
+ char ip[NET_IP_STR_LEN], *slaveip = slave->slave_ip;
int port;
long lag = 0;
- if (anetPeerToString(slave->fd,ip,sizeof(ip),&port) == -1) continue;
+ if (slaveip[0] == '\0') {
+ if (anetPeerToString(slave->fd,ip,sizeof(ip),&port) == -1)
+ continue;
+ slaveip = ip;
+ }
switch(slave->replstate) {
case SLAVE_STATE_WAIT_BGSAVE_START:
case SLAVE_STATE_WAIT_BGSAVE_END:
@@ -3241,18 +3140,24 @@ sds genRedisInfoString(char *section) {
info = sdscatprintf(info,
"slave%d:ip=%s,port=%d,state=%s,"
"offset=%lld,lag=%ld\r\n",
- slaveid,ip,slave->slave_listening_port,state,
+ slaveid,slaveip,slave->slave_listening_port,state,
slave->repl_ack_off, lag);
slaveid++;
}
}
info = sdscatprintf(info,
+ "master_replid:%s\r\n"
+ "master_replid2:%s\r\n"
"master_repl_offset:%lld\r\n"
+ "second_repl_offset:%lld\r\n"
"repl_backlog_active:%d\r\n"
"repl_backlog_size:%lld\r\n"
"repl_backlog_first_byte_offset:%lld\r\n"
"repl_backlog_histlen:%lld\r\n",
+ server.replid,
+ server.replid2,
server.master_repl_offset,
+ server.second_replid_offset,
server.repl_backlog != NULL,
server.repl_backlog_size,
server.repl_backlog_off,
@@ -3337,318 +3242,6 @@ void monitorCommand(client *c) {
addReply(c,shared.ok);
}
-/* ============================ Maxmemory directive ======================== */
-
-/* freeMemoryIfNeeded() gets called when 'maxmemory' is set on the config
- * file to limit the max memory used by the server, before processing a
- * command.
- *
- * The goal of the function is to free enough memory to keep Redis under the
- * configured memory limit.
- *
- * The function starts calculating how many bytes should be freed to keep
- * Redis under the limit, and enters a loop selecting the best keys to
- * evict accordingly to the configured policy.
- *
- * If all the bytes needed to return back under the limit were freed the
- * function returns C_OK, otherwise C_ERR is returned, and the caller
- * should block the execution of commands that will result in more memory
- * used by the server.
- *
- * ------------------------------------------------------------------------
- *
- * LRU approximation algorithm
- *
- * Redis uses an approximation of the LRU algorithm that runs in constant
- * memory. Every time there is a key to expire, we sample N keys (with
- * N very small, usually in around 5) to populate a pool of best keys to
- * evict of M keys (the pool size is defined by MAXMEMORY_EVICTION_POOL_SIZE).
- *
- * The N keys sampled are added in the pool of good keys to expire (the one
- * with an old access time) if they are better than one of the current keys
- * in the pool.
- *
- * After the pool is populated, the best key we have in the pool is expired.
- * However note that we don't remove keys from the pool when they are deleted
- * so the pool may contain keys that no longer exist.
- *
- * When we try to evict a key, and all the entries in the pool don't exist
- * we populate it again. This time we'll be sure that the pool has at least
- * one key that can be evicted, if there is at least one key that can be
- * evicted in the whole database. */
-
-/* Create a new eviction pool. */
-struct evictionPoolEntry *evictionPoolAlloc(void) {
- struct evictionPoolEntry *ep;
- int j;
-
- ep = zmalloc(sizeof(*ep)*MAXMEMORY_EVICTION_POOL_SIZE);
- for (j = 0; j < MAXMEMORY_EVICTION_POOL_SIZE; j++) {
- ep[j].idle = 0;
- ep[j].key = NULL;
- }
- return ep;
-}
-
-/* This is an helper function for freeMemoryIfNeeded(), it is used in order
- * to populate the evictionPool with a few entries every time we want to
- * expire a key. Keys with idle time smaller than one of the current
- * keys are added. Keys are always added if there are free entries.
- *
- * We insert keys on place in ascending order, so keys with the smaller
- * idle time are on the left, and keys with the higher idle time on the
- * right. */
-
-#define EVICTION_SAMPLES_ARRAY_SIZE 16
-void evictionPoolPopulate(dict *sampledict, dict *keydict, struct evictionPoolEntry *pool) {
- int j, k, count;
- dictEntry *_samples[EVICTION_SAMPLES_ARRAY_SIZE];
- dictEntry **samples;
-
- /* Try to use a static buffer: this function is a big hit...
- * Note: it was actually measured that this helps. */
- if (server.maxmemory_samples <= EVICTION_SAMPLES_ARRAY_SIZE) {
- samples = _samples;
- } else {
- samples = zmalloc(sizeof(samples[0])*server.maxmemory_samples);
- }
-
- count = dictGetSomeKeys(sampledict,samples,server.maxmemory_samples);
- for (j = 0; j < count; j++) {
- unsigned long long idle;
- sds key;
- robj *o;
- dictEntry *de;
-
- de = samples[j];
- key = dictGetKey(de);
- /* If the dictionary we are sampling from is not the main
- * dictionary (but the expires one) we need to lookup the key
- * again in the key dictionary to obtain the value object. */
- if (sampledict != keydict) de = dictFind(keydict, key);
- o = dictGetVal(de);
- idle = estimateObjectIdleTime(o);
-
- /* Insert the element inside the pool.
- * First, find the first empty bucket or the first populated
- * bucket that has an idle time smaller than our idle time. */
- k = 0;
- while (k < MAXMEMORY_EVICTION_POOL_SIZE &&
- pool[k].key &&
- pool[k].idle < idle) k++;
- if (k == 0 && pool[MAXMEMORY_EVICTION_POOL_SIZE-1].key != NULL) {
- /* Can't insert if the element is < the worst element we have
- * and there are no empty buckets. */
- continue;
- } else if (k < MAXMEMORY_EVICTION_POOL_SIZE && pool[k].key == NULL) {
- /* Inserting into empty position. No setup needed before insert. */
- } else {
- /* Inserting in the middle. Now k points to the first element
- * greater than the element to insert. */
- if (pool[MAXMEMORY_EVICTION_POOL_SIZE-1].key == NULL) {
- /* Free space on the right? Insert at k shifting
- * all the elements from k to end to the right. */
- memmove(pool+k+1,pool+k,
- sizeof(pool[0])*(MAXMEMORY_EVICTION_POOL_SIZE-k-1));
- } else {
- /* No free space on right? Insert at k-1 */
- k--;
- /* Shift all elements on the left of k (included) to the
- * left, so we discard the element with smaller idle time. */
- sdsfree(pool[0].key);
- memmove(pool,pool+1,sizeof(pool[0])*k);
- }
- }
- pool[k].key = sdsdup(key);
- pool[k].idle = idle;
- }
- if (samples != _samples) zfree(samples);
-}
-
-int freeMemoryIfNeeded(void) {
- size_t mem_reported, mem_used, mem_tofree, mem_freed;
- int slaves = listLength(server.slaves);
- mstime_t latency, eviction_latency;
- long long delta;
-
- /* Check if we are over the memory usage limit. If we are not, no need
- * to subtract the slaves output buffers. We can just return ASAP. */
- mem_reported = zmalloc_used_memory();
- if (mem_reported <= server.maxmemory) return C_OK;
-
- /* Remove the size of slaves output buffers and AOF buffer from the
- * count of used memory. */
- mem_used = mem_reported;
- if (slaves) {
- listIter li;
- listNode *ln;
-
- listRewind(server.slaves,&li);
- while((ln = listNext(&li))) {
- client *slave = listNodeValue(ln);
- unsigned long obuf_bytes = getClientOutputBufferMemoryUsage(slave);
- if (obuf_bytes > mem_used)
- mem_used = 0;
- else
- mem_used -= obuf_bytes;
- }
- }
- if (server.aof_state != AOF_OFF) {
- mem_used -= sdslen(server.aof_buf);
- mem_used -= aofRewriteBufferSize();
- }
-
- /* Check if we are still over the memory limit. */
- if (mem_used <= server.maxmemory) return C_OK;
-
- /* Compute how much memory we need to free. */
- mem_tofree = mem_used - server.maxmemory;
- mem_freed = 0;
-
- if (server.maxmemory_policy == MAXMEMORY_NO_EVICTION)
- goto cant_free; /* We need to free memory, but policy forbids. */
-
- latencyStartMonitor(latency);
- while (mem_freed < mem_tofree) {
- int j, k, keys_freed = 0;
-
- for (j = 0; j < server.dbnum; j++) {
- long bestval = 0; /* just to prevent warning */
- sds bestkey = NULL;
- dictEntry *de;
- redisDb *db = server.db+j;
- dict *dict;
-
- if (server.maxmemory_policy == MAXMEMORY_ALLKEYS_LRU ||
- server.maxmemory_policy == MAXMEMORY_ALLKEYS_RANDOM)
- {
- dict = server.db[j].dict;
- } else {
- dict = server.db[j].expires;
- }
- if (dictSize(dict) == 0) continue;
-
- /* volatile-random and allkeys-random policy */
- if (server.maxmemory_policy == MAXMEMORY_ALLKEYS_RANDOM ||
- server.maxmemory_policy == MAXMEMORY_VOLATILE_RANDOM)
- {
- de = dictGetRandomKey(dict);
- bestkey = dictGetKey(de);
- }
-
- /* volatile-lru and allkeys-lru policy */
- else if (server.maxmemory_policy == MAXMEMORY_ALLKEYS_LRU ||
- server.maxmemory_policy == MAXMEMORY_VOLATILE_LRU)
- {
- struct evictionPoolEntry *pool = db->eviction_pool;
-
- while(bestkey == NULL) {
- evictionPoolPopulate(dict, db->dict, db->eviction_pool);
- /* Go backward from best to worst element to evict. */
- for (k = MAXMEMORY_EVICTION_POOL_SIZE-1; k >= 0; k--) {
- if (pool[k].key == NULL) continue;
- de = dictFind(dict,pool[k].key);
-
- /* Remove the entry from the pool. */
- sdsfree(pool[k].key);
- /* Shift all elements on its right to left. */
- memmove(pool+k,pool+k+1,
- sizeof(pool[0])*(MAXMEMORY_EVICTION_POOL_SIZE-k-1));
- /* Clear the element on the right which is empty
- * since we shifted one position to the left. */
- pool[MAXMEMORY_EVICTION_POOL_SIZE-1].key = NULL;
- pool[MAXMEMORY_EVICTION_POOL_SIZE-1].idle = 0;
-
- /* If the key exists, is our pick. Otherwise it is
- * a ghost and we need to try the next element. */
- if (de) {
- bestkey = dictGetKey(de);
- break;
- } else {
- /* Ghost... */
- continue;
- }
- }
- }
- }
-
- /* volatile-ttl */
- else if (server.maxmemory_policy == MAXMEMORY_VOLATILE_TTL) {
- for (k = 0; k < server.maxmemory_samples; k++) {
- sds thiskey;
- long thisval;
-
- de = dictGetRandomKey(dict);
- thiskey = dictGetKey(de);
- thisval = (long) dictGetVal(de);
-
- /* Expire sooner (minor expire unix timestamp) is better
- * candidate for deletion */
- if (bestkey == NULL || thisval < bestval) {
- bestkey = thiskey;
- bestval = thisval;
- }
- }
- }
-
- /* Finally remove the selected key. */
- if (bestkey) {
- robj *keyobj = createStringObject(bestkey,sdslen(bestkey));
- propagateExpire(db,keyobj,server.lazyfree_lazy_eviction);
- /* We compute the amount of memory freed by db*Delete() alone.
- * It is possible that actually the memory needed to propagate
- * the DEL in AOF and replication link is greater than the one
- * we are freeing removing the key, but we can't account for
- * that otherwise we would never exit the loop.
- *
- * AOF and Output buffer memory will be freed eventually so
- * we only care about memory used by the key space. */
- delta = (long long) zmalloc_used_memory();
- latencyStartMonitor(eviction_latency);
- if (server.lazyfree_lazy_eviction)
- dbAsyncDelete(db,keyobj);
- else
- dbSyncDelete(db,keyobj);
- latencyEndMonitor(eviction_latency);
- latencyAddSampleIfNeeded("eviction-del",eviction_latency);
- latencyRemoveNestedEvent(latency,eviction_latency);
- delta -= (long long) zmalloc_used_memory();
- mem_freed += delta;
- server.stat_evictedkeys++;
- notifyKeyspaceEvent(NOTIFY_EVICTED, "evicted",
- keyobj, db->id);
- decrRefCount(keyobj);
- keys_freed++;
-
- /* When the memory to free starts to be big enough, we may
- * start spending so much time here that is impossible to
- * deliver data to the slaves fast enough, so we force the
- * transmission here inside the loop. */
- if (slaves) flushSlavesOutputBuffers();
- }
- }
- if (!keys_freed) {
- latencyEndMonitor(latency);
- latencyAddSampleIfNeeded("eviction-cycle",latency);
- goto cant_free; /* nothing to free... */
- }
- }
- latencyEndMonitor(latency);
- latencyAddSampleIfNeeded("eviction-cycle",latency);
- return C_OK;
-
-cant_free:
- /* We are here if we are not able to reclaim memory. There is only one
- * last thing we can try: check if the lazyfree thread has jobs in queue
- * and wait... */
- while(bioPendingJobsOfType(BIO_LAZY_FREE)) {
- if (((mem_reported - zmalloc_used_memory()) + mem_freed) >= mem_tofree)
- break;
- usleep(1000);
- }
- return C_ERR;
-}
-
/* =================================== Main! ================================ */
#ifdef __linux__
@@ -3743,15 +3336,18 @@ void redisAsciiArt(void) {
else if (server.sentinel_mode) mode = "sentinel";
else mode = "standalone";
- if (server.syslog_enabled) {
+ /* Show the ASCII logo if: log file is stdout AND stdout is a
+ * tty AND syslog logging is disabled. Also show logo if the user
+ * forced us to do so via redis.conf. */
+ int show_logo = ((!server.syslog_enabled &&
+ server.logfile[0] == '\0' &&
+ isatty(fileno(stdout))) ||
+ server.always_show_logo);
+
+ if (!show_logo) {
serverLog(LL_NOTICE,
- "Redis %s (%s/%d) %s bit, %s mode, port %d, pid %ld ready to start.",
- REDIS_VERSION,
- redisGitSHA1(),
- strtol(redisGitDirty(),NULL,10) > 0,
- (sizeof(long) == 8) ? "64" : "32",
- mode, server.port,
- (long) getpid()
+ "Running mode=%s, port=%d.",
+ mode, server.port
);
} else {
snprintf(buf,1024*16,ascii_logo,
@@ -3823,7 +3419,7 @@ void setupSignalHandlers(void) {
void memtest(size_t megabytes, int passes);
/* Returns 1 if there is --sentinel among the arguments or if
- * argv[0] is exactly "redis-sentinel". */
+ * argv[0] contains "redis-sentinel". */
int checkForSentinelMode(int argc, char **argv) {
int j;
@@ -3840,9 +3436,20 @@ void loadDataFromDisk(void) {
if (loadAppendOnlyFile(server.aof_filename) == C_OK)
serverLog(LL_NOTICE,"DB loaded from append only file: %.3f seconds",(float)(ustime()-start)/1000000);
} else {
- if (rdbLoad(server.rdb_filename) == C_OK) {
+ rdbSaveInfo rsi = RDB_SAVE_INFO_INIT;
+ if (rdbLoad(server.rdb_filename,&rsi) == C_OK) {
serverLog(LL_NOTICE,"DB loaded from disk: %.3f seconds",
(float)(ustime()-start)/1000000);
+
+ /* Restore the replication ID / offset from the RDB file. */
+ if (rsi.repl_id_is_set && rsi.repl_offset != -1) {
+ memcpy(server.replid,rsi.repl_id,sizeof(server.replid));
+ server.master_repl_offset = rsi.repl_offset;
+ /* If we are a slave, create a cached master from this
+ * information, in order to allow partial resynchronizations
+ * with masters. */
+ if (server.masterhost) replicationCacheMasterUsingMyself();
+ }
} else if (errno != ENOENT) {
serverLog(LL_WARNING,"Fatal error loading the DB: %s. Exiting.",strerror(errno));
exit(1);
@@ -4031,7 +3638,7 @@ int main(int argc, char **argv) {
* the program main. However the program is part of the Redis executable
* so that we can easily execute an RDB check on loading errors. */
if (strstr(argv[0],"redis-check-rdb") != NULL)
- exit(redis_check_rdb_main(argv,argc));
+ redis_check_rdb_main(argc,argv);
if (argc >= 2) {
j = 1; /* First option to parse in argv[] */
@@ -4097,8 +3704,21 @@ int main(int argc, char **argv) {
resetServerSaveParams();
loadServerConfig(configfile,options);
sdsfree(options);
- } else {
+ }
+
+ serverLog(LL_WARNING, "oO0OoO0OoO0Oo Redis is starting oO0OoO0OoO0Oo");
+ serverLog(LL_WARNING,
+ "Redis version=%s, bits=%d, commit=%s, modified=%d, pid=%d, just started",
+ REDIS_VERSION,
+ (sizeof(long) == 8) ? 64 : 32,
+ redisGitSHA1(),
+ strtol(redisGitDirty(),NULL,10) > 0,
+ (int)getpid());
+
+ if (argc == 1) {
serverLog(LL_WARNING, "Warning: no config file specified, using the default config. In order to specify a config file use %s /path/to/%s.conf", argv[0], server.sentinel_mode ? "sentinel" : "redis");
+ } else {
+ serverLog(LL_WARNING, "Configuration loaded");
}
server.supervised = redisIsSupervised(server.supervised_mode);
@@ -4113,7 +3733,7 @@ int main(int argc, char **argv) {
if (!server.sentinel_mode) {
/* Things not needed when running in Sentinel mode. */
- serverLog(LL_WARNING,"Server started, Redis version " REDIS_VERSION);
+ serverLog(LL_WARNING,"Server initialized");
#ifdef __linux__
linuxMemoryWarnings();
#endif
@@ -4128,7 +3748,7 @@ int main(int argc, char **argv) {
}
}
if (server.ipfd_count > 0)
- serverLog(LL_NOTICE,"The server is now ready to accept connections on port %d", server.port);
+ serverLog(LL_NOTICE,"Ready to accept connections");
if (server.sofd > 0)
serverLog(LL_NOTICE,"The server is now ready to accept connections at %s", server.unixsocket);
} else {
diff --git a/src/server.h b/src/server.h
index 28a8bebf6..140897c18 100644
--- a/src/server.h
+++ b/src/server.h
@@ -33,6 +33,7 @@
#include "fmacros.h"
#include "config.h"
#include "solarisfixes.h"
+#include "rio.h"
#include <stdio.h>
#include <stdlib.h>
@@ -92,6 +93,7 @@ typedef long long mstime_t; /* millisecond time type. */
#define AOF_REWRITE_PERC 100
#define AOF_REWRITE_MIN_SIZE (64*1024*1024)
#define AOF_REWRITE_ITEMS_PER_CMD 64
+#define AOF_READ_DIFF_INTERVAL_BYTES (1024*10)
#define CONFIG_DEFAULT_SLOWLOG_LOG_SLOWER_THAN 10000
#define CONFIG_DEFAULT_SLOWLOG_MAX_LEN 128
#define CONFIG_DEFAULT_MAX_CLIENTS 10000
@@ -113,7 +115,7 @@ typedef long long mstime_t; /* millisecond time type. */
#define CONFIG_DEFAULT_CLUSTER_ANNOUNCE_BUS_PORT 0 /* Use +10000 offset. */
#define CONFIG_DEFAULT_DAEMONIZE 0
#define CONFIG_DEFAULT_UNIX_SOCKET_PERM 0
-#define CONFIG_DEFAULT_TCP_KEEPALIVE 0
+#define CONFIG_DEFAULT_TCP_KEEPALIVE 300
#define CONFIG_DEFAULT_PROTECTED_MODE 1
#define CONFIG_DEFAULT_LOGFILE ""
#define CONFIG_DEFAULT_SYSLOG_ENABLED 0
@@ -125,12 +127,17 @@ typedef long long mstime_t; /* millisecond time type. */
#define CONFIG_DEFAULT_REPL_DISKLESS_SYNC_DELAY 5
#define CONFIG_DEFAULT_SLAVE_SERVE_STALE_DATA 1
#define CONFIG_DEFAULT_SLAVE_READ_ONLY 1
+#define CONFIG_DEFAULT_SLAVE_ANNOUNCE_IP NULL
+#define CONFIG_DEFAULT_SLAVE_ANNOUNCE_PORT 0
#define CONFIG_DEFAULT_REPL_DISABLE_TCP_NODELAY 0
#define CONFIG_DEFAULT_MAXMEMORY 0
#define CONFIG_DEFAULT_MAXMEMORY_SAMPLES 5
+#define CONFIG_DEFAULT_LFU_LOG_FACTOR 10
+#define CONFIG_DEFAULT_LFU_DECAY_TIME 1
#define CONFIG_DEFAULT_AOF_FILENAME "appendonly.aof"
#define CONFIG_DEFAULT_AOF_NO_FSYNC_ON_REWRITE 0
#define CONFIG_DEFAULT_AOF_LOAD_TRUNCATED 1
+#define CONFIG_DEFAULT_AOF_USE_RDB_PREAMBLE 0
#define CONFIG_DEFAULT_ACTIVE_REHASHING 1
#define CONFIG_DEFAULT_AOF_REWRITE_INCREMENTAL_FSYNC 1
#define CONFIG_DEFAULT_MIN_SLAVES_TO_WRITE 0
@@ -144,6 +151,7 @@ typedef long long mstime_t; /* millisecond time type. */
#define CONFIG_DEFAULT_LAZYFREE_LAZY_EVICTION 0
#define CONFIG_DEFAULT_LAZYFREE_LAZY_EXPIRE 0
#define CONFIG_DEFAULT_LAZYFREE_LAZY_SERVER_DEL 0
+#define CONFIG_DEFAULT_ALWAYS_SHOW_LOGO 0
#define ACTIVE_EXPIRE_CYCLE_LOOKUPS_PER_LOOP 20 /* Loopkups per loop. */
#define ACTIVE_EXPIRE_CYCLE_FAST_DURATION 1000 /* Microseconds */
@@ -164,7 +172,7 @@ typedef long long mstime_t; /* millisecond time type. */
#define PROTO_REPLY_CHUNK_BYTES (16*1024) /* 16k output buffer */
#define PROTO_INLINE_MAX_SIZE (1024*64) /* Max size of inline reads */
#define PROTO_MBULK_BIG_ARG (1024*32)
-#define LONG_STR_SIZE 21 /* Bytes needed for long -> str */
+#define LONG_STR_SIZE 21 /* Bytes needed for long -> str + '\0' */
#define AOF_AUTOSYNC_BYTES (1024*1024*32) /* fdatasync every 32MB */
/* When configuring the server eventloop, we setup it so that the total number
@@ -195,33 +203,6 @@ typedef long long mstime_t; /* millisecond time type. */
#define CMD_MODULE_GETKEYS (1<<14) /* Use the modules getkeys interface. */
#define CMD_MODULE_NO_CLUSTER (1<<15) /* Deny on Redis Cluster. */
-/* Defines related to the dump file format. To store 32 bits lengths for short
- * keys requires a lot of space, so we check the most significant 2 bits of
- * the first byte to interpreter the length:
- *
- * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
- * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
- * 10|000000 [32 bit integer] => if it's 10, a full 32 bit len will follow
- * 11|000000 this means: specially encoded object will follow. The six bits
- * number specify the kind of object that follows.
- * See the RDB_ENC_* defines.
- *
- * Lengths up to 63 are stored using a single byte, most DB keys, and may
- * values, will fit inside. */
-#define RDB_6BITLEN 0
-#define RDB_14BITLEN 1
-#define RDB_32BITLEN 2
-#define RDB_ENCVAL 3
-#define RDB_LENERR UINT_MAX
-
-/* When a length of a string object stored on disk has the first two bits
- * set, the remaining two bits specify a special encoding for the object
- * accordingly to the following defines: */
-#define RDB_ENC_INT8 0 /* 8 bit signed integer */
-#define RDB_ENC_INT16 1 /* 16 bit signed integer */
-#define RDB_ENC_INT32 2 /* 32 bit signed integer */
-#define RDB_ENC_LZF 3 /* string compressed with FASTLZ */
-
/* AOF states */
#define AOF_OFF 0 /* AOF is off */
#define AOF_ON 1 /* AOF is on */
@@ -265,6 +246,7 @@ typedef long long mstime_t; /* millisecond time type. */
#define BLOCKED_NONE 0 /* Not blocked, no CLIENT_BLOCKED flag set. */
#define BLOCKED_LIST 1 /* BLPOP & co. */
#define BLOCKED_WAIT 2 /* WAIT for synchronous replication. */
+#define BLOCKED_MODULE 3 /* Blocked by a loadable module. */
/* Client request types */
#define PROTO_REQ_INLINE 1
@@ -291,13 +273,15 @@ typedef long long mstime_t; /* millisecond time type. */
#define REPL_STATE_RECEIVE_AUTH 5 /* Wait for AUTH reply */
#define REPL_STATE_SEND_PORT 6 /* Send REPLCONF listening-port */
#define REPL_STATE_RECEIVE_PORT 7 /* Wait for REPLCONF reply */
-#define REPL_STATE_SEND_CAPA 8 /* Send REPLCONF capa */
-#define REPL_STATE_RECEIVE_CAPA 9 /* Wait for REPLCONF reply */
-#define REPL_STATE_SEND_PSYNC 10 /* Send PSYNC */
-#define REPL_STATE_RECEIVE_PSYNC 11 /* Wait for PSYNC reply */
+#define REPL_STATE_SEND_IP 8 /* Send REPLCONF ip-address */
+#define REPL_STATE_RECEIVE_IP 9 /* Wait for REPLCONF reply */
+#define REPL_STATE_SEND_CAPA 10 /* Send REPLCONF capa */
+#define REPL_STATE_RECEIVE_CAPA 11 /* Wait for REPLCONF reply */
+#define REPL_STATE_SEND_PSYNC 12 /* Send PSYNC */
+#define REPL_STATE_RECEIVE_PSYNC 13 /* Wait for PSYNC reply */
/* --- End of handshake states --- */
-#define REPL_STATE_TRANSFER 12 /* Receiving .rdb from master */
-#define REPL_STATE_CONNECTED 13 /* Connected to master */
+#define REPL_STATE_TRANSFER 14 /* Receiving .rdb from master */
+#define REPL_STATE_CONNECTED 15 /* Connected to master */
/* State of slaves from the POV of the master. Used in client->replstate.
* In SEND_BULK and ONLINE state the slave receives new updates
@@ -310,7 +294,8 @@ typedef long long mstime_t; /* millisecond time type. */
/* Slave capabilities. */
#define SLAVE_CAPA_NONE 0
-#define SLAVE_CAPA_EOF (1<<0) /* Can parse the RDB EOF streaming format. */
+#define SLAVE_CAPA_EOF (1<<0) /* Can parse the RDB EOF streaming format. */
+#define SLAVE_CAPA_PSYNC2 (1<<1) /* Supports PSYNC2 protocol. */
/* Synchronous read timeout - slave side */
#define CONFIG_REPL_SYNCIO_TIMEOUT 5
@@ -367,13 +352,24 @@ typedef long long mstime_t; /* millisecond time type. */
#define SET_OP_DIFF 1
#define SET_OP_INTER 2
-/* Redis maxmemory strategies */
-#define MAXMEMORY_VOLATILE_LRU 0
-#define MAXMEMORY_VOLATILE_TTL 1
-#define MAXMEMORY_VOLATILE_RANDOM 2
-#define MAXMEMORY_ALLKEYS_LRU 3
-#define MAXMEMORY_ALLKEYS_RANDOM 4
-#define MAXMEMORY_NO_EVICTION 5
+/* Redis maxmemory strategies. Instead of using just incremental number
+ * for this defines, we use a set of flags so that testing for certain
+ * properties common to multiple policies is faster. */
+#define MAXMEMORY_FLAG_LRU (1<<0)
+#define MAXMEMORY_FLAG_LFU (1<<1)
+#define MAXMEMORY_FLAG_ALLKEYS (1<<2)
+#define MAXMEMORY_FLAG_NO_SHARED_INTEGERS \
+ (MAXMEMORY_FLAG_LRU|MAXMEMORY_FLAG_LFU)
+
+#define MAXMEMORY_VOLATILE_LRU ((0<<8)|MAXMEMORY_FLAG_LRU)
+#define MAXMEMORY_VOLATILE_LFU ((1<<8)|MAXMEMORY_FLAG_LFU)
+#define MAXMEMORY_VOLATILE_TTL (2<<8)
+#define MAXMEMORY_VOLATILE_RANDOM (3<<8)
+#define MAXMEMORY_ALLKEYS_LRU ((4<<8)|MAXMEMORY_FLAG_LRU|MAXMEMORY_FLAG_ALLKEYS)
+#define MAXMEMORY_ALLKEYS_LFU ((5<<8)|MAXMEMORY_FLAG_LFU|MAXMEMORY_FLAG_ALLKEYS)
+#define MAXMEMORY_ALLKEYS_RANDOM ((6<<8)|MAXMEMORY_FLAG_ALLKEYS)
+#define MAXMEMORY_NO_EVICTION (7<<8)
+
#define CONFIG_DEFAULT_MAXMEMORY_POLICY MAXMEMORY_NO_EVICTION
/* Scripting */
@@ -448,6 +444,95 @@ typedef long long mstime_t; /* millisecond time type. */
#define OBJ_ZSET 3
#define OBJ_HASH 4
+/* The "module" object type is a special one that signals that the object
+ * is one directly managed by a Redis module. In this case the value points
+ * to a moduleValue struct, which contains the object value (which is only
+ * handled by the module itself) and the RedisModuleType struct which lists
+ * function pointers in order to serialize, deserialize, AOF-rewrite and
+ * free the object.
+ *
+ * Inside the RDB file, module types are encoded as OBJ_MODULE followed
+ * by a 64 bit module type ID, which has a 54 bits module-specific signature
+ * in order to dispatch the loading to the right module, plus a 10 bits
+ * encoding version. */
+#define OBJ_MODULE 5
+
+/* Extract encver / signature from a module type ID. */
+#define REDISMODULE_TYPE_ENCVER_BITS 10
+#define REDISMODULE_TYPE_ENCVER_MASK ((1<<REDISMODULE_TYPE_ENCVER_BITS)-1)
+#define REDISMODULE_TYPE_ENCVER(id) (id & REDISMODULE_TYPE_ENCVER_MASK)
+#define REDISMODULE_TYPE_SIGN(id) ((id & ~((uint64_t)REDISMODULE_TYPE_ENCVER_MASK)) >>REDISMODULE_TYPE_ENCVER_BITS)
+
+struct RedisModule;
+struct RedisModuleIO;
+struct RedisModuleDigest;
+struct RedisModuleCtx;
+struct redisObject;
+
+/* Each module type implementation should export a set of methods in order
+ * to serialize and deserialize the value in the RDB file, rewrite the AOF
+ * log, create the digest for "DEBUG DIGEST", and free the value when a key
+ * is deleted. */
+typedef void *(*moduleTypeLoadFunc)(struct RedisModuleIO *io, int encver);
+typedef void (*moduleTypeSaveFunc)(struct RedisModuleIO *io, void *value);
+typedef void (*moduleTypeRewriteFunc)(struct RedisModuleIO *io, struct redisObject *key, void *value);
+typedef void (*moduleTypeDigestFunc)(struct RedisModuleDigest *digest, void *value);
+typedef size_t (*moduleTypeMemUsageFunc)(void *value);
+typedef void (*moduleTypeFreeFunc)(void *value);
+
+/* The module type, which is referenced in each value of a given type, defines
+ * the methods and links to the module exporting the type. */
+typedef struct RedisModuleType {
+ uint64_t id; /* Higher 54 bits of type ID + 10 lower bits of encoding ver. */
+ struct RedisModule *module;
+ moduleTypeLoadFunc rdb_load;
+ moduleTypeSaveFunc rdb_save;
+ moduleTypeRewriteFunc aof_rewrite;
+ moduleTypeMemUsageFunc mem_usage;
+ moduleTypeDigestFunc digest;
+ moduleTypeFreeFunc free;
+ char name[10]; /* 9 bytes name + null term. Charset: A-Z a-z 0-9 _- */
+} moduleType;
+
+/* In Redis objects 'robj' structures of type OBJ_MODULE, the value pointer
+ * is set to the following structure, referencing the moduleType structure
+ * in order to work with the value, and at the same time providing a raw
+ * pointer to the value, as created by the module commands operating with
+ * the module type.
+ *
+ * So for example in order to free such a value, it is possible to use
+ * the following code:
+ *
+ * if (robj->type == OBJ_MODULE) {
+ * moduleValue *mt = robj->ptr;
+ * mt->type->free(mt->value);
+ * zfree(mt); // We need to release this in-the-middle struct as well.
+ * }
+ */
+typedef struct moduleValue {
+ moduleType *type;
+ void *value;
+} moduleValue;
+
+/* This is a wrapper for the 'rio' streams used inside rdb.c in Redis, so that
+ * the user does not have to take the total count of the written bytes nor
+ * to care about error conditions. */
+typedef struct RedisModuleIO {
+ size_t bytes; /* Bytes read / written so far. */
+ rio *rio; /* Rio stream. */
+ moduleType *type; /* Module type doing the operation. */
+ int error; /* True if error condition happened. */
+ struct RedisModuleCtx *ctx; /* Optional context, see RM_GetContextFromIO()*/
+} RedisModuleIO;
+
+#define moduleInitIOContext(iovar,mtype,rioptr) do { \
+ iovar.rio = rioptr; \
+ iovar.type = mtype; \
+ iovar.bytes = 0; \
+ iovar.error = 0; \
+ iovar.ctx = NULL; \
+} while(0);
+
/* Objects encoding. Some kind of objects like Strings and Hashes can be
* internally represented in multiple ways. The 'encoding' field of the object
* is set to one of this fields for this object. */
@@ -455,7 +540,7 @@ typedef long long mstime_t; /* millisecond time type. */
#define OBJ_ENCODING_INT 1 /* Encoded as integer */
#define OBJ_ENCODING_HT 2 /* Encoded as hash table */
#define OBJ_ENCODING_ZIPMAP 3 /* Encoded as zipmap */
-#define OBJ_ENCODING_LINKEDLIST 4 /* Encoded as regular linked list */
+#define OBJ_ENCODING_LINKEDLIST 4 /* No longer used: old list encoding. */
#define OBJ_ENCODING_ZIPLIST 5 /* Encoded as ziplist */
#define OBJ_ENCODING_INTSET 6 /* Encoded as intset */
#define OBJ_ENCODING_SKIPLIST 7 /* Encoded as skiplist */
@@ -470,7 +555,9 @@ typedef long long mstime_t; /* millisecond time type. */
typedef struct redisObject {
unsigned type:4;
unsigned encoding:4;
- unsigned lru:LRU_BITS; /* lru time (relative to server.lruclock) */
+ unsigned lru:LRU_BITS; /* LRU time (relative to server.lruclock) or
+ * LFU data (least significant 8 bits frequency
+ * and most significant 16 bits decreas time). */
int refcount;
void *ptr;
} robj;
@@ -478,7 +565,7 @@ typedef struct redisObject {
/* Macro used to obtain the current LRU clock.
* If the current resolution is lower than the frequency we refresh the
* LRU clock (as it should be in production servers) we return the
- * precomputed value, otherwise we need to resort to a function call. */
+ * precomputed value, otherwise we need to resort to a system call. */
#define LRU_CLOCK() ((1000/server.hz <= LRU_CLOCK_RESOLUTION) ? server.lruclock : getLRUClock())
/* Macro used to initialize a Redis object allocated on the stack.
@@ -492,18 +579,7 @@ typedef struct redisObject {
_var.ptr = _ptr; \
} while(0)
-/* To improve the quality of the LRU approximation we take a set of keys
- * that are good candidate for eviction across freeMemoryIfNeeded() calls.
- *
- * Entries inside the eviciton pool are taken ordered by idle time, putting
- * greater idle times to the right (ascending order).
- *
- * Empty entries have the key pointer set to NULL. */
-#define MAXMEMORY_EVICTION_POOL_SIZE 16
-struct evictionPoolEntry {
- unsigned long long idle; /* Object idle time. */
- sds key; /* Key name. */
-};
+struct evictionPoolEntry; /* Defined in evict.c */
/* Redis database representation. There are multiple databases identified
* by integers from 0 (the default database) up to the max configured
@@ -511,10 +587,9 @@ struct evictionPoolEntry {
typedef struct redisDb {
dict *dict; /* The keyspace for this DB */
dict *expires; /* Timeout of keys with a timeout set */
- dict *blocking_keys; /* Keys with clients waiting for data (BLPOP) */
+ dict *blocking_keys; /* Keys with clients waiting for data (BLPOP)*/
dict *ready_keys; /* Blocked keys that received a PUSH */
dict *watched_keys; /* WATCHED keys for MULTI/EXEC CAS */
- struct evictionPoolEntry *eviction_pool; /* Eviction pool of keys */
int id; /* Database ID */
long long avg_ttl; /* Average TTL, just for stats */
} redisDb;
@@ -549,6 +624,11 @@ typedef struct blockingState {
/* BLOCKED_WAIT */
int numreplicas; /* Number of replicas we are waiting for ACK. */
long long reploffset; /* Replication offset to reach. */
+
+ /* BLOCKED_MODULE */
+ void *module_blocked_handle; /* RedisModuleBlockedClient structure.
+ which is opaque for the Redis core, only
+ handled in module.c. */
} blockingState;
/* The following structure represents a node in the server.ready_keys list,
@@ -603,8 +683,9 @@ typedef struct client {
long long psync_initial_offset; /* FULLRESYNC reply offset other slaves
copying this slave output buffer
should use. */
- char replrunid[CONFIG_RUN_ID_SIZE+1]; /* Master run id if is a master. */
+ char replid[CONFIG_RUN_ID_SIZE+1]; /* Master replication ID (if master). */
int slave_listening_port; /* As configured with: SLAVECONF listening-port */
+ char slave_ip[NET_IP_STR_LEN]; /* Optionally given by REPLCONF ip-address */
int slave_capa; /* Slave capabilities: SLAVE_CAPA_* bitwise OR. */
multiState mstate; /* MULTI/EXEC state */
int btype; /* Type of blocking op if CLIENT_BLOCKED. */
@@ -625,6 +706,12 @@ struct saveparam {
int changes;
};
+struct moduleLoadQueueEntry {
+ sds path;
+ int argc;
+ robj **argv;
+};
+
struct sharedObjectsStruct {
robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *cnegone, *pong, *space,
*colon, *nullbulk, *nullmultibulk, *queued,
@@ -695,6 +782,51 @@ typedef struct redisOpArray {
int numops;
} redisOpArray;
+/* This structure is returned by the getMemoryOverheadData() function in
+ * order to return memory overhead information. */
+struct redisMemOverhead {
+ size_t peak_allocated;
+ size_t total_allocated;
+ size_t startup_allocated;
+ size_t repl_backlog;
+ size_t clients_slaves;
+ size_t clients_normal;
+ size_t aof_buffer;
+ size_t overhead_total;
+ size_t dataset;
+ size_t total_keys;
+ size_t bytes_per_key;
+ float dataset_perc;
+ float peak_perc;
+ float fragmentation;
+ size_t num_dbs;
+ struct {
+ size_t dbid;
+ size_t overhead_ht_main;
+ size_t overhead_ht_expires;
+ } *db;
+};
+
+/* This structure can be optionally passed to RDB save/load functions in
+ * order to implement additional functionalities, by storing and loading
+ * metadata to the RDB file.
+ *
+ * Currently the only use is to select a DB at load time, useful in
+ * replication in order to make sure that chained slaves (slaves of slaves)
+ * select the correct DB and are able to accept the stream coming from the
+ * top-level master. */
+typedef struct rdbSaveInfo {
+ /* Used saving and loading. */
+ int repl_stream_db; /* DB to select in server.master client. */
+
+ /* Used only loading. */
+ int repl_id_is_set; /* True if repl_id field is set. */
+ char repl_id[CONFIG_RUN_ID_SIZE+1]; /* Replication ID. */
+ long long repl_offset; /* Replication offset. */
+} rdbSaveInfo;
+
+#define RDB_SAVE_INFO_INIT {-1,0,"000000000000000000000000000000",-1}
+
/*-----------------------------------------------------------------------------
* Global server state
*----------------------------------------------------------------------------*/
@@ -707,6 +839,10 @@ struct clusterState;
#undef hz
#endif
+#define CHILD_INFO_MAGIC 0xC17DDA7A12345678LL
+#define CHILD_INFO_TYPE_RDB 0
+#define CHILD_INFO_TYPE_AOF 1
+
struct redisServer {
/* General */
pid_t pid; /* Main process pid. */
@@ -727,6 +863,8 @@ struct redisServer {
int cronloops; /* Number of times the cron function run */
char runid[CONFIG_RUN_ID_SIZE+1]; /* ID always different at every exec. */
int sentinel_mode; /* True if this instance is a Sentinel. */
+ size_t initial_memory_usage; /* Bytes used after initialization. */
+ int always_show_logo; /* Show logo even for non-stdout logging. */
/* Modules */
dict *moduleapi; /* Exported APIs dictionary for modules. */
list *loadmodule_queue; /* List of modules to load at startup. */
@@ -784,6 +922,8 @@ struct redisServer {
size_t resident_set_size; /* RSS sampled in serverCron(). */
long long stat_net_input_bytes; /* Bytes read from network. */
long long stat_net_output_bytes; /* Bytes written to network. */
+ size_t stat_rdb_cow_bytes; /* Copy on write bytes during RDB saving. */
+ size_t stat_aof_cow_bytes; /* Copy on write bytes during AOF rewrite. */
/* The following two are used to track instantaneous metrics, like
* number of operations per second, network traffic. */
struct {
@@ -828,6 +968,7 @@ struct redisServer {
int aof_last_write_status; /* C_OK or C_ERR */
int aof_last_write_errno; /* Valid if aof_last_write_status is ERR */
int aof_load_truncated; /* Don't stop on unexpected AOF EOF. */
+ int aof_use_rdb_preamble; /* Use RDB preamble on AOF rewrites. */
/* AOF pipes used to communicate between parent and child during rewrite. */
int aof_pipe_write_data_to_child;
int aof_pipe_read_data_from_parent;
@@ -851,11 +992,19 @@ struct redisServer {
time_t lastbgsave_try; /* Unix time of last attempted bgsave */
time_t rdb_save_time_last; /* Time used by last RDB save run. */
time_t rdb_save_time_start; /* Current RDB save start time. */
+ int rdb_bgsave_scheduled; /* BGSAVE when possible if true. */
int rdb_child_type; /* Type of save by active child. */
int lastbgsave_status; /* C_OK or C_ERR */
int stop_writes_on_bgsave_err; /* Don't allow writes if can't BGSAVE */
int rdb_pipe_write_result_to_parent; /* RDB pipes used to return the state */
int rdb_pipe_read_result_from_child; /* of each slave in diskless SYNC. */
+ /* Pipe and data structures for child -> parent info sharing. */
+ int child_info_pipe[2]; /* Pipe used to write the child_info_data. */
+ struct {
+ int process_type; /* AOF or RDB child? */
+ size_t cow_size; /* Copy on write size. */
+ unsigned long long magic; /* Magic value to make sure data is valid. */
+ } child_info_data;
/* Propagation of commands in AOF / replication */
redisOpArray also_propagate; /* Additional command to propagate. */
/* Logging */
@@ -864,15 +1013,19 @@ struct redisServer {
char *syslog_ident; /* Syslog ident */
int syslog_facility; /* Syslog facility */
/* Replication (master) */
+ char replid[CONFIG_RUN_ID_SIZE+1]; /* My current replication ID. */
+ char replid2[CONFIG_RUN_ID_SIZE+1]; /* replid inherited from master*/
+ long long master_repl_offset; /* My current replication offset */
+ long long second_replid_offset; /* Accept offsets up to this for replid2. */
int slaveseldb; /* Last SELECTed DB in replication output */
- long long master_repl_offset; /* Global replication offset */
int repl_ping_slave_period; /* Master pings the slave every N seconds */
char *repl_backlog; /* Replication backlog for partial syncs */
long long repl_backlog_size; /* Backlog circular buffer size */
long long repl_backlog_histlen; /* Backlog actual data length */
- long long repl_backlog_idx; /* Backlog circular buffer current offset */
- long long repl_backlog_off; /* Replication offset of first byte in the
- backlog buffer. */
+ long long repl_backlog_idx; /* Backlog circular buffer current offset,
+ that is the next byte will'll write to.*/
+ long long repl_backlog_off; /* Replication "master offset" of first
+ byte in the replication backlog buffer.*/
time_t repl_backlog_time_limit; /* Time without slaves after the backlog
gets released. */
time_t repl_no_slaves_since; /* We have no slaves since that time.
@@ -903,8 +1056,13 @@ struct redisServer {
time_t repl_down_since; /* Unix time at which link with master went down */
int repl_disable_tcp_nodelay; /* Disable TCP_NODELAY after SYNC? */
int slave_priority; /* Reported in INFO and used by Sentinel. */
- char repl_master_runid[CONFIG_RUN_ID_SIZE+1]; /* Master run id for PSYNC. */
- long long repl_master_initial_offset; /* Master PSYNC offset. */
+ int slave_announce_port; /* Give the master this listening port. */
+ char *slave_announce_ip; /* Give the master this ip address. */
+ /* The following two fields is where we store master PSYNC replid/offset
+ * while the PSYNC is in progress. At the end we'll copy the fields into
+ * the server->master client structure. */
+ char master_replid[CONFIG_RUN_ID_SIZE+1]; /* Master PSYNC runid. */
+ long long master_initial_offset; /* Master PSYNC offset. */
int repl_slave_lazy_flush; /* Lazy FLUSHALL before loading DB? */
/* Replication script cache. */
dict *repl_scriptcache_dict; /* SHA1 all slaves are aware of. */
@@ -918,6 +1076,8 @@ struct redisServer {
unsigned long long maxmemory; /* Max number of memory bytes to use */
int maxmemory_policy; /* Policy for key eviction */
int maxmemory_samples; /* Pricision of random sampling */
+ unsigned int lfu_log_factor; /* LFU logarithmic counter factor. */
+ unsigned int lfu_decay_time; /* LFU counter decay factor. */
/* Blocked clients */
unsigned int bpop_blocked_clients; /* Number of clients blocked by lists */
list *unblocked_clients; /* list of clients to unblock before next loop */
@@ -984,8 +1144,8 @@ struct redisServer {
long long latency_monitor_threshold;
dict *latency_events;
/* Assert & bug reporting */
- char *assert_failed;
- char *assert_file;
+ const char *assert_failed;
+ const char *assert_file;
int assert_line;
int bug_report_start; /* True if bug report header was already logged. */
int watchdog_period; /* Software watchdog period in ms. 0 = off */
@@ -1098,9 +1258,15 @@ extern dictType modulesDictType;
/* Modules */
void moduleInitModulesSystem(void);
-int moduleLoad(const char *path);
+int moduleLoad(const char *path, void **argv, int argc);
void moduleLoadFromQueue(void);
int *moduleGetCommandKeysViaAPI(struct redisCommand *cmd, robj **argv, int argc, int *numkeys);
+moduleType *moduleTypeLookupModuleByID(uint64_t id);
+void moduleTypeNameByID(char *name, uint64_t moduleid);
+void moduleFreeContext(struct RedisModuleCtx *ctx);
+void unblockClientFromModule(client *c);
+void moduleHandleBlockedClients(void);
+void moduleBlockedClientTimedOut(client *c);
/* Utils */
long long ustime(void);
@@ -1125,6 +1291,7 @@ void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
void acceptTcpHandler(aeEventLoop *el, int fd, void *privdata, int mask);
void acceptUnixHandler(aeEventLoop *el, int fd, void *privdata, int mask);
void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
+void addReplyString(client *c, const char *s, size_t len);
void addReplyBulk(client *c, robj *obj);
void addReplyBulkCString(client *c, const char *s);
void addReplyBulkCBuffer(client *c, const void *p, size_t len);
@@ -1139,6 +1306,8 @@ void addReplyHumanLongDouble(client *c, long double d);
void addReplyLongLong(client *c, long long ll);
void addReplyMultiBulkLen(client *c, long length);
void copyClientOutputBuffer(client *dst, client *src);
+size_t sdsZmallocSize(sds s);
+size_t getStringObjectSdsUsedMemory(robj *o);
void *dupClientReplyValue(void *o);
void getClientsMaxBuffers(unsigned long *longest_output_list,
unsigned long *biggest_input_buffer);
@@ -1179,7 +1348,7 @@ void addReplyStatusFormat(client *c, const char *fmt, ...);
void listTypeTryConversion(robj *subject, robj *value);
void listTypePush(robj *subject, robj *value, int where);
robj *listTypePop(robj *subject, int where);
-unsigned long listTypeLength(robj *subject);
+unsigned long listTypeLength(const robj *subject);
listTypeIterator *listTypeInitIterator(robj *subject, long index, unsigned char direction);
void listTypeReleaseIterator(listTypeIterator *li);
int listTypeNext(listTypeIterator *li, listTypeEntry *entry);
@@ -1219,7 +1388,7 @@ robj *createObject(int type, void *ptr);
robj *createStringObject(const char *ptr, size_t len);
robj *createRawStringObject(const char *ptr, size_t len);
robj *createEmbeddedStringObject(const char *ptr, size_t len);
-robj *dupStringObject(robj *o);
+robj *dupStringObject(const robj *o);
int isSdsRepresentableAsLongLong(sds s, long long *llval);
int isObjectRepresentableAsLongLong(robj *o, long long *llongval);
robj *tryObjectEncoding(robj *o);
@@ -1234,11 +1403,12 @@ robj *createIntsetObject(void);
robj *createHashObject(void);
robj *createZsetObject(void);
robj *createZsetZiplistObject(void);
+robj *createModuleObject(moduleType *mt, void *value);
int getLongFromObjectOrReply(client *c, robj *o, long *target, const char *msg);
int checkType(client *c, robj *o, int type);
int getLongLongFromObjectOrReply(client *c, robj *o, long long *target, const char *msg);
int getDoubleFromObjectOrReply(client *c, robj *o, double *target, const char *msg);
-int getDoubleFromObject(robj *o, double *target);
+int getDoubleFromObject(const robj *o, double *target);
int getLongLongFromObject(robj *o, long long *target);
int getLongDoubleFromObject(robj *o, long double *target);
int getLongDoubleFromObjectOrReply(client *c, robj *o, long double *target, const char *msg);
@@ -1256,6 +1426,7 @@ ssize_t syncReadLine(int fd, char *ptr, ssize_t size, long long timeout);
/* Replication */
void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
+void replicationFeedSlavesFromMasterStream(list *slaves, char *buf, size_t buflen);
void replicationFeedMonitors(client *c, list *monitors, int dictid, robj **argv, int argc);
void updateSlavesWaitingBgsave(int bgsaveerr, int type);
void replicationCron(void);
@@ -1277,6 +1448,10 @@ long long replicationGetSlaveOffset(void);
char *replicationGetSlaveName(client *c);
long long getPsyncInitialOffset(void);
int replicationSetupSlaveForFullResync(client *slave, long long offset);
+void changeReplicationId(void);
+void clearReplicationId2(void);
+void chopReplicationBacklog(void);
+void replicationCacheMasterUsingMyself(void);
/* Generic persistence functions */
void startLoading(FILE *fp);
@@ -1285,6 +1460,7 @@ void stopLoading(void);
/* RDB persistence */
#include "rdb.h"
+int rdbSaveRio(rio *rdb, int *error, int flags, rdbSaveInfo *rsi);
/* AOF persistence */
void flushAppendOnlyFile(int force);
@@ -1297,6 +1473,13 @@ int startAppendOnly(void);
void backgroundRewriteDoneHandler(int exitcode, int bysignal);
void aofRewriteBufferReset(void);
unsigned long aofRewriteBufferSize(void);
+ssize_t aofReadDiffFromParent(void);
+
+/* Child info */
+void openChildInfoPipe(void);
+void closeChildInfoPipe(void);
+void sendChildInfo(int process_type);
+void receiveChildInfo(void);
/* Sorted sets data type */
@@ -1339,7 +1522,7 @@ void zzlNext(unsigned char *zl, unsigned char **eptr, unsigned char **sptr);
void zzlPrev(unsigned char *zl, unsigned char **eptr, unsigned char **sptr);
unsigned char *zzlFirstInRange(unsigned char *zl, zrangespec *range);
unsigned char *zzlLastInRange(unsigned char *zl, zrangespec *range);
-unsigned int zsetLength(robj *zobj);
+unsigned int zsetLength(const robj *zobj);
void zsetConvert(robj *zobj, int encoding);
void zsetConvertToZiplistIfNeeded(robj *zobj, size_t maxelelen);
int zsetScore(robj *zobj, sds member, double *score);
@@ -1387,7 +1570,6 @@ void serverLogFromHandler(int level, const char *msg);
void usage(void);
void updateDictResizePolicy(void);
int htNeedsResize(dict *dict);
-void oom(const char *msg);
void populateCommandTable(void);
void resetCommandTableStats(void);
void adjustOpenFilesLimit(void);
@@ -1396,6 +1578,8 @@ void updateCachedTime(void);
void resetServerStats(void);
unsigned int getLRUClock(void);
const char *evictPolicyToString(void);
+struct redisMemOverhead *getMemoryOverheadData(void);
+void freeMemoryOverheadData(struct redisMemOverhead *mh);
#define RESTART_SERVER_NONE 0
#define RESTART_SERVER_GRACEFULLY (1<<0) /* Do proper shutdown. */
@@ -1413,7 +1597,7 @@ int setTypeNext(setTypeIterator *si, sds *sdsele, int64_t *llele);
sds setTypeNextObject(setTypeIterator *si);
int setTypeRandomElement(robj *setobj, sds *sdsele, int64_t *llele);
unsigned long setTypeRandomElements(robj *set, unsigned long count, robj *aux_set);
-unsigned long setTypeSize(robj *subject);
+unsigned long setTypeSize(const robj *subject);
void setTypeConvert(robj *subject, int enc);
/* Hash data type */
@@ -1426,7 +1610,7 @@ void hashTypeTryConversion(robj *subject, robj **argv, int start, int end);
void hashTypeTryObjectEncoding(robj *subject, robj **o1, robj **o2);
int hashTypeExists(robj *o, sds key);
int hashTypeDelete(robj *o, sds key);
-unsigned long hashTypeLength(robj *o);
+unsigned long hashTypeLength(const robj *o);
hashTypeIterator *hashTypeInitIterator(robj *subject);
void hashTypeReleaseIterator(hashTypeIterator *hi);
int hashTypeNext(hashTypeIterator *hi);
@@ -1466,12 +1650,17 @@ int removeExpire(redisDb *db, robj *key);
void propagateExpire(redisDb *db, robj *key, int lazy);
int expireIfNeeded(redisDb *db, robj *key);
long long getExpire(redisDb *db, robj *key);
-void setExpire(redisDb *db, robj *key, long long when);
-robj *lookupKey(redisDb *db, robj *key);
+void setExpire(client *c, redisDb *db, robj *key, long long when);
+robj *lookupKey(redisDb *db, robj *key, int flags);
robj *lookupKeyRead(redisDb *db, robj *key);
robj *lookupKeyWrite(redisDb *db, robj *key);
robj *lookupKeyReadOrReply(client *c, robj *key, robj *reply);
robj *lookupKeyWriteOrReply(client *c, robj *key, robj *reply);
+robj *lookupKeyReadWithFlags(redisDb *db, robj *key, int flags);
+robj *objectCommandLookup(client *c, robj *key);
+robj *objectCommandLookupOrReply(client *c, robj *key, robj *reply);
+#define LOOKUP_NONE 0
+#define LOOKUP_NOTOUCH (1<<0)
void dbAdd(redisDb *db, robj *key, robj *val);
void dbOverwrite(redisDb *db, robj *key, robj *val);
void setKey(redisDb *db, robj *key, robj *val);
@@ -1528,7 +1717,7 @@ void sentinelIsRunning(void);
/* redis-check-rdb */
int redis_check_rdb(char *rdbfilename);
-int redis_check_rdb_main(char **argv, int argc);
+int redis_check_rdb_main(int argc, char **argv);
/* Scripting */
void scriptingInit(int setup);
@@ -1544,6 +1733,24 @@ void replyToBlockedClientTimedOut(client *c);
int getTimeoutFromObjectOrReply(client *c, robj *object, mstime_t *timeout, int unit);
void disconnectAllBlockedClients(void);
+/* expire.c -- Handling of expired keys */
+void activeExpireCycle(int type);
+void expireSlaveKeys(void);
+void rememberSlaveKeyWithExpire(redisDb *db, robj *key);
+void flushSlaveKeysWithExpireList(void);
+size_t getSlaveKeyWithExpireCount(void);
+
+/* evict.c -- maxmemory handling and LRU eviction. */
+void evictionPoolAlloc(void);
+#define LFU_INIT_VAL 5
+unsigned long LFUGetTimeInMinutes(void);
+uint8_t LFULogIncr(uint8_t value);
+
+/* Keys hashing / comparison functions for dict.c hash tables. */
+unsigned int dictSdsHash(const void *key);
+int dictSdsKeyCompare(void *privdata, const void *key1, const void *key2);
+void dictSdsDestructor(void *privdata, void *val);
+
/* Git SHA1 */
char *redisGitSHA1(void);
char *redisGitDirty(void);
@@ -1573,6 +1780,7 @@ void incrbyCommand(client *c);
void decrbyCommand(client *c);
void incrbyfloatCommand(client *c);
void selectCommand(client *c);
+void swapdbCommand(client *c);
void randomkeyCommand(client *c);
void keysCommand(client *c);
void scanCommand(client *c);
@@ -1627,6 +1835,7 @@ void pexpireCommand(client *c);
void pexpireatCommand(client *c);
void getsetCommand(client *c);
void ttlCommand(client *c);
+void touchCommand(client *c);
void pttlCommand(client *c);
void persistCommand(client *c);
void slaveofCommand(client *c);
@@ -1695,6 +1904,7 @@ void readonlyCommand(client *c);
void readwriteCommand(client *c);
void dumpCommand(client *c);
void objectCommand(client *c);
+void memoryCommand(client *c);
void clientCommand(client *c);
void evalCommand(client *c);
void evalShaCommand(client *c);
@@ -1720,6 +1930,7 @@ void pfmergeCommand(client *c);
void pfdebugCommand(client *c);
void latencyCommand(client *c);
void moduleCommand(client *c);
+void securityWarningCommand(client *c);
#if defined(__GNUC__)
void *calloc(size_t count, size_t size) __attribute__ ((deprecated));
@@ -1729,11 +1940,11 @@ void *realloc(void *ptr, size_t size) __attribute__ ((deprecated));
#endif
/* Debugging stuff */
-void _serverAssertWithInfo(client *c, robj *o, char *estr, char *file, int line);
-void _serverAssert(char *estr, char *file, int line);
-void _serverPanic(char *msg, char *file, int line);
+void _serverAssertWithInfo(const client *c, const robj *o, const char *estr, const char *file, int line);
+void _serverAssert(const char *estr, const char *file, int line);
+void _serverPanic(const char *msg, const char *file, int line);
void bugReportStart(void);
-void serverLogObjectDebugInfo(robj *o);
+void serverLogObjectDebugInfo(const robj *o);
void sigsegvHandler(int sig, siginfo_t *info, void *secret);
sds genRedisInfoString(char *section);
void enableWatchdog(int period);
diff --git a/src/t_hash.c b/src/t_hash.c
index c75b391d7..a49559336 100644
--- a/src/t_hash.c
+++ b/src/t_hash.c
@@ -308,13 +308,13 @@ int hashTypeDelete(robj *o, sds field) {
}
/* Return the number of elements in a hash. */
-unsigned long hashTypeLength(robj *o) {
+unsigned long hashTypeLength(const robj *o) {
unsigned long length = ULONG_MAX;
if (o->encoding == OBJ_ENCODING_ZIPLIST) {
length = ziplistLen(o->ptr) / 2;
} else if (o->encoding == OBJ_ENCODING_HT) {
- length = dictSize((dict*)o->ptr);
+ length = dictSize((const dict*)o->ptr);
} else {
serverPanic("Unknown hash encoding");
}
diff --git a/src/t_list.c b/src/t_list.c
index 7d5be11af..a0a30998d 100644
--- a/src/t_list.c
+++ b/src/t_list.c
@@ -71,7 +71,7 @@ robj *listTypePop(robj *subject, int where) {
return value;
}
-unsigned long listTypeLength(robj *subject) {
+unsigned long listTypeLength(const robj *subject) {
if (subject->encoding == OBJ_ENCODING_QUICKLIST) {
return quicklistCount(subject->ptr);
} else {
@@ -195,7 +195,7 @@ void listTypeConvert(robj *subject, int enc) {
*----------------------------------------------------------------------------*/
void pushGenericCommand(client *c, int where) {
- int j, waiting = 0, pushed = 0;
+ int j, pushed = 0;
robj *lobj = lookupKeyWrite(c->db,c->argv[1]);
if (lobj && lobj->type != OBJ_LIST) {
@@ -204,7 +204,6 @@ void pushGenericCommand(client *c, int where) {
}
for (j = 2; j < c->argc; j++) {
- c->argv[j] = tryObjectEncoding(c->argv[j]);
if (!lobj) {
lobj = createQuicklistObject();
quicklistSetOptions(lobj->ptr, server.list_max_ziplist_size,
@@ -214,7 +213,7 @@ void pushGenericCommand(client *c, int where) {
listTypePush(lobj,c->argv[j],where);
pushed++;
}
- addReplyLongLong(c, waiting + (lobj ? listTypeLength(lobj) : 0));
+ addReplyLongLong(c, (lobj ? listTypeLength(lobj) : 0));
if (pushed) {
char *event = (where == LIST_HEAD) ? "lpush" : "rpush";
@@ -232,68 +231,78 @@ void rpushCommand(client *c) {
pushGenericCommand(c,LIST_TAIL);
}
-void pushxGenericCommand(client *c, robj *refval, robj *val, int where) {
+void pushxGenericCommand(client *c, int where) {
+ int j, pushed = 0;
robj *subject;
- listTypeIterator *iter;
- listTypeEntry entry;
- int inserted = 0;
if ((subject = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
checkType(c,subject,OBJ_LIST)) return;
- if (refval != NULL) {
- /* Seek refval from head to tail */
- iter = listTypeInitIterator(subject,0,LIST_TAIL);
- while (listTypeNext(iter,&entry)) {
- if (listTypeEqual(&entry,refval)) {
- listTypeInsert(&entry,val,where);
- inserted = 1;
- break;
- }
- }
- listTypeReleaseIterator(iter);
+ for (j = 2; j < c->argc; j++) {
+ listTypePush(subject,c->argv[j],where);
+ pushed++;
+ }
- if (inserted) {
- signalModifiedKey(c->db,c->argv[1]);
- notifyKeyspaceEvent(NOTIFY_LIST,"linsert",
- c->argv[1],c->db->id);
- server.dirty++;
- } else {
- /* Notify client of a failed insert */
- addReply(c,shared.cnegone);
- return;
- }
- } else {
- char *event = (where == LIST_HEAD) ? "lpush" : "rpush";
+ addReplyLongLong(c,listTypeLength(subject));
- listTypePush(subject,val,where);
+ if (pushed) {
+ char *event = (where == LIST_HEAD) ? "lpush" : "rpush";
signalModifiedKey(c->db,c->argv[1]);
notifyKeyspaceEvent(NOTIFY_LIST,event,c->argv[1],c->db->id);
- server.dirty++;
}
-
- addReplyLongLong(c,listTypeLength(subject));
+ server.dirty += pushed;
}
void lpushxCommand(client *c) {
- c->argv[2] = tryObjectEncoding(c->argv[2]);
- pushxGenericCommand(c,NULL,c->argv[2],LIST_HEAD);
+ pushxGenericCommand(c,LIST_HEAD);
}
void rpushxCommand(client *c) {
- c->argv[2] = tryObjectEncoding(c->argv[2]);
- pushxGenericCommand(c,NULL,c->argv[2],LIST_TAIL);
+ pushxGenericCommand(c,LIST_TAIL);
}
void linsertCommand(client *c) {
- c->argv[4] = tryObjectEncoding(c->argv[4]);
+ int where;
+ robj *subject;
+ listTypeIterator *iter;
+ listTypeEntry entry;
+ int inserted = 0;
+
if (strcasecmp(c->argv[2]->ptr,"after") == 0) {
- pushxGenericCommand(c,c->argv[3],c->argv[4],LIST_TAIL);
+ where = LIST_TAIL;
} else if (strcasecmp(c->argv[2]->ptr,"before") == 0) {
- pushxGenericCommand(c,c->argv[3],c->argv[4],LIST_HEAD);
+ where = LIST_HEAD;
} else {
addReply(c,shared.syntaxerr);
+ return;
}
+
+ if ((subject = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
+ checkType(c,subject,OBJ_LIST)) return;
+
+ /* Seek pivot from head to tail */
+ iter = listTypeInitIterator(subject,0,LIST_TAIL);
+ while (listTypeNext(iter,&entry)) {
+ if (listTypeEqual(&entry,c->argv[3])) {
+ listTypeInsert(&entry,c->argv[4],where);
+ inserted = 1;
+ break;
+ }
+ }
+ listTypeReleaseIterator(iter);
+
+ if (inserted) {
+ signalModifiedKey(c->db,c->argv[1]);
+ notifyKeyspaceEvent(NOTIFY_LIST,"linsert",
+ c->argv[1],c->db->id);
+ server.dirty++;
+ } else {
+ /* Notify client of a failed insert */
+ addReply(c,shared.cnegone);
+ return;
+ }
+
+ addReplyLongLong(c,listTypeLength(subject));
}
void llenCommand(client *c) {
diff --git a/src/t_set.c b/src/t_set.c
index 7a2a77ff6..d5a801e11 100644
--- a/src/t_set.c
+++ b/src/t_set.c
@@ -53,7 +53,7 @@ int setTypeAdd(robj *subject, sds value) {
long long llval;
if (subject->encoding == OBJ_ENCODING_HT) {
dict *ht = subject->ptr;
- dictEntry *de = dictAddRaw(ht,value);
+ dictEntry *de = dictAddRaw(ht,value,NULL);
if (de) {
dictSetKey(ht,de,sdsdup(value));
dictSetVal(ht,de,NULL);
@@ -219,11 +219,11 @@ int setTypeRandomElement(robj *setobj, sds *sdsele, int64_t *llele) {
return setobj->encoding;
}
-unsigned long setTypeSize(robj *subject) {
+unsigned long setTypeSize(const robj *subject) {
if (subject->encoding == OBJ_ENCODING_HT) {
- return dictSize((dict*)subject->ptr);
+ return dictSize((const dict*)subject->ptr);
} else if (subject->encoding == OBJ_ENCODING_INTSET) {
- return intsetLen((intset*)subject->ptr);
+ return intsetLen((const intset*)subject->ptr);
} else {
serverPanic("Unknown set encoding");
}
@@ -351,9 +351,6 @@ void smoveCommand(client *c) {
dbDelete(c->db,c->argv[1]);
notifyKeyspaceEvent(NOTIFY_GENERIC,"del",c->argv[1],c->db->id);
}
- signalModifiedKey(c->db,c->argv[1]);
- signalModifiedKey(c->db,c->argv[2]);
- server.dirty++;
/* Create the destination set when it doesn't exist */
if (!dstset) {
@@ -361,6 +358,10 @@ void smoveCommand(client *c) {
dbAdd(c->db,c->argv[2],dstset);
}
+ signalModifiedKey(c->db,c->argv[1]);
+ signalModifiedKey(c->db,c->argv[2]);
+ server.dirty++;
+
/* An extra key has changed when ele was successfully added to dstset */
if (setTypeAdd(dstset,ele->ptr)) {
server.dirty++;
@@ -547,6 +548,8 @@ void spopWithCountCommand(client *c) {
* the alsoPropagate() API. */
decrRefCount(propargv[0]);
preventCommandPropagation(c);
+ signalModifiedKey(c->db,c->argv[1]);
+ server.dirty++;
}
void spopCommand(client *c) {
diff --git a/src/t_string.c b/src/t_string.c
index 35eb9d7c1..75375f446 100644
--- a/src/t_string.c
+++ b/src/t_string.c
@@ -85,7 +85,7 @@ void setGenericCommand(client *c, int flags, robj *key, robj *val, robj *expire,
}
setKey(c->db,key,val);
server.dirty++;
- if (expire) setExpire(c->db,key,mstime()+milliseconds);
+ if (expire) setExpire(c,c->db,key,mstime()+milliseconds);
notifyKeyspaceEvent(NOTIFY_STRING,"set",key,c->db->id);
if (expire) notifyKeyspaceEvent(NOTIFY_GENERIC,
"expire",key,c->db->id);
@@ -263,6 +263,10 @@ void getrangeCommand(client *c) {
}
/* Convert negative indexes */
+ if (start < 0 && end < 0 && start > end) {
+ addReply(c,shared.emptybulk);
+ return;
+ }
if (start < 0) start = strlen+start;
if (end < 0) end = strlen+end;
if (start < 0) start = 0;
diff --git a/src/t_zset.c b/src/t_zset.c
index 86aa5510d..8d905be02 100644
--- a/src/t_zset.c
+++ b/src/t_zset.c
@@ -1100,12 +1100,12 @@ unsigned char *zzlDeleteRangeByRank(unsigned char *zl, unsigned int start, unsig
* Common sorted set API
*----------------------------------------------------------------------------*/
-unsigned int zsetLength(robj *zobj) {
+unsigned int zsetLength(const robj *zobj) {
int length = -1;
if (zobj->encoding == OBJ_ENCODING_ZIPLIST) {
length = zzlLength(zobj->ptr);
} else if (zobj->encoding == OBJ_ENCODING_SKIPLIST) {
- length = ((zset*)zobj->ptr)->zsl->length;
+ length = ((const zset*)zobj->ptr)->zsl->length;
} else {
serverPanic("Unknown sorted set encoding");
}
@@ -1387,7 +1387,7 @@ int zsetDel(robj *zobj, sds ele) {
dictEntry *de;
double score;
- de = dictFind(zs->dict,ele);
+ de = dictUnlink(zs->dict,ele);
if (de != NULL) {
/* Get the score in order to delete from the skiplist later. */
score = *(double*)dictGetVal(de);
@@ -1397,12 +1397,11 @@ int zsetDel(robj *zobj, sds ele) {
* actually releases the SDS string representing the element,
* which is shared between the skiplist and the hash table, so
* we need to delete from the skiplist as the final step. */
- int retval1 = dictDelete(zs->dict,ele);
+ dictFreeUnlinkedEntry(zs->dict,de);
/* Delete from skiplist. */
- int retval2 = zslDelete(zs->zsl,score,ele,NULL);
-
- serverAssert(retval1 == DICT_OK && retval2);
+ int retval = zslDelete(zs->zsl,score,ele,NULL);
+ serverAssert(retval);
if (htNeedsResize(zs->dict)) dictResize(zs->dict);
return 1;
@@ -2262,7 +2261,7 @@ void zunionInterGenericCommand(client *c, robj *dstkey, int op) {
} else if (op == SET_OP_UNION) {
dict *accumulator = dictCreate(&setAccumulatorDictType,NULL);
dictIterator *di;
- dictEntry *de;
+ dictEntry *de, *existing;
double score;
if (setnum) {
@@ -2283,16 +2282,16 @@ void zunionInterGenericCommand(client *c, robj *dstkey, int op) {
if (isnan(score)) score = 0;
/* Search for this element in the accumulating dictionary. */
- de = dictFind(accumulator,zuiSdsFromValue(&zval));
+ de = dictAddRaw(accumulator,zuiSdsFromValue(&zval),&existing);
/* If we don't have it, we need to create a new entry. */
- if (de == NULL) {
+ if (!existing) {
tmp = zuiNewSdsFromValue(&zval);
/* Remember the longest single element encountered,
* to understand if it's possible to convert to ziplist
* at the end. */
if (sdslen(tmp) > maxelelen) maxelelen = sdslen(tmp);
- /* Add the element with its initial score. */
- de = dictAddRaw(accumulator,tmp);
+ /* Update the element with its initial score. */
+ dictSetKey(accumulator, de, tmp);
dictSetDoubleVal(de,score);
} else {
/* Update the score with the score of the new instance
@@ -2301,7 +2300,7 @@ void zunionInterGenericCommand(client *c, robj *dstkey, int op) {
* Here we access directly the dictEntry double
* value inside the union as it is a big speedup
* compared to using the getDouble/setDouble API. */
- zunionInterAggregate(&de->v.d,score,aggregate);
+ zunionInterAggregate(&existing->v.d,score,aggregate);
}
}
zuiClearIterator(&src[i]);
@@ -2327,16 +2326,13 @@ void zunionInterGenericCommand(client *c, robj *dstkey, int op) {
serverPanic("Unknown operator");
}
- if (dbDelete(c->db,dstkey)) {
- signalModifiedKey(c->db,dstkey);
+ if (dbDelete(c->db,dstkey))
touched = 1;
- server.dirty++;
- }
if (dstzset->zsl->length) {
zsetConvertToZiplistIfNeeded(dstobj,maxelelen);
dbAdd(c->db,dstkey,dstobj);
addReplyLongLong(c,zsetLength(dstobj));
- if (!touched) signalModifiedKey(c->db,dstkey);
+ signalModifiedKey(c->db,dstkey);
notifyKeyspaceEvent(NOTIFY_ZSET,
(op == SET_OP_UNION) ? "zunionstore" : "zinterstore",
dstkey,c->db->id);
@@ -2344,8 +2340,11 @@ void zunionInterGenericCommand(client *c, robj *dstkey, int op) {
} else {
decrRefCount(dstobj);
addReply(c,shared.czero);
- if (touched)
+ if (touched) {
+ signalModifiedKey(c->db,dstkey);
notifyKeyspaceEvent(NOTIFY_GENERIC,"del",dstkey,c->db->id);
+ server.dirty++;
+ }
}
zfree(src);
}
diff --git a/src/ziplist.c b/src/ziplist.c
index 7428d30e9..a0939f640 100644
--- a/src/ziplist.c
+++ b/src/ziplist.c
@@ -8,24 +8,31 @@
*
* ----------------------------------------------------------------------------
*
- * ZIPLIST OVERALL LAYOUT:
+ * ZIPLIST OVERALL LAYOUT
+ *
* The general layout of the ziplist is as follows:
- * <zlbytes><zltail><zllen><entry><entry><zlend>
*
- * <zlbytes> is an unsigned integer to hold the number of bytes that the
- * ziplist occupies. This value needs to be stored to be able to resize the
+ * <zlbytes> <zltail> <zllen> <entry> <entry> ... <entry> <zlend>
+ *
+ * All fields are stored in little endian.
+ *
+ * <uint32_t zlbytes> is an unsigned integer to hold the number of bytes that
+ * the ziplist occupies. This value needs to be stored to be able to resize the
* entire structure without the need to traverse it first.
*
- * <zltail> is the offset to the last entry in the list. This allows a pop
- * operation on the far side of the list without the need for full traversal.
+ * <uint32_t zltail> is the offset to the last entry in the list. This allows
+ * a pop operation on the far side of the list without the need for full
+ * traversal.
+ *
+ * <uint16_t zllen> is the number of entries. When this value is larger
+ * than 2^16-2, we need to traverse the entire list to know how many items it
+ * holds.
*
- * <zllen> is the number of entries.When this value is larger than 2**16-2,
- * we need to traverse the entire list to know how many items it holds.
+ * <uint8_t zlend> is a single byte special value, equal to 255, which
+ * indicates the end of the list.
*
- * <zlend> is a single byte special value, equal to 255, which indicates the
- * end of the list.
+ * ZIPLIST ENTRIES
*
- * ZIPLIST ENTRIES:
* Every entry in the ziplist is prefixed by a header that contains two pieces
* of information. First, the length of the previous entry is stored to be
* able to traverse the list from back to front. Second, the encoding with an
@@ -242,7 +249,7 @@ static unsigned int zipEncodeLength(unsigned char *p, unsigned char encoding, un
} else if ((encoding) == ZIP_STR_14B) { \
(lensize) = 2; \
(len) = (((ptr)[0] & 0x3f) << 8) | (ptr)[1]; \
- } else if (encoding == ZIP_STR_32B) { \
+ } else if ((encoding) == ZIP_STR_32B) { \
(lensize) = 5; \
(len) = ((ptr)[1] << 24) | \
((ptr)[2] << 16) | \
@@ -1029,7 +1036,7 @@ void ziplistRepr(unsigned char *zl) {
printf(
"{total bytes %d} "
- "{length %u}\n"
+ "{num entries %u}\n"
"{tail offset %u}\n",
intrev32ifbe(ZIPLIST_BYTES(zl)),
intrev16ifbe(ZIPLIST_LENGTH(zl)),
@@ -1038,16 +1045,15 @@ void ziplistRepr(unsigned char *zl) {
while(*p != ZIP_END) {
zipEntry(p, &entry);
printf(
- "{"
- "addr 0x%08lx, "
- "index %2d, "
- "offset %5ld, "
- "rl: %5u, "
- "hs %2u, "
- "pl: %5u, "
- "pls: %2u, "
- "payload %5u"
- "} ",
+ "{\n"
+ "\taddr 0x%08lx,\n"
+ "\tindex %2d,\n"
+ "\toffset %5ld,\n"
+ "\thdr+entry len: %5u,\n"
+ "\thdr len%2u,\n"
+ "\tprevrawlen: %5u,\n"
+ "\tprevrawlensize: %2u,\n"
+ "\tpayload %5u\n",
(long unsigned)p,
index,
(unsigned long) (p-zl),
@@ -1056,8 +1062,14 @@ void ziplistRepr(unsigned char *zl) {
entry.prevrawlen,
entry.prevrawlensize,
entry.len);
+ printf("\tbytes: ");
+ for (unsigned int i = 0; i < entry.headersize+entry.len; i++) {
+ printf("%02x|",p[i]);
+ }
+ printf("\n");
p += entry.headersize;
if (ZIP_IS_STR(entry.encoding)) {
+ printf("\t[str]");
if (entry.len > 40) {
if (fwrite(p,40,1,stdout) == 0) perror("fwrite");
printf("...");
@@ -1066,9 +1078,9 @@ void ziplistRepr(unsigned char *zl) {
fwrite(p,entry.len,1,stdout) == 0) perror("fwrite");
}
} else {
- printf("%lld", (long long) zipLoadInteger(p,entry.encoding));
+ printf("\t[int]%lld", (long long) zipLoadInteger(p,entry.encoding));
}
- printf("\n");
+ printf("\n}\n");
p += entry.len;
index++;
}
diff --git a/src/ziplist.h b/src/ziplist.h
index ae96823f9..964a47f6d 100644
--- a/src/ziplist.h
+++ b/src/ziplist.h
@@ -48,6 +48,7 @@ unsigned int ziplistCompare(unsigned char *p, unsigned char *s, unsigned int sle
unsigned char *ziplistFind(unsigned char *p, unsigned char *vstr, unsigned int vlen, unsigned int skip);
unsigned int ziplistLen(unsigned char *zl);
size_t ziplistBlobLen(unsigned char *zl);
+void ziplistRepr(unsigned char *zl);
#ifdef REDIS_TEST
int ziplistTest(int argc, char *argv[]);
diff --git a/src/zmalloc.c b/src/zmalloc.c
index ab4af99e2..22bf84fce 100644
--- a/src/zmalloc.c
+++ b/src/zmalloc.c
@@ -304,14 +304,26 @@ float zmalloc_get_fragmentation_ratio(size_t rss) {
* /proc/self/smaps. The field must be specified with trailing ":" as it
* apperas in the smaps output.
*
- * Example: zmalloc_get_smap_bytes_by_field("Rss:");
+ * If a pid is specified, the information is extracted for such a pid,
+ * otherwise if pid is -1 the information is reported is about the
+ * current process.
+ *
+ * Example: zmalloc_get_smap_bytes_by_field("Rss:",-1);
*/
#if defined(HAVE_PROC_SMAPS)
-size_t zmalloc_get_smap_bytes_by_field(char *field) {
+size_t zmalloc_get_smap_bytes_by_field(char *field, long pid) {
char line[1024];
size_t bytes = 0;
- FILE *fp = fopen("/proc/self/smaps","r");
int flen = strlen(field);
+ FILE *fp;
+
+ if (pid == -1) {
+ fp = fopen("/proc/self/smaps","r");
+ } else {
+ char filename[128];
+ snprintf(filename,sizeof(filename),"/proc/%ld/smaps",pid);
+ fp = fopen(filename,"r");
+ }
if (!fp) return 0;
while(fgets(line,sizeof(line),fp) != NULL) {
@@ -327,14 +339,15 @@ size_t zmalloc_get_smap_bytes_by_field(char *field) {
return bytes;
}
#else
-size_t zmalloc_get_smap_bytes_by_field(char *field) {
+size_t zmalloc_get_smap_bytes_by_field(char *field, long pid) {
((void) field);
+ ((void) pid);
return 0;
}
#endif
-size_t zmalloc_get_private_dirty(void) {
- return zmalloc_get_smap_bytes_by_field("Private_Dirty:");
+size_t zmalloc_get_private_dirty(long pid) {
+ return zmalloc_get_smap_bytes_by_field("Private_Dirty:",pid);
}
/* Returns the size of physical memory (RAM) in bytes.
diff --git a/src/zmalloc.h b/src/zmalloc.h
index a47ea6ccf..9badf8f4c 100644
--- a/src/zmalloc.h
+++ b/src/zmalloc.h
@@ -75,8 +75,8 @@ void zmalloc_enable_thread_safeness(void);
void zmalloc_set_oom_handler(void (*oom_handler)(size_t));
float zmalloc_get_fragmentation_ratio(size_t rss);
size_t zmalloc_get_rss(void);
-size_t zmalloc_get_private_dirty(void);
-size_t zmalloc_get_smap_bytes_by_field(char *field);
+size_t zmalloc_get_private_dirty(long pid);
+size_t zmalloc_get_smap_bytes_by_field(char *field, long pid);
size_t zmalloc_get_memory_size(void);
void zlibc_free(void *ptr);
diff --git a/tests/assets/default.conf b/tests/assets/default.conf
index 81f8470bc..d7b8a75c6 100644
--- a/tests/assets/default.conf
+++ b/tests/assets/default.conf
@@ -1,5 +1,6 @@
# Redis configuration for testing.
+always-show-logo yes
notify-keyspace-events KEA
daemonize no
pidfile /var/run/redis.pid
diff --git a/tests/integration/psync2.tcl b/tests/integration/psync2.tcl
new file mode 100644
index 000000000..d91969e3e
--- /dev/null
+++ b/tests/integration/psync2.tcl
@@ -0,0 +1,182 @@
+start_server {tags {"psync2"}} {
+start_server {} {
+start_server {} {
+start_server {} {
+start_server {} {
+ set master_id 0 ; # Current master
+ set start_time [clock seconds] ; # Test start time
+ set counter_value 0 ; # Current value of the Redis counter "x"
+
+ # Config
+ set debug_msg 0 ; # Enable additional debug messages
+
+ set no_exit 0; ; # Do not exit at end of the test
+
+ set duration 20 ; # Total test seconds
+
+ set genload 1 ; # Load master with writes at every cycle
+
+ set genload_time 5000 ; # Writes duration time in ms
+
+ set disconnect 1 ; # Break replication link between random
+ # master and slave instances while the
+ # master is loaded with writes.
+
+ set disconnect_period 1000 ; # Disconnect repl link every N ms.
+
+ for {set j 0} {$j < 5} {incr j} {
+ set R($j) [srv [expr 0-$j] client]
+ set R_host($j) [srv [expr 0-$j] host]
+ set R_port($j) [srv [expr 0-$j] port]
+ if {$debug_msg} {puts "Log file: [srv [expr 0-$j] stdout]"}
+ }
+
+ set cycle 1
+ while {([clock seconds]-$start_time) < $duration} {
+ test "PSYNC2: --- CYCLE $cycle ---" {
+ incr cycle
+ }
+
+ # Create a random replication layout.
+ # Start with switching master (this simulates a failover).
+
+ # 1) Select the new master.
+ set master_id [randomInt 5]
+ set used [list $master_id]
+ test "PSYNC2: \[NEW LAYOUT\] Set #$master_id as master" {
+ $R($master_id) slaveof no one
+ if {$counter_value == 0} {
+ $R($master_id) set x $counter_value
+ }
+ }
+
+ # 2) Attach all the slaves to a random instance
+ while {[llength $used] != 5} {
+ while 1 {
+ set slave_id [randomInt 5]
+ if {[lsearch -exact $used $slave_id] == -1} break
+ }
+ set rand [randomInt [llength $used]]
+ set mid [lindex $used $rand]
+ set master_host $R_host($mid)
+ set master_port $R_port($mid)
+
+ test "PSYNC2: Set #$slave_id to replicate from #$mid" {
+ $R($slave_id) slaveof $master_host $master_port
+ }
+ lappend used $slave_id
+ }
+
+ # 3) Increment the counter and wait for all the instances
+ # to converge.
+ test "PSYNC2: cluster is consistent after failover" {
+ $R($master_id) incr x; incr counter_value
+ for {set j 0} {$j < 5} {incr j} {
+ wait_for_condition 50 1000 {
+ [$R($j) get x] == $counter_value
+ } else {
+ fail "Instance #$j x variable is inconsistent"
+ }
+ }
+ }
+
+ # 4) Generate load while breaking the connection of random
+ # slave-master pairs.
+ test "PSYNC2: generate load while killing replication links" {
+ set t [clock milliseconds]
+ set next_break [expr {$t+$disconnect_period}]
+ while {[clock milliseconds]-$t < $genload_time} {
+ if {$genload} {
+ $R($master_id) incr x; incr counter_value
+ }
+ if {[clock milliseconds] == $next_break} {
+ set next_break \
+ [expr {[clock milliseconds]+$disconnect_period}]
+ set slave_id [randomInt 5]
+ if {$disconnect} {
+ $R($slave_id) client kill type master
+ if {$debug_msg} {
+ puts "+++ Breaking link for slave #$slave_id"
+ }
+ }
+ }
+ }
+ }
+
+ # 5) Increment the counter and wait for all the instances
+ set x [$R($master_id) get x]
+ test "PSYNC2: cluster is consistent after load (x = $x)" {
+ for {set j 0} {$j < 5} {incr j} {
+ wait_for_condition 50 1000 {
+ [$R($j) get x] == $counter_value
+ } else {
+ fail "Instance #$j x variable is inconsistent"
+ }
+ }
+ }
+
+ # Put down the old master so that it cannot generate more
+ # replication stream, this way in the next master switch, the time at
+ # which we move slaves away is not important, each will have full
+ # history (otherwise PINGs will make certain slaves have more history),
+ # and sometimes a full resync will be needed.
+ $R($master_id) slaveof 127.0.0.1 0 ;# We use port zero to make it fail.
+
+ if {$debug_msg} {
+ for {set j 0} {$j < 5} {incr j} {
+ puts "$j: sync_full: [status $R($j) sync_full]"
+ puts "$j: id1 : [status $R($j) master_replid]:[status $R($j) master_repl_offset]"
+ puts "$j: id2 : [status $R($j) master_replid2]:[status $R($j) second_repl_offset]"
+ puts "$j: backlog : firstbyte=[status $R($j) repl_backlog_first_byte_offset] len=[status $R($j) repl_backlog_histlen]"
+ puts "---"
+ }
+ }
+
+ test "PSYNC2: total sum of full synchronizations is exactly 4" {
+ set sum 0
+ for {set j 0} {$j < 5} {incr j} {
+ incr sum [status $R($j) sync_full]
+ }
+ assert {$sum == 4}
+ }
+ }
+
+ test "PSYNC2: Bring the master back again for next test" {
+ $R($master_id) slaveof no one
+ set master_host $R_host($master_id)
+ set master_port $R_port($master_id)
+ for {set j 0} {$j < 5} {incr j} {
+ if {$j == $master_id} continue
+ $R($j) slaveof $master_host $master_port
+ }
+
+ # Wait for slaves to sync
+ wait_for_condition 50 1000 {
+ [status $R($master_id) connected_slaves] == 4
+ } else {
+ fail "Slave not reconnecting"
+ }
+ }
+
+ test "PSYNC2: Partial resync after restart using RDB aux fields" {
+ # Pick a random slave
+ set slave_id [expr {($master_id+1)%5}]
+ set sync_count [status $R($master_id) sync_full]
+ catch {
+ $R($slave_id) config rewrite
+ $R($slave_id) debug restart
+ }
+ wait_for_condition 50 1000 {
+ [status $R($master_id) connected_slaves] == 4
+ } else {
+ fail "Slave not reconnecting"
+ }
+ set new_sync_count [status $R($master_id) sync_full]
+ assert {$sync_count == $new_sync_count}
+ }
+
+ if {$no_exit} {
+ while 1 { puts -nonewline .; flush stdout; after 1000}
+ }
+
+}}}}}
diff --git a/tests/integration/rdb.tcl b/tests/integration/rdb.tcl
index f2de3504d..2ed47cc58 100644
--- a/tests/integration/rdb.tcl
+++ b/tests/integration/rdb.tcl
@@ -89,7 +89,7 @@ close $fd
start_server_and_kill_it [list "dir" $server_path] {
test {Server should not start if RDB is corrupted} {
wait_for_condition 50 100 {
- [string match {*RDB checksum*} \
+ [string match {*CRC error*} \
[exec tail -n10 < [dict get $srv stdout]]]
} else {
fail "Server started even if RDB was corrupted!"
diff --git a/tests/integration/replication-3.tcl b/tests/integration/replication-3.tcl
index 0fcbad45b..50dcb9a9a 100644
--- a/tests/integration/replication-3.tcl
+++ b/tests/integration/replication-3.tcl
@@ -30,6 +30,18 @@ start_server {tags {"repl"}} {
}
assert_equal [r debug digest] [r -1 debug digest]
}
+
+ test {Slave is able to evict keys created in writable slaves} {
+ r -1 select 5
+ assert {[r -1 dbsize] == 0}
+ r -1 config set slave-read-only no
+ r -1 set key1 1 ex 5
+ r -1 set key2 2 ex 5
+ r -1 set key3 3 ex 5
+ assert {[r -1 dbsize] == 3}
+ after 6000
+ r -1 dbsize
+ } {0}
}
}
diff --git a/tests/sentinel/tests/05-manual.tcl b/tests/sentinel/tests/05-manual.tcl
index 1a60d814b..5214fdce1 100644
--- a/tests/sentinel/tests/05-manual.tcl
+++ b/tests/sentinel/tests/05-manual.tcl
@@ -6,7 +6,8 @@ test "Manual failover works" {
set old_port [RI $master_id tcp_port]
set addr [S 0 SENTINEL GET-MASTER-ADDR-BY-NAME mymaster]
assert {[lindex $addr 1] == $old_port}
- S 0 SENTINEL FAILOVER mymaster
+ catch {S 0 SENTINEL FAILOVER mymaster} reply
+ assert {$reply eq "OK"}
foreach_sentinel_id id {
wait_for_condition 1000 50 {
[lindex [S $id SENTINEL GET-MASTER-ADDR-BY-NAME mymaster] 1] != $old_port
diff --git a/tests/sentinel/tests/07-down-conditions.tcl b/tests/sentinel/tests/07-down-conditions.tcl
new file mode 100644
index 000000000..a60656e59
--- /dev/null
+++ b/tests/sentinel/tests/07-down-conditions.tcl
@@ -0,0 +1,68 @@
+# Test conditions where an instance is considered to be down
+
+source "../tests/includes/init-tests.tcl"
+
+proc ensure_master_up {} {
+ wait_for_condition 1000 50 {
+ [dict get [S 4 sentinel master mymaster] flags] eq "master"
+ } else {
+ fail "Master flags are not just 'master'"
+ }
+}
+
+proc ensure_master_down {} {
+ wait_for_condition 1000 50 {
+ [string match *down* \
+ [dict get [S 4 sentinel master mymaster] flags]]
+ } else {
+ fail "Master is not flagged SDOWN"
+ }
+}
+
+test "Crash the majority of Sentinels to prevent failovers for this unit" {
+ for {set id 0} {$id < $quorum} {incr id} {
+ kill_instance sentinel $id
+ }
+}
+
+test "SDOWN is triggered by non-responding but not crashed instance" {
+ lassign [S 4 SENTINEL GET-MASTER-ADDR-BY-NAME mymaster] host port
+ ensure_master_up
+ exec ../../../src/redis-cli -h $host -p $port debug sleep 10 > /dev/null &
+ ensure_master_down
+ ensure_master_up
+}
+
+test "SDOWN is triggered by crashed instance" {
+ lassign [S 4 SENTINEL GET-MASTER-ADDR-BY-NAME mymaster] host port
+ ensure_master_up
+ kill_instance redis 0
+ ensure_master_down
+ restart_instance redis 0
+ ensure_master_up
+}
+
+test "SDOWN is triggered by masters advertising as slaves" {
+ ensure_master_up
+ R 0 slaveof 127.0.0.1 34567
+ ensure_master_down
+ R 0 slaveof no one
+ ensure_master_up
+}
+
+test "SDOWN is triggered by misconfigured instance repling with errors" {
+ ensure_master_up
+ set orig_dir [lindex [R 0 config get dir] 1]
+ set orig_save [lindex [R 0 config get save] 1]
+ # Set dir to / and filename to "tmp" to make sure it will fail.
+ R 0 config set dir /
+ R 0 config set dbfilename tmp
+ R 0 config set save "1000000 1000000"
+ R 0 bgsave
+ ensure_master_down
+ R 0 config set save $orig_save
+ R 0 config set dir $orig_dir
+ R 0 config set dbfilename dump.rdb
+ R 0 bgsave
+ ensure_master_up
+}
diff --git a/tests/support/server.tcl b/tests/support/server.tcl
index 19d6c5152..c36b30775 100644
--- a/tests/support/server.tcl
+++ b/tests/support/server.tcl
@@ -278,7 +278,7 @@ proc start_server {options {code undefined}} {
while 1 {
# check that the server actually started and is ready for connections
- if {[exec grep "ready to accept" | wc -l < $stdout] > 0} {
+ if {[exec grep -i "Ready to accept" | wc -l < $stdout] > 0} {
break
}
after 10
diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl
index 45cecfdde..fdfe6a01b 100644
--- a/tests/test_helper.tcl
+++ b/tests/test_helper.tcl
@@ -41,18 +41,22 @@ set ::all_tests {
integration/rdb
integration/convert-zipmap-hash-on-load
integration/logging
+ integration/psync2
unit/pubsub
unit/slowlog
unit/scripting
unit/maxmemory
unit/introspection
+ unit/introspection-2
unit/limits
unit/obuf-limits
unit/bitops
unit/bitfield
+ unit/geo
unit/memefficiency
unit/hyperloglog
unit/lazyfree
+ unit/wait
}
# Index to the next test to run in the ::all_tests list.
set ::next_test 0
diff --git a/tests/unit/aofrw.tcl b/tests/unit/aofrw.tcl
index 4fdbdc6c6..c5430eedc 100644
--- a/tests/unit/aofrw.tcl
+++ b/tests/unit/aofrw.tcl
@@ -4,60 +4,63 @@ start_server {tags {"aofrw"}} {
r config set auto-aof-rewrite-percentage 0 ; # Disable auto-rewrite.
waitForBgrewriteaof r
- test {AOF rewrite during write load} {
- # Start a write load for 10 seconds
- set master [srv 0 client]
- set master_host [srv 0 host]
- set master_port [srv 0 port]
- set load_handle0 [start_write_load $master_host $master_port 10]
- set load_handle1 [start_write_load $master_host $master_port 10]
- set load_handle2 [start_write_load $master_host $master_port 10]
- set load_handle3 [start_write_load $master_host $master_port 10]
- set load_handle4 [start_write_load $master_host $master_port 10]
-
- # Make sure the instance is really receiving data
- wait_for_condition 50 100 {
- [r dbsize] > 0
- } else {
- fail "No write load detected."
- }
+ foreach rdbpre {yes no} {
+ r config set aof-use-rdb-preamble $rdbpre
+ test "AOF rewrite during write load: RDB preamble=$rdbpre" {
+ # Start a write load for 10 seconds
+ set master [srv 0 client]
+ set master_host [srv 0 host]
+ set master_port [srv 0 port]
+ set load_handle0 [start_write_load $master_host $master_port 10]
+ set load_handle1 [start_write_load $master_host $master_port 10]
+ set load_handle2 [start_write_load $master_host $master_port 10]
+ set load_handle3 [start_write_load $master_host $master_port 10]
+ set load_handle4 [start_write_load $master_host $master_port 10]
+
+ # Make sure the instance is really receiving data
+ wait_for_condition 50 100 {
+ [r dbsize] > 0
+ } else {
+ fail "No write load detected."
+ }
- # After 3 seconds, start a rewrite, while the write load is still
- # active.
- after 3000
- r bgrewriteaof
- waitForBgrewriteaof r
+ # After 3 seconds, start a rewrite, while the write load is still
+ # active.
+ after 3000
+ r bgrewriteaof
+ waitForBgrewriteaof r
+
+ # Let it run a bit more so that we'll append some data to the new
+ # AOF.
+ after 1000
+
+ # Stop the processes generating the load if they are still active
+ stop_write_load $load_handle0
+ stop_write_load $load_handle1
+ stop_write_load $load_handle2
+ stop_write_load $load_handle3
+ stop_write_load $load_handle4
+
+ # Make sure that we remain the only connected client.
+ # This step is needed to make sure there are no pending writes
+ # that will be processed between the two "debug digest" calls.
+ wait_for_condition 50 100 {
+ [llength [split [string trim [r client list]] "\n"]] == 1
+ } else {
+ puts [r client list]
+ fail "Clients generating loads are not disconnecting"
+ }
- # Let it run a bit more so that we'll append some data to the new
- # AOF.
- after 1000
+ # Get the data set digest
+ set d1 [r debug digest]
- # Stop the processes generating the load if they are still active
- stop_write_load $load_handle0
- stop_write_load $load_handle1
- stop_write_load $load_handle2
- stop_write_load $load_handle3
- stop_write_load $load_handle4
+ # Load the AOF
+ r debug loadaof
+ set d2 [r debug digest]
- # Make sure that we remain the only connected client.
- # This step is needed to make sure there are no pending writes
- # that will be processed between the two "debug digest" calls.
- wait_for_condition 50 100 {
- [llength [split [string trim [r client list]] "\n"]] == 1
- } else {
- puts [r client list]
- fail "Clients generating loads are not disconnecting"
+ # Make sure they are the same
+ assert {$d1 eq $d2}
}
-
- # Get the data set digest
- set d1 [r debug digest]
-
- # Load the AOF
- r debug loadaof
- set d2 [r debug digest]
-
- # Make sure they are the same
- assert {$d1 eq $d2}
}
}
diff --git a/tests/unit/bitfield.tcl b/tests/unit/bitfield.tcl
index 368ff9fc6..d76452b1b 100644
--- a/tests/unit/bitfield.tcl
+++ b/tests/unit/bitfield.tcl
@@ -184,4 +184,18 @@ start_server {tags {"bitops"}} {
}
}
}
+
+ test {BITFIELD regression for #3221} {
+ r set bits 1
+ r bitfield bits get u1 0
+ } {0}
+
+ test {BITFIELD regression for #3564} {
+ for {set j 0} {$j < 10} {incr j} {
+ r del mystring
+ set res [r BITFIELD mystring SET i8 0 10 SET i8 64 10 INCRBY i8 10 99900]
+ assert {$res eq {0 0 60}}
+ }
+ r del mystring
+ }
}
diff --git a/tests/unit/bitops.tcl b/tests/unit/bitops.tcl
index 30aa832c7..926f38295 100644
--- a/tests/unit/bitops.tcl
+++ b/tests/unit/bitops.tcl
@@ -43,6 +43,16 @@ start_server {tags {"bitops"}} {
r bitcount no-key
} 0
+ test {BITCOUNT returns 0 with out of range indexes} {
+ r set str "xxxx"
+ r bitcount str 4 10
+ } 0
+
+ test {BITCOUNT returns 0 with negative indexes where start > end} {
+ r set str "xxxx"
+ r bitcount str -6 -7
+ } 0
+
catch {unset num}
foreach vec [list "" "\xaa" "\x00\x00\xff" "foobar" "123"] {
incr num
diff --git a/tests/unit/expire.tcl b/tests/unit/expire.tcl
index ff3dacb33..0a50dd31b 100644
--- a/tests/unit/expire.tcl
+++ b/tests/unit/expire.tcl
@@ -198,4 +198,10 @@ start_server {tags {"expire"}} {
r set foo b
lsort [r keys *]
} {a e foo s t}
+
+ test {EXPIRE with empty string as TTL should report an error} {
+ r set foo bar
+ catch {r expire foo ""} e
+ set e
+ } {*not an integer*}
}
diff --git a/tests/unit/geo.tcl b/tests/unit/geo.tcl
index 3aa06c99c..fdbfbf139 100644
--- a/tests/unit/geo.tcl
+++ b/tests/unit/geo.tcl
@@ -1,4 +1,4 @@
-# Helper functins to simulate search-in-radius in the Tcl side in order to
+# Helper functions to simulate search-in-radius in the Tcl side in order to
# verify the Redis implementation with a fuzzy test.
proc geo_degrad deg {expr {$deg*atan(1)*8/360}}
@@ -23,6 +23,44 @@ proc geo_random_point {lonvar latvar} {
set lat [expr {-70 + rand()*140}]
}
+# Return elements non common to both the lists.
+# This code is from http://wiki.tcl.tk/15489
+proc compare_lists {List1 List2} {
+ set DiffList {}
+ foreach Item $List1 {
+ if {[lsearch -exact $List2 $Item] == -1} {
+ lappend DiffList $Item
+ }
+ }
+ foreach Item $List2 {
+ if {[lsearch -exact $List1 $Item] == -1} {
+ if {[lsearch -exact $DiffList $Item] == -1} {
+ lappend DiffList $Item
+ }
+ }
+ }
+ return $DiffList
+}
+
+# The following list represents sets of random seed, search position
+# and radius that caused bugs in the past. It is used by the randomized
+# test later as a starting point. When the regression vectors are scanned
+# the code reverts to using random data.
+#
+# The format is: seed km lon lat
+set regression_vectors {
+ {1412 156 149.29737817929004 15.95807862745508}
+ {441574 143 59.235461856813856 66.269555127373678}
+ {160645 187 -101.88575239939883 49.061997951502917}
+ {750269 154 -90.187939661642517 66.615930412251487}
+ {342880 145 163.03472387745728 64.012747720821181}
+ {729955 143 137.86663517256579 63.986745399416776}
+ {939895 151 59.149620271823181 65.204186651485145}
+ {1412 156 149.29737817929004 15.95807862745508}
+ {564862 149 84.062063109158544 -65.685403922426232}
+}
+set rv_idx 0
+
start_server {tags {"geo"}} {
test {GEOADD create} {
r geoadd nyc -73.9454966 40.747533 "lic market"
@@ -183,35 +221,83 @@ start_server {tags {"geo"}} {
}
test {GEOADD + GEORANGE randomized test} {
- set attempt 10
+ set attempt 30
while {[incr attempt -1]} {
+ set rv [lindex $regression_vectors $rv_idx]
+ incr rv_idx
+
unset -nocomplain debuginfo
- set srand_seed [randomInt 1000000]
+ set srand_seed [clock milliseconds]
+ if {$rv ne {}} {set srand_seed [lindex $rv 0]}
lappend debuginfo "srand_seed is $srand_seed"
expr {srand($srand_seed)} ; # If you need a reproducible run
r del mypoints
- set radius_km [expr {[randomInt 200]+10}]
+
+ if {[randomInt 10] == 0} {
+ # From time to time use very big radiuses
+ set radius_km [expr {[randomInt 50000]+10}]
+ } else {
+ # Normally use a few - ~200km radiuses to stress
+ # test the code the most in edge cases.
+ set radius_km [expr {[randomInt 200]+10}]
+ }
+ if {$rv ne {}} {set radius_km [lindex $rv 1]}
set radius_m [expr {$radius_km*1000}]
geo_random_point search_lon search_lat
+ if {$rv ne {}} {
+ set search_lon [lindex $rv 2]
+ set search_lat [lindex $rv 3]
+ }
lappend debuginfo "Search area: $search_lon,$search_lat $radius_km km"
set tcl_result {}
set argv {}
for {set j 0} {$j < 20000} {incr j} {
geo_random_point lon lat
lappend argv $lon $lat "place:$j"
- if {[geo_distance $lon $lat $search_lon $search_lat] < $radius_m} {
+ set distance [geo_distance $lon $lat $search_lon $search_lat]
+ if {$distance < $radius_m} {
lappend tcl_result "place:$j"
- lappend debuginfo "place:$j $lon $lat [expr {[geo_distance $lon $lat $search_lon $search_lat]/1000}] km"
}
+ lappend debuginfo "place:$j $lon $lat [expr {$distance/1000}] km"
}
r geoadd mypoints {*}$argv
set res [lsort [r georadius mypoints $search_lon $search_lat $radius_km km]]
set res2 [lsort $tcl_result]
set test_result OK
+
if {$res != $res2} {
+ set rounding_errors 0
+ set diff [compare_lists $res $res2]
+ foreach place $diff {
+ set mydist [geo_distance $lon $lat $search_lon $search_lat]
+ set mydist [expr $mydist/1000]
+ if {($mydist / $radius_km) > 0.999} {incr rounding_errors}
+ }
+ # Make sure this is a real error and not a rounidng issue.
+ if {[llength $diff] == $rounding_errors} {
+ set res $res2; # Error silenced
+ }
+ }
+
+ if {$res != $res2} {
+ set diff [compare_lists $res $res2]
+ puts "*** Possible problem in GEO radius query ***"
puts "Redis: $res"
puts "Tcl : $res2"
+ puts "Diff : $diff"
puts [join $debuginfo "\n"]
+ foreach place $diff {
+ if {[lsearch -exact $res2 $place] != -1} {
+ set where "(only in Tcl)"
+ } else {
+ set where "(only in Redis)"
+ }
+ lassign [lindex [r geopos mypoints $place] 0] lon lat
+ set mydist [geo_distance $lon $lat $search_lon $search_lat]
+ set mydist [expr $mydist/1000]
+ puts "$place -> [r geopos mypoints $place] $mydist $where"
+ if {($mydist / $radius_km) > 0.999} {incr rounding_errors}
+ }
set test_result FAIL
}
unset -nocomplain debuginfo
diff --git a/tests/unit/introspection-2.tcl b/tests/unit/introspection-2.tcl
new file mode 100644
index 000000000..350a8a016
--- /dev/null
+++ b/tests/unit/introspection-2.tcl
@@ -0,0 +1,23 @@
+start_server {tags {"introspection"}} {
+ test {TTL and TYPYE do not alter the last access time of a key} {
+ r set foo bar
+ after 3000
+ r ttl foo
+ r type foo
+ assert {[r object idletime foo] >= 2}
+ }
+
+ test {TOUCH alters the last access time of a key} {
+ r set foo bar
+ after 3000
+ r touch foo
+ assert {[r object idletime foo] < 2}
+ }
+
+ test {TOUCH returns the number of existing keys specified} {
+ r flushdb
+ r set key1 1
+ r set key2 2
+ r touch key0 key1 key2 key3
+ } 2
+}
diff --git a/tests/unit/other.tcl b/tests/unit/other.tcl
index 2f5773930..1d21b561a 100644
--- a/tests/unit/other.tcl
+++ b/tests/unit/other.tcl
@@ -52,7 +52,7 @@ start_server {tags {"other"}} {
test {SELECT an out of range DB} {
catch {r select 1000000} err
set _ $err
- } {*invalid*}
+ } {*index is out of range*}
tags {consistency} {
if {![catch {package require sha1}]} {
diff --git a/tests/unit/scripting.tcl b/tests/unit/scripting.tcl
index 825a73ed3..be82e1559 100644
--- a/tests/unit/scripting.tcl
+++ b/tests/unit/scripting.tcl
@@ -142,7 +142,7 @@ start_server {tags {"scripting"}} {
test {EVAL - Scripts can't run certain commands} {
set e {}
- catch {r eval {return redis.pcall('spop','x')} 0} e
+ catch {r eval {return redis.pcall('blpop','x',0)} 0} e
set e
} {*not allowed*}
diff --git a/tests/unit/type/list-3.tcl b/tests/unit/type/list-3.tcl
index ece6ea2d5..b5bd48cb0 100644
--- a/tests/unit/type/list-3.tcl
+++ b/tests/unit/type/list-3.tcl
@@ -13,6 +13,50 @@ start_server {
assert_equal [r lindex l 1] [lindex $mylist 1]
}
+ test {Regression for quicklist #3343 bug} {
+ r del mylist
+ r lpush mylist 401
+ r lpush mylist 392
+ r rpush mylist [string repeat x 5105]"799"
+ r lset mylist -1 [string repeat x 1014]"702"
+ r lpop mylist
+ r lset mylist -1 [string repeat x 4149]"852"
+ r linsert mylist before 401 [string repeat x 9927]"12"
+ r lrange mylist 0 -1
+ r ping ; # It's enough if the server is still alive
+ } {PONG}
+
+ test {Stress tester for #3343-alike bugs} {
+ r del key
+ for {set j 0} {$j < 10000} {incr j} {
+ set op [randomInt 6]
+ set small_signed_count [expr 5-[randomInt 10]]
+ if {[randomInt 2] == 0} {
+ set ele [randomInt 1000]
+ } else {
+ set ele [string repeat x [randomInt 10000]][randomInt 1000]
+ }
+ switch $op {
+ 0 {r lpush key $ele}
+ 1 {r rpush key $ele}
+ 2 {r lpop key}
+ 3 {r rpop key}
+ 4 {
+ catch {r lset key $small_signed_count $ele}
+ }
+ 5 {
+ set otherele [randomInt 1000]
+ if {[randomInt 2] == 0} {
+ set where before
+ } else {
+ set where after
+ }
+ r linsert key $where $otherele $ele
+ }
+ }
+ }
+ }
+
tags {slow} {
test {ziplist implementation: value encoding and backlink} {
if {$::accurate} {set iterations 100} else {set iterations 10}
diff --git a/tests/unit/type/list.tcl b/tests/unit/type/list.tcl
index e4d568cf1..1557082a2 100644
--- a/tests/unit/type/list.tcl
+++ b/tests/unit/type/list.tcl
@@ -507,7 +507,9 @@ start_server {
create_list xlist "$large c"
assert_equal 3 [r rpushx xlist d]
assert_equal 4 [r lpushx xlist a]
- assert_equal "a $large c d" [r lrange xlist 0 -1]
+ assert_equal 6 [r rpushx xlist 42 x]
+ assert_equal 9 [r lpushx xlist y3 y2 y1]
+ assert_equal "y1 y2 y3 a $large c d 42 x" [r lrange xlist 0 -1]
}
test "LINSERT - $type" {
diff --git a/tests/unit/wait.tcl b/tests/unit/wait.tcl
new file mode 100644
index 000000000..e2f5d2942
--- /dev/null
+++ b/tests/unit/wait.tcl
@@ -0,0 +1,42 @@
+start_server {tags {"wait"}} {
+start_server {} {
+ set slave [srv 0 client]
+ set slave_host [srv 0 host]
+ set slave_port [srv 0 port]
+ set master [srv -1 client]
+ set master_host [srv -1 host]
+ set master_port [srv -1 port]
+
+ test {Setup slave} {
+ $slave slaveof $master_host $master_port
+ wait_for_condition 50 100 {
+ [s 0 master_link_status] eq {up}
+ } else {
+ fail "Replication not started."
+ }
+ }
+
+ test {WAIT should acknowledge 1 additional copy of the data} {
+ $master set foo 0
+ $master incr foo
+ $master incr foo
+ $master incr foo
+ assert {[$master wait 1 5000] == 1}
+ assert {[$slave get foo] == 3}
+ }
+
+ test {WAIT should not acknowledge 2 additional copies of the data} {
+ $master incr foo
+ assert {[$master wait 2 1000] <= 1}
+ }
+
+ test {WAIT should not acknowledge 1 additional copy if slave is blocked} {
+ exec src/redis-cli -h $slave_host -p $slave_port debug sleep 5 > /dev/null 2> /dev/null &
+ after 1000 ;# Give redis-cli the time to execute the command.
+ $master set foo 0
+ $master incr foo
+ $master incr foo
+ $master incr foo
+ assert {[$master wait 1 3000] == 0}
+ }
+}}
diff --git a/utils/corrupt_rdb.c b/utils/corrupt_rdb.c
new file mode 100644
index 000000000..7ba9caeee
--- /dev/null
+++ b/utils/corrupt_rdb.c
@@ -0,0 +1,44 @@
+/* Trivia program to corrupt an RDB file in order to check the RDB check
+ * program behavior and effectiveness.
+ *
+ * Copyright (C) 2016 Salvatore Sanfilippo.
+ * This software is released in the 3-clause BSD license. */
+
+#include <stdio.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <time.h>
+
+int main(int argc, char **argv) {
+ struct stat stat;
+ int fd, cycles;
+
+ if (argc != 3) {
+ fprintf(stderr,"Usage: <filename> <cycles>\n");
+ exit(1);
+ }
+
+ srand(time(NULL));
+ cycles = atoi(argv[2]);
+ fd = open("dump.rdb",O_RDWR);
+ if (fd == -1) {
+ perror("open");
+ exit(1);
+ }
+ fstat(fd,&stat);
+
+ while(cycles--) {
+ unsigned char buf[32];
+ unsigned long offset = rand()%stat.st_size;
+ int writelen = 1+rand()%31;
+ int j;
+
+ for (j = 0; j < writelen; j++) buf[j] = (char)rand();
+ lseek(fd,offset,SEEK_SET);
+ printf("Writing %d bytes at offset %lu\n", writelen, offset);
+ write(fd,buf,writelen);
+ }
+ return 0;
+}
diff --git a/utils/hyperloglog/hll-gnuplot-graph.rb b/utils/hyperloglog/hll-gnuplot-graph.rb
index 745baddcf..6c7596d17 100644
--- a/utils/hyperloglog/hll-gnuplot-graph.rb
+++ b/utils/hyperloglog/hll-gnuplot-graph.rb
@@ -30,7 +30,7 @@ def run_experiment(r,seed,max,step)
elements << ele
i += 1
}
- r.pfadd('hll',*elements)
+ r.pfadd('hll',elements)
approx = r.pfcount('hll')
err = approx-i
rel_err = 100.to_f*err/i
diff --git a/utils/install_server.sh b/utils/install_server.sh
index 98e047e3d..3d920a125 100755
--- a/utils/install_server.sh
+++ b/utils/install_server.sh
@@ -25,9 +25,25 @@
#
################################################################################
#
-# Interactive service installer for redis server
-# this generates a redis config file and an /etc/init.d script, and installs them
-# this scripts should be run as root
+# Service installer for redis server, runs interactively by default.
+#
+# To run this script non-interactively (for automation/provisioning purposes),
+# feed the variables into the script. Any missing variables will be prompted!
+# Tip: Environment variables also support command substitution (see REDIS_EXECUTABLE)
+#
+# Example:
+#
+# sudo REDIS_PORT=1234 \
+# REDIS_CONFIG_FILE=/etc/redis/1234.conf \
+# REDIS_LOG_FILE=/var/log/redis_1234.log \
+# REDIS_DATA_DIR=/var/lib/redis/1234 \
+# REDIS_EXECUTABLE=`command -v redis-server` ./utils/install_server.sh
+#
+# This generates a redis config file and an /etc/init.d script, and installs them.
+#
+# /!\ This script should be run as root
+#
+################################################################################
die () {
echo "ERROR: $1. Aborting!"
@@ -42,6 +58,7 @@ SCRIPTPATH=$(dirname $SCRIPT)
#Initial defaults
_REDIS_PORT=6379
+_MANUAL_EXECUTION=false
echo "Welcome to the redis service installer"
echo "This script will help you easily set up a running redis server"
@@ -53,47 +70,61 @@ if [ "$(id -u)" -ne 0 ] ; then
exit 1
fi
-#Read the redis port
-read -p "Please select the redis port for this instance: [$_REDIS_PORT] " REDIS_PORT
if ! echo $REDIS_PORT | egrep -q '^[0-9]+$' ; then
- echo "Selecting default: $_REDIS_PORT"
- REDIS_PORT=$_REDIS_PORT
+ _MANUAL_EXECUTION=true
+ #Read the redis port
+ read -p "Please select the redis port for this instance: [$_REDIS_PORT] " REDIS_PORT
+ if ! echo $REDIS_PORT | egrep -q '^[0-9]+$' ; then
+ echo "Selecting default: $_REDIS_PORT"
+ REDIS_PORT=$_REDIS_PORT
+ fi
fi
-#read the redis config file
-_REDIS_CONFIG_FILE="/etc/redis/$REDIS_PORT.conf"
-read -p "Please select the redis config file name [$_REDIS_CONFIG_FILE] " REDIS_CONFIG_FILE
if [ -z "$REDIS_CONFIG_FILE" ] ; then
- REDIS_CONFIG_FILE=$_REDIS_CONFIG_FILE
- echo "Selected default - $REDIS_CONFIG_FILE"
+ _MANUAL_EXECUTION=true
+ #read the redis config file
+ _REDIS_CONFIG_FILE="/etc/redis/$REDIS_PORT.conf"
+ read -p "Please select the redis config file name [$_REDIS_CONFIG_FILE] " REDIS_CONFIG_FILE
+ if [ -z "$REDIS_CONFIG_FILE" ] ; then
+ REDIS_CONFIG_FILE=$_REDIS_CONFIG_FILE
+ echo "Selected default - $REDIS_CONFIG_FILE"
+ fi
fi
-#read the redis log file path
-_REDIS_LOG_FILE="/var/log/redis_$REDIS_PORT.log"
-read -p "Please select the redis log file name [$_REDIS_LOG_FILE] " REDIS_LOG_FILE
if [ -z "$REDIS_LOG_FILE" ] ; then
- REDIS_LOG_FILE=$_REDIS_LOG_FILE
- echo "Selected default - $REDIS_LOG_FILE"
+ _MANUAL_EXECUTION=true
+ #read the redis log file path
+ _REDIS_LOG_FILE="/var/log/redis_$REDIS_PORT.log"
+ read -p "Please select the redis log file name [$_REDIS_LOG_FILE] " REDIS_LOG_FILE
+ if [ -z "$REDIS_LOG_FILE" ] ; then
+ REDIS_LOG_FILE=$_REDIS_LOG_FILE
+ echo "Selected default - $REDIS_LOG_FILE"
+ fi
fi
-
-#get the redis data directory
-_REDIS_DATA_DIR="/var/lib/redis/$REDIS_PORT"
-read -p "Please select the data directory for this instance [$_REDIS_DATA_DIR] " REDIS_DATA_DIR
if [ -z "$REDIS_DATA_DIR" ] ; then
- REDIS_DATA_DIR=$_REDIS_DATA_DIR
- echo "Selected default - $REDIS_DATA_DIR"
+ _MANUAL_EXECUTION=true
+ #get the redis data directory
+ _REDIS_DATA_DIR="/var/lib/redis/$REDIS_PORT"
+ read -p "Please select the data directory for this instance [$_REDIS_DATA_DIR] " REDIS_DATA_DIR
+ if [ -z "$REDIS_DATA_DIR" ] ; then
+ REDIS_DATA_DIR=$_REDIS_DATA_DIR
+ echo "Selected default - $REDIS_DATA_DIR"
+ fi
fi
-#get the redis executable path
-_REDIS_EXECUTABLE=`command -v redis-server`
-read -p "Please select the redis executable path [$_REDIS_EXECUTABLE] " REDIS_EXECUTABLE
if [ ! -x "$REDIS_EXECUTABLE" ] ; then
- REDIS_EXECUTABLE=$_REDIS_EXECUTABLE
-
+ _MANUAL_EXECUTION=true
+ #get the redis executable path
+ _REDIS_EXECUTABLE=`command -v redis-server`
+ read -p "Please select the redis executable path [$_REDIS_EXECUTABLE] " REDIS_EXECUTABLE
if [ ! -x "$REDIS_EXECUTABLE" ] ; then
- echo "Mmmmm... it seems like you don't have a redis executable. Did you run make install yet?"
- exit 1
+ REDIS_EXECUTABLE=$_REDIS_EXECUTABLE
+
+ if [ ! -x "$REDIS_EXECUTABLE" ] ; then
+ echo "Mmmmm... it seems like you don't have a redis executable. Did you run make install yet?"
+ exit 1
+ fi
fi
fi
@@ -112,7 +143,9 @@ echo "Data dir : $REDIS_DATA_DIR"
echo "Executable : $REDIS_EXECUTABLE"
echo "Cli Executable : $CLI_EXEC"
-read -p "Is this ok? Then press ENTER to go on or Ctrl-C to abort." _UNUSED_
+if $_MANUAL_EXECUTION == true ; then
+ read -p "Is this ok? Then press ENTER to go on or Ctrl-C to abort." _UNUSED_
+fi
mkdir -p `dirname "$REDIS_CONFIG_FILE"` || die "Could not create redis config directory"
mkdir -p `dirname "$REDIS_LOG_FILE"` || die "Could not create redis log dir"
diff --git a/utils/lru/README b/utils/lru/README
index 288189e3e..f043b2979 100644
--- a/utils/lru/README
+++ b/utils/lru/README
@@ -3,11 +3,17 @@ Redis approximated LRU algorithm against the theoretical output of true
LRU algorithm.
In order to use the program you need to recompile Redis setting the define
-REDIS_LRU_CLOCK_RESOLUTION to 1, by editing redis.h.
+REDIS_LRU_CLOCK_RESOLUTION to 1, by editing the file server.h.
This allows to execute the program in a fast way since the 1 ms resolution
is enough for all the objects to have a different enough time stamp during
the test.
The program is executed like this:
- ruby test-lru.rb > /tmp/lru.html
+ ruby test-lru.rb /tmp/lru.html
+
+You can optionally specify a number of times to run, so that the program
+will output averages of different runs, by adding an additional argument.
+For instance in order to run the test 10 times use:
+
+ ruby test-lru.rb /tmp/lru.html 10
diff --git a/utils/lru/lfu-simulation.c b/utils/lru/lfu-simulation.c
new file mode 100644
index 000000000..6aa5911ac
--- /dev/null
+++ b/utils/lru/lfu-simulation.c
@@ -0,0 +1,158 @@
+#include <stdio.h>
+#include <time.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+int decr_every = 1;
+int keyspace_size = 1000000;
+time_t switch_after = 30; /* Switch access pattern after N seconds. */
+
+struct entry {
+ /* Field that the LFU Redis implementation will have (we have
+ * 24 bits of total space in the object->lru field). */
+ uint8_t counter; /* Logarithmic counter. */
+ uint16_t decrtime; /* (Reduced precision) time of last decrement. */
+
+ /* Fields only useful for visualization. */
+ uint64_t hits; /* Number of real accesses. */
+ time_t ctime; /* Key creation time. */
+};
+
+#define to_16bit_minutes(x) ((x/60) & 65535)
+#define COUNTER_INIT_VAL 5
+
+/* Compute the difference in minutes between two 16 bit minutes times
+ * obtained with to_16bit_minutes(). Since they can wrap around if
+ * we detect the overflow we account for it as if the counter wrapped
+ * a single time. */
+uint16_t minutes_diff(uint16_t now, uint16_t prev) {
+ if (now >= prev) return now-prev;
+ return 65535-prev+now;
+}
+
+/* Increment a couter logaritmically: the greatest is its value, the
+ * less likely is that the counter is really incremented.
+ * The maximum value of the counter is saturated at 255. */
+uint8_t log_incr(uint8_t counter) {
+ if (counter == 255) return counter;
+ double r = (double)rand()/RAND_MAX;
+ double baseval = counter-COUNTER_INIT_VAL;
+ if (baseval < 0) baseval = 0;
+ double limit = 1.0/(baseval*10+1);
+ if (r < limit) counter++;
+ return counter;
+}
+
+/* Simulate an access to an entry. */
+void access_entry(struct entry *e) {
+ e->counter = log_incr(e->counter);
+ e->hits++;
+}
+
+/* Return the entry LFU value and as a side effect decrement the
+ * entry value if the decrement time was reached. */
+uint8_t scan_entry(struct entry *e) {
+ if (minutes_diff(to_16bit_minutes(time(NULL)),e->decrtime)
+ >= decr_every)
+ {
+ if (e->counter) {
+ if (e->counter > COUNTER_INIT_VAL*2) {
+ e->counter /= 2;
+ } else {
+ e->counter--;
+ }
+ }
+ e->decrtime = to_16bit_minutes(time(NULL));
+ }
+ return e->counter;
+}
+
+/* Print the entry info. */
+void show_entry(long pos, struct entry *e) {
+ char *tag = "normal ";
+
+ if (pos >= 10 && pos <= 14) tag = "new no access";
+ if (pos >= 15 && pos <= 19) tag = "new accessed ";
+ if (pos >= keyspace_size -5) tag= "old no access";
+
+ printf("%ld] <%s> frequency:%d decrtime:%d [%lu hits | age:%ld sec]\n",
+ pos, tag, e->counter, e->decrtime, (unsigned long)e->hits,
+ time(NULL) - e->ctime);
+}
+
+int main(void) {
+ time_t start = time(NULL);
+ time_t new_entry_time = start;
+ time_t display_time = start;
+ struct entry *entries = malloc(sizeof(*entries)*keyspace_size);
+ long j;
+
+ /* Initialize. */
+ for (j = 0; j < keyspace_size; j++) {
+ entries[j].counter = COUNTER_INIT_VAL;
+ entries[j].decrtime = to_16bit_minutes(start);
+ entries[j].hits = 0;
+ entries[j].ctime = time(NULL);
+ }
+
+ while(1) {
+ time_t now = time(NULL);
+ long idx;
+
+ /* Scan N random entries (simulates the eviction under maxmemory). */
+ for (j = 0; j < 3; j++) {
+ scan_entry(entries+(rand()%keyspace_size));
+ }
+
+ /* Access a random entry: use a power-law access pattern up to
+ * 'switch_after' seconds. Then revert to flat access pattern. */
+ if (now-start < switch_after) {
+ /* Power law. */
+ idx = 1;
+ while((rand() % 21) != 0 && idx < keyspace_size) idx *= 2;
+ if (idx > keyspace_size) idx = keyspace_size;
+ idx = rand() % idx;
+ } else {
+ /* Flat. */
+ idx = rand() % keyspace_size;
+ }
+
+ /* Never access entries between position 10 and 14, so that
+ * we simulate what happens to new entries that are never
+ * accessed VS new entries which are accessed in positions
+ * 15-19.
+ *
+ * Also never access last 5 entry, so that we have keys which
+ * are never recreated (old), and never accessed. */
+ if ((idx < 10 || idx > 14) && (idx < keyspace_size-5))
+ access_entry(entries+idx);
+
+ /* Simulate the addition of new entries at positions between
+ * 10 and 19, a random one every 10 seconds. */
+ if (new_entry_time <= now) {
+ idx = 10+(rand()%10);
+ entries[idx].counter = COUNTER_INIT_VAL;
+ entries[idx].decrtime = to_16bit_minutes(time(NULL));
+ entries[idx].hits = 0;
+ entries[idx].ctime = time(NULL);
+ new_entry_time = now+10;
+ }
+
+ /* Show the first 20 entries and the last 20 entries. */
+ if (display_time != now) {
+ printf("=============================\n");
+ printf("Current minutes time: %d\n", (int)to_16bit_minutes(now));
+ printf("Access method: %s\n",
+ (now-start < switch_after) ? "power-law" : "flat");
+
+ for (j = 0; j < 20; j++)
+ show_entry(j,entries+j);
+
+ for (j = keyspace_size-20; j < keyspace_size; j++)
+ show_entry(j,entries+j);
+ display_time = now;
+ }
+ }
+ return 0;
+}
+
diff --git a/utils/lru/test-lru.rb b/utils/lru/test-lru.rb
index ee0527ef4..d511e206f 100644
--- a/utils/lru/test-lru.rb
+++ b/utils/lru/test-lru.rb
@@ -1,112 +1,223 @@
require 'rubygems'
require 'redis'
-r = Redis.new
-r.config("SET","maxmemory","2000000")
-r.config("SET","maxmemory-policy","allkeys-lru")
-r.config("SET","maxmemory-samples",5)
-r.config("RESETSTAT")
-r.flushall
-
-puts <<EOF
-<html>
-<body>
-<style>
-.box {
- width:5px;
- height:5px;
- float:left;
- margin: 1px;
-}
-
-.old {
- border: 1px black solid;
-}
-
-.new {
- border: 1px green solid;
-}
-
-.ex {
- background-color: #666;
-}
-</style>
-<pre>
+$runs = []; # Remember the error rate of each run for average purposes.
+$o = {}; # Options set parsing arguments
+
+def testit(filename)
+ r = Redis.new
+ r.config("SET","maxmemory","2000000")
+ if $o[:ttl]
+ r.config("SET","maxmemory-policy","volatile-ttl")
+ else
+ r.config("SET","maxmemory-policy","allkeys-lru")
+ end
+ r.config("SET","maxmemory-samples",5)
+ r.config("RESETSTAT")
+ r.flushall
+
+ html = ""
+ html << <<EOF
+ <html>
+ <body>
+ <style>
+ .box {
+ width:5px;
+ height:5px;
+ float:left;
+ margin: 1px;
+ }
+
+ .old {
+ border: 1px black solid;
+ }
+
+ .new {
+ border: 1px green solid;
+ }
+
+ .otherdb {
+ border: 1px red solid;
+ }
+
+ .ex {
+ background-color: #666;
+ }
+ </style>
+ <pre>
EOF
-# Fill
-oldsize = r.dbsize
-id = 0
-while true
- id += 1
- r.set(id,"foo")
- newsize = r.dbsize
- break if newsize == oldsize
- oldsize = newsize
-end
+ # Fill the DB up to the first eviction.
+ oldsize = r.dbsize
+ id = 0
+ while true
+ id += 1
+ begin
+ r.set(id,"foo")
+ rescue
+ break
+ end
+ newsize = r.dbsize
+ break if newsize == oldsize # A key was evicted? Stop.
+ oldsize = newsize
+ end
-inserted = r.dbsize
-first_set_max_id = id
-puts "#{r.dbsize} keys inserted"
+ inserted = r.dbsize
+ first_set_max_id = id
+ html << "#{r.dbsize} keys inserted.\n"
-# Access keys sequentially
+ # Access keys sequentially, so that in theory the first part will be expired
+ # and the latter part will not, according to perfect LRU.
-puts "Access keys sequentially"
-(1..first_set_max_id).each{|id|
- r.get(id)
-# sleep 0.001
-}
+ if $o[:ttl]
+ STDERR.puts "Set increasing expire value"
+ (1..first_set_max_id).each{|id|
+ r.expire(id,1000+id)
+ STDERR.print(".") if (id % 150) == 0
+ }
+ else
+ STDERR.puts "Access keys sequentially"
+ (1..first_set_max_id).each{|id|
+ r.get(id)
+ sleep 0.001
+ STDERR.print(".") if (id % 150) == 0
+ }
+ end
+ STDERR.puts
+
+ # Insert more 50% keys. We expect that the new keys will rarely be expired
+ # since their last access time is recent compared to the others.
+ #
+ # Note that we insert the first 100 keys of the new set into DB1 instead
+ # of DB0, so that we can try how cross-DB eviction works.
+ half = inserted/2
+ html << "Insert enough keys to evict half the keys we inserted.\n"
+ add = 0
+
+ otherdb_start_idx = id+1
+ otherdb_end_idx = id+100
+ while true
+ add += 1
+ id += 1
+ if id >= otherdb_start_idx && id <= otherdb_end_idx
+ r.select(1)
+ r.set(id,"foo")
+ r.select(0)
+ else
+ r.set(id,"foo")
+ end
+ break if r.info['evicted_keys'].to_i >= half
+ end
+
+ html << "#{add} additional keys added.\n"
+ html << "#{r.dbsize} keys in DB.\n"
+
+ # Check if evicted keys respect LRU
+ # We consider errors from 1 to N progressively more serious as they violate
+ # more the access pattern.
+
+ errors = 0
+ e = 1
+ error_per_key = 100000.0/first_set_max_id
+ half_set_size = first_set_max_id/2
+ maxerr = 0
+ (1..(first_set_max_id/2)).each{|id|
+ if id >= otherdb_start_idx && id <= otherdb_end_idx
+ r.select(1)
+ exists = r.exists(id)
+ r.select(0)
+ else
+ exists = r.exists(id)
+ end
+ if id < first_set_max_id/2
+ thiserr = error_per_key * ((half_set_size-id).to_f/half_set_size)
+ maxerr += thiserr
+ errors += thiserr if exists
+ elsif id >= first_set_max_id/2
+ thiserr = error_per_key * ((id-half_set_size).to_f/half_set_size)
+ maxerr += thiserr
+ errors += thiserr if !exists
+ end
+ }
+ errors = errors*100/maxerr
+
+ STDERR.puts "Test finished with #{errors}% error! Generating HTML on stdout."
+
+ html << "#{errors}% error!\n"
+ html << "</pre>"
+ $runs << errors
+
+ # Generate the graphical representation
+ (1..id).each{|id|
+ # Mark first set and added items in a different way.
+ c = "box"
+ if id >= otherdb_start_idx && id <= otherdb_end_idx
+ c << " otherdb"
+ elsif id <= first_set_max_id
+ c << " old"
+ else
+ c << " new"
+ end
+
+ # Add class if exists
+ if id >= otherdb_start_idx && id <= otherdb_end_idx
+ r.select(1)
+ exists = r.exists(id)
+ r.select(0)
+ else
+ exists = r.exists(id)
+ end
+
+ c << " ex" if exists
+ html << "<div title=\"#{id}\" class=\"#{c}\"></div>"
+ }
+
+ # Close HTML page
+
+ html << <<EOF
+ </body>
+ </html>
+EOF
-# Insert more 50% keys. We expect that the new keys
-half = inserted/2
-puts "Insert enough keys to evict half the keys we inserted"
-add = 0
-while true
- add += 1
- id += 1
- r.set(id,"foo")
- break if r.info['evicted_keys'].to_i >= half
+ f = File.open(filename,"w")
+ f.write(html)
+ f.close
end
-puts "#{add} additional keys added."
-puts "#{r.dbsize} keys in DB"
-
-# Check if evicted keys respect LRU
-# We consider errors from 1 to N progressively more serious as they violate
-# more the access pattern.
-
-errors = 0
-e = 1
-edecr = 1.0/(first_set_max_id/2)
-(1..(first_set_max_id/2)).each{|id|
- e -= edecr if e > 0
- e = 0 if e < 0
- if r.exists(id)
- errors += e
- end
-}
+def print_avg
+ avg = ($runs.reduce {|a,b| a+b}) / $runs.length
+ puts "#{$runs.length} runs, AVG is #{avg}"
+end
-puts "#{errors} errors!"
-puts "</pre>"
+if ARGV.length < 1
+ STDERR.puts "Usage: ruby test-lru.rb <html-output-filename> [--runs <count>] [--ttl]"
+ STDERR.puts "Options:"
+ STDERR.puts " --runs <count> Execute the test <count> times."
+ STDERR.puts " --ttl Set keys with increasing TTL values"
+ STDERR.puts " (starting from 1000 seconds) in order to"
+ STDERR.puts " test the volatile-lru policy."
+ exit 1
+end
-# Generate the graphical representation
-(1..id).each{|id|
- # Mark first set and added items in a different way.
- c = "box"
- if id <= first_set_max_id
- c << " old"
+filename = ARGV[0]
+$o[:numruns] = 1
+
+# Options parsing
+i = 1
+while i < ARGV.length
+ if ARGV[i] == '--runs'
+ $o[:numruns] = ARGV[i+1].to_i
+ i+= 1
+ elsif ARGV[i] == '--ttl'
+ $o[:ttl] = true
else
- c << " new"
+ STDERR.puts "Unknown option #{ARGV[i]}"
+ exit 1
end
+ i+= 1
+end
- # Add class if exists
- c << " ex" if r.exists(id)
- puts "<div class=\"#{c}\"></div>"
+$o[:numruns].times {
+ testit(filename)
+ print_avg if $o[:numruns] != 1
}
-
-# Close HTML page
-
-puts <<EOF
-</body>
-</html>
-EOF
diff --git a/utils/releasetools/changelog.tcl b/utils/releasetools/changelog.tcl
index bf0ad999e..4b5424ce2 100755
--- a/utils/releasetools/changelog.tcl
+++ b/utils/releasetools/changelog.tcl
@@ -21,6 +21,10 @@ append template "\n\n"
set date [clock format [clock seconds]]
set template [string map [list %ver% $ver %date% $date] $template]
-append template [exec git log $branch~30..$branch "--format=format:+-------------------------------------------------------------------------------%n| %s%n| By %an, %ai%n+--------------------------------------------------------------------------------%nhttps://github.com/antirez/redis/commit/%H%n%n%b" --stat]
+append template [exec git log $branch~30..$branch "--format=format:%an in commit %h:%n %s" --shortstat]
+
+#Older, more verbose version.
+#
+#append template [exec git log $branch~30..$branch "--format=format:+-------------------------------------------------------------------------------%n| %s%n| By %an, %ai%n+--------------------------------------------------------------------------------%nhttps://github.com/antirez/redis/commit/%H%n%n%b" --stat]
puts $template