summaryrefslogtreecommitdiff
path: root/src/rdb.c
diff options
context:
space:
mode:
authorWander Hillen <wjw.hillen@gmail.com>2019-10-25 10:18:26 +0200
committerGitHub <noreply@github.com>2019-10-25 10:18:26 +0200
commitfb1f4f4e7e4a7a1fd5366afe5581ac6613047faf (patch)
tree7a8736f75b6c5c96d53d256161e86f0645d8840f /src/rdb.c
parentdda8cc182117cd5b164a524a1246e8be33d08f7e (diff)
parent6e98214f740eb04c9761ccb983ba51988ee35bc2 (diff)
downloadredis-fb1f4f4e7e4a7a1fd5366afe5581ac6613047faf.tar.gz
Merge branch 'unstable' into minor-typos
Diffstat (limited to 'src/rdb.c')
-rw-r--r--src/rdb.c945
1 files changed, 653 insertions, 292 deletions
diff --git a/src/rdb.c b/src/rdb.c
index 35448b624..c92b9f4a1 100644
--- a/src/rdb.c
+++ b/src/rdb.c
@@ -42,30 +42,41 @@
#include <sys/stat.h>
#include <sys/param.h>
-#define rdbExitReportCorruptRDB(...) rdbCheckThenExit(__LINE__,__VA_ARGS__)
+/* This macro is called when the internal RDB stracture is corrupt */
+#define rdbExitReportCorruptRDB(...) rdbReportError(1, __LINE__,__VA_ARGS__)
+/* This macro is called when RDB read failed (possibly a short read) */
+#define rdbReportReadError(...) rdbReportError(0, __LINE__,__VA_ARGS__)
+char* rdbFileBeingLoaded = NULL; /* used for rdb checking on read error */
extern int rdbCheckMode;
void rdbCheckError(const char *fmt, ...);
void rdbCheckSetError(const char *fmt, ...);
-void rdbCheckThenExit(int linenum, char *reason, ...) {
+void rdbReportError(int corruption_error, int linenum, char *reason, ...) {
va_list ap;
char msg[1024];
int len;
len = snprintf(msg,sizeof(msg),
- "Internal error in RDB reading function at rdb.c:%d -> ", linenum);
+ "Internal error in RDB reading offset %llu, function at rdb.c:%d -> ",
+ (unsigned long long)server.loading_loaded_bytes, linenum);
va_start(ap,reason);
vsnprintf(msg+len,sizeof(msg)-len,reason,ap);
va_end(ap);
if (!rdbCheckMode) {
- serverLog(LL_WARNING, "%s", msg);
- char *argv[2] = {"",server.rdb_filename};
- redis_check_rdb_main(2,argv,NULL);
+ if (rdbFileBeingLoaded || corruption_error) {
+ serverLog(LL_WARNING, "%s", msg);
+ char *argv[2] = {"",rdbFileBeingLoaded};
+ redis_check_rdb_main(2,argv,NULL);
+ } else {
+ serverLog(LL_WARNING, "%s. Failure loading rdb format from socket, assuming connection error, resuming operation.", msg);
+ return;
+ }
} else {
rdbCheckError("%s",msg);
}
+ serverLog(LL_WARNING, "Terminating server after rdb file reading failure.");
exit(1);
}
@@ -88,6 +99,11 @@ int rdbLoadType(rio *rdb) {
return type;
}
+/* This is only used to load old databases stored with the RDB_OPCODE_EXPIRETIME
+ * opcode. New versions of Redis store using the RDB_OPCODE_EXPIRETIME_MS
+ * opcode. On error -1 is returned, however this could be a valid time, so
+ * to check for loading errors the caller should call rioGetReadError() after
+ * calling this function. */
time_t rdbLoadTime(rio *rdb) {
int32_t t32;
if (rioRead(rdb,&t32,4) == 0) return -1;
@@ -96,12 +112,30 @@ time_t rdbLoadTime(rio *rdb) {
int rdbSaveMillisecondTime(rio *rdb, long long t) {
int64_t t64 = (int64_t) t;
+ memrev64ifbe(&t64); /* Store in little endian. */
return rdbWriteRaw(rdb,&t64,8);
}
-long long rdbLoadMillisecondTime(rio *rdb) {
+/* This function loads a time from the RDB file. It gets the version of the
+ * RDB because, unfortunately, before Redis 5 (RDB version 9), the function
+ * failed to convert data to/from little endian, so RDB files with keys having
+ * expires could not be shared between big endian and little endian systems
+ * (because the expire time will be totally wrong). The fix for this is just
+ * to call memrev64ifbe(), however if we fix this for all the RDB versions,
+ * this call will introduce an incompatibility for big endian systems:
+ * after upgrading to Redis version 5 they will no longer be able to load their
+ * own old RDB files. Because of that, we instead fix the function only for new
+ * RDB versions, and load older RDB versions as we used to do in the past,
+ * allowing big endian systems to load their own old RDB files.
+ *
+ * On I/O error the function returns LLONG_MAX, however if this is also a
+ * valid stored value, the caller should use rioGetReadError() to check for
+ * errors after calling this function. */
+long long rdbLoadMillisecondTime(rio *rdb, int rdbver) {
int64_t t64;
- if (rioRead(rdb,&t64,8) == 0) return -1;
+ if (rioRead(rdb,&t64,8) == 0) return LLONG_MAX;
+ if (rdbver >= 9) /* Check the top comment of this function. */
+ memrev64ifbe(&t64); /* Convert in big endian if the system is BE. */
return (long long)t64;
}
@@ -226,7 +260,7 @@ int rdbEncodeInteger(long long value, unsigned char *enc) {
/* Loads an integer-encoded object with the specified encoding type "enctype".
* The returned value changes according to the flags, see
- * rdbGenerincLoadStringObject() for more info. */
+ * rdbGenericLoadStringObject() for more info. */
void *rdbLoadIntegerObject(rio *rdb, int enctype, int flags, size_t *lenptr) {
int plain = flags & RDB_LOAD_PLAIN;
int sds = flags & RDB_LOAD_SDS;
@@ -248,8 +282,8 @@ void *rdbLoadIntegerObject(rio *rdb, int enctype, int flags, size_t *lenptr) {
v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
val = (int32_t)v;
} else {
- val = 0; /* anti-warning */
rdbExitReportCorruptRDB("Unknown RDB integer encoding type %d",enctype);
+ return NULL; /* Never reached. */
}
if (plain || sds) {
char buf[LONG_STR_SIZE], *p;
@@ -259,7 +293,7 @@ void *rdbLoadIntegerObject(rio *rdb, int enctype, int flags, size_t *lenptr) {
memcpy(p,buf,len);
return p;
} else if (encode) {
- return createStringObjectFromLongLong(val);
+ return createStringObjectFromLongLongForValue(val);
} else {
return createObject(OBJ_STRING,sdsfromlonglong(val));
}
@@ -352,8 +386,7 @@ void *rdbLoadLzfStringObject(rio *rdb, int flags, size_t *lenptr) {
/* Load the compressed representation and uncompress it to target. */
if (rioRead(rdb,c,clen) == 0) goto err;
if (lzf_decompress(c,clen,val,len) == 0) {
- if (rdbCheckMode) rdbCheckSetError("Invalid LZF compressed string");
- goto err;
+ rdbExitReportCorruptRDB("Invalid LZF compressed string");
}
zfree(c);
@@ -467,6 +500,7 @@ void *rdbGenericLoadStringObject(rio *rdb, int flags, size_t *lenptr) {
return rdbLoadLzfStringObject(rdb,flags,lenptr);
default:
rdbExitReportCorruptRDB("Unknown RDB string encoding type %d",len);
+ return NULL; /* Never reached. */
}
}
@@ -642,8 +676,87 @@ int rdbLoadObjectType(rio *rdb) {
return type;
}
-/* Save a Redis object. Returns -1 on error, number of bytes written on success. */
-ssize_t rdbSaveObject(rio *rdb, robj *o) {
+/* This helper function serializes a consumer group Pending Entries List (PEL)
+ * into the RDB file. The 'nacks' argument tells the function if also persist
+ * the informations about the not acknowledged message, or if to persist
+ * just the IDs: this is useful because for the global consumer group PEL
+ * we serialized the NACKs as well, but when serializing the local consumer
+ * PELs we just add the ID, that will be resolved inside the global PEL to
+ * put a reference to the same structure. */
+ssize_t rdbSaveStreamPEL(rio *rdb, rax *pel, int nacks) {
+ ssize_t n, nwritten = 0;
+
+ /* Number of entries in the PEL. */
+ if ((n = rdbSaveLen(rdb,raxSize(pel))) == -1) return -1;
+ nwritten += n;
+
+ /* Save each entry. */
+ raxIterator ri;
+ raxStart(&ri,pel);
+ raxSeek(&ri,"^",NULL,0);
+ while(raxNext(&ri)) {
+ /* We store IDs in raw form as 128 big big endian numbers, like
+ * they are inside the radix tree key. */
+ if ((n = rdbWriteRaw(rdb,ri.key,sizeof(streamID))) == -1) return -1;
+ nwritten += n;
+
+ if (nacks) {
+ streamNACK *nack = ri.data;
+ if ((n = rdbSaveMillisecondTime(rdb,nack->delivery_time)) == -1)
+ return -1;
+ nwritten += n;
+ if ((n = rdbSaveLen(rdb,nack->delivery_count)) == -1) return -1;
+ nwritten += n;
+ /* We don't save the consumer name: we'll save the pending IDs
+ * for each consumer in the consumer PEL, and resolve the consumer
+ * at loading time. */
+ }
+ }
+ raxStop(&ri);
+ return nwritten;
+}
+
+/* Serialize the consumers of a stream consumer group into the RDB. Helper
+ * function for the stream data type serialization. What we do here is to
+ * persist the consumer metadata, and it's PEL, for each consumer. */
+size_t rdbSaveStreamConsumers(rio *rdb, streamCG *cg) {
+ ssize_t n, nwritten = 0;
+
+ /* Number of consumers in this consumer group. */
+ if ((n = rdbSaveLen(rdb,raxSize(cg->consumers))) == -1) return -1;
+ nwritten += n;
+
+ /* Save each consumer. */
+ raxIterator ri;
+ raxStart(&ri,cg->consumers);
+ raxSeek(&ri,"^",NULL,0);
+ while(raxNext(&ri)) {
+ streamConsumer *consumer = ri.data;
+
+ /* Consumer name. */
+ if ((n = rdbSaveRawString(rdb,ri.key,ri.key_len)) == -1) return -1;
+ nwritten += n;
+
+ /* Last seen time. */
+ if ((n = rdbSaveMillisecondTime(rdb,consumer->seen_time)) == -1)
+ return -1;
+ nwritten += n;
+
+ /* Consumer PEL, without the ACKs (see last parameter of the function
+ * passed with value of 0), at loading time we'll lookup the ID
+ * in the consumer group global PEL and will put a reference in the
+ * consumer local PEL. */
+ if ((n = rdbSaveStreamPEL(rdb,consumer->pel,0)) == -1)
+ return -1;
+ nwritten += n;
+ }
+ raxStop(&ri);
+ return nwritten;
+}
+
+/* Save a Redis object.
+ * Returns -1 on error, number of bytes written on success. */
+ssize_t rdbSaveObject(rio *rdb, robj *o, robj *key) {
ssize_t n = 0, nwritten = 0;
if (o->type == OBJ_STRING) {
@@ -681,13 +794,20 @@ ssize_t rdbSaveObject(rio *rdb, robj *o) {
dictIterator *di = dictGetIterator(set);
dictEntry *de;
- if ((n = rdbSaveLen(rdb,dictSize(set))) == -1) return -1;
+ if ((n = rdbSaveLen(rdb,dictSize(set))) == -1) {
+ dictReleaseIterator(di);
+ return -1;
+ }
nwritten += n;
while((de = dictNext(di)) != NULL) {
sds ele = dictGetKey(de);
if ((n = rdbSaveRawString(rdb,(unsigned char*)ele,sdslen(ele)))
- == -1) return -1;
+ == -1)
+ {
+ dictReleaseIterator(di);
+ return -1;
+ }
nwritten += n;
}
dictReleaseIterator(di);
@@ -747,7 +867,10 @@ ssize_t rdbSaveObject(rio *rdb, robj *o) {
dictIterator *di = dictGetIterator(o->ptr);
dictEntry *de;
- if ((n = rdbSaveLen(rdb,dictSize((dict*)o->ptr))) == -1) return -1;
+ if ((n = rdbSaveLen(rdb,dictSize((dict*)o->ptr))) == -1) {
+ dictReleaseIterator(di);
+ return -1;
+ }
nwritten += n;
while((de = dictNext(di)) != NULL) {
@@ -755,10 +878,18 @@ ssize_t rdbSaveObject(rio *rdb, robj *o) {
sds value = dictGetVal(de);
if ((n = rdbSaveRawString(rdb,(unsigned char*)field,
- sdslen(field))) == -1) return -1;
+ sdslen(field))) == -1)
+ {
+ dictReleaseIterator(di);
+ return -1;
+ }
nwritten += n;
if ((n = rdbSaveRawString(rdb,(unsigned char*)value,
- sdslen(value))) == -1) return -1;
+ sdslen(value))) == -1)
+ {
+ dictReleaseIterator(di);
+ return -1;
+ }
nwritten += n;
}
dictReleaseIterator(di);
@@ -798,12 +929,48 @@ ssize_t rdbSaveObject(rio *rdb, robj *o) {
nwritten += n;
if ((n = rdbSaveLen(rdb,s->last_id.seq)) == -1) return -1;
nwritten += n;
+
+ /* The consumer groups and their clients are part of the stream
+ * type, so serialize every consumer group. */
+
+ /* Save the number of groups. */
+ size_t num_cgroups = s->cgroups ? raxSize(s->cgroups) : 0;
+ if ((n = rdbSaveLen(rdb,num_cgroups)) == -1) return -1;
+ nwritten += n;
+
+ if (num_cgroups) {
+ /* Serialize each consumer group. */
+ raxStart(&ri,s->cgroups);
+ raxSeek(&ri,"^",NULL,0);
+ while(raxNext(&ri)) {
+ streamCG *cg = ri.data;
+
+ /* Save the group name. */
+ if ((n = rdbSaveRawString(rdb,ri.key,ri.key_len)) == -1)
+ return -1;
+ nwritten += n;
+
+ /* Last ID. */
+ if ((n = rdbSaveLen(rdb,cg->last_id.ms)) == -1) return -1;
+ nwritten += n;
+ if ((n = rdbSaveLen(rdb,cg->last_id.seq)) == -1) return -1;
+ nwritten += n;
+
+ /* Save the global PEL. */
+ if ((n = rdbSaveStreamPEL(rdb,cg->pel,1)) == -1) return -1;
+ nwritten += n;
+
+ /* Save the consumers of this group. */
+ if ((n = rdbSaveStreamConsumers(rdb,cg)) == -1) return -1;
+ nwritten += n;
+ }
+ raxStop(&ri);
+ }
} else if (o->type == OBJ_MODULE) {
/* Save a module-specific value. */
RedisModuleIO io;
moduleValue *mv = o->ptr;
moduleType *mt = mv->type;
- moduleInitIOContext(io,mt,rdb);
/* Write the "module" identifier as prefix, so that we'll be able
* to call the right module during loading. */
@@ -812,10 +979,13 @@ ssize_t rdbSaveObject(rio *rdb, robj *o) {
io.bytes += retval;
/* Then write the module-specific representation + EOF marker. */
+ moduleInitIOContext(io,mt,rdb,key);
mt->rdb_save(&io,mv->value);
retval = rdbSaveLen(rdb,RDB_MODULE_OPCODE_EOF);
- if (retval == -1) return -1;
- io.bytes += retval;
+ if (retval == -1)
+ io.error = 1;
+ else
+ io.bytes += retval;
if (io.ctx) {
moduleFreeContext(io.ctx);
@@ -833,7 +1003,7 @@ ssize_t rdbSaveObject(rio *rdb, robj *o) {
* this length with very little changes to the code. In the future
* we could switch to a faster solution. */
size_t rdbSavedObjectLen(robj *o) {
- ssize_t len = rdbSaveObject(NULL,o);
+ ssize_t len = rdbSaveObject(NULL,o,NULL);
serverAssertWithInfo(NULL,o,len != -1);
return len;
}
@@ -842,21 +1012,45 @@ size_t rdbSavedObjectLen(robj *o) {
* On error -1 is returned.
* On success if the key was actually saved 1 is returned, otherwise 0
* is returned (the key was already expired). */
-int rdbSaveKeyValuePair(rio *rdb, robj *key, robj *val,
- long long expiretime, long long now)
-{
+int rdbSaveKeyValuePair(rio *rdb, robj *key, robj *val, long long expiretime) {
+ int savelru = server.maxmemory_policy & MAXMEMORY_FLAG_LRU;
+ int savelfu = server.maxmemory_policy & MAXMEMORY_FLAG_LFU;
+
/* Save the expire time */
if (expiretime != -1) {
- /* If this key is already expired skip it */
- if (expiretime < now) return 0;
if (rdbSaveType(rdb,RDB_OPCODE_EXPIRETIME_MS) == -1) return -1;
if (rdbSaveMillisecondTime(rdb,expiretime) == -1) return -1;
}
+ /* Save the LRU info. */
+ if (savelru) {
+ uint64_t idletime = estimateObjectIdleTime(val);
+ idletime /= 1000; /* Using seconds is enough and requires less space.*/
+ if (rdbSaveType(rdb,RDB_OPCODE_IDLE) == -1) return -1;
+ if (rdbSaveLen(rdb,idletime) == -1) return -1;
+ }
+
+ /* Save the LFU info. */
+ if (savelfu) {
+ uint8_t buf[1];
+ buf[0] = LFUDecrAndReturn(val);
+ /* We can encode this in exactly two bytes: the opcode and an 8
+ * bit counter, since the frequency is logarithmic with a 0-255 range.
+ * Note that we do not store the halving time because to reset it
+ * a single time when loading does not affect the frequency much. */
+ if (rdbSaveType(rdb,RDB_OPCODE_FREQ) == -1) return -1;
+ if (rdbWriteRaw(rdb,buf,1) == -1) return -1;
+ }
+
/* Save type, key, value */
if (rdbSaveObjectType(rdb,val) == -1) return -1;
if (rdbSaveStringObject(rdb,key) == -1) return -1;
- if (rdbSaveObject(rdb,val) == -1) return -1;
+ if (rdbSaveObject(rdb,val,key) == -1) return -1;
+
+ /* Delay return if required (for testing) */
+ if (server.rdb_key_save_delay)
+ usleep(server.rdb_key_save_delay);
+
return 1;
}
@@ -909,6 +1103,45 @@ int rdbSaveInfoAuxFields(rio *rdb, int flags, rdbSaveInfo *rsi) {
return 1;
}
+ssize_t rdbSaveSingleModuleAux(rio *rdb, int when, moduleType *mt) {
+ /* Save a module-specific aux value. */
+ RedisModuleIO io;
+ int retval = rdbSaveType(rdb, RDB_OPCODE_MODULE_AUX);
+
+ /* Write the "module" identifier as prefix, so that we'll be able
+ * to call the right module during loading. */
+ retval = rdbSaveLen(rdb,mt->id);
+ if (retval == -1) return -1;
+ io.bytes += retval;
+
+ /* write the 'when' so that we can provide it on loading. add a UINT opcode
+ * for backwards compatibility, everything after the MT needs to be prefixed
+ * by an opcode. */
+ retval = rdbSaveLen(rdb,RDB_MODULE_OPCODE_UINT);
+ if (retval == -1) return -1;
+ io.bytes += retval;
+ retval = rdbSaveLen(rdb,when);
+ if (retval == -1) return -1;
+ io.bytes += retval;
+
+ /* Then write the module-specific representation + EOF marker. */
+ moduleInitIOContext(io,mt,rdb,NULL);
+ mt->aux_save(&io,when);
+ retval = rdbSaveLen(rdb,RDB_MODULE_OPCODE_EOF);
+ if (retval == -1)
+ io.error = 1;
+ else
+ io.bytes += retval;
+
+ if (io.ctx) {
+ moduleFreeContext(io.ctx);
+ zfree(io.ctx);
+ }
+ if (io.error)
+ return -1;
+ return io.bytes;
+}
+
/* Produces a dump of the database in RDB format sending it to the specified
* Redis I/O channel. On success C_OK is returned, otherwise C_ERR
* is returned and part of the output, or all the output, can be
@@ -922,7 +1155,6 @@ int rdbSaveRio(rio *rdb, int *error, int flags, rdbSaveInfo *rsi) {
dictEntry *de;
char magic[10];
int j;
- long long now = mstime();
uint64_t cksum;
size_t processed = 0;
@@ -931,13 +1163,13 @@ int rdbSaveRio(rio *rdb, int *error, int flags, rdbSaveInfo *rsi) {
snprintf(magic,sizeof(magic),"REDIS%04d",RDB_VERSION);
if (rdbWriteRaw(rdb,magic,9) == -1) goto werr;
if (rdbSaveInfoAuxFields(rdb,flags,rsi) == -1) goto werr;
+ if (rdbSaveModulesAux(rdb, REDISMODULE_AUX_BEFORE_RDB) == -1) goto werr;
for (j = 0; j < server.dbnum; j++) {
redisDb *db = server.db+j;
dict *d = db->dict;
if (dictSize(d) == 0) continue;
di = dictGetSafeIterator(d);
- if (!di) return C_ERR;
/* Write the SELECT DB opcode */
if (rdbSaveType(rdb,RDB_OPCODE_SELECTDB) == -1) goto werr;
@@ -947,13 +1179,9 @@ int rdbSaveRio(rio *rdb, int *error, int flags, rdbSaveInfo *rsi) {
* is currently the largest type we are able to represent in RDB sizes.
* However this does not limit the actual size of the DB to load since
* these sizes are just hints to resize the hash tables. */
- uint32_t db_size, expires_size;
- db_size = (dictSize(db->dict) <= UINT32_MAX) ?
- dictSize(db->dict) :
- UINT32_MAX;
- expires_size = (dictSize(db->expires) <= UINT32_MAX) ?
- dictSize(db->expires) :
- UINT32_MAX;
+ uint64_t db_size, expires_size;
+ db_size = dictSize(db->dict);
+ expires_size = dictSize(db->expires);
if (rdbSaveType(rdb,RDB_OPCODE_RESIZEDB) == -1) goto werr;
if (rdbSaveLen(rdb,db_size) == -1) goto werr;
if (rdbSaveLen(rdb,expires_size) == -1) goto werr;
@@ -966,7 +1194,7 @@ int rdbSaveRio(rio *rdb, int *error, int flags, rdbSaveInfo *rsi) {
initStaticStringObject(key,keystr);
expire = getExpire(db,&key);
- if (rdbSaveKeyValuePair(rdb,&key,o,expire,now) == -1) goto werr;
+ if (rdbSaveKeyValuePair(rdb,&key,o,expire) == -1) goto werr;
/* When this RDB is produced as part of an AOF rewrite, move
* accumulated diff from parent to child while rewriting in
@@ -979,8 +1207,8 @@ int rdbSaveRio(rio *rdb, int *error, int flags, rdbSaveInfo *rsi) {
}
}
dictReleaseIterator(di);
+ di = NULL; /* So that we don't release it again on error. */
}
- di = NULL; /* So that we don't release it again on error. */
/* If we are storing the replication information on disk, persist
* the script cache as well: on successful PSYNC after a restart, we need
@@ -994,8 +1222,11 @@ int rdbSaveRio(rio *rdb, int *error, int flags, rdbSaveInfo *rsi) {
goto werr;
}
dictReleaseIterator(di);
+ di = NULL; /* So that we don't release it again on error. */
}
+ if (rdbSaveModulesAux(rdb, REDISMODULE_AUX_AFTER_RDB) == -1) goto werr;
+
/* EOF opcode */
if (rdbSaveType(rdb,RDB_OPCODE_EOF) == -1) goto werr;
@@ -1060,6 +1291,10 @@ int rdbSave(char *filename, rdbSaveInfo *rsi) {
}
rioInitWithFile(&rdb,fp);
+
+ if (server.rdb_save_incremental_fsync)
+ rioSetAutoSync(&rdb,REDIS_AUTOSYNC_BYTES);
+
if (rdbSaveRio(&rdb,&error,RDB_SAVE_NONE,rsi) == C_ERR) {
errno = error;
goto werr;
@@ -1100,40 +1335,25 @@ werr:
int rdbSaveBackground(char *filename, rdbSaveInfo *rsi) {
pid_t childpid;
- long long start;
- if (server.aof_child_pid != -1 || server.rdb_child_pid != -1) return C_ERR;
+ if (hasActiveChildProcess()) return C_ERR;
server.dirty_before_bgsave = server.dirty;
server.lastbgsave_try = time(NULL);
openChildInfoPipe();
- start = ustime();
- if ((childpid = fork()) == 0) {
+ if ((childpid = redisFork()) == 0) {
int retval;
/* Child */
- closeListeningSockets(0);
redisSetProcTitle("redis-rdb-bgsave");
retval = rdbSave(filename,rsi);
if (retval == C_OK) {
- size_t private_dirty = zmalloc_get_private_dirty(-1);
-
- if (private_dirty) {
- serverLog(LL_NOTICE,
- "RDB: %zu MB of memory used by copy-on-write",
- private_dirty/(1024*1024));
- }
-
- server.child_info_data.cow_size = private_dirty;
- sendChildInfo(CHILD_INFO_TYPE_RDB);
+ sendChildCOWInfo(CHILD_INFO_TYPE_RDB, "RDB");
}
exitFromChild((retval == C_OK) ? 0 : 1);
} else {
/* Parent */
- server.stat_fork_time = ustime()-start;
- server.stat_fork_rate = (double) zmalloc_used_memory() * 1000000 / server.stat_fork_time / (1024*1024*1024); /* GB per second. */
- latencyAddSampleIfNeeded("fork",server.stat_fork_time/1000);
if (childpid == -1) {
closeChildInfoPipe();
server.lastbgsave_status = C_ERR;
@@ -1145,7 +1365,6 @@ int rdbSaveBackground(char *filename, rdbSaveInfo *rsi) {
server.rdb_save_time_start = time(NULL);
server.rdb_child_pid = childpid;
server.rdb_child_type = RDB_CHILD_TYPE_DISK;
- updateDictResizePolicy();
return C_OK;
}
return C_OK; /* unreached */
@@ -1199,7 +1418,7 @@ robj *rdbLoadCheckModuleValue(rio *rdb, char *modulename) {
/* Load a Redis object of the specified type from the specified file.
* On success a newly allocated object is returned, otherwise NULL. */
-robj *rdbLoadObject(int rdbtype, rio *rdb) {
+robj *rdbLoadObject(int rdbtype, rio *rdb, robj *key) {
robj *o = NULL, *ele, *dec;
uint64_t len;
unsigned int i;
@@ -1276,6 +1495,9 @@ robj *rdbLoadObject(int rdbtype, rio *rdb) {
o = createZsetObject();
zs = o->ptr;
+ if (zsetlen > DICT_HT_INITIAL_SIZE)
+ dictExpand(zs->dict,zsetlen);
+
/* Load every single element of the sorted set. */
while(zsetlen--) {
sds sdsele;
@@ -1344,6 +1566,9 @@ robj *rdbLoadObject(int rdbtype, rio *rdb) {
sdsfree(value);
}
+ if (o->encoding == OBJ_ENCODING_HT && len > DICT_HT_INITIAL_SIZE)
+ dictExpand(o->ptr,len);
+
/* Load remaining fields and values into the hash table */
while (o->encoding == OBJ_ENCODING_HT && len > 0) {
len--;
@@ -1445,6 +1670,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb) {
hashTypeConvert(o, OBJ_ENCODING_HT);
break;
default:
+ /* totally unreachable */
rdbExitReportCorruptRDB("Unknown RDB encoding type %d",rdbtype);
break;
}
@@ -1452,12 +1678,22 @@ robj *rdbLoadObject(int rdbtype, rio *rdb) {
o = createStreamObject();
stream *s = o->ptr;
uint64_t listpacks = rdbLoadLen(rdb,NULL);
+ if (listpacks == RDB_LENERR) {
+ rdbReportReadError("Stream listpacks len loading failed.");
+ decrRefCount(o);
+ return NULL;
+ }
while(listpacks--) {
/* Get the master ID, the one we'll use as key of the radix tree
* node: the entries inside the listpack itself are delta-encoded
* relatively to this ID. */
sds nodekey = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS,NULL);
+ if (nodekey == NULL) {
+ rdbReportReadError("Stream master ID loading failed: invalid encoding or I/O error.");
+ decrRefCount(o);
+ return NULL;
+ }
if (sdslen(nodekey) != sizeof(streamID)) {
rdbExitReportCorruptRDB("Stream node key entry is not the "
"size of a stream ID");
@@ -1466,12 +1702,17 @@ robj *rdbLoadObject(int rdbtype, rio *rdb) {
/* Load the listpack. */
unsigned char *lp =
rdbGenericLoadStringObject(rdb,RDB_LOAD_PLAIN,NULL);
- if (lp == NULL) return NULL;
+ if (lp == NULL) {
+ rdbReportReadError("Stream listpacks loading failed.");
+ sdsfree(nodekey);
+ decrRefCount(o);
+ return NULL;
+ }
unsigned char *first = lpFirst(lp);
if (first == NULL) {
/* Serialized listpacks should never be empty, since on
* deletion we should remove the radix tree key if the
- * resulting listpack is emtpy. */
+ * resulting listpack is empty. */
rdbExitReportCorruptRDB("Empty listpack inside stream");
}
@@ -1484,16 +1725,153 @@ robj *rdbLoadObject(int rdbtype, rio *rdb) {
}
/* Load total number of items inside the stream. */
s->length = rdbLoadLen(rdb,NULL);
+
/* Load the last entry ID. */
s->last_id.ms = rdbLoadLen(rdb,NULL);
s->last_id.seq = rdbLoadLen(rdb,NULL);
+
+ if (rioGetReadError(rdb)) {
+ rdbReportReadError("Stream object metadata loading failed.");
+ decrRefCount(o);
+ return NULL;
+ }
+
+ /* Consumer groups loading */
+ uint64_t cgroups_count = rdbLoadLen(rdb,NULL);
+ if (cgroups_count == RDB_LENERR) {
+ rdbReportReadError("Stream cgroup count loading failed.");
+ decrRefCount(o);
+ return NULL;
+ }
+ while(cgroups_count--) {
+ /* Get the consumer group name and ID. We can then create the
+ * consumer group ASAP and populate its structure as
+ * we read more data. */
+ streamID cg_id;
+ sds cgname = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS,NULL);
+ if (cgname == NULL) {
+ rdbReportReadError(
+ "Error reading the consumer group name from Stream");
+ decrRefCount(o);
+ return NULL;
+ }
+
+ cg_id.ms = rdbLoadLen(rdb,NULL);
+ cg_id.seq = rdbLoadLen(rdb,NULL);
+ if (rioGetReadError(rdb)) {
+ rdbReportReadError("Stream cgroup ID loading failed.");
+ sdsfree(cgname);
+ decrRefCount(o);
+ return NULL;
+ }
+
+ streamCG *cgroup = streamCreateCG(s,cgname,sdslen(cgname),&cg_id);
+ if (cgroup == NULL)
+ rdbExitReportCorruptRDB("Duplicated consumer group name %s",
+ cgname);
+ sdsfree(cgname);
+
+ /* Load the global PEL for this consumer group, however we'll
+ * not yet populate the NACK structures with the message
+ * owner, since consumers for this group and their messages will
+ * be read as a next step. So for now leave them not resolved
+ * and later populate it. */
+ uint64_t pel_size = rdbLoadLen(rdb,NULL);
+ if (pel_size == RDB_LENERR) {
+ rdbReportReadError("Stream PEL size loading failed.");
+ decrRefCount(o);
+ return NULL;
+ }
+ while(pel_size--) {
+ unsigned char rawid[sizeof(streamID)];
+ if (rioRead(rdb,rawid,sizeof(rawid)) == 0) {
+ rdbReportReadError("Stream PEL ID loading failed.");
+ decrRefCount(o);
+ return NULL;
+ }
+ streamNACK *nack = streamCreateNACK(NULL);
+ nack->delivery_time = rdbLoadMillisecondTime(rdb,RDB_VERSION);
+ nack->delivery_count = rdbLoadLen(rdb,NULL);
+ if (rioGetReadError(rdb)) {
+ rdbReportReadError("Stream PEL NACK loading failed.");
+ decrRefCount(o);
+ streamFreeNACK(nack);
+ return NULL;
+ }
+ if (!raxInsert(cgroup->pel,rawid,sizeof(rawid),nack,NULL))
+ rdbExitReportCorruptRDB("Duplicated gobal PEL entry "
+ "loading stream consumer group");
+ }
+
+ /* Now that we loaded our global PEL, we need to load the
+ * consumers and their local PELs. */
+ uint64_t consumers_num = rdbLoadLen(rdb,NULL);
+ if (consumers_num == RDB_LENERR) {
+ rdbReportReadError("Stream consumers num loading failed.");
+ decrRefCount(o);
+ return NULL;
+ }
+ while(consumers_num--) {
+ sds cname = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS,NULL);
+ if (cname == NULL) {
+ rdbReportReadError(
+ "Error reading the consumer name from Stream group.");
+ decrRefCount(o);
+ return NULL;
+ }
+ streamConsumer *consumer = streamLookupConsumer(cgroup,cname,
+ 1);
+ sdsfree(cname);
+ consumer->seen_time = rdbLoadMillisecondTime(rdb,RDB_VERSION);
+ if (rioGetReadError(rdb)) {
+ rdbReportReadError("Stream short read reading seen time.");
+ decrRefCount(o);
+ return NULL;
+ }
+
+ /* Load the PEL about entries owned by this specific
+ * consumer. */
+ pel_size = rdbLoadLen(rdb,NULL);
+ if (pel_size == RDB_LENERR) {
+ rdbReportReadError(
+ "Stream consumer PEL num loading failed.");
+ decrRefCount(o);
+ return NULL;
+ }
+ while(pel_size--) {
+ unsigned char rawid[sizeof(streamID)];
+ if (rioRead(rdb,rawid,sizeof(rawid)) == 0) {
+ rdbReportReadError(
+ "Stream short read reading PEL streamID.");
+ decrRefCount(o);
+ return NULL;
+ }
+ streamNACK *nack = raxFind(cgroup->pel,rawid,sizeof(rawid));
+ if (nack == raxNotFound)
+ rdbExitReportCorruptRDB("Consumer entry not found in "
+ "group global PEL");
+
+ /* Set the NACK consumer, that was left to NULL when
+ * loading the global PEL. Then set the same shared
+ * NACK structure also in the consumer-specific PEL. */
+ nack->consumer = consumer;
+ if (!raxInsert(consumer->pel,rawid,sizeof(rawid),nack,NULL))
+ rdbExitReportCorruptRDB("Duplicated consumer PEL entry "
+ " loading a stream consumer "
+ "group");
+ }
+ }
+ }
} else if (rdbtype == RDB_TYPE_MODULE || rdbtype == RDB_TYPE_MODULE_2) {
uint64_t moduleid = rdbLoadLen(rdb,NULL);
+ if (rioGetReadError(rdb)) return NULL;
moduleType *mt = moduleTypeLookupModuleByID(moduleid);
char name[10];
- if (rdbCheckMode && rdbtype == RDB_TYPE_MODULE_2)
+ if (rdbCheckMode && rdbtype == RDB_TYPE_MODULE_2) {
+ moduleTypeNameByID(name,moduleid);
return rdbLoadCheckModuleValue(rdb,name);
+ }
if (mt == NULL) {
moduleTypeNameByID(name,moduleid);
@@ -1501,7 +1879,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb) {
exit(1);
}
RedisModuleIO io;
- moduleInitIOContext(io,mt,rdb);
+ moduleInitIOContext(io,mt,rdb,key);
io.ver = (rdbtype == RDB_TYPE_MODULE) ? 1 : 2;
/* Call the rdb_load method of the module providing the 10 bit
* encoding version in the lower 10 bits of the module ID. */
@@ -1514,6 +1892,11 @@ robj *rdbLoadObject(int rdbtype, rio *rdb) {
/* Module v2 serialization has an EOF mark at the end. */
if (io.ver == 2) {
uint64_t eof = rdbLoadLen(rdb,NULL);
+ if (eof == RDB_LENERR) {
+ o = createModuleObject(mt,ptr); /* creating just in order to easily destroy */
+ decrRefCount(o);
+ return NULL;
+ }
if (eof != RDB_MODULE_OPCODE_EOF) {
serverLog(LL_WARNING,"The RDB file contains module data for the module '%s' that is not terminated by the proper module value EOF marker", name);
exit(1);
@@ -1527,25 +1910,31 @@ robj *rdbLoadObject(int rdbtype, rio *rdb) {
}
o = createModuleObject(mt,ptr);
} else {
- rdbExitReportCorruptRDB("Unknown RDB encoding type %d",rdbtype);
+ rdbReportReadError("Unknown RDB encoding type %d",rdbtype);
+ return NULL;
}
return o;
}
/* Mark that we are loading in the global state and setup the fields
* needed to provide loading stats. */
-void startLoading(FILE *fp) {
- struct stat sb;
-
+void startLoading(size_t size) {
/* Load the DB */
server.loading = 1;
server.loading_start_time = time(NULL);
server.loading_loaded_bytes = 0;
- if (fstat(fileno(fp), &sb) == -1) {
- server.loading_total_bytes = 0;
- } else {
- server.loading_total_bytes = sb.st_size;
- }
+ server.loading_total_bytes = size;
+}
+
+/* Mark that we are loading in the global state and setup the fields
+ * needed to provide loading stats.
+ * 'filename' is optional and used for rdb-check on error */
+void startLoadingFile(FILE *fp, char* filename) {
+ struct stat sb;
+ if (fstat(fileno(fp), &sb) == -1)
+ sb.st_size = 0;
+ rdbFileBeingLoaded = filename;
+ startLoading(sb.st_size);
}
/* Refresh the loading progress info */
@@ -1558,6 +1947,7 @@ void loadingProgress(off_t pos) {
/* Loading finished */
void stopLoading(void) {
server.loading = 0;
+ rdbFileBeingLoaded = NULL;
}
/* Track loading progress in order to serve client's from time to time
@@ -1581,12 +1971,11 @@ void rdbLoadProgressCallback(rio *r, const void *buf, size_t len) {
/* Load an RDB file from the rio stream 'rdb'. On success C_OK is returned,
* otherwise C_ERR is returned and 'errno' is set accordingly. */
-int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi) {
+int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi, int loading_aof) {
uint64_t dbid;
int type, rdbver;
redisDb *db = server.db+0;
char buf[1024];
- long long expiretime, now = mstime();
rdb->update_cksum = rdbLoadProgressCallback;
rdb->max_processing_chunk = server.loading_process_events_interval_bytes;
@@ -1604,9 +1993,12 @@ int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi) {
return C_ERR;
}
+ /* Key-specific attributes, set by opcodes before the key type. */
+ long long lru_idle = -1, lfu_freq = -1, expiretime = -1, now = mstime();
+ long long lru_clock = LRU_CLOCK();
+
while(1) {
robj *key, *val;
- expiretime = -1;
/* Read type. */
if ((type = rdbLoadType(rdb)) == -1) goto eoferr;
@@ -1616,25 +2008,34 @@ int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi) {
/* EXPIRETIME: load an expire associated with the next key
* to load. Note that after loading an expire we need to
* load the actual type, and continue. */
- if ((expiretime = rdbLoadTime(rdb)) == -1) goto eoferr;
- /* We read the time so we need to read the object type again. */
- if ((type = rdbLoadType(rdb)) == -1) goto eoferr;
- /* the EXPIRETIME opcode specifies time in seconds, so convert
- * into milliseconds. */
+ expiretime = rdbLoadTime(rdb);
expiretime *= 1000;
+ if (rioGetReadError(rdb)) goto eoferr;
+ continue; /* Read next opcode. */
} else if (type == RDB_OPCODE_EXPIRETIME_MS) {
/* EXPIRETIME_MS: milliseconds precision expire times introduced
* with RDB v3. Like EXPIRETIME but no with more precision. */
- if ((expiretime = rdbLoadMillisecondTime(rdb)) == -1) goto eoferr;
- /* We read the time so we need to read the object type again. */
- if ((type = rdbLoadType(rdb)) == -1) goto eoferr;
+ expiretime = rdbLoadMillisecondTime(rdb,rdbver);
+ if (rioGetReadError(rdb)) goto eoferr;
+ continue; /* Read next opcode. */
+ } else if (type == RDB_OPCODE_FREQ) {
+ /* FREQ: LFU frequency. */
+ uint8_t byte;
+ if (rioRead(rdb,&byte,1) == 0) goto eoferr;
+ lfu_freq = byte;
+ continue; /* Read next opcode. */
+ } else if (type == RDB_OPCODE_IDLE) {
+ /* IDLE: LRU idle time. */
+ uint64_t qword;
+ if ((qword = rdbLoadLen(rdb,NULL)) == RDB_LENERR) goto eoferr;
+ lru_idle = qword;
+ continue; /* Read next opcode. */
} else if (type == RDB_OPCODE_EOF) {
/* EOF: End of file, exit the main loop. */
break;
} else if (type == RDB_OPCODE_SELECTDB) {
/* SELECTDB: Select the specified database. */
- if ((dbid = rdbLoadLen(rdb,NULL)) == RDB_LENERR)
- goto eoferr;
+ if ((dbid = rdbLoadLen(rdb,NULL)) == RDB_LENERR) goto eoferr;
if (dbid >= (unsigned)server.dbnum) {
serverLog(LL_WARNING,
"FATAL: Data file was created with a Redis "
@@ -1643,7 +2044,7 @@ int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi) {
exit(1);
}
db = server.db+dbid;
- continue; /* Read type again. */
+ continue; /* Read next opcode. */
} else if (type == RDB_OPCODE_RESIZEDB) {
/* RESIZEDB: Hint about the size of the keys in the currently
* selected data base, in order to avoid useless rehashing. */
@@ -1654,7 +2055,7 @@ int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi) {
goto eoferr;
dictExpand(db->dict,db_size);
dictExpand(db->expires,expires_size);
- continue; /* Read type again. */
+ continue; /* Read next opcode. */
} else if (type == RDB_OPCODE_AUX) {
/* AUX: generic string-string fields. Use to add state to RDB
* which is backward compatible. Implementations of RDB loading
@@ -1688,6 +2089,23 @@ int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi) {
"Can't load Lua script from RDB file! "
"BODY: %s", auxval->ptr);
}
+ } else if (!strcasecmp(auxkey->ptr,"redis-ver")) {
+ serverLog(LL_NOTICE,"Loading RDB produced by version %s",
+ (char*)auxval->ptr);
+ } else if (!strcasecmp(auxkey->ptr,"ctime")) {
+ time_t age = time(NULL)-strtol(auxval->ptr,NULL,10);
+ if (age < 0) age = 0;
+ serverLog(LL_NOTICE,"RDB age %ld seconds",
+ (unsigned long) age);
+ } else if (!strcasecmp(auxkey->ptr,"used-mem")) {
+ long long usedmem = strtoll(auxval->ptr,NULL,10);
+ serverLog(LL_NOTICE,"RDB memory usage when created %.2f Mb",
+ (double) usedmem / (1024*1024));
+ } else if (!strcasecmp(auxkey->ptr,"aof-preamble")) {
+ long long haspreamble = strtoll(auxval->ptr,NULL,10);
+ if (haspreamble) serverLog(LL_NOTICE,"RDB has an AOF tail");
+ } else if (!strcasecmp(auxkey->ptr,"redis-bits")) {
+ /* Just ignored. */
} else {
/* We ignore fields we don't understand, as by AUX field
* contract. */
@@ -1698,49 +2116,120 @@ int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi) {
decrRefCount(auxkey);
decrRefCount(auxval);
continue; /* Read type again. */
+ } else if (type == RDB_OPCODE_MODULE_AUX) {
+ /* Load module data that is not related to the Redis key space.
+ * Such data can be potentially be stored both before and after the
+ * RDB keys-values section. */
+ uint64_t moduleid = rdbLoadLen(rdb,NULL);
+ int when_opcode = rdbLoadLen(rdb,NULL);
+ int when = rdbLoadLen(rdb,NULL);
+ if (rioGetReadError(rdb)) goto eoferr;
+ if (when_opcode != RDB_MODULE_OPCODE_UINT)
+ rdbReportReadError("bad when_opcode");
+ moduleType *mt = moduleTypeLookupModuleByID(moduleid);
+ char name[10];
+ moduleTypeNameByID(name,moduleid);
+
+ if (!rdbCheckMode && mt == NULL) {
+ /* Unknown module. */
+ serverLog(LL_WARNING,"The RDB file contains AUX module data I can't load: no matching module '%s'", name);
+ exit(1);
+ } else if (!rdbCheckMode && mt != NULL) {
+ if (!mt->aux_load) {
+ /* Module doesn't support AUX. */
+ serverLog(LL_WARNING,"The RDB file contains module AUX data, but the module '%s' doesn't seem to support it.", name);
+ exit(1);
+ }
+
+ RedisModuleIO io;
+ moduleInitIOContext(io,mt,rdb,NULL);
+ io.ver = 2;
+ /* Call the rdb_load method of the module providing the 10 bit
+ * encoding version in the lower 10 bits of the module ID. */
+ if (mt->aux_load(&io,moduleid&1023, when) || io.error) {
+ moduleTypeNameByID(name,moduleid);
+ serverLog(LL_WARNING,"The RDB file contains module AUX data for the module type '%s', that the responsible module is not able to load. Check for modules log above for additional clues.", name);
+ exit(1);
+ }
+ if (io.ctx) {
+ moduleFreeContext(io.ctx);
+ zfree(io.ctx);
+ }
+ uint64_t eof = rdbLoadLen(rdb,NULL);
+ if (eof != RDB_MODULE_OPCODE_EOF) {
+ serverLog(LL_WARNING,"The RDB file contains module AUX data for the module '%s' that is not terminated by the proper module value EOF marker", name);
+ exit(1);
+ }
+ continue;
+ } else {
+ /* RDB check mode. */
+ robj *aux = rdbLoadCheckModuleValue(rdb,name);
+ decrRefCount(aux);
+ continue; /* Read next opcode. */
+ }
}
/* Read key */
if ((key = rdbLoadStringObject(rdb)) == NULL) goto eoferr;
/* Read value */
- if ((val = rdbLoadObject(type,rdb)) == NULL) goto eoferr;
+ if ((val = rdbLoadObject(type,rdb,key)) == NULL) goto eoferr;
/* Check if the key already expired. This function is used when loading
* an RDB file from disk, either at startup, or when an RDB was
* received from the master. In the latter case, the master is
* responsible for key expiry. If we would expire keys here, the
* snapshot taken by the master may not be reflected on the slave. */
- if (server.masterhost == NULL && expiretime != -1 && expiretime < now) {
+ if (server.masterhost == NULL && !loading_aof && expiretime != -1 && expiretime < now) {
decrRefCount(key);
decrRefCount(val);
- continue;
- }
- /* Add the new object in the hash table */
- dbAdd(db,key,val);
+ } else {
+ /* Add the new object in the hash table */
+ dbAdd(db,key,val);
+
+ /* Set the expire time if needed */
+ if (expiretime != -1) setExpire(NULL,db,key,expiretime);
+
+ /* Set usage information (for eviction). */
+ objectSetLRUOrLFU(val,lfu_freq,lru_idle,lru_clock);
- /* Set the expire time if needed */
- if (expiretime != -1) setExpire(NULL,db,key,expiretime);
+ /* Decrement the key refcount since dbAdd() will take its
+ * own reference. */
+ decrRefCount(key);
+ }
+ if (server.key_load_delay)
+ usleep(server.key_load_delay);
- decrRefCount(key);
+ /* Reset the state that is key-specified and is populated by
+ * opcodes before the key, so that we start from scratch again. */
+ expiretime = -1;
+ lfu_freq = -1;
+ lru_idle = -1;
}
/* Verify the checksum if RDB version is >= 5 */
- if (rdbver >= 5 && server.rdb_checksum) {
+ if (rdbver >= 5) {
uint64_t cksum, expected = rdb->cksum;
if (rioRead(rdb,&cksum,8) == 0) goto eoferr;
- memrev64ifbe(&cksum);
- if (cksum == 0) {
- serverLog(LL_WARNING,"RDB file was saved with checksum disabled: no check performed.");
- } else if (cksum != expected) {
- serverLog(LL_WARNING,"Wrong RDB checksum. Aborting now.");
- rdbExitReportCorruptRDB("RDB CRC error");
+ if (server.rdb_checksum) {
+ memrev64ifbe(&cksum);
+ if (cksum == 0) {
+ serverLog(LL_WARNING,"RDB file was saved with checksum disabled: no check performed.");
+ } else if (cksum != expected) {
+ serverLog(LL_WARNING,"Wrong RDB checksum. Aborting now.");
+ rdbExitReportCorruptRDB("RDB CRC error");
+ }
}
}
return C_OK;
-eoferr: /* unexpected end of file is handled here with a fatal exit */
- serverLog(LL_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
- rdbExitReportCorruptRDB("Unexpected EOF reading RDB file");
- return C_ERR; /* Just to avoid warning */
+ /* Unexpected end of file is handled here calling rdbReportReadError():
+ * this will in turn either abort Redis in most cases, or if we are loading
+ * the RDB file from a socket during initial SYNC (diskless replica mode),
+ * we'll report the error to the caller, so that we can retry. */
+eoferr:
+ serverLog(LL_WARNING,
+ "Short read or OOM loading DB. Unrecoverable error, aborting now.");
+ rdbReportReadError("Unexpected EOF reading RDB file");
+ return C_ERR;
}
/* Like rdbLoadRio() but takes a filename instead of a rio stream. The
@@ -1756,9 +2245,9 @@ int rdbLoad(char *filename, rdbSaveInfo *rsi) {
int retval;
if ((fp = fopen(filename,"r")) == NULL) return C_ERR;
- startLoading(fp);
+ startLoadingFile(fp, filename);
rioInitWithFile(&rdb,fp);
- retval = rdbLoadRio(&rdb,rsi);
+ retval = rdbLoadRio(&rdb,rsi,0);
fclose(fp);
stopLoading();
return retval;
@@ -1786,7 +2275,7 @@ void backgroundSaveDoneHandlerDisk(int exitcode, int bysignal) {
latencyEndMonitor(latency);
latencyAddSampleIfNeeded("rdb-unlink-temp-file",latency);
/* SIGUSR1 is whitelisted, so we have a way to kill a child without
- * tirggering an error conditon. */
+ * tirggering an error condition. */
if (bysignal != SIGUSR1)
server.lastbgsave_status = C_ERR;
}
@@ -1803,8 +2292,6 @@ void backgroundSaveDoneHandlerDisk(int exitcode, int bysignal) {
* This function covers the case of RDB -> Slaves socket transfers for
* diskless replication. */
void backgroundSaveDoneHandlerSocket(int exitcode, int bysignal) {
- uint64_t *ok_slaves;
-
if (!bysignal && exitcode == 0) {
serverLog(LL_NOTICE,
"Background RDB transfer terminated with success");
@@ -1818,79 +2305,6 @@ void backgroundSaveDoneHandlerSocket(int exitcode, int bysignal) {
server.rdb_child_type = RDB_CHILD_TYPE_NONE;
server.rdb_save_time_start = -1;
- /* If the child returns an OK exit code, read the set of slave client
- * IDs and the associated status code. We'll terminate all the slaves
- * in error state.
- *
- * If the process returned an error, consider the list of slaves that
- * can continue to be emtpy, so that it's just a special case of the
- * normal code path. */
- ok_slaves = zmalloc(sizeof(uint64_t)); /* Make space for the count. */
- ok_slaves[0] = 0;
- if (!bysignal && exitcode == 0) {
- int readlen = sizeof(uint64_t);
-
- if (read(server.rdb_pipe_read_result_from_child, ok_slaves, readlen) ==
- readlen)
- {
- readlen = ok_slaves[0]*sizeof(uint64_t)*2;
-
- /* Make space for enough elements as specified by the first
- * uint64_t element in the array. */
- ok_slaves = zrealloc(ok_slaves,sizeof(uint64_t)+readlen);
- if (readlen &&
- read(server.rdb_pipe_read_result_from_child, ok_slaves+1,
- readlen) != readlen)
- {
- ok_slaves[0] = 0;
- }
- }
- }
-
- close(server.rdb_pipe_read_result_from_child);
- close(server.rdb_pipe_write_result_to_parent);
-
- /* We can continue the replication process with all the slaves that
- * correctly received the full payload. Others are terminated. */
- listNode *ln;
- listIter li;
-
- listRewind(server.slaves,&li);
- while((ln = listNext(&li))) {
- client *slave = ln->value;
-
- if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_END) {
- uint64_t j;
- int errorcode = 0;
-
- /* Search for the slave ID in the reply. In order for a slave to
- * continue the replication process, we need to find it in the list,
- * and it must have an error code set to 0 (which means success). */
- for (j = 0; j < ok_slaves[0]; j++) {
- if (slave->id == ok_slaves[2*j+1]) {
- errorcode = ok_slaves[2*j+2];
- break; /* Found in slaves list. */
- }
- }
- if (j == ok_slaves[0] || errorcode != 0) {
- serverLog(LL_WARNING,
- "Closing slave %s: child->slave RDB transfer failed: %s",
- replicationGetSlaveName(slave),
- (errorcode == 0) ? "RDB transfer child aborted"
- : strerror(errorcode));
- freeClient(slave);
- } else {
- serverLog(LL_WARNING,
- "Slave %s correctly received the streamed RDB file.",
- replicationGetSlaveName(slave));
- /* Restore the socket as non-blocking. */
- anetNonBlock(NULL,slave->fd);
- anetSendTimeout(NULL,slave->fd,0);
- }
- }
- }
- zfree(ok_slaves);
-
updateSlavesWaitingBgsave((!bysignal && exitcode == 0) ? C_OK : C_ERR, RDB_CHILD_TYPE_SOCKET);
}
@@ -1909,123 +2323,74 @@ void backgroundSaveDoneHandler(int exitcode, int bysignal) {
}
}
+/* Kill the RDB saving child using SIGUSR1 (so that the parent will know
+ * the child did not exit for an error, but because we wanted), and performs
+ * the cleanup needed. */
+void killRDBChild(void) {
+ kill(server.rdb_child_pid,SIGUSR1);
+ rdbRemoveTempFile(server.rdb_child_pid);
+ closeChildInfoPipe();
+ updateDictResizePolicy();
+}
+
/* Spawn an RDB child that writes the RDB to the sockets of the slaves
* that are currently in SLAVE_STATE_WAIT_BGSAVE_START state. */
int rdbSaveToSlavesSockets(rdbSaveInfo *rsi) {
- int *fds;
- uint64_t *clientids;
- int numfds;
listNode *ln;
listIter li;
pid_t childpid;
- long long start;
int pipefds[2];
- if (server.aof_child_pid != -1 || server.rdb_child_pid != -1) return C_ERR;
+ if (hasActiveChildProcess()) return C_ERR;
+
+ /* Even if the previous fork child exited, don't start a new one until we
+ * drained the pipe. */
+ if (server.rdb_pipe_conns) return C_ERR;
- /* Before to fork, create a pipe that will be used in order to
- * send back to the parent the IDs of the slaves that successfully
- * received all the writes. */
+ /* Before to fork, create a pipe that is used to transfer the rdb bytes to
+ * the parent, we can't let it write directly to the sockets, since in case
+ * of TLS we must let the parent handle a continuous TLS state when the
+ * child terminates and parent takes over. */
if (pipe(pipefds) == -1) return C_ERR;
- server.rdb_pipe_read_result_from_child = pipefds[0];
- server.rdb_pipe_write_result_to_parent = pipefds[1];
+ server.rdb_pipe_read = pipefds[0];
+ server.rdb_pipe_write = pipefds[1];
+ anetNonBlock(NULL, server.rdb_pipe_read);
- /* Collect the file descriptors of the slaves we want to transfer
+ /* Collect the connections of the replicas we want to transfer
* the RDB to, which are i WAIT_BGSAVE_START state. */
- fds = zmalloc(sizeof(int)*listLength(server.slaves));
- /* We also allocate an array of corresponding client IDs. This will
- * be useful for the child process in order to build the report
- * (sent via unix pipe) that will be sent to the parent. */
- clientids = zmalloc(sizeof(uint64_t)*listLength(server.slaves));
- numfds = 0;
-
+ server.rdb_pipe_conns = zmalloc(sizeof(connection *)*listLength(server.slaves));
+ server.rdb_pipe_numconns = 0;
+ server.rdb_pipe_numconns_writing = 0;
listRewind(server.slaves,&li);
while((ln = listNext(&li))) {
client *slave = ln->value;
-
if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_START) {
- clientids[numfds] = slave->id;
- fds[numfds++] = slave->fd;
+ server.rdb_pipe_conns[server.rdb_pipe_numconns++] = slave->conn;
replicationSetupSlaveForFullResync(slave,getPsyncInitialOffset());
- /* Put the socket in blocking mode to simplify RDB transfer.
- * We'll restore it when the children returns (since duped socket
- * will share the O_NONBLOCK attribute with the parent). */
- anetBlock(NULL,slave->fd);
- anetSendTimeout(NULL,slave->fd,server.repl_timeout*1000);
}
}
/* Create the child process. */
openChildInfoPipe();
- start = ustime();
- if ((childpid = fork()) == 0) {
+ if ((childpid = redisFork()) == 0) {
/* Child */
int retval;
- rio slave_sockets;
+ rio rdb;
- rioInitWithFdset(&slave_sockets,fds,numfds);
- zfree(fds);
+ rioInitWithFd(&rdb,server.rdb_pipe_write);
- closeListeningSockets(0);
redisSetProcTitle("redis-rdb-to-slaves");
- retval = rdbSaveRioWithEOFMark(&slave_sockets,NULL,rsi);
- if (retval == C_OK && rioFlush(&slave_sockets) == 0)
+ retval = rdbSaveRioWithEOFMark(&rdb,NULL,rsi);
+ if (retval == C_OK && rioFlush(&rdb) == 0)
retval = C_ERR;
if (retval == C_OK) {
- size_t private_dirty = zmalloc_get_private_dirty(-1);
-
- if (private_dirty) {
- serverLog(LL_NOTICE,
- "RDB: %zu MB of memory used by copy-on-write",
- private_dirty/(1024*1024));
- }
-
- server.child_info_data.cow_size = private_dirty;
- sendChildInfo(CHILD_INFO_TYPE_RDB);
-
- /* If we are returning OK, at least one slave was served
- * with the RDB file as expected, so we need to send a report
- * to the parent via the pipe. The format of the message is:
- *
- * <len> <slave[0].id> <slave[0].error> ...
- *
- * len, slave IDs, and slave errors, are all uint64_t integers,
- * so basically the reply is composed of 64 bits for the len field
- * plus 2 additional 64 bit integers for each entry, for a total
- * of 'len' entries.
- *
- * The 'id' represents the slave's client ID, so that the master
- * can match the report with a specific slave, and 'error' is
- * set to 0 if the replication process terminated with a success
- * or the error code if an error occurred. */
- void *msg = zmalloc(sizeof(uint64_t)*(1+2*numfds));
- uint64_t *len = msg;
- uint64_t *ids = len+1;
- int j, msglen;
-
- *len = numfds;
- for (j = 0; j < numfds; j++) {
- *ids++ = clientids[j];
- *ids++ = slave_sockets.io.fdset.state[j];
- }
-
- /* Write the message to the parent. If we have no good slaves or
- * we are unable to transfer the message to the parent, we exit
- * with an error so that the parent will abort the replication
- * process with all the childre that were waiting. */
- msglen = sizeof(uint64_t)*(1+2*numfds);
- if (*len == 0 ||
- write(server.rdb_pipe_write_result_to_parent,msg,msglen)
- != msglen)
- {
- retval = C_ERR;
- }
- zfree(msg);
+ sendChildCOWInfo(CHILD_INFO_TYPE_RDB, "RDB");
}
- zfree(clientids);
- rioFreeFdset(&slave_sockets);
+
+ rioFreeFd(&rdb);
+ close(server.rdb_pipe_write); /* wake up the reader, tell it we're done. */
exitFromChild((retval == C_OK) ? 0 : 1);
} else {
/* Parent */
@@ -2039,32 +2404,28 @@ int rdbSaveToSlavesSockets(rdbSaveInfo *rsi) {
listRewind(server.slaves,&li);
while((ln = listNext(&li))) {
client *slave = ln->value;
- int j;
-
- for (j = 0; j < numfds; j++) {
- if (slave->id == clientids[j]) {
- slave->replstate = SLAVE_STATE_WAIT_BGSAVE_START;
- break;
- }
+ if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_END) {
+ slave->replstate = SLAVE_STATE_WAIT_BGSAVE_START;
}
}
- close(pipefds[0]);
- close(pipefds[1]);
+ close(server.rdb_pipe_write);
+ close(server.rdb_pipe_read);
+ zfree(server.rdb_pipe_conns);
+ server.rdb_pipe_conns = NULL;
+ server.rdb_pipe_numconns = 0;
+ server.rdb_pipe_numconns_writing = 0;
closeChildInfoPipe();
} else {
- server.stat_fork_time = ustime()-start;
- server.stat_fork_rate = (double) zmalloc_used_memory() * 1000000 / server.stat_fork_time / (1024*1024*1024); /* GB per second. */
- latencyAddSampleIfNeeded("fork",server.stat_fork_time/1000);
-
serverLog(LL_NOTICE,"Background RDB transfer started by pid %d",
childpid);
server.rdb_save_time_start = time(NULL);
server.rdb_child_pid = childpid;
server.rdb_child_type = RDB_CHILD_TYPE_SOCKET;
- updateDictResizePolicy();
+ close(server.rdb_pipe_write); /* close write in parent so that it can detect the close on the child. */
+ if (aeCreateFileEvent(server.el, server.rdb_pipe_read, AE_READABLE, rdbPipeReadHandler,NULL) == AE_ERR) {
+ serverPanic("Unrecoverable error creating server.rdb_pipe_read file event.");
+ }
}
- zfree(clientids);
- zfree(fds);
return (childpid == -1) ? C_ERR : C_OK;
}
return C_OK; /* Unreached. */
@@ -2104,15 +2465,15 @@ void bgsaveCommand(client *c) {
if (server.rdb_child_pid != -1) {
addReplyError(c,"Background save already in progress");
- } else if (server.aof_child_pid != -1) {
+ } else if (hasActiveChildProcess()) {
if (schedule) {
server.rdb_bgsave_scheduled = 1;
addReplyStatus(c,"Background saving scheduled");
} else {
addReplyError(c,
- "An AOF log rewriting in progress: can't BGSAVE right now. "
- "Use BGSAVE SCHEDULE in order to schedule a BGSAVE whenever "
- "possible.");
+ "Another child process is active (AOF?): can't BGSAVE right now. "
+ "Use BGSAVE SCHEDULE in order to schedule a BGSAVE whenever "
+ "possible.");
}
} else if (rdbSaveBackground(server.rdb_filename,rsiptr) == C_OK) {
addReplyStatus(c,"Background saving started");