diff options
author | Wander Hillen <wjw.hillen@gmail.com> | 2019-10-25 10:18:26 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-10-25 10:18:26 +0200 |
commit | fb1f4f4e7e4a7a1fd5366afe5581ac6613047faf (patch) | |
tree | 7a8736f75b6c5c96d53d256161e86f0645d8840f /src/rdb.c | |
parent | dda8cc182117cd5b164a524a1246e8be33d08f7e (diff) | |
parent | 6e98214f740eb04c9761ccb983ba51988ee35bc2 (diff) | |
download | redis-fb1f4f4e7e4a7a1fd5366afe5581ac6613047faf.tar.gz |
Merge branch 'unstable' into minor-typos
Diffstat (limited to 'src/rdb.c')
-rw-r--r-- | src/rdb.c | 945 |
1 files changed, 653 insertions, 292 deletions
@@ -42,30 +42,41 @@ #include <sys/stat.h> #include <sys/param.h> -#define rdbExitReportCorruptRDB(...) rdbCheckThenExit(__LINE__,__VA_ARGS__) +/* This macro is called when the internal RDB stracture is corrupt */ +#define rdbExitReportCorruptRDB(...) rdbReportError(1, __LINE__,__VA_ARGS__) +/* This macro is called when RDB read failed (possibly a short read) */ +#define rdbReportReadError(...) rdbReportError(0, __LINE__,__VA_ARGS__) +char* rdbFileBeingLoaded = NULL; /* used for rdb checking on read error */ extern int rdbCheckMode; void rdbCheckError(const char *fmt, ...); void rdbCheckSetError(const char *fmt, ...); -void rdbCheckThenExit(int linenum, char *reason, ...) { +void rdbReportError(int corruption_error, int linenum, char *reason, ...) { va_list ap; char msg[1024]; int len; len = snprintf(msg,sizeof(msg), - "Internal error in RDB reading function at rdb.c:%d -> ", linenum); + "Internal error in RDB reading offset %llu, function at rdb.c:%d -> ", + (unsigned long long)server.loading_loaded_bytes, linenum); va_start(ap,reason); vsnprintf(msg+len,sizeof(msg)-len,reason,ap); va_end(ap); if (!rdbCheckMode) { - serverLog(LL_WARNING, "%s", msg); - char *argv[2] = {"",server.rdb_filename}; - redis_check_rdb_main(2,argv,NULL); + if (rdbFileBeingLoaded || corruption_error) { + serverLog(LL_WARNING, "%s", msg); + char *argv[2] = {"",rdbFileBeingLoaded}; + redis_check_rdb_main(2,argv,NULL); + } else { + serverLog(LL_WARNING, "%s. Failure loading rdb format from socket, assuming connection error, resuming operation.", msg); + return; + } } else { rdbCheckError("%s",msg); } + serverLog(LL_WARNING, "Terminating server after rdb file reading failure."); exit(1); } @@ -88,6 +99,11 @@ int rdbLoadType(rio *rdb) { return type; } +/* This is only used to load old databases stored with the RDB_OPCODE_EXPIRETIME + * opcode. New versions of Redis store using the RDB_OPCODE_EXPIRETIME_MS + * opcode. On error -1 is returned, however this could be a valid time, so + * to check for loading errors the caller should call rioGetReadError() after + * calling this function. */ time_t rdbLoadTime(rio *rdb) { int32_t t32; if (rioRead(rdb,&t32,4) == 0) return -1; @@ -96,12 +112,30 @@ time_t rdbLoadTime(rio *rdb) { int rdbSaveMillisecondTime(rio *rdb, long long t) { int64_t t64 = (int64_t) t; + memrev64ifbe(&t64); /* Store in little endian. */ return rdbWriteRaw(rdb,&t64,8); } -long long rdbLoadMillisecondTime(rio *rdb) { +/* This function loads a time from the RDB file. It gets the version of the + * RDB because, unfortunately, before Redis 5 (RDB version 9), the function + * failed to convert data to/from little endian, so RDB files with keys having + * expires could not be shared between big endian and little endian systems + * (because the expire time will be totally wrong). The fix for this is just + * to call memrev64ifbe(), however if we fix this for all the RDB versions, + * this call will introduce an incompatibility for big endian systems: + * after upgrading to Redis version 5 they will no longer be able to load their + * own old RDB files. Because of that, we instead fix the function only for new + * RDB versions, and load older RDB versions as we used to do in the past, + * allowing big endian systems to load their own old RDB files. + * + * On I/O error the function returns LLONG_MAX, however if this is also a + * valid stored value, the caller should use rioGetReadError() to check for + * errors after calling this function. */ +long long rdbLoadMillisecondTime(rio *rdb, int rdbver) { int64_t t64; - if (rioRead(rdb,&t64,8) == 0) return -1; + if (rioRead(rdb,&t64,8) == 0) return LLONG_MAX; + if (rdbver >= 9) /* Check the top comment of this function. */ + memrev64ifbe(&t64); /* Convert in big endian if the system is BE. */ return (long long)t64; } @@ -226,7 +260,7 @@ int rdbEncodeInteger(long long value, unsigned char *enc) { /* Loads an integer-encoded object with the specified encoding type "enctype". * The returned value changes according to the flags, see - * rdbGenerincLoadStringObject() for more info. */ + * rdbGenericLoadStringObject() for more info. */ void *rdbLoadIntegerObject(rio *rdb, int enctype, int flags, size_t *lenptr) { int plain = flags & RDB_LOAD_PLAIN; int sds = flags & RDB_LOAD_SDS; @@ -248,8 +282,8 @@ void *rdbLoadIntegerObject(rio *rdb, int enctype, int flags, size_t *lenptr) { v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24); val = (int32_t)v; } else { - val = 0; /* anti-warning */ rdbExitReportCorruptRDB("Unknown RDB integer encoding type %d",enctype); + return NULL; /* Never reached. */ } if (plain || sds) { char buf[LONG_STR_SIZE], *p; @@ -259,7 +293,7 @@ void *rdbLoadIntegerObject(rio *rdb, int enctype, int flags, size_t *lenptr) { memcpy(p,buf,len); return p; } else if (encode) { - return createStringObjectFromLongLong(val); + return createStringObjectFromLongLongForValue(val); } else { return createObject(OBJ_STRING,sdsfromlonglong(val)); } @@ -352,8 +386,7 @@ void *rdbLoadLzfStringObject(rio *rdb, int flags, size_t *lenptr) { /* Load the compressed representation and uncompress it to target. */ if (rioRead(rdb,c,clen) == 0) goto err; if (lzf_decompress(c,clen,val,len) == 0) { - if (rdbCheckMode) rdbCheckSetError("Invalid LZF compressed string"); - goto err; + rdbExitReportCorruptRDB("Invalid LZF compressed string"); } zfree(c); @@ -467,6 +500,7 @@ void *rdbGenericLoadStringObject(rio *rdb, int flags, size_t *lenptr) { return rdbLoadLzfStringObject(rdb,flags,lenptr); default: rdbExitReportCorruptRDB("Unknown RDB string encoding type %d",len); + return NULL; /* Never reached. */ } } @@ -642,8 +676,87 @@ int rdbLoadObjectType(rio *rdb) { return type; } -/* Save a Redis object. Returns -1 on error, number of bytes written on success. */ -ssize_t rdbSaveObject(rio *rdb, robj *o) { +/* This helper function serializes a consumer group Pending Entries List (PEL) + * into the RDB file. The 'nacks' argument tells the function if also persist + * the informations about the not acknowledged message, or if to persist + * just the IDs: this is useful because for the global consumer group PEL + * we serialized the NACKs as well, but when serializing the local consumer + * PELs we just add the ID, that will be resolved inside the global PEL to + * put a reference to the same structure. */ +ssize_t rdbSaveStreamPEL(rio *rdb, rax *pel, int nacks) { + ssize_t n, nwritten = 0; + + /* Number of entries in the PEL. */ + if ((n = rdbSaveLen(rdb,raxSize(pel))) == -1) return -1; + nwritten += n; + + /* Save each entry. */ + raxIterator ri; + raxStart(&ri,pel); + raxSeek(&ri,"^",NULL,0); + while(raxNext(&ri)) { + /* We store IDs in raw form as 128 big big endian numbers, like + * they are inside the radix tree key. */ + if ((n = rdbWriteRaw(rdb,ri.key,sizeof(streamID))) == -1) return -1; + nwritten += n; + + if (nacks) { + streamNACK *nack = ri.data; + if ((n = rdbSaveMillisecondTime(rdb,nack->delivery_time)) == -1) + return -1; + nwritten += n; + if ((n = rdbSaveLen(rdb,nack->delivery_count)) == -1) return -1; + nwritten += n; + /* We don't save the consumer name: we'll save the pending IDs + * for each consumer in the consumer PEL, and resolve the consumer + * at loading time. */ + } + } + raxStop(&ri); + return nwritten; +} + +/* Serialize the consumers of a stream consumer group into the RDB. Helper + * function for the stream data type serialization. What we do here is to + * persist the consumer metadata, and it's PEL, for each consumer. */ +size_t rdbSaveStreamConsumers(rio *rdb, streamCG *cg) { + ssize_t n, nwritten = 0; + + /* Number of consumers in this consumer group. */ + if ((n = rdbSaveLen(rdb,raxSize(cg->consumers))) == -1) return -1; + nwritten += n; + + /* Save each consumer. */ + raxIterator ri; + raxStart(&ri,cg->consumers); + raxSeek(&ri,"^",NULL,0); + while(raxNext(&ri)) { + streamConsumer *consumer = ri.data; + + /* Consumer name. */ + if ((n = rdbSaveRawString(rdb,ri.key,ri.key_len)) == -1) return -1; + nwritten += n; + + /* Last seen time. */ + if ((n = rdbSaveMillisecondTime(rdb,consumer->seen_time)) == -1) + return -1; + nwritten += n; + + /* Consumer PEL, without the ACKs (see last parameter of the function + * passed with value of 0), at loading time we'll lookup the ID + * in the consumer group global PEL and will put a reference in the + * consumer local PEL. */ + if ((n = rdbSaveStreamPEL(rdb,consumer->pel,0)) == -1) + return -1; + nwritten += n; + } + raxStop(&ri); + return nwritten; +} + +/* Save a Redis object. + * Returns -1 on error, number of bytes written on success. */ +ssize_t rdbSaveObject(rio *rdb, robj *o, robj *key) { ssize_t n = 0, nwritten = 0; if (o->type == OBJ_STRING) { @@ -681,13 +794,20 @@ ssize_t rdbSaveObject(rio *rdb, robj *o) { dictIterator *di = dictGetIterator(set); dictEntry *de; - if ((n = rdbSaveLen(rdb,dictSize(set))) == -1) return -1; + if ((n = rdbSaveLen(rdb,dictSize(set))) == -1) { + dictReleaseIterator(di); + return -1; + } nwritten += n; while((de = dictNext(di)) != NULL) { sds ele = dictGetKey(de); if ((n = rdbSaveRawString(rdb,(unsigned char*)ele,sdslen(ele))) - == -1) return -1; + == -1) + { + dictReleaseIterator(di); + return -1; + } nwritten += n; } dictReleaseIterator(di); @@ -747,7 +867,10 @@ ssize_t rdbSaveObject(rio *rdb, robj *o) { dictIterator *di = dictGetIterator(o->ptr); dictEntry *de; - if ((n = rdbSaveLen(rdb,dictSize((dict*)o->ptr))) == -1) return -1; + if ((n = rdbSaveLen(rdb,dictSize((dict*)o->ptr))) == -1) { + dictReleaseIterator(di); + return -1; + } nwritten += n; while((de = dictNext(di)) != NULL) { @@ -755,10 +878,18 @@ ssize_t rdbSaveObject(rio *rdb, robj *o) { sds value = dictGetVal(de); if ((n = rdbSaveRawString(rdb,(unsigned char*)field, - sdslen(field))) == -1) return -1; + sdslen(field))) == -1) + { + dictReleaseIterator(di); + return -1; + } nwritten += n; if ((n = rdbSaveRawString(rdb,(unsigned char*)value, - sdslen(value))) == -1) return -1; + sdslen(value))) == -1) + { + dictReleaseIterator(di); + return -1; + } nwritten += n; } dictReleaseIterator(di); @@ -798,12 +929,48 @@ ssize_t rdbSaveObject(rio *rdb, robj *o) { nwritten += n; if ((n = rdbSaveLen(rdb,s->last_id.seq)) == -1) return -1; nwritten += n; + + /* The consumer groups and their clients are part of the stream + * type, so serialize every consumer group. */ + + /* Save the number of groups. */ + size_t num_cgroups = s->cgroups ? raxSize(s->cgroups) : 0; + if ((n = rdbSaveLen(rdb,num_cgroups)) == -1) return -1; + nwritten += n; + + if (num_cgroups) { + /* Serialize each consumer group. */ + raxStart(&ri,s->cgroups); + raxSeek(&ri,"^",NULL,0); + while(raxNext(&ri)) { + streamCG *cg = ri.data; + + /* Save the group name. */ + if ((n = rdbSaveRawString(rdb,ri.key,ri.key_len)) == -1) + return -1; + nwritten += n; + + /* Last ID. */ + if ((n = rdbSaveLen(rdb,cg->last_id.ms)) == -1) return -1; + nwritten += n; + if ((n = rdbSaveLen(rdb,cg->last_id.seq)) == -1) return -1; + nwritten += n; + + /* Save the global PEL. */ + if ((n = rdbSaveStreamPEL(rdb,cg->pel,1)) == -1) return -1; + nwritten += n; + + /* Save the consumers of this group. */ + if ((n = rdbSaveStreamConsumers(rdb,cg)) == -1) return -1; + nwritten += n; + } + raxStop(&ri); + } } else if (o->type == OBJ_MODULE) { /* Save a module-specific value. */ RedisModuleIO io; moduleValue *mv = o->ptr; moduleType *mt = mv->type; - moduleInitIOContext(io,mt,rdb); /* Write the "module" identifier as prefix, so that we'll be able * to call the right module during loading. */ @@ -812,10 +979,13 @@ ssize_t rdbSaveObject(rio *rdb, robj *o) { io.bytes += retval; /* Then write the module-specific representation + EOF marker. */ + moduleInitIOContext(io,mt,rdb,key); mt->rdb_save(&io,mv->value); retval = rdbSaveLen(rdb,RDB_MODULE_OPCODE_EOF); - if (retval == -1) return -1; - io.bytes += retval; + if (retval == -1) + io.error = 1; + else + io.bytes += retval; if (io.ctx) { moduleFreeContext(io.ctx); @@ -833,7 +1003,7 @@ ssize_t rdbSaveObject(rio *rdb, robj *o) { * this length with very little changes to the code. In the future * we could switch to a faster solution. */ size_t rdbSavedObjectLen(robj *o) { - ssize_t len = rdbSaveObject(NULL,o); + ssize_t len = rdbSaveObject(NULL,o,NULL); serverAssertWithInfo(NULL,o,len != -1); return len; } @@ -842,21 +1012,45 @@ size_t rdbSavedObjectLen(robj *o) { * On error -1 is returned. * On success if the key was actually saved 1 is returned, otherwise 0 * is returned (the key was already expired). */ -int rdbSaveKeyValuePair(rio *rdb, robj *key, robj *val, - long long expiretime, long long now) -{ +int rdbSaveKeyValuePair(rio *rdb, robj *key, robj *val, long long expiretime) { + int savelru = server.maxmemory_policy & MAXMEMORY_FLAG_LRU; + int savelfu = server.maxmemory_policy & MAXMEMORY_FLAG_LFU; + /* Save the expire time */ if (expiretime != -1) { - /* If this key is already expired skip it */ - if (expiretime < now) return 0; if (rdbSaveType(rdb,RDB_OPCODE_EXPIRETIME_MS) == -1) return -1; if (rdbSaveMillisecondTime(rdb,expiretime) == -1) return -1; } + /* Save the LRU info. */ + if (savelru) { + uint64_t idletime = estimateObjectIdleTime(val); + idletime /= 1000; /* Using seconds is enough and requires less space.*/ + if (rdbSaveType(rdb,RDB_OPCODE_IDLE) == -1) return -1; + if (rdbSaveLen(rdb,idletime) == -1) return -1; + } + + /* Save the LFU info. */ + if (savelfu) { + uint8_t buf[1]; + buf[0] = LFUDecrAndReturn(val); + /* We can encode this in exactly two bytes: the opcode and an 8 + * bit counter, since the frequency is logarithmic with a 0-255 range. + * Note that we do not store the halving time because to reset it + * a single time when loading does not affect the frequency much. */ + if (rdbSaveType(rdb,RDB_OPCODE_FREQ) == -1) return -1; + if (rdbWriteRaw(rdb,buf,1) == -1) return -1; + } + /* Save type, key, value */ if (rdbSaveObjectType(rdb,val) == -1) return -1; if (rdbSaveStringObject(rdb,key) == -1) return -1; - if (rdbSaveObject(rdb,val) == -1) return -1; + if (rdbSaveObject(rdb,val,key) == -1) return -1; + + /* Delay return if required (for testing) */ + if (server.rdb_key_save_delay) + usleep(server.rdb_key_save_delay); + return 1; } @@ -909,6 +1103,45 @@ int rdbSaveInfoAuxFields(rio *rdb, int flags, rdbSaveInfo *rsi) { return 1; } +ssize_t rdbSaveSingleModuleAux(rio *rdb, int when, moduleType *mt) { + /* Save a module-specific aux value. */ + RedisModuleIO io; + int retval = rdbSaveType(rdb, RDB_OPCODE_MODULE_AUX); + + /* Write the "module" identifier as prefix, so that we'll be able + * to call the right module during loading. */ + retval = rdbSaveLen(rdb,mt->id); + if (retval == -1) return -1; + io.bytes += retval; + + /* write the 'when' so that we can provide it on loading. add a UINT opcode + * for backwards compatibility, everything after the MT needs to be prefixed + * by an opcode. */ + retval = rdbSaveLen(rdb,RDB_MODULE_OPCODE_UINT); + if (retval == -1) return -1; + io.bytes += retval; + retval = rdbSaveLen(rdb,when); + if (retval == -1) return -1; + io.bytes += retval; + + /* Then write the module-specific representation + EOF marker. */ + moduleInitIOContext(io,mt,rdb,NULL); + mt->aux_save(&io,when); + retval = rdbSaveLen(rdb,RDB_MODULE_OPCODE_EOF); + if (retval == -1) + io.error = 1; + else + io.bytes += retval; + + if (io.ctx) { + moduleFreeContext(io.ctx); + zfree(io.ctx); + } + if (io.error) + return -1; + return io.bytes; +} + /* Produces a dump of the database in RDB format sending it to the specified * Redis I/O channel. On success C_OK is returned, otherwise C_ERR * is returned and part of the output, or all the output, can be @@ -922,7 +1155,6 @@ int rdbSaveRio(rio *rdb, int *error, int flags, rdbSaveInfo *rsi) { dictEntry *de; char magic[10]; int j; - long long now = mstime(); uint64_t cksum; size_t processed = 0; @@ -931,13 +1163,13 @@ int rdbSaveRio(rio *rdb, int *error, int flags, rdbSaveInfo *rsi) { snprintf(magic,sizeof(magic),"REDIS%04d",RDB_VERSION); if (rdbWriteRaw(rdb,magic,9) == -1) goto werr; if (rdbSaveInfoAuxFields(rdb,flags,rsi) == -1) goto werr; + if (rdbSaveModulesAux(rdb, REDISMODULE_AUX_BEFORE_RDB) == -1) goto werr; for (j = 0; j < server.dbnum; j++) { redisDb *db = server.db+j; dict *d = db->dict; if (dictSize(d) == 0) continue; di = dictGetSafeIterator(d); - if (!di) return C_ERR; /* Write the SELECT DB opcode */ if (rdbSaveType(rdb,RDB_OPCODE_SELECTDB) == -1) goto werr; @@ -947,13 +1179,9 @@ int rdbSaveRio(rio *rdb, int *error, int flags, rdbSaveInfo *rsi) { * is currently the largest type we are able to represent in RDB sizes. * However this does not limit the actual size of the DB to load since * these sizes are just hints to resize the hash tables. */ - uint32_t db_size, expires_size; - db_size = (dictSize(db->dict) <= UINT32_MAX) ? - dictSize(db->dict) : - UINT32_MAX; - expires_size = (dictSize(db->expires) <= UINT32_MAX) ? - dictSize(db->expires) : - UINT32_MAX; + uint64_t db_size, expires_size; + db_size = dictSize(db->dict); + expires_size = dictSize(db->expires); if (rdbSaveType(rdb,RDB_OPCODE_RESIZEDB) == -1) goto werr; if (rdbSaveLen(rdb,db_size) == -1) goto werr; if (rdbSaveLen(rdb,expires_size) == -1) goto werr; @@ -966,7 +1194,7 @@ int rdbSaveRio(rio *rdb, int *error, int flags, rdbSaveInfo *rsi) { initStaticStringObject(key,keystr); expire = getExpire(db,&key); - if (rdbSaveKeyValuePair(rdb,&key,o,expire,now) == -1) goto werr; + if (rdbSaveKeyValuePair(rdb,&key,o,expire) == -1) goto werr; /* When this RDB is produced as part of an AOF rewrite, move * accumulated diff from parent to child while rewriting in @@ -979,8 +1207,8 @@ int rdbSaveRio(rio *rdb, int *error, int flags, rdbSaveInfo *rsi) { } } dictReleaseIterator(di); + di = NULL; /* So that we don't release it again on error. */ } - di = NULL; /* So that we don't release it again on error. */ /* If we are storing the replication information on disk, persist * the script cache as well: on successful PSYNC after a restart, we need @@ -994,8 +1222,11 @@ int rdbSaveRio(rio *rdb, int *error, int flags, rdbSaveInfo *rsi) { goto werr; } dictReleaseIterator(di); + di = NULL; /* So that we don't release it again on error. */ } + if (rdbSaveModulesAux(rdb, REDISMODULE_AUX_AFTER_RDB) == -1) goto werr; + /* EOF opcode */ if (rdbSaveType(rdb,RDB_OPCODE_EOF) == -1) goto werr; @@ -1060,6 +1291,10 @@ int rdbSave(char *filename, rdbSaveInfo *rsi) { } rioInitWithFile(&rdb,fp); + + if (server.rdb_save_incremental_fsync) + rioSetAutoSync(&rdb,REDIS_AUTOSYNC_BYTES); + if (rdbSaveRio(&rdb,&error,RDB_SAVE_NONE,rsi) == C_ERR) { errno = error; goto werr; @@ -1100,40 +1335,25 @@ werr: int rdbSaveBackground(char *filename, rdbSaveInfo *rsi) { pid_t childpid; - long long start; - if (server.aof_child_pid != -1 || server.rdb_child_pid != -1) return C_ERR; + if (hasActiveChildProcess()) return C_ERR; server.dirty_before_bgsave = server.dirty; server.lastbgsave_try = time(NULL); openChildInfoPipe(); - start = ustime(); - if ((childpid = fork()) == 0) { + if ((childpid = redisFork()) == 0) { int retval; /* Child */ - closeListeningSockets(0); redisSetProcTitle("redis-rdb-bgsave"); retval = rdbSave(filename,rsi); if (retval == C_OK) { - size_t private_dirty = zmalloc_get_private_dirty(-1); - - if (private_dirty) { - serverLog(LL_NOTICE, - "RDB: %zu MB of memory used by copy-on-write", - private_dirty/(1024*1024)); - } - - server.child_info_data.cow_size = private_dirty; - sendChildInfo(CHILD_INFO_TYPE_RDB); + sendChildCOWInfo(CHILD_INFO_TYPE_RDB, "RDB"); } exitFromChild((retval == C_OK) ? 0 : 1); } else { /* Parent */ - server.stat_fork_time = ustime()-start; - server.stat_fork_rate = (double) zmalloc_used_memory() * 1000000 / server.stat_fork_time / (1024*1024*1024); /* GB per second. */ - latencyAddSampleIfNeeded("fork",server.stat_fork_time/1000); if (childpid == -1) { closeChildInfoPipe(); server.lastbgsave_status = C_ERR; @@ -1145,7 +1365,6 @@ int rdbSaveBackground(char *filename, rdbSaveInfo *rsi) { server.rdb_save_time_start = time(NULL); server.rdb_child_pid = childpid; server.rdb_child_type = RDB_CHILD_TYPE_DISK; - updateDictResizePolicy(); return C_OK; } return C_OK; /* unreached */ @@ -1199,7 +1418,7 @@ robj *rdbLoadCheckModuleValue(rio *rdb, char *modulename) { /* Load a Redis object of the specified type from the specified file. * On success a newly allocated object is returned, otherwise NULL. */ -robj *rdbLoadObject(int rdbtype, rio *rdb) { +robj *rdbLoadObject(int rdbtype, rio *rdb, robj *key) { robj *o = NULL, *ele, *dec; uint64_t len; unsigned int i; @@ -1276,6 +1495,9 @@ robj *rdbLoadObject(int rdbtype, rio *rdb) { o = createZsetObject(); zs = o->ptr; + if (zsetlen > DICT_HT_INITIAL_SIZE) + dictExpand(zs->dict,zsetlen); + /* Load every single element of the sorted set. */ while(zsetlen--) { sds sdsele; @@ -1344,6 +1566,9 @@ robj *rdbLoadObject(int rdbtype, rio *rdb) { sdsfree(value); } + if (o->encoding == OBJ_ENCODING_HT && len > DICT_HT_INITIAL_SIZE) + dictExpand(o->ptr,len); + /* Load remaining fields and values into the hash table */ while (o->encoding == OBJ_ENCODING_HT && len > 0) { len--; @@ -1445,6 +1670,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb) { hashTypeConvert(o, OBJ_ENCODING_HT); break; default: + /* totally unreachable */ rdbExitReportCorruptRDB("Unknown RDB encoding type %d",rdbtype); break; } @@ -1452,12 +1678,22 @@ robj *rdbLoadObject(int rdbtype, rio *rdb) { o = createStreamObject(); stream *s = o->ptr; uint64_t listpacks = rdbLoadLen(rdb,NULL); + if (listpacks == RDB_LENERR) { + rdbReportReadError("Stream listpacks len loading failed."); + decrRefCount(o); + return NULL; + } while(listpacks--) { /* Get the master ID, the one we'll use as key of the radix tree * node: the entries inside the listpack itself are delta-encoded * relatively to this ID. */ sds nodekey = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS,NULL); + if (nodekey == NULL) { + rdbReportReadError("Stream master ID loading failed: invalid encoding or I/O error."); + decrRefCount(o); + return NULL; + } if (sdslen(nodekey) != sizeof(streamID)) { rdbExitReportCorruptRDB("Stream node key entry is not the " "size of a stream ID"); @@ -1466,12 +1702,17 @@ robj *rdbLoadObject(int rdbtype, rio *rdb) { /* Load the listpack. */ unsigned char *lp = rdbGenericLoadStringObject(rdb,RDB_LOAD_PLAIN,NULL); - if (lp == NULL) return NULL; + if (lp == NULL) { + rdbReportReadError("Stream listpacks loading failed."); + sdsfree(nodekey); + decrRefCount(o); + return NULL; + } unsigned char *first = lpFirst(lp); if (first == NULL) { /* Serialized listpacks should never be empty, since on * deletion we should remove the radix tree key if the - * resulting listpack is emtpy. */ + * resulting listpack is empty. */ rdbExitReportCorruptRDB("Empty listpack inside stream"); } @@ -1484,16 +1725,153 @@ robj *rdbLoadObject(int rdbtype, rio *rdb) { } /* Load total number of items inside the stream. */ s->length = rdbLoadLen(rdb,NULL); + /* Load the last entry ID. */ s->last_id.ms = rdbLoadLen(rdb,NULL); s->last_id.seq = rdbLoadLen(rdb,NULL); + + if (rioGetReadError(rdb)) { + rdbReportReadError("Stream object metadata loading failed."); + decrRefCount(o); + return NULL; + } + + /* Consumer groups loading */ + uint64_t cgroups_count = rdbLoadLen(rdb,NULL); + if (cgroups_count == RDB_LENERR) { + rdbReportReadError("Stream cgroup count loading failed."); + decrRefCount(o); + return NULL; + } + while(cgroups_count--) { + /* Get the consumer group name and ID. We can then create the + * consumer group ASAP and populate its structure as + * we read more data. */ + streamID cg_id; + sds cgname = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS,NULL); + if (cgname == NULL) { + rdbReportReadError( + "Error reading the consumer group name from Stream"); + decrRefCount(o); + return NULL; + } + + cg_id.ms = rdbLoadLen(rdb,NULL); + cg_id.seq = rdbLoadLen(rdb,NULL); + if (rioGetReadError(rdb)) { + rdbReportReadError("Stream cgroup ID loading failed."); + sdsfree(cgname); + decrRefCount(o); + return NULL; + } + + streamCG *cgroup = streamCreateCG(s,cgname,sdslen(cgname),&cg_id); + if (cgroup == NULL) + rdbExitReportCorruptRDB("Duplicated consumer group name %s", + cgname); + sdsfree(cgname); + + /* Load the global PEL for this consumer group, however we'll + * not yet populate the NACK structures with the message + * owner, since consumers for this group and their messages will + * be read as a next step. So for now leave them not resolved + * and later populate it. */ + uint64_t pel_size = rdbLoadLen(rdb,NULL); + if (pel_size == RDB_LENERR) { + rdbReportReadError("Stream PEL size loading failed."); + decrRefCount(o); + return NULL; + } + while(pel_size--) { + unsigned char rawid[sizeof(streamID)]; + if (rioRead(rdb,rawid,sizeof(rawid)) == 0) { + rdbReportReadError("Stream PEL ID loading failed."); + decrRefCount(o); + return NULL; + } + streamNACK *nack = streamCreateNACK(NULL); + nack->delivery_time = rdbLoadMillisecondTime(rdb,RDB_VERSION); + nack->delivery_count = rdbLoadLen(rdb,NULL); + if (rioGetReadError(rdb)) { + rdbReportReadError("Stream PEL NACK loading failed."); + decrRefCount(o); + streamFreeNACK(nack); + return NULL; + } + if (!raxInsert(cgroup->pel,rawid,sizeof(rawid),nack,NULL)) + rdbExitReportCorruptRDB("Duplicated gobal PEL entry " + "loading stream consumer group"); + } + + /* Now that we loaded our global PEL, we need to load the + * consumers and their local PELs. */ + uint64_t consumers_num = rdbLoadLen(rdb,NULL); + if (consumers_num == RDB_LENERR) { + rdbReportReadError("Stream consumers num loading failed."); + decrRefCount(o); + return NULL; + } + while(consumers_num--) { + sds cname = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS,NULL); + if (cname == NULL) { + rdbReportReadError( + "Error reading the consumer name from Stream group."); + decrRefCount(o); + return NULL; + } + streamConsumer *consumer = streamLookupConsumer(cgroup,cname, + 1); + sdsfree(cname); + consumer->seen_time = rdbLoadMillisecondTime(rdb,RDB_VERSION); + if (rioGetReadError(rdb)) { + rdbReportReadError("Stream short read reading seen time."); + decrRefCount(o); + return NULL; + } + + /* Load the PEL about entries owned by this specific + * consumer. */ + pel_size = rdbLoadLen(rdb,NULL); + if (pel_size == RDB_LENERR) { + rdbReportReadError( + "Stream consumer PEL num loading failed."); + decrRefCount(o); + return NULL; + } + while(pel_size--) { + unsigned char rawid[sizeof(streamID)]; + if (rioRead(rdb,rawid,sizeof(rawid)) == 0) { + rdbReportReadError( + "Stream short read reading PEL streamID."); + decrRefCount(o); + return NULL; + } + streamNACK *nack = raxFind(cgroup->pel,rawid,sizeof(rawid)); + if (nack == raxNotFound) + rdbExitReportCorruptRDB("Consumer entry not found in " + "group global PEL"); + + /* Set the NACK consumer, that was left to NULL when + * loading the global PEL. Then set the same shared + * NACK structure also in the consumer-specific PEL. */ + nack->consumer = consumer; + if (!raxInsert(consumer->pel,rawid,sizeof(rawid),nack,NULL)) + rdbExitReportCorruptRDB("Duplicated consumer PEL entry " + " loading a stream consumer " + "group"); + } + } + } } else if (rdbtype == RDB_TYPE_MODULE || rdbtype == RDB_TYPE_MODULE_2) { uint64_t moduleid = rdbLoadLen(rdb,NULL); + if (rioGetReadError(rdb)) return NULL; moduleType *mt = moduleTypeLookupModuleByID(moduleid); char name[10]; - if (rdbCheckMode && rdbtype == RDB_TYPE_MODULE_2) + if (rdbCheckMode && rdbtype == RDB_TYPE_MODULE_2) { + moduleTypeNameByID(name,moduleid); return rdbLoadCheckModuleValue(rdb,name); + } if (mt == NULL) { moduleTypeNameByID(name,moduleid); @@ -1501,7 +1879,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb) { exit(1); } RedisModuleIO io; - moduleInitIOContext(io,mt,rdb); + moduleInitIOContext(io,mt,rdb,key); io.ver = (rdbtype == RDB_TYPE_MODULE) ? 1 : 2; /* Call the rdb_load method of the module providing the 10 bit * encoding version in the lower 10 bits of the module ID. */ @@ -1514,6 +1892,11 @@ robj *rdbLoadObject(int rdbtype, rio *rdb) { /* Module v2 serialization has an EOF mark at the end. */ if (io.ver == 2) { uint64_t eof = rdbLoadLen(rdb,NULL); + if (eof == RDB_LENERR) { + o = createModuleObject(mt,ptr); /* creating just in order to easily destroy */ + decrRefCount(o); + return NULL; + } if (eof != RDB_MODULE_OPCODE_EOF) { serverLog(LL_WARNING,"The RDB file contains module data for the module '%s' that is not terminated by the proper module value EOF marker", name); exit(1); @@ -1527,25 +1910,31 @@ robj *rdbLoadObject(int rdbtype, rio *rdb) { } o = createModuleObject(mt,ptr); } else { - rdbExitReportCorruptRDB("Unknown RDB encoding type %d",rdbtype); + rdbReportReadError("Unknown RDB encoding type %d",rdbtype); + return NULL; } return o; } /* Mark that we are loading in the global state and setup the fields * needed to provide loading stats. */ -void startLoading(FILE *fp) { - struct stat sb; - +void startLoading(size_t size) { /* Load the DB */ server.loading = 1; server.loading_start_time = time(NULL); server.loading_loaded_bytes = 0; - if (fstat(fileno(fp), &sb) == -1) { - server.loading_total_bytes = 0; - } else { - server.loading_total_bytes = sb.st_size; - } + server.loading_total_bytes = size; +} + +/* Mark that we are loading in the global state and setup the fields + * needed to provide loading stats. + * 'filename' is optional and used for rdb-check on error */ +void startLoadingFile(FILE *fp, char* filename) { + struct stat sb; + if (fstat(fileno(fp), &sb) == -1) + sb.st_size = 0; + rdbFileBeingLoaded = filename; + startLoading(sb.st_size); } /* Refresh the loading progress info */ @@ -1558,6 +1947,7 @@ void loadingProgress(off_t pos) { /* Loading finished */ void stopLoading(void) { server.loading = 0; + rdbFileBeingLoaded = NULL; } /* Track loading progress in order to serve client's from time to time @@ -1581,12 +1971,11 @@ void rdbLoadProgressCallback(rio *r, const void *buf, size_t len) { /* Load an RDB file from the rio stream 'rdb'. On success C_OK is returned, * otherwise C_ERR is returned and 'errno' is set accordingly. */ -int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi) { +int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi, int loading_aof) { uint64_t dbid; int type, rdbver; redisDb *db = server.db+0; char buf[1024]; - long long expiretime, now = mstime(); rdb->update_cksum = rdbLoadProgressCallback; rdb->max_processing_chunk = server.loading_process_events_interval_bytes; @@ -1604,9 +1993,12 @@ int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi) { return C_ERR; } + /* Key-specific attributes, set by opcodes before the key type. */ + long long lru_idle = -1, lfu_freq = -1, expiretime = -1, now = mstime(); + long long lru_clock = LRU_CLOCK(); + while(1) { robj *key, *val; - expiretime = -1; /* Read type. */ if ((type = rdbLoadType(rdb)) == -1) goto eoferr; @@ -1616,25 +2008,34 @@ int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi) { /* EXPIRETIME: load an expire associated with the next key * to load. Note that after loading an expire we need to * load the actual type, and continue. */ - if ((expiretime = rdbLoadTime(rdb)) == -1) goto eoferr; - /* We read the time so we need to read the object type again. */ - if ((type = rdbLoadType(rdb)) == -1) goto eoferr; - /* the EXPIRETIME opcode specifies time in seconds, so convert - * into milliseconds. */ + expiretime = rdbLoadTime(rdb); expiretime *= 1000; + if (rioGetReadError(rdb)) goto eoferr; + continue; /* Read next opcode. */ } else if (type == RDB_OPCODE_EXPIRETIME_MS) { /* EXPIRETIME_MS: milliseconds precision expire times introduced * with RDB v3. Like EXPIRETIME but no with more precision. */ - if ((expiretime = rdbLoadMillisecondTime(rdb)) == -1) goto eoferr; - /* We read the time so we need to read the object type again. */ - if ((type = rdbLoadType(rdb)) == -1) goto eoferr; + expiretime = rdbLoadMillisecondTime(rdb,rdbver); + if (rioGetReadError(rdb)) goto eoferr; + continue; /* Read next opcode. */ + } else if (type == RDB_OPCODE_FREQ) { + /* FREQ: LFU frequency. */ + uint8_t byte; + if (rioRead(rdb,&byte,1) == 0) goto eoferr; + lfu_freq = byte; + continue; /* Read next opcode. */ + } else if (type == RDB_OPCODE_IDLE) { + /* IDLE: LRU idle time. */ + uint64_t qword; + if ((qword = rdbLoadLen(rdb,NULL)) == RDB_LENERR) goto eoferr; + lru_idle = qword; + continue; /* Read next opcode. */ } else if (type == RDB_OPCODE_EOF) { /* EOF: End of file, exit the main loop. */ break; } else if (type == RDB_OPCODE_SELECTDB) { /* SELECTDB: Select the specified database. */ - if ((dbid = rdbLoadLen(rdb,NULL)) == RDB_LENERR) - goto eoferr; + if ((dbid = rdbLoadLen(rdb,NULL)) == RDB_LENERR) goto eoferr; if (dbid >= (unsigned)server.dbnum) { serverLog(LL_WARNING, "FATAL: Data file was created with a Redis " @@ -1643,7 +2044,7 @@ int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi) { exit(1); } db = server.db+dbid; - continue; /* Read type again. */ + continue; /* Read next opcode. */ } else if (type == RDB_OPCODE_RESIZEDB) { /* RESIZEDB: Hint about the size of the keys in the currently * selected data base, in order to avoid useless rehashing. */ @@ -1654,7 +2055,7 @@ int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi) { goto eoferr; dictExpand(db->dict,db_size); dictExpand(db->expires,expires_size); - continue; /* Read type again. */ + continue; /* Read next opcode. */ } else if (type == RDB_OPCODE_AUX) { /* AUX: generic string-string fields. Use to add state to RDB * which is backward compatible. Implementations of RDB loading @@ -1688,6 +2089,23 @@ int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi) { "Can't load Lua script from RDB file! " "BODY: %s", auxval->ptr); } + } else if (!strcasecmp(auxkey->ptr,"redis-ver")) { + serverLog(LL_NOTICE,"Loading RDB produced by version %s", + (char*)auxval->ptr); + } else if (!strcasecmp(auxkey->ptr,"ctime")) { + time_t age = time(NULL)-strtol(auxval->ptr,NULL,10); + if (age < 0) age = 0; + serverLog(LL_NOTICE,"RDB age %ld seconds", + (unsigned long) age); + } else if (!strcasecmp(auxkey->ptr,"used-mem")) { + long long usedmem = strtoll(auxval->ptr,NULL,10); + serverLog(LL_NOTICE,"RDB memory usage when created %.2f Mb", + (double) usedmem / (1024*1024)); + } else if (!strcasecmp(auxkey->ptr,"aof-preamble")) { + long long haspreamble = strtoll(auxval->ptr,NULL,10); + if (haspreamble) serverLog(LL_NOTICE,"RDB has an AOF tail"); + } else if (!strcasecmp(auxkey->ptr,"redis-bits")) { + /* Just ignored. */ } else { /* We ignore fields we don't understand, as by AUX field * contract. */ @@ -1698,49 +2116,120 @@ int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi) { decrRefCount(auxkey); decrRefCount(auxval); continue; /* Read type again. */ + } else if (type == RDB_OPCODE_MODULE_AUX) { + /* Load module data that is not related to the Redis key space. + * Such data can be potentially be stored both before and after the + * RDB keys-values section. */ + uint64_t moduleid = rdbLoadLen(rdb,NULL); + int when_opcode = rdbLoadLen(rdb,NULL); + int when = rdbLoadLen(rdb,NULL); + if (rioGetReadError(rdb)) goto eoferr; + if (when_opcode != RDB_MODULE_OPCODE_UINT) + rdbReportReadError("bad when_opcode"); + moduleType *mt = moduleTypeLookupModuleByID(moduleid); + char name[10]; + moduleTypeNameByID(name,moduleid); + + if (!rdbCheckMode && mt == NULL) { + /* Unknown module. */ + serverLog(LL_WARNING,"The RDB file contains AUX module data I can't load: no matching module '%s'", name); + exit(1); + } else if (!rdbCheckMode && mt != NULL) { + if (!mt->aux_load) { + /* Module doesn't support AUX. */ + serverLog(LL_WARNING,"The RDB file contains module AUX data, but the module '%s' doesn't seem to support it.", name); + exit(1); + } + + RedisModuleIO io; + moduleInitIOContext(io,mt,rdb,NULL); + io.ver = 2; + /* Call the rdb_load method of the module providing the 10 bit + * encoding version in the lower 10 bits of the module ID. */ + if (mt->aux_load(&io,moduleid&1023, when) || io.error) { + moduleTypeNameByID(name,moduleid); + serverLog(LL_WARNING,"The RDB file contains module AUX data for the module type '%s', that the responsible module is not able to load. Check for modules log above for additional clues.", name); + exit(1); + } + if (io.ctx) { + moduleFreeContext(io.ctx); + zfree(io.ctx); + } + uint64_t eof = rdbLoadLen(rdb,NULL); + if (eof != RDB_MODULE_OPCODE_EOF) { + serverLog(LL_WARNING,"The RDB file contains module AUX data for the module '%s' that is not terminated by the proper module value EOF marker", name); + exit(1); + } + continue; + } else { + /* RDB check mode. */ + robj *aux = rdbLoadCheckModuleValue(rdb,name); + decrRefCount(aux); + continue; /* Read next opcode. */ + } } /* Read key */ if ((key = rdbLoadStringObject(rdb)) == NULL) goto eoferr; /* Read value */ - if ((val = rdbLoadObject(type,rdb)) == NULL) goto eoferr; + if ((val = rdbLoadObject(type,rdb,key)) == NULL) goto eoferr; /* Check if the key already expired. This function is used when loading * an RDB file from disk, either at startup, or when an RDB was * received from the master. In the latter case, the master is * responsible for key expiry. If we would expire keys here, the * snapshot taken by the master may not be reflected on the slave. */ - if (server.masterhost == NULL && expiretime != -1 && expiretime < now) { + if (server.masterhost == NULL && !loading_aof && expiretime != -1 && expiretime < now) { decrRefCount(key); decrRefCount(val); - continue; - } - /* Add the new object in the hash table */ - dbAdd(db,key,val); + } else { + /* Add the new object in the hash table */ + dbAdd(db,key,val); + + /* Set the expire time if needed */ + if (expiretime != -1) setExpire(NULL,db,key,expiretime); + + /* Set usage information (for eviction). */ + objectSetLRUOrLFU(val,lfu_freq,lru_idle,lru_clock); - /* Set the expire time if needed */ - if (expiretime != -1) setExpire(NULL,db,key,expiretime); + /* Decrement the key refcount since dbAdd() will take its + * own reference. */ + decrRefCount(key); + } + if (server.key_load_delay) + usleep(server.key_load_delay); - decrRefCount(key); + /* Reset the state that is key-specified and is populated by + * opcodes before the key, so that we start from scratch again. */ + expiretime = -1; + lfu_freq = -1; + lru_idle = -1; } /* Verify the checksum if RDB version is >= 5 */ - if (rdbver >= 5 && server.rdb_checksum) { + if (rdbver >= 5) { uint64_t cksum, expected = rdb->cksum; if (rioRead(rdb,&cksum,8) == 0) goto eoferr; - memrev64ifbe(&cksum); - if (cksum == 0) { - serverLog(LL_WARNING,"RDB file was saved with checksum disabled: no check performed."); - } else if (cksum != expected) { - serverLog(LL_WARNING,"Wrong RDB checksum. Aborting now."); - rdbExitReportCorruptRDB("RDB CRC error"); + if (server.rdb_checksum) { + memrev64ifbe(&cksum); + if (cksum == 0) { + serverLog(LL_WARNING,"RDB file was saved with checksum disabled: no check performed."); + } else if (cksum != expected) { + serverLog(LL_WARNING,"Wrong RDB checksum. Aborting now."); + rdbExitReportCorruptRDB("RDB CRC error"); + } } } return C_OK; -eoferr: /* unexpected end of file is handled here with a fatal exit */ - serverLog(LL_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now."); - rdbExitReportCorruptRDB("Unexpected EOF reading RDB file"); - return C_ERR; /* Just to avoid warning */ + /* Unexpected end of file is handled here calling rdbReportReadError(): + * this will in turn either abort Redis in most cases, or if we are loading + * the RDB file from a socket during initial SYNC (diskless replica mode), + * we'll report the error to the caller, so that we can retry. */ +eoferr: + serverLog(LL_WARNING, + "Short read or OOM loading DB. Unrecoverable error, aborting now."); + rdbReportReadError("Unexpected EOF reading RDB file"); + return C_ERR; } /* Like rdbLoadRio() but takes a filename instead of a rio stream. The @@ -1756,9 +2245,9 @@ int rdbLoad(char *filename, rdbSaveInfo *rsi) { int retval; if ((fp = fopen(filename,"r")) == NULL) return C_ERR; - startLoading(fp); + startLoadingFile(fp, filename); rioInitWithFile(&rdb,fp); - retval = rdbLoadRio(&rdb,rsi); + retval = rdbLoadRio(&rdb,rsi,0); fclose(fp); stopLoading(); return retval; @@ -1786,7 +2275,7 @@ void backgroundSaveDoneHandlerDisk(int exitcode, int bysignal) { latencyEndMonitor(latency); latencyAddSampleIfNeeded("rdb-unlink-temp-file",latency); /* SIGUSR1 is whitelisted, so we have a way to kill a child without - * tirggering an error conditon. */ + * tirggering an error condition. */ if (bysignal != SIGUSR1) server.lastbgsave_status = C_ERR; } @@ -1803,8 +2292,6 @@ void backgroundSaveDoneHandlerDisk(int exitcode, int bysignal) { * This function covers the case of RDB -> Slaves socket transfers for * diskless replication. */ void backgroundSaveDoneHandlerSocket(int exitcode, int bysignal) { - uint64_t *ok_slaves; - if (!bysignal && exitcode == 0) { serverLog(LL_NOTICE, "Background RDB transfer terminated with success"); @@ -1818,79 +2305,6 @@ void backgroundSaveDoneHandlerSocket(int exitcode, int bysignal) { server.rdb_child_type = RDB_CHILD_TYPE_NONE; server.rdb_save_time_start = -1; - /* If the child returns an OK exit code, read the set of slave client - * IDs and the associated status code. We'll terminate all the slaves - * in error state. - * - * If the process returned an error, consider the list of slaves that - * can continue to be emtpy, so that it's just a special case of the - * normal code path. */ - ok_slaves = zmalloc(sizeof(uint64_t)); /* Make space for the count. */ - ok_slaves[0] = 0; - if (!bysignal && exitcode == 0) { - int readlen = sizeof(uint64_t); - - if (read(server.rdb_pipe_read_result_from_child, ok_slaves, readlen) == - readlen) - { - readlen = ok_slaves[0]*sizeof(uint64_t)*2; - - /* Make space for enough elements as specified by the first - * uint64_t element in the array. */ - ok_slaves = zrealloc(ok_slaves,sizeof(uint64_t)+readlen); - if (readlen && - read(server.rdb_pipe_read_result_from_child, ok_slaves+1, - readlen) != readlen) - { - ok_slaves[0] = 0; - } - } - } - - close(server.rdb_pipe_read_result_from_child); - close(server.rdb_pipe_write_result_to_parent); - - /* We can continue the replication process with all the slaves that - * correctly received the full payload. Others are terminated. */ - listNode *ln; - listIter li; - - listRewind(server.slaves,&li); - while((ln = listNext(&li))) { - client *slave = ln->value; - - if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_END) { - uint64_t j; - int errorcode = 0; - - /* Search for the slave ID in the reply. In order for a slave to - * continue the replication process, we need to find it in the list, - * and it must have an error code set to 0 (which means success). */ - for (j = 0; j < ok_slaves[0]; j++) { - if (slave->id == ok_slaves[2*j+1]) { - errorcode = ok_slaves[2*j+2]; - break; /* Found in slaves list. */ - } - } - if (j == ok_slaves[0] || errorcode != 0) { - serverLog(LL_WARNING, - "Closing slave %s: child->slave RDB transfer failed: %s", - replicationGetSlaveName(slave), - (errorcode == 0) ? "RDB transfer child aborted" - : strerror(errorcode)); - freeClient(slave); - } else { - serverLog(LL_WARNING, - "Slave %s correctly received the streamed RDB file.", - replicationGetSlaveName(slave)); - /* Restore the socket as non-blocking. */ - anetNonBlock(NULL,slave->fd); - anetSendTimeout(NULL,slave->fd,0); - } - } - } - zfree(ok_slaves); - updateSlavesWaitingBgsave((!bysignal && exitcode == 0) ? C_OK : C_ERR, RDB_CHILD_TYPE_SOCKET); } @@ -1909,123 +2323,74 @@ void backgroundSaveDoneHandler(int exitcode, int bysignal) { } } +/* Kill the RDB saving child using SIGUSR1 (so that the parent will know + * the child did not exit for an error, but because we wanted), and performs + * the cleanup needed. */ +void killRDBChild(void) { + kill(server.rdb_child_pid,SIGUSR1); + rdbRemoveTempFile(server.rdb_child_pid); + closeChildInfoPipe(); + updateDictResizePolicy(); +} + /* Spawn an RDB child that writes the RDB to the sockets of the slaves * that are currently in SLAVE_STATE_WAIT_BGSAVE_START state. */ int rdbSaveToSlavesSockets(rdbSaveInfo *rsi) { - int *fds; - uint64_t *clientids; - int numfds; listNode *ln; listIter li; pid_t childpid; - long long start; int pipefds[2]; - if (server.aof_child_pid != -1 || server.rdb_child_pid != -1) return C_ERR; + if (hasActiveChildProcess()) return C_ERR; + + /* Even if the previous fork child exited, don't start a new one until we + * drained the pipe. */ + if (server.rdb_pipe_conns) return C_ERR; - /* Before to fork, create a pipe that will be used in order to - * send back to the parent the IDs of the slaves that successfully - * received all the writes. */ + /* Before to fork, create a pipe that is used to transfer the rdb bytes to + * the parent, we can't let it write directly to the sockets, since in case + * of TLS we must let the parent handle a continuous TLS state when the + * child terminates and parent takes over. */ if (pipe(pipefds) == -1) return C_ERR; - server.rdb_pipe_read_result_from_child = pipefds[0]; - server.rdb_pipe_write_result_to_parent = pipefds[1]; + server.rdb_pipe_read = pipefds[0]; + server.rdb_pipe_write = pipefds[1]; + anetNonBlock(NULL, server.rdb_pipe_read); - /* Collect the file descriptors of the slaves we want to transfer + /* Collect the connections of the replicas we want to transfer * the RDB to, which are i WAIT_BGSAVE_START state. */ - fds = zmalloc(sizeof(int)*listLength(server.slaves)); - /* We also allocate an array of corresponding client IDs. This will - * be useful for the child process in order to build the report - * (sent via unix pipe) that will be sent to the parent. */ - clientids = zmalloc(sizeof(uint64_t)*listLength(server.slaves)); - numfds = 0; - + server.rdb_pipe_conns = zmalloc(sizeof(connection *)*listLength(server.slaves)); + server.rdb_pipe_numconns = 0; + server.rdb_pipe_numconns_writing = 0; listRewind(server.slaves,&li); while((ln = listNext(&li))) { client *slave = ln->value; - if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_START) { - clientids[numfds] = slave->id; - fds[numfds++] = slave->fd; + server.rdb_pipe_conns[server.rdb_pipe_numconns++] = slave->conn; replicationSetupSlaveForFullResync(slave,getPsyncInitialOffset()); - /* Put the socket in blocking mode to simplify RDB transfer. - * We'll restore it when the children returns (since duped socket - * will share the O_NONBLOCK attribute with the parent). */ - anetBlock(NULL,slave->fd); - anetSendTimeout(NULL,slave->fd,server.repl_timeout*1000); } } /* Create the child process. */ openChildInfoPipe(); - start = ustime(); - if ((childpid = fork()) == 0) { + if ((childpid = redisFork()) == 0) { /* Child */ int retval; - rio slave_sockets; + rio rdb; - rioInitWithFdset(&slave_sockets,fds,numfds); - zfree(fds); + rioInitWithFd(&rdb,server.rdb_pipe_write); - closeListeningSockets(0); redisSetProcTitle("redis-rdb-to-slaves"); - retval = rdbSaveRioWithEOFMark(&slave_sockets,NULL,rsi); - if (retval == C_OK && rioFlush(&slave_sockets) == 0) + retval = rdbSaveRioWithEOFMark(&rdb,NULL,rsi); + if (retval == C_OK && rioFlush(&rdb) == 0) retval = C_ERR; if (retval == C_OK) { - size_t private_dirty = zmalloc_get_private_dirty(-1); - - if (private_dirty) { - serverLog(LL_NOTICE, - "RDB: %zu MB of memory used by copy-on-write", - private_dirty/(1024*1024)); - } - - server.child_info_data.cow_size = private_dirty; - sendChildInfo(CHILD_INFO_TYPE_RDB); - - /* If we are returning OK, at least one slave was served - * with the RDB file as expected, so we need to send a report - * to the parent via the pipe. The format of the message is: - * - * <len> <slave[0].id> <slave[0].error> ... - * - * len, slave IDs, and slave errors, are all uint64_t integers, - * so basically the reply is composed of 64 bits for the len field - * plus 2 additional 64 bit integers for each entry, for a total - * of 'len' entries. - * - * The 'id' represents the slave's client ID, so that the master - * can match the report with a specific slave, and 'error' is - * set to 0 if the replication process terminated with a success - * or the error code if an error occurred. */ - void *msg = zmalloc(sizeof(uint64_t)*(1+2*numfds)); - uint64_t *len = msg; - uint64_t *ids = len+1; - int j, msglen; - - *len = numfds; - for (j = 0; j < numfds; j++) { - *ids++ = clientids[j]; - *ids++ = slave_sockets.io.fdset.state[j]; - } - - /* Write the message to the parent. If we have no good slaves or - * we are unable to transfer the message to the parent, we exit - * with an error so that the parent will abort the replication - * process with all the childre that were waiting. */ - msglen = sizeof(uint64_t)*(1+2*numfds); - if (*len == 0 || - write(server.rdb_pipe_write_result_to_parent,msg,msglen) - != msglen) - { - retval = C_ERR; - } - zfree(msg); + sendChildCOWInfo(CHILD_INFO_TYPE_RDB, "RDB"); } - zfree(clientids); - rioFreeFdset(&slave_sockets); + + rioFreeFd(&rdb); + close(server.rdb_pipe_write); /* wake up the reader, tell it we're done. */ exitFromChild((retval == C_OK) ? 0 : 1); } else { /* Parent */ @@ -2039,32 +2404,28 @@ int rdbSaveToSlavesSockets(rdbSaveInfo *rsi) { listRewind(server.slaves,&li); while((ln = listNext(&li))) { client *slave = ln->value; - int j; - - for (j = 0; j < numfds; j++) { - if (slave->id == clientids[j]) { - slave->replstate = SLAVE_STATE_WAIT_BGSAVE_START; - break; - } + if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_END) { + slave->replstate = SLAVE_STATE_WAIT_BGSAVE_START; } } - close(pipefds[0]); - close(pipefds[1]); + close(server.rdb_pipe_write); + close(server.rdb_pipe_read); + zfree(server.rdb_pipe_conns); + server.rdb_pipe_conns = NULL; + server.rdb_pipe_numconns = 0; + server.rdb_pipe_numconns_writing = 0; closeChildInfoPipe(); } else { - server.stat_fork_time = ustime()-start; - server.stat_fork_rate = (double) zmalloc_used_memory() * 1000000 / server.stat_fork_time / (1024*1024*1024); /* GB per second. */ - latencyAddSampleIfNeeded("fork",server.stat_fork_time/1000); - serverLog(LL_NOTICE,"Background RDB transfer started by pid %d", childpid); server.rdb_save_time_start = time(NULL); server.rdb_child_pid = childpid; server.rdb_child_type = RDB_CHILD_TYPE_SOCKET; - updateDictResizePolicy(); + close(server.rdb_pipe_write); /* close write in parent so that it can detect the close on the child. */ + if (aeCreateFileEvent(server.el, server.rdb_pipe_read, AE_READABLE, rdbPipeReadHandler,NULL) == AE_ERR) { + serverPanic("Unrecoverable error creating server.rdb_pipe_read file event."); + } } - zfree(clientids); - zfree(fds); return (childpid == -1) ? C_ERR : C_OK; } return C_OK; /* Unreached. */ @@ -2104,15 +2465,15 @@ void bgsaveCommand(client *c) { if (server.rdb_child_pid != -1) { addReplyError(c,"Background save already in progress"); - } else if (server.aof_child_pid != -1) { + } else if (hasActiveChildProcess()) { if (schedule) { server.rdb_bgsave_scheduled = 1; addReplyStatus(c,"Background saving scheduled"); } else { addReplyError(c, - "An AOF log rewriting in progress: can't BGSAVE right now. " - "Use BGSAVE SCHEDULE in order to schedule a BGSAVE whenever " - "possible."); + "Another child process is active (AOF?): can't BGSAVE right now. " + "Use BGSAVE SCHEDULE in order to schedule a BGSAVE whenever " + "possible."); } } else if (rdbSaveBackground(server.rdb_filename,rsiptr) == C_OK) { addReplyStatus(c,"Background saving started"); |