/* * Copyright (c) 2009-2012, Salvatore Sanfilippo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Redis nor the names of its contributors may be used * to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "server.h" #include "lzf.h" /* LZF compression library */ #include "zipmap.h" #include "endianconv.h" #include #include #include #include #include #include #include #include #define rdbExitReportCorruptRDB(...) rdbCheckThenExit(__LINE__,__VA_ARGS__) extern int rdbCheckMode; void rdbCheckError(const char *fmt, ...); void rdbCheckSetError(const char *fmt, ...); void rdbCheckThenExit(int linenum, char *reason, ...) { va_list ap; char msg[1024]; int len; len = snprintf(msg,sizeof(msg), "Internal error in RDB reading function at rdb.c:%d -> ", linenum); va_start(ap,reason); vsnprintf(msg+len,sizeof(msg)-len,reason,ap); va_end(ap); if (!rdbCheckMode) { serverLog(LL_WARNING, "%s", msg); char *argv[2] = {"",server.rdb_filename}; redis_check_rdb_main(2,argv,NULL); } else { rdbCheckError("%s",msg); } exit(1); } static int rdbWriteRaw(rio *rdb, void *p, size_t len) { if (rdb && rioWrite(rdb,p,len) == 0) return -1; return len; } int rdbSaveType(rio *rdb, unsigned char type) { return rdbWriteRaw(rdb,&type,1); } /* Load a "type" in RDB format, that is a one byte unsigned integer. * This function is not only used to load object types, but also special * "types" like the end-of-file type, the EXPIRE type, and so forth. */ int rdbLoadType(rio *rdb) { unsigned char type; if (rioRead(rdb,&type,1) == 0) return -1; return type; } time_t rdbLoadTime(rio *rdb) { int32_t t32; if (rioRead(rdb,&t32,4) == 0) return -1; return (time_t)t32; } int rdbSaveMillisecondTime(rio *rdb, long long t) { int64_t t64 = (int64_t) t; return rdbWriteRaw(rdb,&t64,8); } long long rdbLoadMillisecondTime(rio *rdb) { int64_t t64; if (rioRead(rdb,&t64,8) == 0) return -1; return (long long)t64; } /* Saves an encoded length. The first two bits in the first byte are used to * hold the encoding type. See the RDB_* definitions for more information * on the types of encoding. */ int rdbSaveLen(rio *rdb, uint64_t len) { unsigned char buf[2]; size_t nwritten; if (len < (1<<6)) { /* Save a 6 bit len */ buf[0] = (len&0xFF)|(RDB_6BITLEN<<6); if (rdbWriteRaw(rdb,buf,1) == -1) return -1; nwritten = 1; } else if (len < (1<<14)) { /* Save a 14 bit len */ buf[0] = ((len>>8)&0xFF)|(RDB_14BITLEN<<6); buf[1] = len&0xFF; if (rdbWriteRaw(rdb,buf,2) == -1) return -1; nwritten = 2; } else if (len <= UINT32_MAX) { /* Save a 32 bit len */ buf[0] = RDB_32BITLEN; if (rdbWriteRaw(rdb,buf,1) == -1) return -1; uint32_t len32 = htonl(len); if (rdbWriteRaw(rdb,&len32,4) == -1) return -1; nwritten = 1+4; } else { /* Save a 64 bit len */ buf[0] = RDB_64BITLEN; if (rdbWriteRaw(rdb,buf,1) == -1) return -1; len = htonu64(len); if (rdbWriteRaw(rdb,&len,8) == -1) return -1; nwritten = 1+8; } return nwritten; } /* Load an encoded length. If the loaded length is a normal length as stored * with rdbSaveLen(), the read length is set to '*lenptr'. If instead the * loaded length describes a special encoding that follows, then '*isencoded' * is set to 1 and the encoding format is stored at '*lenptr'. * * See the RDB_ENC_* definitions in rdb.h for more information on special * encodings. * * The function returns -1 on error, 0 on success. */ int rdbLoadLenByRef(rio *rdb, int *isencoded, uint64_t *lenptr) { unsigned char buf[2]; int type; if (isencoded) *isencoded = 0; if (rioRead(rdb,buf,1) == 0) return -1; type = (buf[0]&0xC0)>>6; if (type == RDB_ENCVAL) { /* Read a 6 bit encoding type. */ if (isencoded) *isencoded = 1; *lenptr = buf[0]&0x3F; } else if (type == RDB_6BITLEN) { /* Read a 6 bit len. */ *lenptr = buf[0]&0x3F; } else if (type == RDB_14BITLEN) { /* Read a 14 bit len. */ if (rioRead(rdb,buf+1,1) == 0) return -1; *lenptr = ((buf[0]&0x3F)<<8)|buf[1]; } else if (buf[0] == RDB_32BITLEN) { /* Read a 32 bit len. */ uint32_t len; if (rioRead(rdb,&len,4) == 0) return -1; *lenptr = ntohl(len); } else if (buf[0] == RDB_64BITLEN) { /* Read a 64 bit len. */ uint64_t len; if (rioRead(rdb,&len,8) == 0) return -1; *lenptr = ntohu64(len); } else { rdbExitReportCorruptRDB( "Unknown length encoding %d in rdbLoadLen()",type); return -1; /* Never reached. */ } return 0; } /* This is like rdbLoadLenByRef() but directly returns the value read * from the RDB stream, signaling an error by returning RDB_LENERR * (since it is a too large count to be applicable in any Redis data * structure). */ uint64_t rdbLoadLen(rio *rdb, int *isencoded) { uint64_t len; if (rdbLoadLenByRef(rdb,isencoded,&len) == -1) return RDB_LENERR; return len; } /* Encodes the "value" argument as integer when it fits in the supported ranges * for encoded types. If the function successfully encodes the integer, the * representation is stored in the buffer pointer to by "enc" and the string * length is returned. Otherwise 0 is returned. */ int rdbEncodeInteger(long long value, unsigned char *enc) { if (value >= -(1<<7) && value <= (1<<7)-1) { enc[0] = (RDB_ENCVAL<<6)|RDB_ENC_INT8; enc[1] = value&0xFF; return 2; } else if (value >= -(1<<15) && value <= (1<<15)-1) { enc[0] = (RDB_ENCVAL<<6)|RDB_ENC_INT16; enc[1] = value&0xFF; enc[2] = (value>>8)&0xFF; return 3; } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) { enc[0] = (RDB_ENCVAL<<6)|RDB_ENC_INT32; enc[1] = value&0xFF; enc[2] = (value>>8)&0xFF; enc[3] = (value>>16)&0xFF; enc[4] = (value>>24)&0xFF; return 5; } else { return 0; } } /* Loads an integer-encoded object with the specified encoding type "enctype". * The returned value changes according to the flags, see * rdbGenerincLoadStringObject() for more info. */ void *rdbLoadIntegerObject(rio *rdb, int enctype, int flags, size_t *lenptr) { int plain = flags & RDB_LOAD_PLAIN; int sds = flags & RDB_LOAD_SDS; int encode = flags & RDB_LOAD_ENC; unsigned char enc[4]; long long val; if (enctype == RDB_ENC_INT8) { if (rioRead(rdb,enc,1) == 0) return NULL; val = (signed char)enc[0]; } else if (enctype == RDB_ENC_INT16) { uint16_t v; if (rioRead(rdb,enc,2) == 0) return NULL; v = enc[0]|(enc[1]<<8); val = (int16_t)v; } else if (enctype == RDB_ENC_INT32) { uint32_t v; if (rioRead(rdb,enc,4) == 0) return NULL; v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24); val = (int32_t)v; } else { val = 0; /* anti-warning */ rdbExitReportCorruptRDB("Unknown RDB integer encoding type %d",enctype); } if (plain || sds) { char buf[LONG_STR_SIZE], *p; int len = ll2string(buf,sizeof(buf),val); if (lenptr) *lenptr = len; p = plain ? zmalloc(len) : sdsnewlen(NULL,len); memcpy(p,buf,len); return p; } else if (encode) { return createStringObjectFromLongLong(val); } else { return createObject(OBJ_STRING,sdsfromlonglong(val)); } } /* String objects in the form "2391" "-100" without any space and with a * range of values that can fit in an 8, 16 or 32 bit signed value can be * encoded as integers to save space */ int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) { long long value; char *endptr, buf[32]; /* Check if it's possible to encode this value as a number */ value = strtoll(s, &endptr, 10); if (endptr[0] != '\0') return 0; ll2string(buf,32,value); /* If the number converted back into a string is not identical * then it's not possible to encode the string as integer */ if (strlen(buf) != len || memcmp(buf,s,len)) return 0; return rdbEncodeInteger(value,enc); } ssize_t rdbSaveLzfBlob(rio *rdb, void *data, size_t compress_len, size_t original_len) { unsigned char byte; ssize_t n, nwritten = 0; /* Data compressed! Let's save it on disk */ byte = (RDB_ENCVAL<<6)|RDB_ENC_LZF; if ((n = rdbWriteRaw(rdb,&byte,1)) == -1) goto writeerr; nwritten += n; if ((n = rdbSaveLen(rdb,compress_len)) == -1) goto writeerr; nwritten += n; if ((n = rdbSaveLen(rdb,original_len)) == -1) goto writeerr; nwritten += n; if ((n = rdbWriteRaw(rdb,data,compress_len)) == -1) goto writeerr; nwritten += n; return nwritten; writeerr: return -1; } ssize_t rdbSaveLzfStringObject(rio *rdb, unsigned char *s, size_t len) { size_t comprlen, outlen; void *out; /* We require at least four bytes compression for this to be worth it */ if (len <= 4) return 0; outlen = len-4; if ((out = zmalloc(outlen+1)) == NULL) return 0; comprlen = lzf_compress(s, len, out, outlen); if (comprlen == 0) { zfree(out); return 0; } ssize_t nwritten = rdbSaveLzfBlob(rdb, out, comprlen, len); zfree(out); return nwritten; } /* Load an LZF compressed string in RDB format. The returned value * changes according to 'flags'. For more info check the * rdbGenericLoadStringObject() function. */ void *rdbLoadLzfStringObject(rio *rdb, int flags, size_t *lenptr) { int plain = flags & RDB_LOAD_PLAIN; int sds = flags & RDB_LOAD_SDS; uint64_t len, clen; unsigned char *c = NULL; char *val = NULL; if ((clen = rdbLoadLen(rdb,NULL)) == RDB_LENERR) return NULL; if ((len = rdbLoadLen(rdb,NULL)) == RDB_LENERR) return NULL; if ((c = zmalloc(clen)) == NULL) goto err; /* Allocate our target according to the uncompressed size. */ if (plain) { val = zmalloc(len); if (lenptr) *lenptr = len; } else { val = sdsnewlen(NULL,len); } /* Load the compressed representation and uncompress it to target. */ if (rioRead(rdb,c,clen) == 0) goto err; if (lzf_decompress(c,clen,val,len) == 0) { if (rdbCheckMode) rdbCheckSetError("Invalid LZF compressed string"); goto err; } zfree(c); if (plain || sds) { return val; } else { return createObject(OBJ_STRING,val); } err: zfree(c); if (plain) zfree(val); else sdsfree(val); return NULL; } /* Save a string object as [len][data] on disk. If the object is a string * representation of an integer value we try to save it in a special form */ ssize_t rdbSaveRawString(rio *rdb, unsigned char *s, size_t len) { int enclen; ssize_t n, nwritten = 0; /* Try integer encoding */ if (len <= 11) { unsigned char buf[5]; if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) { if (rdbWriteRaw(rdb,buf,enclen) == -1) return -1; return enclen; } } /* Try LZF compression - under 20 bytes it's unable to compress even * aaaaaaaaaaaaaaaaaa so skip it */ if (server.rdb_compression && len > 20) { n = rdbSaveLzfStringObject(rdb,s,len); if (n == -1) return -1; if (n > 0) return n; /* Return value of 0 means data can't be compressed, save the old way */ } /* Store verbatim */ if ((n = rdbSaveLen(rdb,len)) == -1) return -1; nwritten += n; if (len > 0) { if (rdbWriteRaw(rdb,s,len) == -1) return -1; nwritten += len; } return nwritten; } /* Save a long long value as either an encoded string or a string. */ ssize_t rdbSaveLongLongAsStringObject(rio *rdb, long long value) { unsigned char buf[32]; ssize_t n, nwritten = 0; int enclen = rdbEncodeInteger(value,buf); if (enclen > 0) { return rdbWriteRaw(rdb,buf,enclen); } else { /* Encode as string */ enclen = ll2string((char*)buf,32,value); serverAssert(enclen < 32); if ((n = rdbSaveLen(rdb,enclen)) == -1) return -1; nwritten += n; if ((n = rdbWriteRaw(rdb,buf,enclen)) == -1) return -1; nwritten += n; } return nwritten; } /* Like rdbSaveRawString() gets a Redis object instead. */ int rdbSaveStringObject(rio *rdb, robj *obj) { /* Avoid to decode the object, then encode it again, if the * object is already integer encoded. */ if (obj->encoding == OBJ_ENCODING_INT) { return rdbSaveLongLongAsStringObject(rdb,(long)obj->ptr); } else { serverAssertWithInfo(NULL,obj,sdsEncodedObject(obj)); return rdbSaveRawString(rdb,obj->ptr,sdslen(obj->ptr)); } } /* Load a string object from an RDB file according to flags: * * RDB_LOAD_NONE (no flags): load an RDB object, unencoded. * RDB_LOAD_ENC: If the returned type is a Redis object, try to * encode it in a special way to be more memory * efficient. When this flag is passed the function * no longer guarantees that obj->ptr is an SDS string. * RDB_LOAD_PLAIN: Return a plain string allocated with zmalloc() * instead of a Redis object with an sds in it. * RDB_LOAD_SDS: Return an SDS string instead of a Redis object. * * On I/O error NULL is returned. */ void *rdbGenericLoadStringObject(rio *rdb, int flags, size_t *lenptr) { int encode = flags & RDB_LOAD_ENC; int plain = flags & RDB_LOAD_PLAIN; int sds = flags & RDB_LOAD_SDS; int isencoded; uint64_t len; len = rdbLoadLen(rdb,&isencoded); if (isencoded) { switch(len) { case RDB_ENC_INT8: case RDB_ENC_INT16: case RDB_ENC_INT32: return rdbLoadIntegerObject(rdb,len,flags,lenptr); case RDB_ENC_LZF: return rdbLoadLzfStringObject(rdb,flags,lenptr); default: rdbExitReportCorruptRDB("Unknown RDB string encoding type %d",len); } } if (len == RDB_LENERR) return NULL; if (plain || sds) { void *buf = plain ? zmalloc(len) : sdsnewlen(NULL,len); if (lenptr) *lenptr = len; if (len && rioRead(rdb,buf,len) == 0) { if (plain) zfree(buf); else sdsfree(buf); return NULL; } return buf; } else { robj *o = encode ? createStringObject(NULL,len) : createRawStringObject(NULL,len); if (len && rioRead(rdb,o->ptr,len) == 0) { decrRefCount(o); return NULL; } return o; } } robj *rdbLoadStringObject(rio *rdb) { return rdbGenericLoadStringObject(rdb,RDB_LOAD_NONE,NULL); } robj *rdbLoadEncodedStringObject(rio *rdb) { return rdbGenericLoadStringObject(rdb,RDB_LOAD_ENC,NULL); } /* Save a double value. Doubles are saved as strings prefixed by an unsigned * 8 bit integer specifying the length of the representation. * This 8 bit integer has special values in order to specify the following * conditions: * 253: not a number * 254: + inf * 255: - inf */ int rdbSaveDoubleValue(rio *rdb, double val) { unsigned char buf[128]; int len; if (isnan(val)) { buf[0] = 253; len = 1; } else if (!isfinite(val)) { len = 1; buf[0] = (val < 0) ? 255 : 254; } else { #if (DBL_MANT_DIG >= 52) && (LLONG_MAX == 0x7fffffffffffffffLL) /* Check if the float is in a safe range to be casted into a * long long. We are assuming that long long is 64 bit here. * Also we are assuming that there are no implementations around where * double has precision < 52 bit. * * Under this assumptions we test if a double is inside an interval * where casting to long long is safe. Then using two castings we * make sure the decimal part is zero. If all this is true we use * integer printing function that is much faster. */ double min = -4503599627370495; /* (2^52)-1 */ double max = 4503599627370496; /* -(2^52) */ if (val > min && val < max && val == ((double)((long long)val))) ll2string((char*)buf+1,sizeof(buf)-1,(long long)val); else #endif snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val); buf[0] = strlen((char*)buf+1); len = buf[0]+1; } return rdbWriteRaw(rdb,buf,len); } /* For information about double serialization check rdbSaveDoubleValue() */ int rdbLoadDoubleValue(rio *rdb, double *val) { char buf[256]; unsigned char len; if (rioRead(rdb,&len,1) == 0) return -1; switch(len) { case 255: *val = R_NegInf; return 0; case 254: *val = R_PosInf; return 0; case 253: *val = R_Nan; return 0; default: if (rioRead(rdb,buf,len) == 0) return -1; buf[len] = '\0'; sscanf(buf, "%lg", val); return 0; } } /* Saves a double for RDB 8 or greater, where IE754 binary64 format is assumed. * We just make sure the integer is always stored in little endian, otherwise * the value is copied verbatim from memory to disk. * * Return -1 on error, the size of the serialized value on success. */ int rdbSaveBinaryDoubleValue(rio *rdb, double val) { memrev64ifbe(&val); return rdbWriteRaw(rdb,&val,sizeof(val)); } /* Loads a double from RDB 8 or greater. See rdbSaveBinaryDoubleValue() for * more info. On error -1 is returned, otherwise 0. */ int rdbLoadBinaryDoubleValue(rio *rdb, double *val) { if (rioRead(rdb,val,sizeof(*val)) == 0) return -1; memrev64ifbe(val); return 0; } /* Like rdbSaveBinaryDoubleValue() but single precision. */ int rdbSaveBinaryFloatValue(rio *rdb, float val) { memrev32ifbe(&val); return rdbWriteRaw(rdb,&val,sizeof(val)); } /* Like rdbLoadBinaryDoubleValue() but single precision. */ int rdbLoadBinaryFloatValue(rio *rdb, float *val) { if (rioRead(rdb,val,sizeof(*val)) == 0) return -1; memrev32ifbe(val); return 0; } /* Save the object type of object "o". */ int rdbSaveObjectType(rio *rdb, robj *o) { switch (o->type) { case OBJ_STRING: return rdbSaveType(rdb,RDB_TYPE_STRING); case OBJ_LIST: if (o->encoding == OBJ_ENCODING_QUICKLIST) return rdbSaveType(rdb,RDB_TYPE_LIST_QUICKLIST); else serverPanic("Unknown list encoding"); case OBJ_SET: if (o->encoding == OBJ_ENCODING_INTSET) return rdbSaveType(rdb,RDB_TYPE_SET_INTSET); else if (o->encoding == OBJ_ENCODING_HT) return rdbSaveType(rdb,RDB_TYPE_SET); else serverPanic("Unknown set encoding"); case OBJ_ZSET: if (o->encoding == OBJ_ENCODING_ZIPLIST) return rdbSaveType(rdb,RDB_TYPE_ZSET_ZIPLIST); else if (o->encoding == OBJ_ENCODING_SKIPLIST) return rdbSaveType(rdb,RDB_TYPE_ZSET_2); else serverPanic("Unknown sorted set encoding"); case OBJ_HASH: if (o->encoding == OBJ_ENCODING_ZIPLIST) return rdbSaveType(rdb,RDB_TYPE_HASH_ZIPLIST); else if (o->encoding == OBJ_ENCODING_HT) return rdbSaveType(rdb,RDB_TYPE_HASH); else serverPanic("Unknown hash encoding"); case OBJ_MODULE: return rdbSaveType(rdb,RDB_TYPE_MODULE_2); default: serverPanic("Unknown object type"); } return -1; /* avoid warning */ } /* Use rdbLoadType() to load a TYPE in RDB format, but returns -1 if the * type is not specifically a valid Object Type. */ int rdbLoadObjectType(rio *rdb) { int type; if ((type = rdbLoadType(rdb)) == -1) return -1; if (!rdbIsObjectType(type)) return -1; return type; } /* Save a Redis object. Returns -1 on error, number of bytes written on success. */ ssize_t rdbSaveObject(rio *rdb, robj *o) { ssize_t n = 0, nwritten = 0; if (o->type == OBJ_STRING) { /* Save a string value */ if ((n = rdbSaveStringObject(rdb,o)) == -1) return -1; nwritten += n; } else if (o->type == OBJ_LIST) { /* Save a list value */ if (o->encoding == OBJ_ENCODING_QUICKLIST) { quicklist *ql = o->ptr; quicklistNode *node = ql->head; if ((n = rdbSaveLen(rdb,ql->len)) == -1) return -1; nwritten += n; while(node) { if (quicklistNodeIsCompressed(node)) { void *data; size_t compress_len = quicklistGetLzf(node, &data); if ((n = rdbSaveLzfBlob(rdb,data,compress_len,node->sz)) == -1) return -1; nwritten += n; } else { if ((n = rdbSaveRawString(rdb,node->zl,node->sz)) == -1) return -1; nwritten += n; } node = node->next; } } else { serverPanic("Unknown list encoding"); } } else if (o->type == OBJ_SET) { /* Save a set value */ if (o->encoding == OBJ_ENCODING_HT) { dict *set = o->ptr; dictIterator *di = dictGetIterator(set); dictEntry *de; if ((n = rdbSaveLen(rdb,dictSize(set))) == -1) return -1; nwritten += n; while((de = dictNext(di)) != NULL) { sds ele = dictGetKey(de); if ((n = rdbSaveRawString(rdb,(unsigned char*)ele,sdslen(ele))) == -1) return -1; nwritten += n; } dictReleaseIterator(di); } else if (o->encoding == OBJ_ENCODING_INTSET) { size_t l = intsetBlobLen((intset*)o->ptr); if ((n = rdbSaveRawString(rdb,o->ptr,l)) == -1) return -1; nwritten += n; } else { serverPanic("Unknown set encoding"); } } else if (o->type == OBJ_ZSET) { /* Save a sorted set value */ if (o->encoding == OBJ_ENCODING_ZIPLIST) { size_t l = ziplistBlobLen((unsigned char*)o->ptr); if ((n = rdbSaveRawString(rdb,o->ptr,l)) == -1) return -1; nwritten += n; } else if (o->encoding == OBJ_ENCODING_SKIPLIST) { zset *zs = o->ptr; zskiplist *zsl = zs->zsl; if ((n = rdbSaveLen(rdb,zsl->length)) == -1) return -1; nwritten += n; /* We save the skiplist elements from the greatest to the smallest * (that's trivial since the elements are already ordered in the * skiplist): this improves the load process, since the next loaded * element will always be the smaller, so adding to the skiplist * will always immediately stop at the head, making the insertion * O(1) instead of O(log(N)). */ zskiplistNode *zn = zsl->tail; while (zn != NULL) { if ((n = rdbSaveRawString(rdb, (unsigned char*)zn->ele,sdslen(zn->ele))) == -1) { return -1; } nwritten += n; if ((n = rdbSaveBinaryDoubleValue(rdb,zn->score)) == -1) return -1; nwritten += n; zn = zn->backward; } } else { serverPanic("Unknown sorted set encoding"); } } else if (o->type == OBJ_HASH) { /* Save a hash value */ if (o->encoding == OBJ_ENCODING_ZIPLIST) { size_t l = ziplistBlobLen((unsigned char*)o->ptr); if ((n = rdbSaveRawString(rdb,o->ptr,l)) == -1) return -1; nwritten += n; } else if (o->encoding == OBJ_ENCODING_HT) { dictIterator *di = dictGetIterator(o->ptr); dictEntry *de; if ((n = rdbSaveLen(rdb,dictSize((dict*)o->ptr))) == -1) return -1; nwritten += n; while((de = dictNext(di)) != NULL) { sds field = dictGetKey(de); sds value = dictGetVal(de); if ((n = rdbSaveRawString(rdb,(unsigned char*)field, sdslen(field))) == -1) return -1; nwritten += n; if ((n = rdbSaveRawString(rdb,(unsigned char*)value, sdslen(value))) == -1) return -1; nwritten += n; } dictReleaseIterator(di); } else { serverPanic("Unknown hash encoding"); } } else if (o->type == OBJ_MODULE) { /* Save a module-specific value. */ RedisModuleIO io; moduleValue *mv = o->ptr; moduleType *mt = mv->type; moduleInitIOContext(io,mt,rdb); /* Write the "module" identifier as prefix, so that we'll be able * to call the right module during loading. */ int retval = rdbSaveLen(rdb,mt->id); if (retval == -1) return -1; io.bytes += retval; /* Then write the module-specific representation + EOF marker. */ mt->rdb_save(&io,mv->value); retval = rdbSaveLen(rdb,RDB_MODULE_OPCODE_EOF); if (retval == -1) return -1; io.bytes += retval; if (io.ctx) { moduleFreeContext(io.ctx); zfree(io.ctx); } return io.error ? -1 : (ssize_t)io.bytes; } else { serverPanic("Unknown object type"); } return nwritten; } /* Return the length the object will have on disk if saved with * the rdbSaveObject() function. Currently we use a trick to get * this length with very little changes to the code. In the future * we could switch to a faster solution. */ size_t rdbSavedObjectLen(robj *o) { ssize_t len = rdbSaveObject(NULL,o); serverAssertWithInfo(NULL,o,len != -1); return len; } /* Save a key-value pair, with expire time, type, key, value. * On error -1 is returned. * On success if the key was actually saved 1 is returned, otherwise 0 * is returned (the key was already expired). */ int rdbSaveKeyValuePair(rio *rdb, robj *key, robj *val, long long expiretime, long long now) { /* Save the expire time */ if (expiretime != -1) { /* If this key is already expired skip it */ if (expiretime < now) return 0; if (rdbSaveType(rdb,RDB_OPCODE_EXPIRETIME_MS) == -1) return -1; if (rdbSaveMillisecondTime(rdb,expiretime) == -1) return -1; } /* Save type, key, value */ if (rdbSaveObjectType(rdb,val) == -1) return -1; if (rdbSaveStringObject(rdb,key) == -1) return -1; if (rdbSaveObject(rdb,val) == -1) return -1; return 1; } /* Save an AUX field. */ int rdbSaveAuxField(rio *rdb, void *key, size_t keylen, void *val, size_t vallen) { if (rdbSaveType(rdb,RDB_OPCODE_AUX) == -1) return -1; if (rdbSaveRawString(rdb,key,keylen) == -1) return -1; if (rdbSaveRawString(rdb,val,vallen) == -1) return -1; return 1; } /* Wrapper for rdbSaveAuxField() used when key/val length can be obtained * with strlen(). */ int rdbSaveAuxFieldStrStr(rio *rdb, char *key, char *val) { return rdbSaveAuxField(rdb,key,strlen(key),val,strlen(val)); } /* Wrapper for strlen(key) + integer type (up to long long range). */ int rdbSaveAuxFieldStrInt(rio *rdb, char *key, long long val) { char buf[LONG_STR_SIZE]; int vlen = ll2string(buf,sizeof(buf),val); return rdbSaveAuxField(rdb,key,strlen(key),buf,vlen); } /* Save a few default AUX fields with information about the RDB generated. */ int rdbSaveInfoAuxFields(rio *rdb, int flags, rdbSaveInfo *rsi) { int redis_bits = (sizeof(void*) == 8) ? 64 : 32; int aof_preamble = (flags & RDB_SAVE_AOF_PREAMBLE) != 0; /* Add a few fields about the state when the RDB was created. */ if (rdbSaveAuxFieldStrStr(rdb,"redis-ver",REDIS_VERSION) == -1) return -1; if (rdbSaveAuxFieldStrInt(rdb,"redis-bits",redis_bits) == -1) return -1; if (rdbSaveAuxFieldStrInt(rdb,"ctime",time(NULL)) == -1) return -1; if (rdbSaveAuxFieldStrInt(rdb,"used-mem",zmalloc_used_memory()) == -1) return -1; /* Handle saving options that generate aux fields. */ if (rsi) { if (rdbSaveAuxFieldStrInt(rdb,"repl-stream-db",rsi->repl_stream_db) == -1) return -1; if (rdbSaveAuxFieldStrStr(rdb,"repl-id",server.replid) == -1) return -1; if (rdbSaveAuxFieldStrInt(rdb,"repl-offset",server.master_repl_offset) == -1) return -1; } if (rdbSaveAuxFieldStrInt(rdb,"aof-preamble",aof_preamble) == -1) return -1; return 1; } /* Produces a dump of the database in RDB format sending it to the specified * Redis I/O channel. On success C_OK is returned, otherwise C_ERR * is returned and part of the output, or all the output, can be * missing because of I/O errors. * * When the function returns C_ERR and if 'error' is not NULL, the * integer pointed by 'error' is set to the value of errno just after the I/O * error. */ int rdbSaveRio(rio *rdb, int *error, int flags, rdbSaveInfo *rsi) { dictIterator *di = NULL; dictEntry *de; char magic[10]; int j; long long now = mstime(); uint64_t cksum; size_t processed = 0; if (server.rdb_checksum) rdb->update_cksum = rioGenericUpdateChecksum; snprintf(magic,sizeof(magic),"REDIS%04d",RDB_VERSION); if (rdbWriteRaw(rdb,magic,9) == -1) goto werr; if (rdbSaveInfoAuxFields(rdb,flags,rsi) == -1) goto werr; for (j = 0; j < server.dbnum; j++) { redisDb *db = server.db+j; dict *d = db->dict; if (dictSize(d) == 0) continue; di = dictGetSafeIterator(d); if (!di) return C_ERR; /* Write the SELECT DB opcode */ if (rdbSaveType(rdb,RDB_OPCODE_SELECTDB) == -1) goto werr; if (rdbSaveLen(rdb,j) == -1) goto werr; /* Write the RESIZE DB opcode. We trim the size to UINT32_MAX, which * is currently the largest type we are able to represent in RDB sizes. * However this does not limit the actual size of the DB to load since * these sizes are just hints to resize the hash tables. */ uint32_t db_size, expires_size; db_size = (dictSize(db->dict) <= UINT32_MAX) ? dictSize(db->dict) : UINT32_MAX; expires_size = (dictSize(db->expires) <= UINT32_MAX) ? dictSize(db->expires) : UINT32_MAX; if (rdbSaveType(rdb,RDB_OPCODE_RESIZEDB) == -1) goto werr; if (rdbSaveLen(rdb,db_size) == -1) goto werr; if (rdbSaveLen(rdb,expires_size) == -1) goto werr; /* Iterate this DB writing every entry */ while((de = dictNext(di)) != NULL) { sds keystr = dictGetKey(de); robj key, *o = dictGetVal(de); long long expire; initStaticStringObject(key,keystr); expire = getExpire(db,&key); if (rdbSaveKeyValuePair(rdb,&key,o,expire,now) == -1) goto werr; /* When this RDB is produced as part of an AOF rewrite, move * accumulated diff from parent to child while rewriting in * order to have a smaller final write. */ if (flags & RDB_SAVE_AOF_PREAMBLE && rdb->processed_bytes > processed+AOF_READ_DIFF_INTERVAL_BYTES) { processed = rdb->processed_bytes; aofReadDiffFromParent(); } } dictReleaseIterator(di); } di = NULL; /* So that we don't release it again on error. */ /* EOF opcode */ if (rdbSaveType(rdb,RDB_OPCODE_EOF) == -1) goto werr; /* CRC64 checksum. It will be zero if checksum computation is disabled, the * loading code skips the check in this case. */ cksum = rdb->cksum; memrev64ifbe(&cksum); if (rioWrite(rdb,&cksum,8) == 0) goto werr; return C_OK; werr: if (error) *error = errno; if (di) dictReleaseIterator(di); return C_ERR; } /* This is just a wrapper to rdbSaveRio() that additionally adds a prefix * and a suffix to the generated RDB dump. The prefix is: * * $EOF:<40 bytes unguessable hex string>\r\n * * While the suffix is the 40 bytes hex string we announced in the prefix. * This way processes receiving the payload can understand when it ends * without doing any processing of the content. */ int rdbSaveRioWithEOFMark(rio *rdb, int *error, rdbSaveInfo *rsi) { char eofmark[RDB_EOF_MARK_SIZE]; getRandomHexChars(eofmark,RDB_EOF_MARK_SIZE); if (error) *error = 0; if (rioWrite(rdb,"$EOF:",5) == 0) goto werr; if (rioWrite(rdb,eofmark,RDB_EOF_MARK_SIZE) == 0) goto werr; if (rioWrite(rdb,"\r\n",2) == 0) goto werr; if (rdbSaveRio(rdb,error,RDB_SAVE_NONE,rsi) == C_ERR) goto werr; if (rioWrite(rdb,eofmark,RDB_EOF_MARK_SIZE) == 0) goto werr; return C_OK; werr: /* Write error. */ /* Set 'error' only if not already set by rdbSaveRio() call. */ if (error && *error == 0) *error = errno; return C_ERR; } /* Save the DB on disk. Return C_ERR on error, C_OK on success. */ int rdbSave(char *filename, rdbSaveInfo *rsi) { char tmpfile[256]; char cwd[MAXPATHLEN]; /* Current working dir path for error messages. */ FILE *fp; rio rdb; int error = 0; snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid()); fp = fopen(tmpfile,"w"); if (!fp) { char *cwdp = getcwd(cwd,MAXPATHLEN); serverLog(LL_WARNING, "Failed opening the RDB file %s (in server root dir %s) " "for saving: %s", filename, cwdp ? cwdp : "unknown", strerror(errno)); return C_ERR; } rioInitWithFile(&rdb,fp); if (rdbSaveRio(&rdb,&error,RDB_SAVE_NONE,rsi) == C_ERR) { errno = error; goto werr; } /* Make sure data will not remain on the OS's output buffers */ if (fflush(fp) == EOF) goto werr; if (fsync(fileno(fp)) == -1) goto werr; if (fclose(fp) == EOF) goto werr; /* Use RENAME to make sure the DB file is changed atomically only * if the generate DB file is ok. */ if (rename(tmpfile,filename) == -1) { char *cwdp = getcwd(cwd,MAXPATHLEN); serverLog(LL_WARNING, "Error moving temp DB file %s on the final " "destination %s (in server root dir %s): %s", tmpfile, filename, cwdp ? cwdp : "unknown", strerror(errno)); unlink(tmpfile); return C_ERR; } serverLog(LL_NOTICE,"DB saved on disk"); server.dirty = 0; server.lastsave = time(NULL); server.lastbgsave_status = C_OK; return C_OK; werr: serverLog(LL_WARNING,"Write error saving DB on disk: %s", strerror(errno)); fclose(fp); unlink(tmpfile); return C_ERR; } int rdbSaveBackground(char *filename, rdbSaveInfo *rsi) { pid_t childpid; long long start; if (server.aof_child_pid != -1 || server.rdb_child_pid != -1) return C_ERR; server.dirty_before_bgsave = server.dirty; server.lastbgsave_try = time(NULL); openChildInfoPipe(); start = ustime(); if ((childpid = fork()) == 0) { int retval; /* Child */ closeListeningSockets(0); redisSetProcTitle("redis-rdb-bgsave"); retval = rdbSave(filename,rsi); if (retval == C_OK) { size_t private_dirty = zmalloc_get_private_dirty(-1); if (private_dirty) { serverLog(LL_NOTICE, "RDB: %zu MB of memory used by copy-on-write", private_dirty/(1024*1024)); } server.child_info_data.cow_size = private_dirty; sendChildInfo(CHILD_INFO_TYPE_RDB); } exitFromChild((retval == C_OK) ? 0 : 1); } else { /* Parent */ server.stat_fork_time = ustime()-start; server.stat_fork_rate = (double) zmalloc_used_memory() * 1000000 / server.stat_fork_time / (1024*1024*1024); /* GB per second. */ latencyAddSampleIfNeeded("fork",server.stat_fork_time/1000); if (childpid == -1) { closeChildInfoPipe(); server.lastbgsave_status = C_ERR; serverLog(LL_WARNING,"Can't save in background: fork: %s", strerror(errno)); return C_ERR; } serverLog(LL_NOTICE,"Background saving started by pid %d",childpid); server.rdb_save_time_start = time(NULL); server.rdb_child_pid = childpid; server.rdb_child_type = RDB_CHILD_TYPE_DISK; updateDictResizePolicy(); return C_OK; } return C_OK; /* unreached */ } void rdbRemoveTempFile(pid_t childpid) { char tmpfile[256]; snprintf(tmpfile,sizeof(tmpfile),"temp-%d.rdb", (int) childpid); unlink(tmpfile); } /* This function is called by rdbLoadObject() when the code is in RDB-check * mode and we find a module value of type 2 that can be parsed without * the need of the actual module. The value is parsed for errors, finally * a dummy redis object is returned just to conform to the API. */ robj *rdbLoadCheckModuleValue(rio *rdb, char *modulename) { uint64_t opcode; while((opcode = rdbLoadLen(rdb,NULL)) != RDB_MODULE_OPCODE_EOF) { if (opcode == RDB_MODULE_OPCODE_SINT || opcode == RDB_MODULE_OPCODE_UINT) { uint64_t len; if (rdbLoadLenByRef(rdb,NULL,&len) == -1) { rdbExitReportCorruptRDB( "Error reading integer from module %s value", modulename); } } else if (opcode == RDB_MODULE_OPCODE_STRING) { robj *o = rdbGenericLoadStringObject(rdb,RDB_LOAD_NONE,NULL); if (o == NULL) { rdbExitReportCorruptRDB( "Error reading string from module %s value", modulename); } decrRefCount(o); } else if (opcode == RDB_MODULE_OPCODE_FLOAT) { float val; if (rdbLoadBinaryFloatValue(rdb,&val) == -1) { rdbExitReportCorruptRDB( "Error reading float from module %s value", modulename); } } else if (opcode == RDB_MODULE_OPCODE_DOUBLE) { double val; if (rdbLoadBinaryDoubleValue(rdb,&val) == -1) { rdbExitReportCorruptRDB( "Error reading double from module %s value", modulename); } } } return createStringObject("module-dummy-value",18); } /* Load a Redis object of the specified type from the specified file. * On success a newly allocated object is returned, otherwise NULL. */ robj *rdbLoadObject(int rdbtype, rio *rdb) { robj *o = NULL, *ele, *dec; uint64_t len; unsigned int i; if (rdbtype == RDB_TYPE_STRING) { /* Read string value */ if ((o = rdbLoadEncodedStringObject(rdb)) == NULL) return NULL; o = tryObjectEncoding(o); } else if (rdbtype == RDB_TYPE_LIST) { /* Read list value */ if ((len = rdbLoadLen(rdb,NULL)) == RDB_LENERR) return NULL; o = createQuicklistObject(); quicklistSetOptions(o->ptr, server.list_max_ziplist_size, server.list_compress_depth); /* Load every single element of the list */ while(len--) { if ((ele = rdbLoadEncodedStringObject(rdb)) == NULL) return NULL; dec = getDecodedObject(ele); size_t len = sdslen(dec->ptr); quicklistPushTail(o->ptr, dec->ptr, len); decrRefCount(dec); decrRefCount(ele); } } else if (rdbtype == RDB_TYPE_SET) { /* Read Set value */ if ((len = rdbLoadLen(rdb,NULL)) == RDB_LENERR) return NULL; /* Use a regular set when there are too many entries. */ if (len > server.set_max_intset_entries) { o = createSetObject(); /* It's faster to expand the dict to the right size asap in order * to avoid rehashing */ if (len > DICT_HT_INITIAL_SIZE) dictExpand(o->ptr,len); } else { o = createIntsetObject(); } /* Load every single element of the set */ for (i = 0; i < len; i++) { long long llval; sds sdsele; if ((sdsele = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS,NULL)) == NULL) return NULL; if (o->encoding == OBJ_ENCODING_INTSET) { /* Fetch integer value from element. */ if (isSdsRepresentableAsLongLong(sdsele,&llval) == C_OK) { o->ptr = intsetAdd(o->ptr,llval,NULL); } else { setTypeConvert(o,OBJ_ENCODING_HT); dictExpand(o->ptr,len); } } /* This will also be called when the set was just converted * to a regular hash table encoded set. */ if (o->encoding == OBJ_ENCODING_HT) { dictAdd((dict*)o->ptr,sdsele,NULL); } else { sdsfree(sdsele); } } } else if (rdbtype == RDB_TYPE_ZSET_2 || rdbtype == RDB_TYPE_ZSET) { /* Read list/set value. */ uint64_t zsetlen; size_t maxelelen = 0; zset *zs; if ((zsetlen = rdbLoadLen(rdb,NULL)) == RDB_LENERR) return NULL; o = createZsetObject(); zs = o->ptr; /* Load every single element of the sorted set. */ while(zsetlen--) { sds sdsele; double score; zskiplistNode *znode; if ((sdsele = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS,NULL)) == NULL) return NULL; if (rdbtype == RDB_TYPE_ZSET_2) { if (rdbLoadBinaryDoubleValue(rdb,&score) == -1) return NULL; } else { if (rdbLoadDoubleValue(rdb,&score) == -1) return NULL; } /* Don't care about integer-encoded strings. */ if (sdslen(sdsele) > maxelelen) maxelelen = sdslen(sdsele); znode = zslInsert(zs->zsl,score,sdsele); dictAdd(zs->dict,sdsele,&znode->score); } /* Convert *after* loading, since sorted sets are not stored ordered. */ if (zsetLength(o) <= server.zset_max_ziplist_entries && maxelelen <= server.zset_max_ziplist_value) zsetConvert(o,OBJ_ENCODING_ZIPLIST); } else if (rdbtype == RDB_TYPE_HASH) { uint64_t len; int ret; sds field, value; len = rdbLoadLen(rdb, NULL); if (len == RDB_LENERR) return NULL; o = createHashObject(); /* Too many entries? Use a hash table. */ if (len > server.hash_max_ziplist_entries) hashTypeConvert(o, OBJ_ENCODING_HT); /* Load every field and value into the ziplist */ while (o->encoding == OBJ_ENCODING_ZIPLIST && len > 0) { len--; /* Load raw strings */ if ((field = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS,NULL)) == NULL) return NULL; if ((value = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS,NULL)) == NULL) return NULL; /* Add pair to ziplist */ o->ptr = ziplistPush(o->ptr, (unsigned char*)field, sdslen(field), ZIPLIST_TAIL); o->ptr = ziplistPush(o->ptr, (unsigned char*)value, sdslen(value), ZIPLIST_TAIL); /* Convert to hash table if size threshold is exceeded */ if (sdslen(field) > server.hash_max_ziplist_value || sdslen(value) > server.hash_max_ziplist_value) { sdsfree(field); sdsfree(value); hashTypeConvert(o, OBJ_ENCODING_HT); break; } sdsfree(field); sdsfree(value); } /* Load remaining fields and values into the hash table */ while (o->encoding == OBJ_ENCODING_HT && len > 0) { len--; /* Load encoded strings */ if ((field = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS,NULL)) == NULL) return NULL; if ((value = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS,NULL)) == NULL) return NULL; /* Add pair to hash table */ ret = dictAdd((dict*)o->ptr, field, value); if (ret == DICT_ERR) { rdbExitReportCorruptRDB("Duplicate keys detected"); } } /* All pairs should be read by now */ serverAssert(len == 0); } else if (rdbtype == RDB_TYPE_LIST_QUICKLIST) { if ((len = rdbLoadLen(rdb,NULL)) == RDB_LENERR) return NULL; o = createQuicklistObject(); quicklistSetOptions(o->ptr, server.list_max_ziplist_size, server.list_compress_depth); while (len--) { unsigned char *zl = rdbGenericLoadStringObject(rdb,RDB_LOAD_PLAIN,NULL); if (zl == NULL) return NULL; quicklistAppendZiplist(o->ptr, zl); } } else if (rdbtype == RDB_TYPE_HASH_ZIPMAP || rdbtype == RDB_TYPE_LIST_ZIPLIST || rdbtype == RDB_TYPE_SET_INTSET || rdbtype == RDB_TYPE_ZSET_ZIPLIST || rdbtype == RDB_TYPE_HASH_ZIPLIST) { unsigned char *encoded = rdbGenericLoadStringObject(rdb,RDB_LOAD_PLAIN,NULL); if (encoded == NULL) return NULL; o = createObject(OBJ_STRING,encoded); /* Obj type fixed below. */ /* Fix the object encoding, and make sure to convert the encoded * data type into the base type if accordingly to the current * configuration there are too many elements in the encoded data * type. Note that we only check the length and not max element * size as this is an O(N) scan. Eventually everything will get * converted. */ switch(rdbtype) { case RDB_TYPE_HASH_ZIPMAP: /* Convert to ziplist encoded hash. This must be deprecated * when loading dumps created by Redis 2.4 gets deprecated. */ { unsigned char *zl = ziplistNew(); unsigned char *zi = zipmapRewind(o->ptr); unsigned char *fstr, *vstr; unsigned int flen, vlen; unsigned int maxlen = 0; while ((zi = zipmapNext(zi, &fstr, &flen, &vstr, &vlen)) != NULL) { if (flen > maxlen) maxlen = flen; if (vlen > maxlen) maxlen = vlen; zl = ziplistPush(zl, fstr, flen, ZIPLIST_TAIL); zl = ziplistPush(zl, vstr, vlen, ZIPLIST_TAIL); } zfree(o->ptr); o->ptr = zl; o->type = OBJ_HASH; o->encoding = OBJ_ENCODING_ZIPLIST; if (hashTypeLength(o) > server.hash_max_ziplist_entries || maxlen > server.hash_max_ziplist_value) { hashTypeConvert(o, OBJ_ENCODING_HT); } } break; case RDB_TYPE_LIST_ZIPLIST: o->type = OBJ_LIST; o->encoding = OBJ_ENCODING_ZIPLIST; listTypeConvert(o,OBJ_ENCODING_QUICKLIST); break; case RDB_TYPE_SET_INTSET: o->type = OBJ_SET; o->encoding = OBJ_ENCODING_INTSET; if (intsetLen(o->ptr) > server.set_max_intset_entries) setTypeConvert(o,OBJ_ENCODING_HT); break; case RDB_TYPE_ZSET_ZIPLIST: o->type = OBJ_ZSET; o->encoding = OBJ_ENCODING_ZIPLIST; if (zsetLength(o) > server.zset_max_ziplist_entries) zsetConvert(o,OBJ_ENCODING_SKIPLIST); break; case RDB_TYPE_HASH_ZIPLIST: o->type = OBJ_HASH; o->encoding = OBJ_ENCODING_ZIPLIST; if (hashTypeLength(o) > server.hash_max_ziplist_entries) hashTypeConvert(o, OBJ_ENCODING_HT); break; default: rdbExitReportCorruptRDB("Unknown RDB encoding type %d",rdbtype); break; } } else if (rdbtype == RDB_TYPE_MODULE || rdbtype == RDB_TYPE_MODULE_2) { uint64_t moduleid = rdbLoadLen(rdb,NULL); moduleType *mt = moduleTypeLookupModuleByID(moduleid); char name[10]; if (rdbCheckMode && rdbtype == RDB_TYPE_MODULE_2) return rdbLoadCheckModuleValue(rdb,name); if (mt == NULL) { moduleTypeNameByID(name,moduleid); serverLog(LL_WARNING,"The RDB file contains module data I can't load: no matching module '%s'", name); exit(1); } RedisModuleIO io; moduleInitIOContext(io,mt,rdb); io.ver = (rdbtype == RDB_TYPE_MODULE) ? 1 : 2; /* Call the rdb_load method of the module providing the 10 bit * encoding version in the lower 10 bits of the module ID. */ void *ptr = mt->rdb_load(&io,moduleid&1023); if (io.ctx) { moduleFreeContext(io.ctx); zfree(io.ctx); } /* Module v2 serialization has an EOF mark at the end. */ if (io.ver == 2) { uint64_t eof = rdbLoadLen(rdb,NULL); if (eof != RDB_MODULE_OPCODE_EOF) { serverLog(LL_WARNING,"The RDB file contains module data for the module '%s' that is not terminated by the proper module value EOF marker", name); exit(1); } } if (ptr == NULL) { moduleTypeNameByID(name,moduleid); serverLog(LL_WARNING,"The RDB file contains module data for the module type '%s', that the responsible module is not able to load. Check for modules log above for additional clues.", name); exit(1); } o = createModuleObject(mt,ptr); } else { rdbExitReportCorruptRDB("Unknown RDB encoding type %d",rdbtype); } return o; } /* Mark that we are loading in the global state and setup the fields * needed to provide loading stats. */ void startLoading(FILE *fp) { struct stat sb; /* Load the DB */ server.loading = 1; server.loading_start_time = time(NULL); server.loading_loaded_bytes = 0; if (fstat(fileno(fp), &sb) == -1) { server.loading_total_bytes = 0; } else { server.loading_total_bytes = sb.st_size; } } /* Refresh the loading progress info */ void loadingProgress(off_t pos) { server.loading_loaded_bytes = pos; if (server.stat_peak_memory < zmalloc_used_memory()) server.stat_peak_memory = zmalloc_used_memory(); } /* Loading finished */ void stopLoading(void) { server.loading = 0; } /* Track loading progress in order to serve client's from time to time and if needed calculate rdb checksum */ void rdbLoadProgressCallback(rio *r, const void *buf, size_t len) { if (server.rdb_checksum) rioGenericUpdateChecksum(r, buf, len); if (server.loading_process_events_interval_bytes && (r->processed_bytes + len)/server.loading_process_events_interval_bytes > r->processed_bytes/server.loading_process_events_interval_bytes) { /* The DB can take some non trivial amount of time to load. Update * our cached time since it is used to create and update the last * interaction time with clients and for other important things. */ updateCachedTime(); if (server.masterhost && server.repl_state == REPL_STATE_TRANSFER) replicationSendNewlineToMaster(); loadingProgress(r->processed_bytes); processEventsWhileBlocked(); } } /* Load an RDB file from the rio stream 'rdb'. On success C_OK is returned, * otherwise C_ERR is returned and 'errno' is set accordingly. */ int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi) { uint64_t dbid; int type, rdbver; redisDb *db = server.db+0; char buf[1024]; long long expiretime, now = mstime(); rdb->update_cksum = rdbLoadProgressCallback; rdb->max_processing_chunk = server.loading_process_events_interval_bytes; if (rioRead(rdb,buf,9) == 0) goto eoferr; buf[9] = '\0'; if (memcmp(buf,"REDIS",5) != 0) { serverLog(LL_WARNING,"Wrong signature trying to load DB from file"); errno = EINVAL; return C_ERR; } rdbver = atoi(buf+5); if (rdbver < 1 || rdbver > RDB_VERSION) { serverLog(LL_WARNING,"Can't handle RDB format version %d",rdbver); errno = EINVAL; return C_ERR; } while(1) { robj *key, *val; expiretime = -1; /* Read type. */ if ((type = rdbLoadType(rdb)) == -1) goto eoferr; /* Handle special types. */ if (type == RDB_OPCODE_EXPIRETIME) { /* EXPIRETIME: load an expire associated with the next key * to load. Note that after loading an expire we need to * load the actual type, and continue. */ if ((expiretime = rdbLoadTime(rdb)) == -1) goto eoferr; /* We read the time so we need to read the object type again. */ if ((type = rdbLoadType(rdb)) == -1) goto eoferr; /* the EXPIRETIME opcode specifies time in seconds, so convert * into milliseconds. */ expiretime *= 1000; } else if (type == RDB_OPCODE_EXPIRETIME_MS) { /* EXPIRETIME_MS: milliseconds precision expire times introduced * with RDB v3. Like EXPIRETIME but no with more precision. */ if ((expiretime = rdbLoadMillisecondTime(rdb)) == -1) goto eoferr; /* We read the time so we need to read the object type again. */ if ((type = rdbLoadType(rdb)) == -1) goto eoferr; } else if (type == RDB_OPCODE_EOF) { /* EOF: End of file, exit the main loop. */ break; } else if (type == RDB_OPCODE_SELECTDB) { /* SELECTDB: Select the specified database. */ if ((dbid = rdbLoadLen(rdb,NULL)) == RDB_LENERR) goto eoferr; if (dbid >= (unsigned)server.dbnum) { serverLog(LL_WARNING, "FATAL: Data file was created with a Redis " "server configured to handle more than %d " "databases. Exiting\n", server.dbnum); exit(1); } db = server.db+dbid; continue; /* Read type again. */ } else if (type == RDB_OPCODE_RESIZEDB) { /* RESIZEDB: Hint about the size of the keys in the currently * selected data base, in order to avoid useless rehashing. */ uint64_t db_size, expires_size; if ((db_size = rdbLoadLen(rdb,NULL)) == RDB_LENERR) goto eoferr; if ((expires_size = rdbLoadLen(rdb,NULL)) == RDB_LENERR) goto eoferr; dictExpand(db->dict,db_size); dictExpand(db->expires,expires_size); continue; /* Read type again. */ } else if (type == RDB_OPCODE_AUX) { /* AUX: generic string-string fields. Use to add state to RDB * which is backward compatible. Implementations of RDB loading * are requierd to skip AUX fields they don't understand. * * An AUX field is composed of two strings: key and value. */ robj *auxkey, *auxval; if ((auxkey = rdbLoadStringObject(rdb)) == NULL) goto eoferr; if ((auxval = rdbLoadStringObject(rdb)) == NULL) goto eoferr; if (((char*)auxkey->ptr)[0] == '%') { /* All the fields with a name staring with '%' are considered * information fields and are logged at startup with a log * level of NOTICE. */ serverLog(LL_NOTICE,"RDB '%s': %s", (char*)auxkey->ptr, (char*)auxval->ptr); } else if (!strcasecmp(auxkey->ptr,"repl-stream-db")) { if (rsi) rsi->repl_stream_db = atoi(auxval->ptr); } else if (!strcasecmp(auxkey->ptr,"repl-id")) { if (rsi && sdslen(auxval->ptr) == CONFIG_RUN_ID_SIZE) { memcpy(rsi->repl_id,auxval->ptr,CONFIG_RUN_ID_SIZE+1); rsi->repl_id_is_set = 1; } } else if (!strcasecmp(auxkey->ptr,"repl-offset")) { if (rsi) rsi->repl_offset = strtoll(auxval->ptr,NULL,10); } else { /* We ignore fields we don't understand, as by AUX field * contract. */ serverLog(LL_DEBUG,"Unrecognized RDB AUX field: '%s'", (char*)auxkey->ptr); } decrRefCount(auxkey); decrRefCount(auxval); continue; /* Read type again. */ } /* Read key */ if ((key = rdbLoadStringObject(rdb)) == NULL) goto eoferr; /* Read value */ if ((val = rdbLoadObject(type,rdb)) == NULL) goto eoferr; /* Check if the key already expired. This function is used when loading * an RDB file from disk, either at startup, or when an RDB was * received from the master. In the latter case, the master is * responsible for key expiry. If we would expire keys here, the * snapshot taken by the master may not be reflected on the slave. */ if (server.masterhost == NULL && expiretime != -1 && expiretime < now) { decrRefCount(key); decrRefCount(val); continue; } /* Add the new object in the hash table */ dbAdd(db,key,val); /* Set the expire time if needed */ if (expiretime != -1) setExpire(NULL,db,key,expiretime); decrRefCount(key); } /* Verify the checksum if RDB version is >= 5 */ if (rdbver >= 5 && server.rdb_checksum) { uint64_t cksum, expected = rdb->cksum; if (rioRead(rdb,&cksum,8) == 0) goto eoferr; memrev64ifbe(&cksum); if (cksum == 0) { serverLog(LL_WARNING,"RDB file was saved with checksum disabled: no check performed."); } else if (cksum != expected) { serverLog(LL_WARNING,"Wrong RDB checksum. Aborting now."); rdbExitReportCorruptRDB("RDB CRC error"); } } return C_OK; eoferr: /* unexpected end of file is handled here with a fatal exit */ serverLog(LL_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now."); rdbExitReportCorruptRDB("Unexpected EOF reading RDB file"); return C_ERR; /* Just to avoid warning */ } /* Like rdbLoadRio() but takes a filename instead of a rio stream. The * filename is open for reading and a rio stream object created in order * to do the actual loading. Moreover the ETA displayed in the INFO * output is initialized and finalized. * * If you pass an 'rsi' structure initialied with RDB_SAVE_OPTION_INIT, the * loading code will fiil the information fields in the structure. */ int rdbLoad(char *filename, rdbSaveInfo *rsi) { FILE *fp; rio rdb; int retval; if ((fp = fopen(filename,"r")) == NULL) return C_ERR; startLoading(fp); rioInitWithFile(&rdb,fp); retval = rdbLoadRio(&rdb,rsi); fclose(fp); stopLoading(); return retval; } /* A background saving child (BGSAVE) terminated its work. Handle this. * This function covers the case of actual BGSAVEs. */ void backgroundSaveDoneHandlerDisk(int exitcode, int bysignal) { if (!bysignal && exitcode == 0) { serverLog(LL_NOTICE, "Background saving terminated with success"); server.dirty = server.dirty - server.dirty_before_bgsave; server.lastsave = time(NULL); server.lastbgsave_status = C_OK; } else if (!bysignal && exitcode != 0) { serverLog(LL_WARNING, "Background saving error"); server.lastbgsave_status = C_ERR; } else { mstime_t latency; serverLog(LL_WARNING, "Background saving terminated by signal %d", bysignal); latencyStartMonitor(latency); rdbRemoveTempFile(server.rdb_child_pid); latencyEndMonitor(latency); latencyAddSampleIfNeeded("rdb-unlink-temp-file",latency); /* SIGUSR1 is whitelisted, so we have a way to kill a child without * tirggering an error conditon. */ if (bysignal != SIGUSR1) server.lastbgsave_status = C_ERR; } server.rdb_child_pid = -1; server.rdb_child_type = RDB_CHILD_TYPE_NONE; server.rdb_save_time_last = time(NULL)-server.rdb_save_time_start; server.rdb_save_time_start = -1; /* Possibly there are slaves waiting for a BGSAVE in order to be served * (the first stage of SYNC is a bulk transfer of dump.rdb) */ updateSlavesWaitingBgsave((!bysignal && exitcode == 0) ? C_OK : C_ERR, RDB_CHILD_TYPE_DISK); } /* A background saving child (BGSAVE) terminated its work. Handle this. * This function covers the case of RDB -> Salves socket transfers for * diskless replication. */ void backgroundSaveDoneHandlerSocket(int exitcode, int bysignal) { uint64_t *ok_slaves; if (!bysignal && exitcode == 0) { serverLog(LL_NOTICE, "Background RDB transfer terminated with success"); } else if (!bysignal && exitcode != 0) { serverLog(LL_WARNING, "Background transfer error"); } else { serverLog(LL_WARNING, "Background transfer terminated by signal %d", bysignal); } server.rdb_child_pid = -1; server.rdb_child_type = RDB_CHILD_TYPE_NONE; server.rdb_save_time_start = -1; /* If the child returns an OK exit code, read the set of slave client * IDs and the associated status code. We'll terminate all the slaves * in error state. * * If the process returned an error, consider the list of slaves that * can continue to be emtpy, so that it's just a special case of the * normal code path. */ ok_slaves = zmalloc(sizeof(uint64_t)); /* Make space for the count. */ ok_slaves[0] = 0; if (!bysignal && exitcode == 0) { int readlen = sizeof(uint64_t); if (read(server.rdb_pipe_read_result_from_child, ok_slaves, readlen) == readlen) { readlen = ok_slaves[0]*sizeof(uint64_t)*2; /* Make space for enough elements as specified by the first * uint64_t element in the array. */ ok_slaves = zrealloc(ok_slaves,sizeof(uint64_t)+readlen); if (readlen && read(server.rdb_pipe_read_result_from_child, ok_slaves+1, readlen) != readlen) { ok_slaves[0] = 0; } } } close(server.rdb_pipe_read_result_from_child); close(server.rdb_pipe_write_result_to_parent); /* We can continue the replication process with all the slaves that * correctly received the full payload. Others are terminated. */ listNode *ln; listIter li; listRewind(server.slaves,&li); while((ln = listNext(&li))) { client *slave = ln->value; if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_END) { uint64_t j; int errorcode = 0; /* Search for the slave ID in the reply. In order for a slave to * continue the replication process, we need to find it in the list, * and it must have an error code set to 0 (which means success). */ for (j = 0; j < ok_slaves[0]; j++) { if (slave->id == ok_slaves[2*j+1]) { errorcode = ok_slaves[2*j+2]; break; /* Found in slaves list. */ } } if (j == ok_slaves[0] || errorcode != 0) { serverLog(LL_WARNING, "Closing slave %s: child->slave RDB transfer failed: %s", replicationGetSlaveName(slave), (errorcode == 0) ? "RDB transfer child aborted" : strerror(errorcode)); freeClient(slave); } else { serverLog(LL_WARNING, "Slave %s correctly received the streamed RDB file.", replicationGetSlaveName(slave)); /* Restore the socket as non-blocking. */ anetNonBlock(NULL,slave->fd); anetSendTimeout(NULL,slave->fd,0); } } } zfree(ok_slaves); updateSlavesWaitingBgsave((!bysignal && exitcode == 0) ? C_OK : C_ERR, RDB_CHILD_TYPE_SOCKET); } /* When a background RDB saving/transfer terminates, call the right handler. */ void backgroundSaveDoneHandler(int exitcode, int bysignal) { switch(server.rdb_child_type) { case RDB_CHILD_TYPE_DISK: backgroundSaveDoneHandlerDisk(exitcode,bysignal); break; case RDB_CHILD_TYPE_SOCKET: backgroundSaveDoneHandlerSocket(exitcode,bysignal); break; default: serverPanic("Unknown RDB child type."); break; } } /* Spawn an RDB child that writes the RDB to the sockets of the slaves * that are currently in SLAVE_STATE_WAIT_BGSAVE_START state. */ int rdbSaveToSlavesSockets(rdbSaveInfo *rsi) { int *fds; uint64_t *clientids; int numfds; listNode *ln; listIter li; pid_t childpid; long long start; int pipefds[2]; if (server.aof_child_pid != -1 || server.rdb_child_pid != -1) return C_ERR; /* Before to fork, create a pipe that will be used in order to * send back to the parent the IDs of the slaves that successfully * received all the writes. */ if (pipe(pipefds) == -1) return C_ERR; server.rdb_pipe_read_result_from_child = pipefds[0]; server.rdb_pipe_write_result_to_parent = pipefds[1]; /* Collect the file descriptors of the slaves we want to transfer * the RDB to, which are i WAIT_BGSAVE_START state. */ fds = zmalloc(sizeof(int)*listLength(server.slaves)); /* We also allocate an array of corresponding client IDs. This will * be useful for the child process in order to build the report * (sent via unix pipe) that will be sent to the parent. */ clientids = zmalloc(sizeof(uint64_t)*listLength(server.slaves)); numfds = 0; listRewind(server.slaves,&li); while((ln = listNext(&li))) { client *slave = ln->value; if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_START) { clientids[numfds] = slave->id; fds[numfds++] = slave->fd; replicationSetupSlaveForFullResync(slave,getPsyncInitialOffset()); /* Put the socket in blocking mode to simplify RDB transfer. * We'll restore it when the children returns (since duped socket * will share the O_NONBLOCK attribute with the parent). */ anetBlock(NULL,slave->fd); anetSendTimeout(NULL,slave->fd,server.repl_timeout*1000); } } /* Create the child process. */ openChildInfoPipe(); start = ustime(); if ((childpid = fork()) == 0) { /* Child */ int retval; rio slave_sockets; rioInitWithFdset(&slave_sockets,fds,numfds); zfree(fds); closeListeningSockets(0); redisSetProcTitle("redis-rdb-to-slaves"); retval = rdbSaveRioWithEOFMark(&slave_sockets,NULL,rsi); if (retval == C_OK && rioFlush(&slave_sockets) == 0) retval = C_ERR; if (retval == C_OK) { size_t private_dirty = zmalloc_get_private_dirty(-1); if (private_dirty) { serverLog(LL_NOTICE, "RDB: %zu MB of memory used by copy-on-write", private_dirty/(1024*1024)); } server.child_info_data.cow_size = private_dirty; sendChildInfo(CHILD_INFO_TYPE_RDB); /* If we are returning OK, at least one slave was served * with the RDB file as expected, so we need to send a report * to the parent via the pipe. The format of the message is: * * ... * * len, slave IDs, and slave errors, are all uint64_t integers, * so basically the reply is composed of 64 bits for the len field * plus 2 additional 64 bit integers for each entry, for a total * of 'len' entries. * * The 'id' represents the slave's client ID, so that the master * can match the report with a specific slave, and 'error' is * set to 0 if the replication process terminated with a success * or the error code if an error occurred. */ void *msg = zmalloc(sizeof(uint64_t)*(1+2*numfds)); uint64_t *len = msg; uint64_t *ids = len+1; int j, msglen; *len = numfds; for (j = 0; j < numfds; j++) { *ids++ = clientids[j]; *ids++ = slave_sockets.io.fdset.state[j]; } /* Write the message to the parent. If we have no good slaves or * we are unable to transfer the message to the parent, we exit * with an error so that the parent will abort the replication * process with all the childre that were waiting. */ msglen = sizeof(uint64_t)*(1+2*numfds); if (*len == 0 || write(server.rdb_pipe_write_result_to_parent,msg,msglen) != msglen) { retval = C_ERR; } zfree(msg); } zfree(clientids); rioFreeFdset(&slave_sockets); exitFromChild((retval == C_OK) ? 0 : 1); } else { /* Parent */ if (childpid == -1) { serverLog(LL_WARNING,"Can't save in background: fork: %s", strerror(errno)); /* Undo the state change. The caller will perform cleanup on * all the slaves in BGSAVE_START state, but an early call to * replicationSetupSlaveForFullResync() turned it into BGSAVE_END */ listRewind(server.slaves,&li); while((ln = listNext(&li))) { client *slave = ln->value; int j; for (j = 0; j < numfds; j++) { if (slave->id == clientids[j]) { slave->replstate = SLAVE_STATE_WAIT_BGSAVE_START; break; } } } close(pipefds[0]); close(pipefds[1]); closeChildInfoPipe(); } else { server.stat_fork_time = ustime()-start; server.stat_fork_rate = (double) zmalloc_used_memory() * 1000000 / server.stat_fork_time / (1024*1024*1024); /* GB per second. */ latencyAddSampleIfNeeded("fork",server.stat_fork_time/1000); serverLog(LL_NOTICE,"Background RDB transfer started by pid %d", childpid); server.rdb_save_time_start = time(NULL); server.rdb_child_pid = childpid; server.rdb_child_type = RDB_CHILD_TYPE_SOCKET; updateDictResizePolicy(); } zfree(clientids); zfree(fds); return (childpid == -1) ? C_ERR : C_OK; } return C_OK; /* Unreached. */ } void saveCommand(client *c) { if (server.rdb_child_pid != -1) { addReplyError(c,"Background save already in progress"); return; } rdbSaveInfo rsi, *rsiptr; rsiptr = rdbPopulateSaveInfo(&rsi); if (rdbSave(server.rdb_filename,rsiptr) == C_OK) { addReply(c,shared.ok); } else { addReply(c,shared.err); } } /* BGSAVE [SCHEDULE] */ void bgsaveCommand(client *c) { int schedule = 0; /* The SCHEDULE option changes the behavior of BGSAVE when an AOF rewrite * is in progress. Instead of returning an error a BGSAVE gets scheduled. */ if (c->argc > 1) { if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr,"schedule")) { schedule = 1; } else { addReply(c,shared.syntaxerr); return; } } rdbSaveInfo rsi, *rsiptr; rsiptr = rdbPopulateSaveInfo(&rsi); if (server.rdb_child_pid != -1) { addReplyError(c,"Background save already in progress"); } else if (server.aof_child_pid != -1) { if (schedule) { server.rdb_bgsave_scheduled = 1; addReplyStatus(c,"Background saving scheduled"); } else { addReplyError(c, "An AOF log rewriting in progress: can't BGSAVE right now. " "Use BGSAVE SCHEDULE in order to schedule a BGSAVE whenever " "possible."); } } else if (rdbSaveBackground(server.rdb_filename,rsiptr) == C_OK) { addReplyStatus(c,"Background saving started"); } else { addReply(c,shared.err); } } /* Populate the rdbSaveInfo structure used to persist the replication * information inside the RDB file. Currently the structure explicitly * contains just the currently selected DB from the master stream, however * if the rdbSave*() family functions receive a NULL rsi structure also * the Replication ID/offset is not saved. The function popultes 'rsi' * that is normally stack-allocated in the caller, returns the populated * pointer if the instance has a valid master client, otherwise NULL * is returned, and the RDB saving will not persist any replication related * information. */ rdbSaveInfo *rdbPopulateSaveInfo(rdbSaveInfo *rsi) { rdbSaveInfo rsi_init = RDB_SAVE_INFO_INIT; *rsi = rsi_init; /* If the instance is a master, we can populate the replication info * only when repl_backlog is not NULL. If the repl_backlog is NULL, * it means that the instance isn't in any replication chains. In this * scenario the replication info is useless, because when a slave * connects to us, the NULL repl_backlog will trigger a full * synchronization, at the same time we will use a new replid and clear * replid2. */ if (!server.masterhost && server.repl_backlog) { /* Note that when server.slaveseldb is -1, it means that this master * didn't apply any write commands after a full synchronization. * So we can let repl_stream_db be 0, this allows a restarted slave * to reload replication ID/offset, it's safe because the next write * command must generate a SELECT statement. */ rsi->repl_stream_db = server.slaveseldb == -1 ? 0 : server.slaveseldb; return rsi; } /* If the instance is a slave we need a connected master * in order to fetch the currently selected DB. */ if (server.master) { rsi->repl_stream_db = server.master->db->id; return rsi; } /* If we have a cached master we can use it in order to populate the * replication selected DB info inside the RDB file: the slave can * increment the master_repl_offset only from data arriving from the * master, so if we are disconnected the offset in the cached master * is valid. */ if (server.cached_master) { rsi->repl_stream_db = server.cached_master->db->id; return rsi; } return NULL; }