diff options
Diffstat (limited to 'src')
402 files changed, 24727 insertions, 4983 deletions
diff --git a/src/blob/blob_fileops.c b/src/blob/blob_fileops.c new file mode 100644 index 00000000..713e7e83 --- /dev/null +++ b/src/blob/blob_fileops.c @@ -0,0 +1,352 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2013, 2015 Oracle and/or its affiliates. All rights reserved. + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/db_am.h" +#include "dbinc/fop.h" + +/* + * __blob_file_create -- + * Blobs are orginaized in a directory sturcture consisting of + * <DB_HOME>/__db_bl/<blob_sub_dir>/. Below that, the blob_id + * is used to construct a path to the blob file, and to name + * the blob file. blob_id=1 would result in __db.bl001. + * blob_id=12002 would result in 012/__db.bl012002. + * + * PUBLIC: int __blob_file_create __P + * PUBLIC: ((DBC *, DB_FH **, db_seq_t *)); + */ +int +__blob_file_create(dbc, fhpp, blob_id) + DBC *dbc; + DB_FH **fhpp; + db_seq_t *blob_id; +{ + DB *dbp; + DB_FH *fhp; + ENV *env; + int ret; + char *ppath; + const char *dir; + + dbp = dbc->dbp; + env = dbp->env; + fhp = *fhpp = NULL; + ppath = NULL; + dir = NULL; + DB_ASSERT(env, !DB_IS_READONLY(dbc->dbp)); + + if ((ret = __blob_generate_id(dbp, dbc->txn, blob_id)) != 0) + goto err; + + if ((ret = __blob_id_to_path( + env, dbp->blob_sub_dir, *blob_id, &ppath)) != 0) + goto err; + + if ((ret = __fop_create(env, dbc->txn, + &fhp, ppath, &dir, DB_APP_BLOB, env->db_mode, + (F_ISSET(dbc->dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0))) + != 0) { + __db_errx(env, DB_STR_A("0228", + "Error creating blob file: %llu.", "%llu"), + (unsigned long long)*blob_id); + goto err; + } + +err: if (ppath != NULL) + __os_free(env, ppath); + if (ret == 0) + *fhpp = fhp; + return (ret); +} + +/* + * __blob_file_close -- + * + * PUBLIC: int __blob_file_close __P ((DBC *, DB_FH *, u_int32_t)); + */ +int +__blob_file_close(dbc, fhp, flags) + DBC *dbc; + DB_FH *fhp; + u_int32_t flags; +{ + ENV *env; + int ret, t_ret; + + env = dbc->env; + ret = t_ret = 0; + if (fhp != NULL) { + /* Only sync if the file was open for writing. */ + if (LF_ISSET(DB_FOP_WRITE)) + t_ret = __os_fsync(env, fhp); + ret = __os_closehandle(env, fhp); + if (t_ret != 0) + ret = t_ret; + } + + return (ret); +} + +/* + * __blob_file_delete -- + * Delete a blob file. + * + * PUBLIC: int __blob_file_delete __P((DBC *, db_seq_t)); + */ +int +__blob_file_delete(dbc, blob_id) + DBC *dbc; + db_seq_t blob_id; +{ + ENV *env; + char *blob_name, *full_path; + int ret; + + env = dbc->dbp->env; + blob_name = full_path = NULL; + + if ((ret = __blob_id_to_path( + env, dbc->dbp->blob_sub_dir, blob_id, &blob_name)) != 0) { + __db_errx(env, DB_STR_A("0229", + "Failed to construct path for blob file %llu.", + "%llu"), (unsigned long long)blob_id); + goto err; + } + + /* Log the file remove event. */ + if (!IS_REAL_TXN(dbc->txn)) { + if ((ret = __db_appname( + env, DB_APP_BLOB, blob_name, NULL, &full_path)) != 0) + goto err; + ret = __os_unlink(env, full_path, 0); + } else { + ret = __fop_remove( + env, dbc->txn, NULL, blob_name, NULL, DB_APP_BLOB, 0); + } + + if (ret != 0) { + __db_errx(env, DB_STR_A("0230", + "Failed to remove blob file while deleting: %s.", + "%s"), blob_name); + goto err; + } + +err: if (blob_name != NULL) + __os_free(env, blob_name); + if (full_path != NULL) + __os_free(env, full_path); + return (ret); +} + +/* + * __blob_file_open -- + * + * PUBLIC: int __blob_file_open + * PUBLIC: __P((DB *, DB_FH **, db_seq_t, u_int32_t, int)); + */ +int +__blob_file_open(dbp, fhpp, blob_id, flags, printerr) + DB *dbp; + DB_FH **fhpp; + db_seq_t blob_id; + u_int32_t flags; + int printerr; +{ + ENV *env; + int ret; + u_int32_t oflags; + char *path, *ppath; + + env = dbp->env; + *fhpp = NULL; + ppath = path = NULL; + oflags = 0; + + if ((ret = __blob_id_to_path( + env, dbp->blob_sub_dir, blob_id, &ppath)) != 0) + goto err; + + if ((ret = __db_appname( + env, DB_APP_BLOB, ppath, NULL, &path)) != 0) { + __db_errx(env, DB_STR_A("0231", + "Failed to get path to blob file: %llu.", "%llu"), + (unsigned long long)blob_id); + goto err; + } + + if (LF_ISSET(DB_FOP_READONLY) || DB_IS_READONLY(dbp)) + oflags |= DB_OSO_RDONLY; + if ((ret = __os_open(env, path, 0, oflags, 0, fhpp)) != 0) { + /* + * In replication it is possible to try to read a blob file + * that has been deleted. In that case do not print an error. + */ + if (printerr == 1) { + __db_errx(env, DB_STR_A("0232", + "Error opening blob file: %s.", "%s"), path); + } + goto err; + } + +err: if (path != NULL) + __os_free(env, path); + if (ppath != NULL) + __os_free(env, ppath); + return (ret); +} + +/* + * __blob_file_read -- + * + * PUBLIC: int __blob_file_read + * PUBLIC: __P((ENV *, DB_FH *, DBT *, off_t, u_int32_t)); + */ +int +__blob_file_read(env, fhp, dbt, offset, size) + ENV *env; + DB_FH *fhp; + DBT *dbt; + off_t offset; + u_int32_t size; +{ + int ret; + size_t bytes; + void *buf; + + bytes = 0; + buf = NULL; + + if ((ret = __os_seek(env, fhp, 0, 0, offset)) != 0) + goto err; + + if (F_ISSET(dbt, DB_DBT_USERCOPY)) { + if ((ret = __os_malloc(env, size, &buf)) != 0) + goto err; + } else + buf = dbt->data; + + if ((ret = __os_read(env, fhp, buf, size, &bytes)) != 0) { + __db_errx(env, DB_STR("0233", "Error reading blob file.")); + goto err; + } + /* + * It is okay to read off the end of the file, in which case less bytes + * will be returned than requested. This is also how the code behaves + * in the DB_DBT_PARTIAL API. + */ + dbt->size = (u_int32_t)bytes; + + if (F_ISSET(dbt, DB_DBT_USERCOPY) && dbt->size != 0) { + ret = env->dbt_usercopy( + dbt, 0, buf, dbt->size, DB_USERCOPY_SETDATA); + } + +err: if (buf != NULL && buf != dbt->data) + __os_free(env, buf); + return (ret); +} + +/* + * __blob_file_write -- + * + * PUBLIC: int __blob_file_write + * PUBLIC: __P((DBC *, DB_FH *, DBT *, + * PUBLIC: off_t, db_seq_t, off_t *, u_int32_t)); + */ +int +__blob_file_write(dbc, fhp, buf, offset, blob_id, file_size, flags) + DBC *dbc; + DB_FH *fhp; + DBT *buf; + off_t offset; + db_seq_t blob_id; + off_t *file_size; + u_int32_t flags; +{ + ENV *env; + off_t size, write_offset; + char *dirname, *name; + int ret, blob_lg; + size_t data_size; + void *ptr; + + env = dbc->env; + dirname = name = NULL; + size = 0; + write_offset = offset; + DB_ASSERT(env, !DB_IS_READONLY(dbc->dbp)); + DB_ASSERT(env, fhp != NULL); + + /* File size is used to tell if the write is extending the file. */ + size = *file_size; + + if (DBENV_LOGGING(env)) { + if ((ret = __log_get_config( + env->dbenv, DB_LOG_BLOB, &blob_lg)) != 0) + goto err; + if (blob_lg == 0 && !REP_ON(env)) + LF_SET(DB_FOP_PARTIAL_LOG); + if (!LF_ISSET(DB_FOP_CREATE) && (size <= offset)) + LF_SET(DB_FOP_APPEND); + } + + if ((ret = __blob_id_to_path( + env, dbc->dbp->blob_sub_dir, blob_id, &name)) != 0) + goto err; + + if ((ret = __dbt_usercopy(env, buf)) != 0) + goto err; + + /* + * If the write overwrites some of the file, and writes off the end + * of the file, break the write into two writes, one that overwrites + * data, and an append. Otherwise if the write is aborted, the + * data written past the end of the file will not be erased. + */ + if (offset < size && (offset + buf->size) > size) { + ptr = buf->data; + data_size = (size_t)(size - offset); + if ((ret = __fop_write_file(env, dbc->txn, name, dirname, + DB_APP_BLOB, fhp, offset, ptr, data_size, flags)) != 0) { + __db_errx(env, DB_STR_A("0235", + "Error writing blob file: %s.", "%s"), name); + goto err; + } + LF_SET(DB_FOP_APPEND); + ptr = (u_int8_t *)ptr + data_size; + data_size = buf->size - data_size; + write_offset = size; + } else { + if (!LF_ISSET(DB_FOP_CREATE) && (offset >= size)) + LF_SET(DB_FOP_APPEND); + ptr = buf->data; + data_size = buf->size; + } + + if ((ret = __fop_write_file(env, dbc->txn, name, dirname, + DB_APP_BLOB, fhp, write_offset, ptr, data_size, flags)) != 0) { + __db_errx(env, DB_STR_A("0236", + "Error writing blob file: %s.", "%s"), name); + goto err; + } + + if (LF_ISSET(DB_FOP_SYNC_WRITE)) + if ((ret = __os_fsync(env, fhp)) != 0) + goto err; + + /* Update the size of the file. */ + if ((offset + (off_t)buf->size) > size) + *file_size = offset + (off_t)buf->size; + +err: if (name != NULL) + __os_free(env, name); + + return (ret); +} diff --git a/src/blob/blob_page.c b/src/blob/blob_page.c new file mode 100644 index 00000000..96a2b59b --- /dev/null +++ b/src/blob/blob_page.c @@ -0,0 +1,374 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2013, 2015 Oracle and/or its affiliates. All rights reserved. + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/db_am.h" +#include "dbinc/fop.h" + +/* + * Blob file data item code. + * + * Blob file data entries are stored on linked lists of pages. The initial + * reference is a structure with an encoded version of the path where the file + * is stored. The blob file contains only the users data. + */ + +/* + * __blob_bulk -- + * Dump blob file into buffer. + * The space requirements have already been checked, if the blob is + * larger than UINT32MAX then DB_BUFFER_SMALL would have already + * been returned. + * PUBLIC: int __blob_bulk + * PUBLIC: __P((DBC *, u_int32_t, db_seq_t, u_int8_t *)); + */ +int +__blob_bulk(dbc, len, blob_id, dp) + DBC *dbc; + u_int32_t len; + db_seq_t blob_id; + u_int8_t *dp; +{ + DBT dbt; + DB_FH *fhp; + ENV *env; + int ret, t_ret; + + env = dbc->dbp->env; + fhp = NULL; + memset(&dbt, 0, sizeof(dbt)); + F_SET(&dbt, DB_DBT_USERMEM); + dbt.ulen = len; + dbt.data = (void *)dp; + + if ((ret = __blob_file_open( + dbc->dbp, &fhp, blob_id, DB_FOP_READONLY, 1)) != 0) + goto err; + + if ((ret = __blob_file_read(env, fhp, &dbt, 0, len)) != 0) + goto err; + + /* Close any open file descriptors. */ +err: if (fhp != NULL) { + t_ret = __blob_file_close(dbc, fhp, 0); + if (ret == 0) + ret = t_ret; + } + return (ret); +} + +/* + * __blob_get -- + * Get a blob file item. Analogous to db_overflow.c:__db_goff. + * + * PUBLIC: int __blob_get __P((DBC *, + * PUBLIC: DBT *, db_seq_t, off_t, void **, u_int32_t *)); + */ +int +__blob_get(dbc, dbt, blob_id, file_size, bpp, bpsz) + DBC *dbc; + DBT *dbt; + db_seq_t blob_id; + off_t file_size; + void **bpp; + u_int32_t *bpsz; +{ + DB_FH *fhp; + ENV *env; + int ret, t_ret; + u_int32_t needed, start, tlen; + + env = dbc->dbp->env; + fhp = NULL; + ret = 0; + + /* + * Blobs larger than UINT32_MAX can only be read using + * the DB_STREAM API, or the DB_DBT_PARTIAL API. + */ + if (file_size > UINT32_MAX) { + if (!F_ISSET(dbt, DB_DBT_PARTIAL)) { + dbt->size = UINT32_MAX; + ret = DB_BUFFER_SMALL; + goto err; + } else + tlen = UINT32_MAX; + } else + tlen = (u_int32_t)file_size; + + if (((ret = __db_alloc_dbt( + env, dbt, tlen, &needed, &start, bpp, bpsz)) != 0) || needed == 0) + goto err; + dbt->size = needed; + + if ((ret = __blob_file_open( + dbc->dbp, &fhp, blob_id, DB_FOP_READONLY, 1)) != 0) + goto err; + + if ((ret = __blob_file_read(env, fhp, dbt, dbt->doff, needed)) != 0) + goto err; + + /* Close any open file descriptors. */ +err: if (fhp != NULL) { + t_ret = __blob_file_close(dbc, fhp, 0); + if (ret == 0) + ret = t_ret; + } + /* Does the dbt need to be cleaned on error? */ + return (ret); +} + +/* + * __blob_put -- + * Put a blob file item. + * + * PUBLIC: int __blob_put __P(( + * PUBLIC: DBC *, DBT *, db_seq_t *, off_t *size, DB_LSN *)); + */ +int +__blob_put(dbc, dbt, blob_id, size, plsn) + DBC *dbc; + DBT *dbt; + db_seq_t *blob_id; + off_t *size; + DB_LSN *plsn; +{ + DBT partial; + DB_FH *fhp; + ENV *env; + int ret, t_ret; + off_t offset; + + env = dbc->dbp->env; + fhp = NULL; + offset = 0; + DB_ASSERT(env, blob_id != NULL); + DB_ASSERT(env, *blob_id == 0); + + ZERO_LSN(*plsn); + + /* If the id didn't refer to an existing blob generate a new one. */ + if ((ret = __blob_file_create(dbc, &fhp, blob_id)) != 0) + goto err; + + /* + * If doing a partial put with dbt->doff == 0, then treat like + * a normal put. Otherwise write NULLs into the file up to doff, which + * is required by the PARTIAL API. Since the file is being created, + * its size is always 0. + */ + DB_ASSERT(env, *size == 0); + if (F_ISSET(dbt, DB_DBT_PARTIAL) && dbt->doff > 0) { + memset(&partial, 0, sizeof(partial)); + if ((ret = __os_malloc(env, dbt->doff, &partial.data)) != 0) + goto err; + memset(partial.data, 0, dbt->doff); + partial.size = dbt->doff; + ret = __blob_file_write( + dbc, fhp, &partial, 0, *blob_id, size, DB_FOP_CREATE); + offset = dbt->doff; + __os_free(env, partial.data); + if (ret != 0) + goto err; + } + + if ((ret = __blob_file_write( + dbc, fhp, dbt, offset, *blob_id, size, DB_FOP_CREATE)) != 0) + goto err; + + /* Close any open file descriptors. */ +err: if (fhp != NULL) { + t_ret = __blob_file_close(dbc, fhp, DB_FOP_WRITE); + if (ret == 0) + ret = t_ret; + } + return (ret); +} + +/* + * __blob_repl -- + * Replace a blob file contents. It would be nice if this could be done + * by truncating the file and writing in the new data, but undoing a + * truncate would require a lot of logging, so it is performed by + * deleting the old blob file, and creating a new one. + * + * PUBLIC: int __blob_repl __P((DBC *, DBT *, db_seq_t, db_seq_t *,off_t *)); + */ +int +__blob_repl(dbc, nval, blob_id, new_blob_id, size) + DBC *dbc; + DBT *nval; + db_seq_t blob_id; + db_seq_t *new_blob_id; + off_t *size; +{ + DBT partial; + DB_FH *fhp, *new_fhp; + DB_LSN lsn; + ENV *env; + int ret, t_ret; + off_t current, old_size; + + fhp = new_fhp = NULL; + *new_blob_id = 0; + old_size = *size; + env = dbc->env; + memset(&partial, 0, sizeof(partial)); + + /* + * Handling partial replace. + * 1. doff > blob file size : Pad the end of the blob file with NULLs + * up to doff, then append the data. + * 2. doff == size: Write the data to the existing blob file. + * 3. dlen == size: Write the data to the existing blob file. + * 4. Create a new blob file. Copy old blob data up to doff + * to the new file. Append the new data. Append data + * from the old file from doff + dlen to the end of the + * old file to the new file. Delete the old file. + */ + if (F_ISSET(nval, DB_DBT_PARTIAL)) { + if ((nval->doff > *size) || + ((nval->doff == *size) || (nval->dlen == nval->size))) { + /* Open the file for appending. */ + if ((ret = __blob_file_open( + dbc->dbp, &fhp, blob_id, 0, 1)) != 0) + goto err; + *new_blob_id = blob_id; + + /* Pad the end of the blob with NULLs. */ + if (nval->doff > *size) { + partial.size = nval->doff - (u_int32_t)*size; + if ((ret = __os_malloc( + env, partial.size, &partial.data)) != 0) + goto err; + memset(partial.data, 0, partial.size); + if ((ret = __blob_file_write(dbc, fhp, + &partial, *size, blob_id, size, 0)) != 0) + goto err; + } + + /* Write in the data. */ + if ((ret = __blob_file_write(dbc, fhp, + nval, nval->doff, blob_id, size, 0)) != 0) + goto err; + + /* Close the file */ + ret = __blob_file_close(dbc, fhp, DB_FOP_WRITE); + fhp = NULL; + if (ret != 0) + goto err; + } else { + /* Open the old blob file. */ + if ((ret = __blob_file_open( + dbc->dbp, &fhp, blob_id, DB_FOP_READONLY, 1)) != 0) + goto err; + /* Create the new blob file. */ + if ((ret = __blob_file_create( + dbc, &new_fhp, new_blob_id)) != 0) + goto err; + + *size = 0; + /* Copy data to the new file up to doff. */ + if (nval->doff != 0) { + partial.ulen = partial.size = nval->doff; + if ((ret = __os_malloc( + env, partial.ulen, &partial.data)) != 0) + goto err; + if ((ret = __blob_file_read( + env, fhp, &partial, 0, partial.size)) != 0) + goto err; + if ((ret = __blob_file_write( + dbc, new_fhp, &partial, 0, + *new_blob_id, size, DB_FOP_CREATE)) != 0) + goto err; + } + + /* Write the partial data into the new file. */ + if ((ret = __blob_file_write( + dbc, new_fhp, nval, nval->doff, + *new_blob_id, size, DB_FOP_CREATE)) != 0) + goto err; + + /* Copy remaining blob data into the new file. */ + current = nval->doff + nval->dlen; + while (current < old_size) { + if (partial.ulen < MEGABYTE) { + if ((ret = __os_realloc(env, + MEGABYTE, &partial.data)) != 0) + goto err; + partial.size = partial.ulen = MEGABYTE; + } + if ((old_size - current) < partial.ulen) { + partial.size = + (u_int32_t)(old_size - current); + } else + partial.size = MEGABYTE; + + if ((ret = __blob_file_read(env, fhp, + &partial, current, partial.size)) != 0) + goto err; + if ((ret = __blob_file_write( + dbc, new_fhp, &partial, *size, + *new_blob_id, size, DB_FOP_CREATE)) != 0) + goto err; + current += partial.size; + } + + /* Close the old file. */ + ret = __blob_file_close(dbc, fhp, 0); + fhp = NULL; + if (ret != 0) + goto err; + + /* Delete the old blob file. */ + if ((ret = __blob_del(dbc, blob_id)) != 0) + goto err; + } + goto err; + } + + if ((ret = __blob_del(dbc, blob_id)) != 0) + goto err; + + *size = 0; + if ((ret = __blob_put(dbc, nval, new_blob_id, size, &lsn)) != 0) + goto err; + +err: if (fhp != NULL) { + t_ret = __blob_file_close(dbc, fhp, DB_FOP_WRITE); + if (ret == 0) + ret = t_ret; + } + if (new_fhp != NULL) { + t_ret = __blob_file_close(dbc, new_fhp, DB_FOP_WRITE); + if (ret == 0) + ret = t_ret; + } + if (partial.data != NULL) + __os_free(env, partial.data); + return (ret); +} + +/* + * __blob_del -- + * Delete a blob file. The onpage record is handled separately.. + * + * PUBLIC: int __blob_del __P((DBC *, db_seq_t)); + */ +int +__blob_del(dbc, blob_id) + DBC *dbc; + db_seq_t blob_id; +{ + int ret; + + ret = __blob_file_delete(dbc, blob_id); + + return (ret); +} diff --git a/src/blob/blob_stream.c b/src/blob/blob_stream.c new file mode 100644 index 00000000..ab21aa0f --- /dev/null +++ b/src/blob/blob_stream.c @@ -0,0 +1,283 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2013, 2015 Oracle and/or its affiliates. All rights reserved. + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/db_am.h" +#include "dbinc/fop.h" + +static int __db_stream_close __P((DB_STREAM *, u_int32_t)); +static int __db_stream_read + __P((DB_STREAM *, DBT *, db_off_t, u_int32_t, u_int32_t)); +static int __db_stream_size __P((DB_STREAM *, db_off_t *, u_int32_t)); +static int __db_stream_write __P((DB_STREAM *, DBT *, db_off_t, u_int32_t)); + +/* + * __db_stream_init + * DB_STREAM initializer. + * + * PUBLIC: int __db_stream_init __P((DBC *, DB_STREAM **, u_int32_t)); + */ +int +__db_stream_init(dbc, dbsp, flags) + DBC *dbc; + DB_STREAM **dbsp; + u_int32_t flags; +{ + DB_STREAM *dbs; + DB_THREAD_INFO *ip; + ENV *env; + int ret; + off_t size; + + dbs = NULL; + env = dbc->env; + + if ((ret = __os_malloc(env, sizeof(DB_STREAM), &dbs)) != 0) + return (ret); + memset(dbs, 0, sizeof(DB_STREAM)); + + ENV_ENTER(env, ip); + /* Should the copy be transient? */ + if ((ret = __dbc_idup(dbc, &dbs->dbc, DB_POSITION)) != 0) + goto err; + dbs->flags = flags; + + /* + * Make sure we have a write lock on the db record if writing + * to the blob. + */ + if (F_ISSET(dbs, DB_FOP_WRITE)) + F_SET(dbc, DBC_RMW); + + if ((ret = __dbc_get_blob_id(dbs->dbc, &dbs->blob_id)) != 0) { + if (ret == EINVAL) + __db_errx(env, DB_STR("0211", + "Error, cursor does not point to a blob.")); + goto err; + } + + if ((ret = __dbc_get_blob_size(dbs->dbc, &size)) != 0) + goto err; + dbs->file_size = size; + + if ((ret = __blob_file_open( + dbs->dbc->dbp, &dbs->fhp, dbs->blob_id, flags, 1)) != 0) + goto err; + ENV_LEAVE(env, ip); + + dbs->close = __db_stream_close; + dbs->read = __db_stream_read; + dbs->size = __db_stream_size; + dbs->write = __db_stream_write; + + *dbsp = dbs; + return (0); + +err: if (dbs != NULL && dbs->dbc != NULL) + (void)__dbc_close(dbs->dbc); + ENV_LEAVE(env, ip); + if (dbs != NULL) + __os_free(env, dbs); + return (ret); +} + +/* + * __db_stream_close -- + * + * DB_STREAM->close + */ +static int +__db_stream_close(dbs, flags) + DB_STREAM *dbs; + u_int32_t flags; +{ + DB_THREAD_INFO *ip; + ENV *env; + int ret; + + env = dbs->dbc->env; + + if ((ret = __db_fchk(env, "DB_STREAM->close", flags, 0)) != 0) + return (ret); + + ENV_ENTER(env, ip); + + ret = __db_stream_close_int(dbs); + + ENV_LEAVE(env, ip); + + return (ret); +} + +/* + * __db_stream_close_int -- + * Close a DB_STREAM object. + * + * PUBLIC: int __db_stream_close_int __P ((DB_STREAM *)); + */ +int +__db_stream_close_int(dbs) + DB_STREAM *dbs; +{ + DBC *dbc; + ENV *env; + int ret, t_ret; + + dbc = dbs->dbc; + env = dbc->env; + + ret = __blob_file_close(dbc, dbs->fhp, dbs->flags); + + if ((t_ret = __dbc_close(dbs->dbc)) != 0 && ret == 0) + ret = t_ret; + + __os_free(env, dbs); + + return (ret); +} + +/* + * __db_stream_read -- + * + * DB_STREAM->read + */ +static int +__db_stream_read(dbs, data, offset, size, flags) + DB_STREAM *dbs; + DBT *data; + db_off_t offset; + u_int32_t size; + u_int32_t flags; +{ + DBC *dbc; + ENV *env; + int ret; + u_int32_t needed, start; + + dbc = dbs->dbc; + env = dbc->dbp->env; + ret = 0; + + if ((ret = __db_fchk(env, "DB_STREAM->read", flags, 0)) != 0) + return (ret); + + if (F_ISSET(data, DB_DBT_PARTIAL)) { + ret = EINVAL; + __db_errx(env, DB_STR("0212", + "Error, do not use DB_DBT_PARTIAL with DB_STREAM.")); + goto err; + } + + if (offset > dbs->file_size) { + data->size = 0; + goto err; + } + + if ((ret = __db_alloc_dbt( + env, data, size, &needed, &start, NULL, NULL)) != 0) + goto err; + data->size = needed; + + if (needed == 0) + goto err; + + ret = __blob_file_read(env, dbs->fhp, data, offset, size); + +err: return (ret); +} + +/* + * __db_stream_size -- + * + * DB_STREAM->size + */ +static int +__db_stream_size(dbs, size, flags) + DB_STREAM *dbs; + db_off_t *size; + u_int32_t flags; +{ + int ret; + + if ((ret = __db_fchk(dbs->dbc->env, "DB_STREAM->size", flags, 0)) != 0) + return (ret); + + *size = dbs->file_size; + + return (0); +} + +/* + * __db_stream_write -- + * + * DB_STREAM->write + */ +static int +__db_stream_write(dbs, data, offset, flags) + DB_STREAM *dbs; + DBT *data; + db_off_t offset; + u_int32_t flags; +{ + DB_THREAD_INFO *ip; + ENV *env; + int ret; + off_t file_size; + u_int32_t wflags; + + env = dbs->dbc->env; + + if ((ret = __db_fchk( + env, "DB_STREAM->write", flags, DB_STREAM_SYNC_WRITE)) != 0) + return (ret); + + if (F_ISSET(dbs, DB_FOP_READONLY)) { + ret = EINVAL; + __db_errx(env, DB_STR("0213", "Error, blob is read only.")); + return (ret); + } + if (F_ISSET(data, DB_DBT_PARTIAL)) { + ret = EINVAL; + __db_errx(env, DB_STR("0214", + "Error, do not use DB_DBT_PARTIAL with DB_STREAM.")); + return (ret); + } + if (offset < 0 ) { + ret = EINVAL; + __db_errx(env, DB_STR_A("0215", + "Error, invalid offset value: %lld", "%lld"), + (long long)offset); + return (ret); + } + /* Catch overflow. */ + if (offset + (db_off_t)data->size < offset) { + ret = EINVAL; + __db_errx(env, DB_STR_A("0216", + "Error, this write will exceed the maximum blob size: %lu %lld", + "%lu %lld"), (u_long)data->size, (long long)offset); + return (ret); + } + + ENV_ENTER(env, ip); + wflags = dbs->flags; + if (LF_ISSET(DB_STREAM_SYNC_WRITE)) + wflags |= DB_FOP_SYNC_WRITE; + file_size = dbs->file_size; + if ((ret = __blob_file_write(dbs->dbc, dbs->fhp, + data, offset, dbs->blob_id, &file_size, wflags)) != 0) + goto err; + if (file_size != dbs->file_size) { + dbs->file_size = file_size; + if ((ret = __dbc_set_blob_size(dbs->dbc, dbs->file_size)) != 0) + goto err; + } +err: ENV_LEAVE(env, ip); + + return (ret); +} diff --git a/src/blob/blob_util.c b/src/blob/blob_util.c new file mode 100644 index 00000000..b2e3474b --- /dev/null +++ b/src/blob/blob_util.c @@ -0,0 +1,1189 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2013, 2015 Oracle and/or its affiliates. All rights reserved. + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/db_verify.h" +#include "dbinc/db_am.h" +#include "dbinc/blob.h" +#include "dbinc/fop.h" +#include "dbinc/txn.h" +#include "dbinc_auto/sequence_ext.h" + +static int __blob_open_meta_db __P(( + DB *, DB_TXN *, DB **, DB_SEQUENCE **, int, int)); +static int __blob_clean_dir + __P((ENV *, DB_TXN *, const char *, const char *, int)); +static int __blob_copy_dir __P((DB *, const char *, const char *)); + +#define BLOB_ID_KEY "blob_id" +#define BLOB_SEQ_DB_NAME "blob_id_seq" +#define BLOB_DIR_ID_KEY "blob_dir_id" +#define BLOB_DIR_SEQ_DB_NAME "blob_dir_id_seq" + +/* + * __blob_make_sub_dir -- + * Create the name of the subdirectory in the blob directory + * for the given database file and subdatabase ids. + * + * PUBLIC: int __blob_make_sub_dir __P((ENV *, char **, db_seq_t, db_seq_t)); + */ +int +__blob_make_sub_dir(env, blob_sub_dir, file_id, db_id) + ENV *env; + char **blob_sub_dir; + db_seq_t file_id; + db_seq_t db_id; +{ + char fname[MAX_BLOB_PATH_SZ], dname[MAX_BLOB_PATH_SZ]; + int ret; + size_t len; + + *blob_sub_dir = NULL; + memset(fname, 0, MAX_BLOB_PATH_SZ); + memset(dname, 0, MAX_BLOB_PATH_SZ); + + if (db_id == 0 && file_id == 0) + return (0); + + if (db_id < 0 || file_id < 0) + return (EINVAL); + + /* The master db has no subdb id. */ + if (db_id != 0) + (void)snprintf(dname, MAX_BLOB_PATH_SZ, + "%s%llu", BLOB_DIR_PREFIX, (unsigned long long)db_id); + (void)snprintf(fname, MAX_BLOB_PATH_SZ, "%s%llu", + BLOB_DIR_PREFIX, (unsigned long long)file_id); + + len = strlen(fname) + (db_id ? strlen(dname) : 0) + 3; + if ((ret = __os_malloc(env, len, blob_sub_dir)) != 0) + goto err; + if (db_id != 0) + (void)sprintf(*blob_sub_dir, "%s%c%s%c", fname, + PATH_SEPARATOR[0], dname, PATH_SEPARATOR[0]); + else + (void)sprintf(*blob_sub_dir, "%s%c", fname, PATH_SEPARATOR[0]); + + return (0); + +err: if (*blob_sub_dir != NULL) + __os_free(env, *blob_sub_dir); + + return (ret); +} + +/* + * __blob_make_meta_fname -- + * Construct a (usually partial) path name of a blob metadata data file. + * It usually is relative to the environment home directory; only when a + * blob directory is configured and is an absolute path does this make a + * full path. + * + * When dbp is set it constructs the blob metadata filename for that db; + * otherwise it constructs the environment-wide directory id filename. + * + * PUBLIC: int __blob_make_meta_fname __P((ENV *, DB *, char **)); + */ +int +__blob_make_meta_fname(env, dbp, meta_fname) + ENV *env; + DB *dbp; + char **meta_fname; +{ + char *fname, *sub_dir; + int ret; + size_t len; + + fname = NULL; + len = strlen(BLOB_META_FILE_NAME) + 1; + if (dbp == NULL) { + sub_dir = ""; + } else { + sub_dir = dbp->blob_sub_dir; + DB_ASSERT(env, sub_dir != NULL); + len += strlen(sub_dir); + } + if ((ret = __os_malloc(env, len, &fname)) != 0) + goto err; + + snprintf(fname, len, "%s%s", sub_dir, BLOB_META_FILE_NAME); + *meta_fname = fname; + return (0); +err: + if (fname != NULL) + __os_free(env, fname); + return (ret); +} + +/* + * __blob_get_dir -- + * Get the root directory of this database's blob files. + * + * PUBLIC: int __blob_get_dir __P((DB *, char **)); + */ +int +__blob_get_dir(dbp, dirp) + DB *dbp; + char **dirp; +{ + char *blob_dir; + int ret; + + *dirp = NULL; + + if (dbp->blob_sub_dir == NULL) + return (0); + + /* Get the path of the blob directory for this database. */ + if ((ret = __db_appname(dbp->env, + DB_APP_BLOB, dbp->blob_sub_dir, NULL, &blob_dir)) != 0) + goto err; + + *dirp = blob_dir; + return (0); + +err: if (blob_dir != NULL) + __os_free(dbp->env, blob_dir); + + return (ret); +} + +/* + * __blob_open_meta_db -- + * Open or create a blob meta database. This can be either + * the environment-wide db used to generate blob directory ids (__db1), or + * the per-db db used to generate blob ids (__db.bl001). + */ +static int +__blob_open_meta_db(dbp, txn, meta_db, seq, file, create) + DB *dbp; + DB_TXN *txn; + DB **meta_db; + DB_SEQUENCE **seq; + int file; + int create; +{ +#ifdef HAVE_64BIT_TYPES + ENV *env; + DB *blob_meta_db; + DBT key; + DB_SEQUENCE *blob_seq; + DB_THREAD_INFO *ip; + DB_TXN *local_txn; + char *fullname, *fname, *dname, *path; + int free_paths, ret, use_txn; + u_int32_t flags; + + flags = 0; + fullname = fname = NULL; + blob_meta_db = NULL; + blob_seq = NULL; + local_txn = NULL; + env = dbp->env; + free_paths = use_txn = 0; + memset(&key, 0, sizeof(DBT)); + + /* + * Get the directory of the database, the meta db file name, + * and the sub-db name. + * file: blob directory/meta-file-name + * else: blob directory/per-db-blobdir/meta-file-name + */ + if (file) { + key.data = BLOB_DIR_ID_KEY; + key.size = (u_int32_t)strlen(BLOB_DIR_ID_KEY); + dname = BLOB_DIR_SEQ_DB_NAME; + fname = BLOB_META_FILE_NAME; + } else { + key.data = BLOB_ID_KEY; + key.size = (u_int32_t)strlen(BLOB_ID_KEY); + dname = BLOB_SEQ_DB_NAME; + if ((ret = __blob_make_meta_fname(env, + file ? NULL : dbp, &fname)) < 0) + goto err; + free_paths = 1; + if (dbp->open_flags & DB_THREAD) + LF_SET(DB_THREAD); + } + + if ((ret = __db_appname(env, DB_APP_BLOB, fname, NULL, &fullname)) != 0) + goto err; + + path = fullname; +#ifdef DB_WIN32 + /* + * Absolute paths on windows can result in it creating a "C" or "D" + * directory in the working directory. + */ + if (__os_abspath(path)) + path += 2; +#endif + /* + * Create the blob, database file, and database name directories. The + * mkdir isn't logged, so __fop_create_recover needs to do this as well. + */ + if (__os_exists(env, fullname, NULL) != 0) { + if (!create) { + ret = ENOENT; + goto err; + } else if ((ret = __db_mkpath(env, path)) != 0) + goto err; + } + + if ((ret = __db_create_internal(&blob_meta_db, env, 0)) != 0) + goto err; + + if (create) + LF_SET(DB_CREATE); + + /* Disable blobs in the blob meta databases themselves. */ + if ((ret = __db_set_blob_threshold(blob_meta_db, 0, 0)) != 0) + goto err; + + /* + * To avoid concurrency issues, the blob meta database is + * opened and operated on in a local transaction. The one + * exception is when the blob meta database is created in the + * same txn as the parent db. Then the blob meta database + * shares the given txn, so if the txn is rolled back, the + * creation of the blob meta database will also be rolled back. + */ + if (!file && IS_REAL_TXN(dbp->cur_txn)) + use_txn = 1; + + ENV_GET_THREAD_INFO(env, ip); + if (IS_REAL_TXN(txn)) { + if (use_txn) + local_txn = txn; + else { + if ((ret = __txn_begin( + env, ip, NULL, &local_txn, DB_IGNORE_LEASE)) != 0) + goto err; + } + } + if ((ret = __db_open(blob_meta_db, ip, local_txn, fname, dname, + DB_BTREE, flags | DB_INTERNAL_BLOB_DB, 0, PGNO_BASE_MD)) != 0) + goto err; + + /* Open the sequence that holds the blob ids. */ + if ((ret = db_sequence_create(&blob_seq, blob_meta_db, 0)) != 0) + goto err; + + /* No-op if already initialized, 0 is an invalid value for blob ids. */ + if ((ret = __seq_initial_value(blob_seq, 1)) != 0) + goto err; + if ((ret = __seq_open(blob_seq, local_txn, &key, flags)) != 0) + goto err; + + if (local_txn != NULL && use_txn == 0 && + (ret = __txn_commit(local_txn, 0)) != 0) { + local_txn = NULL; + goto err; + } + __os_free(env, fullname); + if (free_paths) + __os_free(env, fname); + *meta_db = blob_meta_db; + *seq = blob_seq; + return (0); + +err: + if (fullname) + __os_free(env, fullname); + if (fname != NULL && free_paths) + __os_free(env, fname); + if (local_txn != NULL && use_txn == 0) + (void)__txn_abort(local_txn); + if (blob_seq != NULL) + (void)__seq_close(blob_seq, 0); + if (blob_meta_db != NULL) + (void)__db_close(blob_meta_db, NULL, 0); + return (ret); + +#else /*HAVE_64BIT_TYPES*/ + __db_errx(dbp->env, DB_STR("0217", + "library build did not include support for blobs")); + return (DB_OPNOTSUP); +#endif +} + +/* + * __blob_generate_dir_ids -- + * + * Generate the unique ids used to create a blob directory for the database. + * Only one argument is needed. Files with one database only need the + * file id. The master database only needs the file id, and + * subdatabases inherit the file id from the master, so they only need the + * subdatabase id. + * + * PUBLIC: int __blob_generate_dir_ids + * PUBLIC: __P((DB *, DB_TXN *, db_seq_t *)); + */ +int +__blob_generate_dir_ids(dbp, txn, id) + DB *dbp; + DB_TXN *txn; + db_seq_t *id; +{ + DB *blob_meta_db; + DB_SEQUENCE *blob_seq; + int ret; + u_int32_t flags; + +#ifdef HAVE_64BIT_TYPES + flags = 0; + blob_meta_db = NULL; + blob_seq = NULL; + + if ((ret = __blob_open_meta_db( + dbp, txn, &blob_meta_db, &blob_seq, 1, 1)) != 0) + goto err; + + if (IS_REAL_TXN(txn)) + LF_SET(DB_AUTO_COMMIT | DB_TXN_NOSYNC); + + DB_ASSERT(dbp->env, id != NULL); + if (*id == 0) { + if ((ret = __seq_get(blob_seq, 0, 1, id, flags)) != 0) + goto err; + } + +err: if (blob_seq != NULL) + (void)__seq_close(blob_seq, 0); + if (blob_meta_db != NULL) + (void)__db_close(blob_meta_db, NULL, 0); + return (ret); +#else /*HAVE_64BIT_TYPES*/ + COMPQUIET(dbp, NULL); + COMPQUIET(txn, NULL); + __db_errx(dbp->env, DB_STR("0218", + "library build did not include support for blobs")); + return (DB_OPNOTSUP); +#endif +} + +/* + * __blob_generate_id -- + * Generate a new blob ID. + * + * PUBLIC: int __blob_generate_id __P((DB *, DB_TXN *, db_seq_t *)); + */ +int +__blob_generate_id(dbp, txn, blob_id) + DB *dbp; + DB_TXN *txn; + db_seq_t *blob_id; +{ +#ifdef HAVE_64BIT_TYPES + DB_TXN *ltxn; + int ret; + u_int32_t flags; + flags = DB_IGNORE_LEASE; + ltxn = NULL; + + if (dbp->blob_seq == NULL) { + if ((ret = __blob_open_meta_db(dbp, txn, + &dbp->blob_meta_db, &dbp->blob_seq, 0, 1)) != 0) + goto err; + } + + /* + * If this is the opening transaction of the database, use it instead + * of auto commit. Otherwise it could deadlock with the transaction + * used to open the blob meta database in __blob_open_meta_db. + */ + if (IS_REAL_TXN(dbp->cur_txn)) + ltxn = txn; + + if (IS_REAL_TXN(txn) && ltxn == NULL) + LF_SET(DB_AUTO_COMMIT | DB_TXN_NOSYNC); + + if ((ret = __seq_get(dbp->blob_seq, ltxn, 1, blob_id, flags)) != 0) + goto err; + +err: return (ret); +#else /*HAVE_64BIT_TYPES*/ + COMPQUIET(blob_id, NULL); + __db_errx(dbp->env, DB_STR("0219", + "library build did not include support for blobs")); + return (DB_OPNOTSUP); +#endif +} + +/* + * __blob_highest_id + * + * Returns the highest id in the blob meta database. + * + * PUBLIC: int __blob_highest_id __P((DB *, DB_TXN *, db_seq_t *)); + */ +int +__blob_highest_id(dbp, txn, id) + DB *dbp; + DB_TXN *txn; + db_seq_t *id; +{ +#ifdef HAVE_64BIT_TYPES + int ret; + + *id = 0; + if (dbp->blob_sub_dir == NULL) { + if ((ret = __blob_make_sub_dir(dbp->env, &dbp->blob_sub_dir, + dbp->blob_file_id, dbp->blob_sdb_id)) != 0) + goto err; + } + if (dbp->blob_seq == NULL) { + ret = __blob_open_meta_db(dbp, txn, + &dbp->blob_meta_db, &dbp->blob_seq, 0, 0); + /* + * It is not an error if the blob meta database does not + * exist. + */ + if (ret == ENOENT) + ret = 0; + if (ret != 0) + goto err; + } + + ret = __seq_get(dbp->blob_seq, txn, 0, id, DB_CURRENT); +err: + return (ret); +#else /*HAVE_64BIT_TYPES*/ + COMPQUIET(id, NULL); + __db_errx(dbp->env, DB_STR("0245", + "library build did not include support for blobs")); + return (DB_OPNOTSUP); +#endif +} + +/* + * __blob_calculate_dirs + * + * Use a blob id to to determine the path below the blob subdirectory in + * which the blob file is located. Assumes enough space exists in the path + * variable to hold the path. + * + * PUBLIC: void __blob_calculate_dirs __P((db_seq_t, char *, int *, int *)); + */ +void +__blob_calculate_dirs(blob_id, path, len, depth) + db_seq_t blob_id; + char *path; + int *len; + int *depth; +{ + int i; + db_seq_t factor, tmp; + + /* Calculate the subdirectories from the blob id. */ + factor = 1; + for ((*depth) = 0, tmp = blob_id/BLOB_DIR_ELEMS; + tmp != 0; tmp = tmp/BLOB_DIR_ELEMS, (*depth)++) + factor *= BLOB_DIR_ELEMS; + + for (i = (*depth); i > 0; i--) { + tmp = (blob_id / factor) % BLOB_DIR_ELEMS; + factor /= BLOB_DIR_ELEMS; + (*len) += sprintf(path + (*len), + "%03llu%c", (unsigned long long)tmp, PATH_SEPARATOR[0]); + } +} + +/* + * __blob_id_to_path -- + * Generate the file name and blob specific part of the path for a particular + * blob_id. The __db_appname API is used to generate a fully qualified path. + * The caller must deallocate the path. + * + * PUBLIC: int __blob_id_to_path __P((ENV *, const char *, db_seq_t, char **)); + */ +int +__blob_id_to_path(env, blob_sub_dir, blob_id, ppath) + ENV *env; + const char *blob_sub_dir; + db_seq_t blob_id; + char **ppath; +{ + char *path, *tmp_path; + int depth, name_len, ret; + size_t len; + + name_len = 0; + path = tmp_path = *ppath = NULL; + + if (blob_id < 1) { + ret = EINVAL; + goto err; + } + + len = MAX_BLOB_PATH_SZ + strlen(blob_sub_dir) + 1; + if ((ret = __os_malloc(env, len, &path)) != 0) + goto err; + + memset(path, 0, len); + name_len += sprintf(path, "%s", blob_sub_dir); + + __blob_calculate_dirs(blob_id, path, &name_len, &depth); + + /* + * Populate the file name. Ensure there are 3 digits for each directory + * level (even if they are 0). + */ + (void)sprintf(path + name_len, "%s%0*llu", + BLOB_FILE_PREFIX, (depth + 1) * 3, (unsigned long long)blob_id); + + /* If this is the first file in the directory, ensure it exists. */ + if (blob_id % BLOB_DIR_ELEMS == 0 && depth > 0) { + if ((ret = __db_appname( + env, DB_APP_BLOB, path, NULL, &tmp_path)) != 0 ) + goto err; + + if ((ret = __db_mkpath(env, tmp_path)) != 0) { + __db_errx(env, DB_STR("0221", + "Error creating blob directory.")); + ret = EINVAL; + goto err; + } + __os_free(env, tmp_path); + } + + *ppath = path; + return (0); + +err: + if (tmp_path != NULL) + __os_free(env, tmp_path); + if (path != NULL) + __os_free(env, path); + + return (ret); +} + +/* + * __blob_str_to_id + * + * If the given string is a positive number, it returns it as a signed + * 64 bit integer. Otherwise the number is returned as 0. + * + * PUBLIC: int __blob_str_to_id __P((ENV *, const char **, db_seq_t *)); + */ +int +__blob_str_to_id(env, path, id) + ENV *env; + const char **path; + db_seq_t *id; +{ + db_seq_t i; + const char *p; + char buf[2]; + + p = *path; + i = 10; + *id = 0; + buf[1] = '\0'; + while (p[0] >= '0' && p[0] <= '9') { + *id *= i; + buf[0] = p[0]; + *id += atoi(buf); + if (*id < 0) { + __db_errx(env, DB_STR("0246", + "Blob id integer overflow.")); + return (EINVAL); + } + p++; + } + *path = p; + return (0); +} + +/* + * __blob_path_to_dir_ids -- + * Get the file and subdatabase ids from a path to a blob file + * or a path in the blob directory structure. Skips the + * subdatabase directory id if sdb_id is NULL. + * + * PUBLIC: int __blob_path_to_dir_ids + * PUBLIC: __P((ENV *, const char *, db_seq_t *, db_seq_t *)); + */ +int +__blob_path_to_dir_ids(env, path, file_id, sdb_id) + ENV *env; + const char *path; + db_seq_t *file_id; + db_seq_t *sdb_id; +{ + int ret; + size_t len; + const char *p; + + *file_id = 0; + if (sdb_id != NULL) + *sdb_id = 0; + ret = 0; + p = path; + + /* + * The blob file and subdatabase directories are of the form __db###, + * so search the string for any directories that match that form. + */ + len = strlen(path); + do { + p = strstr(p, BLOB_DIR_PREFIX); + if (p == NULL || p > (path + len + 4)) + return (ret); + p += 4; + } while (p[0] < '0' || p[0] > '9'); + + /* The file id should be next in the path. */ + if ((ret = __blob_str_to_id(env, &p, file_id)) != 0) + return (ret); + + /* Quit now if a subdatabase argument was not passed. */ + if (sdb_id == NULL) + return (ret); + + p = strstr(p, BLOB_DIR_PREFIX); + /* It is okay for the path not to include a sdb_id. */ + if (p == NULL || p > (path + 4 + len)) + return (ret); + + p += 4; + ret = __blob_str_to_id(env, &p, sdb_id); + + return (ret); +} + +/* + * __blob_salvage -- + * + * Print a blob file during salvage. The function assumes the DBT already has + * a buffer large enough to hold "size" bytes. + * + * PUBLIC: int __blob_salvage __P((ENV *, db_seq_t, off_t, size_t, + * PUBLIC: db_seq_t, db_seq_t, DBT *)); + */ +int +__blob_salvage(env, blob_id, offset, size, file_id, sdb_id, dbt) + ENV *env; + db_seq_t blob_id; + off_t offset; + size_t size; + db_seq_t file_id; + db_seq_t sdb_id; + DBT *dbt; +{ + DB_FH *fhp; + char *blob_sub_dir, *dir, *path; + int ret; + size_t bytes; + + blob_sub_dir = dir = path = NULL; + fhp = NULL; + + if (file_id == 0 && sdb_id == 0) { + ret = ENOENT; + goto err; + } + + if ((ret = __blob_make_sub_dir( + env, &blob_sub_dir, file_id, sdb_id)) != 0) + goto err; + + if ((ret = __blob_id_to_path(env, blob_sub_dir, blob_id, &dir)) != 0) + goto err; + + if ((ret = __db_appname(env, DB_APP_BLOB, dir, NULL, &path)) != 0) + goto err; + + if ((ret = __os_open(env, path, 0, DB_OSO_RDONLY, 0, &fhp)) != 0) + goto err; + + if ((ret = __os_seek(env, fhp, 0, 0, offset)) != 0) + goto err; + + if ((ret = __os_read(env, fhp, dbt->data, size, &bytes)) != 0) + goto err; + + dbt->size = (u_int32_t)bytes; + if (bytes != size) + ret = EIO; + +err: if (fhp != NULL) + (void)__os_closehandle(env, fhp); + if (dir != NULL) + __os_free(env, dir); + if (path != NULL) + __os_free(env, path); + if (blob_sub_dir != NULL) + __os_free(env, blob_sub_dir); + return (ret); +} + +/* + * __blob_vrfy -- + * + * Checks that a blob file for the given blob id exists, and is the given size. + * + * PUBLIC: int __blob_vrfy __P((ENV *, db_seq_t, off_t, + * PUBLIC: db_seq_t, db_seq_t, db_pgno_t, u_int32_t)); + */ +int +__blob_vrfy(env, blob_id, blob_size, file_id, sdb_id, pgno, flags) + ENV *env; + db_seq_t blob_id; + off_t blob_size; + db_seq_t file_id; + db_seq_t sdb_id; + db_pgno_t pgno; + u_int32_t flags; +{ + DB_FH *fhp; + char *blob_sub_dir, *dir, *path; + int isdir, ret; + off_t actual_size; + u_int32_t mbytes, bytes; + + blob_sub_dir = dir = path = NULL; + fhp = NULL; + isdir = 0; + ret = DB_VERIFY_BAD; + + if ((ret = __blob_make_sub_dir( + env, &blob_sub_dir, file_id, sdb_id)) != 0) + goto err; + + if (__blob_id_to_path(env, blob_sub_dir, blob_id, &dir) != 0) { + EPRINT((env, DB_STR_A("0222", + "Page %lu: Error getting path to blob file for %llu", + "%lu %llu"), (u_long)pgno, (unsigned long long)blob_id)); + goto err; + } + if (__db_appname(env, DB_APP_BLOB, dir, NULL, &path) != 0) { + EPRINT((env, DB_STR_A("0223", + "Page %lu: Error getting path to blob file for %llu", + "%lu %llu"), (u_long)pgno, (unsigned long long)blob_id)); + goto err; + } + if ((__os_exists(env, path, &isdir)) != 0 || isdir != 0) { + EPRINT((env, DB_STR_A("0224", + "Page %lu: blob file does not exist at %s", + "%lu %s"), (u_long)pgno, path)); + goto err; + } + if (__os_open(env, path, 0, DB_OSO_RDONLY, 0, &fhp) != 0) { + EPRINT((env, DB_STR_A("0225", + "Page %lu: Error opening blob file at %s", + "%lu %s"), (u_long)pgno, path)); + goto err; + } + if (__os_ioinfo(env, path, fhp, &mbytes, &bytes, NULL) != 0) { + EPRINT((env, DB_STR_A("0226", + "Page %lu: Error getting blob file size at %s", + "%lu %s"), (u_long)pgno, path)); + goto err; + } + + actual_size = ((off_t)mbytes * (off_t)MEGABYTE) + bytes; + if (blob_size != actual_size) { + EPRINT((env, DB_STR_A("0227", +"Page %lu: blob file size does not match size in database record: %llu %llu", + "%lu %llu %llu"), (u_long)pgno, + (unsigned long long)actual_size, + (unsigned long long)blob_size)); + goto err; + } + + ret = 0; + +err: if (fhp != NULL) + (void)__os_closehandle(env, fhp); + if (dir != NULL) + __os_free(env, dir); + if (path != NULL) + __os_free(env, path); + if (blob_sub_dir != NULL) + __os_free(env, blob_sub_dir); + return (ret); +} + +/* + * __blob_del_hierarchy -- + * + * Deletes the entire blob directory. Used by replication. + * + * PUBLIC: int __blob_del_hierarchy __P((ENV *)); + */ +int +__blob_del_hierarchy(env) + ENV *env; +{ + int ret; + char *blob_dir; + + blob_dir = NULL; + + if ((ret = __db_appname(env, DB_APP_BLOB, NULL, NULL, &blob_dir)) != 0) + goto err; + + if ((ret = __blob_clean_dir(env, NULL, blob_dir, NULL, 0)) != 0) + goto err; + +err: if (blob_dir != NULL) + __os_free(env, blob_dir); + return (ret); +} + +/* + * __blob_del_all -- + * + * Deletes all the blob files and meta databases in a database's blob + * directory. Does not delete the directories if the delete is transactionally + * protected, since there is no current way to undo a directory delete in case + * the operation is aborted. + * + * PUBLIC: int __blob_del_all __P((DB *, DB_TXN *, int)); + */ +int +__blob_del_all(dbp, txn, istruncate) + DB *dbp; + DB_TXN *txn; + int istruncate; +{ +#ifdef HAVE_64BIT_TYPES + ENV *env; + char *path; + int isdir, ret; + + env = dbp->env; + path = NULL; + ret = 0; + + if (dbp->blob_sub_dir == NULL) { + if ((ret = __blob_make_sub_dir(env, &dbp->blob_sub_dir, + dbp->blob_file_id, dbp->blob_sdb_id)) != 0) + goto err; + } + + /* Do nothing if blobs are not enabled. */ + if (dbp->blob_sub_dir == NULL || + (dbp->blob_file_id == 0 && dbp->blob_sdb_id == 0)) + goto err; + + if ((ret = __blob_get_dir(dbp, &path)) != 0) + goto err; + + /* Close the blob meta data databases, they are about to be deleted. */ + if (!istruncate) { + if (dbp->blob_seq != NULL) { + if ((ret = __seq_close(dbp->blob_seq, 0)) != 0) + goto err; + dbp->blob_seq = NULL; + } + if (dbp->blob_meta_db != NULL) { + if ((ret = + __db_close(dbp->blob_meta_db, NULL, 0)) != 0) + goto err; + dbp->blob_meta_db = NULL; + } + } + + /* + * The blob directory may not exist if blobs were enabled, + * but none were created. + */ + if (__os_exists(env, path, &isdir) != 0) + goto err; + + if ((ret = __blob_clean_dir( + env, txn, path, dbp->blob_sub_dir, istruncate)) != 0) + goto err; + + if (!IS_REAL_TXN(txn) && !istruncate) { + if ((ret = __os_rmdir(env, path)) != 0) + goto err; + } + +err: if (path != NULL) + __os_free(env, path); + return (ret); + +#else /*HAVE_64BIT_TYPES*/ + __db_errx(dbp->env, DB_STR("0220", + "library build did not include support for blobs")); + return (DB_OPNOTSUP); +#endif + +} + +/* + * __blob_clean_dir -- + * + * Delete all files in the given directory, and all files + * in all sub-directories. Does not remove directories if the operation is + * transactionally protected. + */ +static int +__blob_clean_dir(env, txn, dir, subdir, istruncate) + ENV *env; + DB_TXN *txn; + const char *dir; + const char *subdir; + int istruncate; +{ + DB *meta; + DB_THREAD_INFO *ip; + char *blob_dir, **dirs, *fname, full_path[DB_MAXPATHLEN], *local_path; + int count, i, isdir, ret, t_ret; + + count = 0; + dirs = NULL; + fname = NULL; + meta = NULL; + + /* Get a list of all files in the directory. */ + if ((ret = __os_dirlist(env, dir, 1, &dirs, &count)) != 0) { + if (ret == ENOENT) + ret = 0; + goto err; + } + + for (i = 0; i < count; i++) { + (void)sprintf(full_path, "%s%c%s%c", + dir, PATH_SEPARATOR[0], dirs[i], '\0'); + + if (__os_exists(env, full_path, &isdir) != 0) + continue; + + /* If it is a directory, clean it. Else remove the file. */ + if (isdir) { + if ((ret = __blob_clean_dir( + env, txn, full_path, subdir, istruncate)) != 0) + goto err; + /* Delete the top directory. */ + if (!IS_REAL_TXN(txn)) { + if ((ret = __os_rmdir(env, full_path)) != 0) + goto err; + } + } else if (strcmp(dirs[i], BLOB_META_FILE_NAME) == 0 ) { + /* Ignore the meta db when truncating. */ + if (istruncate) + continue; + blob_dir = (env->dbenv->db_blob_dir != NULL ? + env->dbenv->db_blob_dir : BLOB_DEFAULT_DIR); + if ((fname = strstr(full_path, blob_dir)) == NULL) + goto err; + fname += strlen(blob_dir) + 1; + if ((ret = __db_create_internal(&meta, env, 0)) != 0) + goto err; + ENV_GET_THREAD_INFO(env, ip); + if ((ret = __db_remove_int(meta, + ip, txn, fname, NULL, 0)) != 0) + goto err; + /* + * Closing the local DB handle releases the transaction + * locks, but those have to remain until the + * transaction is resolved, so NULL the DB locker. + * See __env_dbremove_pp for more details. + */ + if (IS_REAL_TXN(txn)) + meta->locker = NULL; + if ((t_ret = __db_close( + meta, NULL, DB_NOSYNC)) != 0 && ret == 0) + ret = t_ret; + meta = NULL; + if (ret != 0) + goto err; + } else { + if (!IS_REAL_TXN(txn)) + ret = __os_unlink(env, full_path, 0); + else { + local_path = (subdir == NULL ? full_path : + strstr(full_path, subdir)); + if (local_path != NULL) + ret = __fop_remove(env, txn, NULL, + local_path, NULL, DB_APP_BLOB, 0); + } + if (ret != 0) + goto err; + } + } +err: if (meta != NULL) { + if ((t_ret = __db_close( + meta, NULL, DB_NOSYNC)) != 0 && ret == 0) + ret = t_ret; + } + if (dirs != NULL) + __os_dirfree(env, dirs, count); + + return (ret); +} + +/* + * __blob_copy_all -- + * Copy all files in the blob directory. + * + * PUBLIC: int __blob_copy_all __P((DB*, const char *, u_int32_t)); + */ +int __blob_copy_all(dbp, target, flags) + DB *dbp; + const char *target; + u_int32_t flags; +{ + DB_THREAD_INFO *ip; + ENV *env; + char *blobdir, *fullname, *metafname, new_target[DB_MAXPATHLEN]; + const char *path; + int ret; + + env = dbp->env; + blobdir = NULL; + fullname = NULL; + metafname = NULL; + ret = 0; + + /* Do nothing if blobs are not enabled. */ + if (dbp->blob_sub_dir == NULL || dbp->blob_threshold == 0) + return (0); + + /* Create the directory structure in the target directory. */ + if (env->dbenv->db_blob_dir != NULL) + path = env->dbenv->db_blob_dir; + else + path = BLOB_DEFAULT_DIR; + + /* + * Default blob directory will be maintained in the target + * directory only when it is backing up a single directory. + */ + (void)snprintf(new_target, sizeof(new_target), "%s%c%s%c%c", + target, PATH_SEPARATOR[0], LF_ISSET(DB_BACKUP_SINGLE_DIR) ? + BLOB_DEFAULT_DIR : path, PATH_SEPARATOR[0], '\0'); + path = new_target; +#ifdef DB_WIN32 + /* + * Absolute paths on windows can result in it creating a "C" or "D" + * directory in the working directory. + */ + if (__os_abspath(path)) + path += 2; +#endif + if ((ret = __db_mkpath(env, path)) != 0) + goto err; + + /* Copy the directory id database. */ + if ((ret = __blob_make_meta_fname(env, NULL, &metafname)) != 0) + goto err; + if ((ret = __db_appname(env, + DB_APP_BLOB, metafname, NULL, &fullname)) != 0) + goto err; + path = fullname; + /* Remove env home from the full path of directory id database. */ + if (!__os_abspath(fullname) && + env->db_home != NULL && (env->db_home)[0] != '\0') + path += (strlen(env->db_home) + 1); + ENV_GET_THREAD_INFO(env, ip); + + if ((ret = __db_dbbackup( + dbp->dbenv, ip, path, new_target, 0, 0, metafname)) != 0) + goto err; + + if ((ret = __blob_get_dir(dbp, &blobdir)) != 0) + goto err; + + /* + * The blob directory may not exist if blobs were enabled, + * but none were created. + */ + if (__os_exists(env, blobdir, NULL) != 0) + goto err; + + (void)sprintf(new_target + strlen(new_target), + "%s%c", dbp->blob_sub_dir, '\0'); + if ((ret = __blob_copy_dir(dbp, blobdir, new_target)) != 0) + goto err; + +err: if (blobdir != NULL) + __os_free(env, blobdir); + if (metafname != NULL) + __os_free(env, metafname); + if (fullname != NULL) + __os_free(env, fullname); + return (ret); +} + +/* + * __blob_copy_dir -- + * Copy all files in the given directory, and all files + * in all sub-directories. + */ +static int +__blob_copy_dir(dbp, dir, target) + DB *dbp; + const char *dir; + const char *target; +{ + DB_THREAD_INFO *ip; + ENV *env; + char **dirs, full_path[DB_MAXPATHLEN], new_target[DB_MAXPATHLEN]; + int count, i, isdir, ret; + + env = dbp->env; + count = 0; + dirs = NULL; + + /* Create the directory sturcture in the target directory. */ + if ((ret = __db_mkpath(env, target)) != 0) + goto err; + + ENV_GET_THREAD_INFO(env, ip); + /* Get a list of all files in the directory. */ + if ((ret = __os_dirlist(env, dir, 1, &dirs, &count)) != 0) + goto err; + + for (i = 0; i < count; i++) { + (void)sprintf(full_path, "%s%c%s%c", + dir, PATH_SEPARATOR[0], dirs[i], '\0'); + + if (__os_exists(env, full_path, &isdir) != 0) + continue; + + /* + * If it is a directory, copy the files in it. + * Else if it is the meta database, call __db_dbbackup, else + * copy the file. + */ + if (isdir) { + (void)sprintf(new_target, + "%s%c%s%c%c", target, PATH_SEPARATOR[0], + dirs[i], PATH_SEPARATOR[0], '\0'); + if ((ret = __blob_copy_dir( + dbp, full_path, new_target)) != 0) + goto err; + } else { + if (strcmp(dirs[i], BLOB_META_FILE_NAME) == 0) { + (void)sprintf(full_path, "%s%c%s%c", + dbp->blob_sub_dir, + PATH_SEPARATOR[0], dirs[i], '\0'); + if ((ret = __db_dbbackup(dbp->dbenv, ip, + full_path, target, 0, 0, + BLOB_META_FILE_NAME)) != 0) + goto err; + } else { + if ((ret = backup_data_copy( + dbp->dbenv, dirs[i], dir, target, 0)) != 0) + goto err; + } + } + } + +err: + if (dirs != NULL) + __os_dirfree(env, dirs, count); + return (ret); +} diff --git a/src/btree/bt_compact.c b/src/btree/bt_compact.c index b455ff23..be4c6b01 100644 --- a/src/btree/bt_compact.c +++ b/src/btree/bt_compact.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -22,13 +22,16 @@ static int __bam_csearch __P((DBC *, DBT *, u_int32_t, int)); static int __bam_lock_tree __P((DBC *, EPG *, EPG *csp, u_int32_t, u_int32_t)); static int __bam_lock_subtree __P((DBC *, PAGE *, u_int32_t, u_int32_t)); static int __bam_merge __P((DBC *, - DBC *, u_int32_t, DBT *, DB_COMPACT *,int *)); -static int __bam_merge_internal __P((DBC *, DBC *, int, DB_COMPACT *, int *)); + DBC *, u_int32_t, DBT *, DB_COMPACT *, int *, int *)); +static int __bam_merge_internal __P((DBC *, + DBC *, int, DB_COMPACT *, int *, int *)); static int __bam_merge_pages __P((DBC *, DBC *, DB_COMPACT *)); -static int __bam_merge_records __P((DBC *, DBC*, u_int32_t, DB_COMPACT *)); -static int __bam_truncate_internal_overflow __P((DBC *, PAGE *, DB_COMPACT *)); +static int __bam_merge_records __P((DBC *, + DBC *, u_int32_t, DB_COMPACT *, int *)); +static int __bam_truncate_internal_overflow __P((DBC *, + PAGE *, DB_COMPACT *, int *)); static int __bam_truncate_root_page __P((DBC *, - PAGE *, u_int32_t, DB_COMPACT *)); + PAGE *, u_int32_t, DB_COMPACT *, int *)); #ifdef HAVE_FTRUNCATE static int __bam_savekey __P((DBC *, int, DBT *)); @@ -145,13 +148,13 @@ __bam_csearch(dbc, start, sflag, level) * PUBLIC: DBT *, DBT *, u_int32_t, int *, DB_COMPACT *, int *)); */ int -__bam_compact_int(dbc, start, stop, factor, spanp, c_data, donep) +__bam_compact_int(dbc, start, stop, factor, spanp, c_data, isdonep) DBC *dbc; DBT *start, *stop; u_int32_t factor; int *spanp; DB_COMPACT *c_data; - int *donep; + int *isdonep; { BTREE_CURSOR *cp, *ncp; DB *dbp; @@ -168,7 +171,7 @@ __bam_compact_int(dbc, start, stop, factor, spanp, c_data, donep) int check_dups, check_trunc, clear_root, do_commit, isdone; int merged, next_p, pgs_done, ret, t_ret, tdone; -#ifdef DEBUG +#ifdef DEBUG_WOP #define CTRACE(dbc, location, t, start, f) do { \ DBT __trace; \ DB_SET_DBT(__trace, t, strlen(t)); \ @@ -182,8 +185,8 @@ __bam_compact_int(dbc, start, stop, factor, spanp, c_data, donep) CTRACE(dbc, location, __buf, start, f); \ } while (0) #else -#define CTRACE(dbc, location, t, start, f) -#define PTRACE(dbc, location, p, start, f) +#define CTRACE(dbc, location, t, start, f) NOP_STATEMENT +#define PTRACE(dbc, location, p, start, f) NOP_STATEMENT #endif ndbc = NULL; @@ -551,11 +554,10 @@ retry: pg = NULL; if (ret != 0) goto err1; } - pgs_done++; - /* Get a fresh low numbered page. */ + /* Try to swap to a lower numbered page. */ if ((ret = __db_exchange_page(dbc, &cp->csp->page, ncp->csp->page, - PGNO_INVALID, DB_EXCH_DEFAULT)) != 0) + PGNO_INVALID, DB_EXCH_DEFAULT, &pgs_done)) != 0) goto err1; if ((ret = __TLPUT(dbc, prev_lock)) != 0) goto err1; @@ -598,8 +600,8 @@ retry: pg = NULL; merged = 0; for (epg = cp->sp; epg != cp->csp; epg++) { PTRACE(dbc, "PMerge", PGNO(epg->page), start, 0); - if ((ret = __bam_merge_internal(dbc, - ndbc, LEVEL(epg->page), c_data, &merged)) != 0) + if ((ret = __bam_merge_internal(dbc, ndbc, + LEVEL(epg->page), c_data, &merged, &pgs_done)) != 0) break; if (merged) break; @@ -627,7 +629,7 @@ retry: pg = NULL; } PTRACE(dbc, "SMerge", PGNO(cp->csp->page), start, 0); - /* if we remove the next page, then we need its next locked */ + /* If we remove the next page, then we need its next locked. */ npgno = NEXT_PGNO(ncp->csp->page); if (npgno != PGNO_INVALID) { TRY_LOCK2(dbc, ndbc, npgno, @@ -637,9 +639,8 @@ retry: pg = NULL; } /*lint -e{794} */ if ((ret = __bam_merge(dbc, - ndbc, factor, stop, c_data, &isdone)) != 0) + ndbc, factor, stop, c_data, &isdone, &pgs_done)) != 0) goto err1; - pgs_done++; /* * __bam_merge could have freed our stack if it * deleted a page possibly collapsing the tree. @@ -722,8 +723,8 @@ retry: pg = NULL; /* Get a fresh low numbered page. */ pgno = PGNO(pg); if ((ret = __db_exchange_page(dbc, - &cp->csp->page, NULL, - PGNO_INVALID, DB_EXCH_DEFAULT)) != 0) + &cp->csp->page, NULL, PGNO_INVALID, + DB_EXCH_DEFAULT, &pgs_done)) != 0) goto err1; if ((ret = __TLPUT(dbc, prev_lock)) != 0) goto err1; @@ -734,10 +735,7 @@ retry: pg = NULL; LOCK_INIT(next_lock); saved_pgno = PGNO_INVALID; pg = cp->csp->page; - if (pgno != PGNO(pg)) { - pgs_done++; - pgno = PGNO(pg); - } + pgno = PGNO(pg); } /* * If we are going to leave this parent commit @@ -752,7 +750,7 @@ retry: pg = NULL; goto next_page; } - /* If they have the same parent, just dup the cursor */ + /* If they have the same parent, just dup the cursor. */ if (ndbc != NULL && (ret = __dbc_close(ndbc)) != 0) goto err1; if ((ret = __dbc_dup(dbc, &ndbc, DB_POSITION)) != 0) @@ -842,17 +840,15 @@ retry: pg = NULL; pgno = PGNO(pg); /* Get a fresh low numbered page. */ if ((ret = __db_exchange_page(dbc, &cp->csp->page, - npg, PGNO_INVALID, DB_EXCH_DEFAULT)) != 0) + npg, PGNO_INVALID, + DB_EXCH_DEFAULT, &pgs_done)) != 0) goto err1; if ((ret = __TLPUT(dbc, prev_lock)) != 0) goto err1; LOCK_INIT(prev_lock); prev_pgno = PGNO_INVALID; pg = cp->csp->page; - if (pgno != PGNO(pg)) { - pgs_done++; - pgno = PGNO(pg); - } + pgno = PGNO(pg); } c_data->compact_pages_examine++; @@ -887,11 +883,9 @@ retry: pg = NULL; */ PTRACE(dbc, "Merge", PGNO(cp->csp->page), start, 0); if ((ret = __bam_merge(dbc, - ndbc, factor, stop, c_data, &isdone)) != 0) + ndbc, factor, stop, c_data, &isdone, &pgs_done)) != 0) goto err1; - pgs_done++; - if ((ret = __TLPUT(dbc, nnext_lock)) != 0) goto err1; LOCK_INIT(nnext_lock); @@ -932,7 +926,7 @@ next_page: pg = NULL; if ((ret = __bam_stkrel(dbc, STK_PGONLY)) != 0) goto err; - if (npgno != PGNO_INVALID && + if (npgno != PGNO_INVALID && !do_commit && (ret = __db_lget(dbc, 0, npgno, DB_LOCK_READ, 0, &next_lock)) != 0) goto err; if ((ret = __bam_stkrel(dbc, pgs_done == 0 ? STK_NOLOCK : 0)) != 0) @@ -1010,9 +1004,6 @@ err: /* if ((t_ret = __bam_stkrel(dbc, sflag)) != 0 && ret == 0) ret = t_ret; - if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0) - ret = t_ret; - if (pg != NULL && (t_ret = __memp_fput(dbmp, dbc->thread_info, pg, dbc->priority) != 0) && ret == 0) @@ -1022,7 +1013,11 @@ err: /* dbc->thread_info, npg, dbc->priority) != 0) && ret == 0) ret = t_ret; -out: *donep = isdone; +out: + if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0) + ret = t_ret; + + *isdonep = isdone; /* For OPD trees return if we did anything in the span variable. */ if (F_ISSET(dbc, DBC_OPD)) @@ -1035,12 +1030,13 @@ out: *donep = isdone; * __bam_merge -- do actual merging of leaf pages. */ static int -__bam_merge(dbc, ndbc, factor, stop, c_data, donep) +__bam_merge(dbc, ndbc, factor, stop, c_data, isdonep, pgs_donep) DBC *dbc, *ndbc; u_int32_t factor; DBT *stop; DB_COMPACT *c_data; - int *donep; + int *isdonep; + int *pgs_donep; { BTREE_CURSOR *cp, *ncp; DB *dbp; @@ -1064,9 +1060,9 @@ __bam_merge(dbc, ndbc, factor, stop, c_data, donep) /* Find if the stopping point is on this page. */ if (stop != NULL && stop->size != 0) { - if ((ret = __bam_compact_isdone(dbc, stop, npg, donep)) != 0) + if ((ret = __bam_compact_isdone(dbc, stop, npg, isdonep)) != 0) return (ret); - if (*donep) + if (*isdonep) return (0); } @@ -1080,20 +1076,23 @@ __bam_merge(dbc, ndbc, factor, stop, c_data, donep) ncp->csp[-1].indx == 0 && ncp->csp[-1].entries != 1) || (int)(P_FREESPACE(dbp, pg) - ((dbp->pgsize - P_OVERHEAD(dbp)) - - P_FREESPACE(dbp, npg))) < (int)factor) - ret = __bam_merge_records(dbc, ndbc, factor, c_data); - else + P_FREESPACE(dbp, npg))) < (int)factor) { + ret = __bam_merge_records(dbc, ndbc, factor, c_data, pgs_donep); + } else { /*lint -e{794} */ free_page: ret = __bam_merge_pages(dbc, ndbc, c_data); + (*pgs_donep)++; + } return (ret); } static int -__bam_merge_records(dbc, ndbc, factor, c_data) +__bam_merge_records(dbc, ndbc, factor, c_data, pgs_donep) DBC *dbc, *ndbc; u_int32_t factor; DB_COMPACT *c_data; + int *pgs_donep; { BINTERNAL *bi; BKEYDATA *bk, *tmp_bk; @@ -1126,8 +1125,8 @@ __bam_merge_records(dbc, ndbc, factor, c_data) if (c_data->compact_truncate != PGNO_INVALID && PGNO(ncp->csp->page) > c_data->compact_truncate) { /* Get a fresh low numbered page. */ - if ((ret = __db_exchange_page(ndbc, - &ncp->csp->page, pg, PGNO_INVALID, DB_EXCH_DEFAULT)) != 0) + if ((ret = __db_exchange_page(ndbc, &ncp->csp->page, + pg, PGNO_INVALID, DB_EXCH_DEFAULT, pgs_donep)) != 0) goto err; } @@ -1197,6 +1196,7 @@ __bam_merge_records(dbc, ndbc, factor, c_data) /* If we have hit the first record then there is nothing we can move. */ if (indx == 0) goto done; + (*pgs_donep)++; if (TYPE(pg) != P_LBTREE && TYPE(pg) != P_LDUP) { if (indx == nent) return (__bam_merge_pages(dbc, ndbc, c_data)); @@ -1237,7 +1237,8 @@ __bam_merge_records(dbc, ndbc, factor, c_data) indx -= adj; } bk = GET_BKEYDATA(dbp, npg, indx); - len = (B_TYPE(bk->type) != B_KEYDATA) ? BOVERFLOW_SIZE : bk->len; + len = (B_TYPE(bk->type) == B_KEYDATA) ? bk->len : + ((B_TYPE(bk->type) == B_BLOB) ? BBLOB_DSIZE : BOVERFLOW_SIZE); if (indx != 0 && BINTERNAL_SIZE(len) >= pfree) { if (F_ISSET(dbc, DBC_OPD)) { if (dbp->dup_compare == __bam_defcmp) @@ -1281,8 +1282,9 @@ noprefix: } while (indx != 0 && ninp[indx] == ninp[indx - adj]); bk = GET_BKEYDATA(dbp, npg, indx); - len = - (B_TYPE(bk->type) != B_KEYDATA) ? BOVERFLOW_SIZE : bk->len; + len = (B_TYPE(bk->type) == B_KEYDATA) ? + bk->len : ((B_TYPE(bk->type) == B_BLOB) ? + BBLOB_DSIZE : BOVERFLOW_SIZE); } /* @@ -1346,6 +1348,13 @@ no_check: is_dup = first_dup = next_dup = 0; BOVERFLOW_SIZE, &data, NULL)) != 0) goto err; break; + case B_BLOB: + data.size = BBLOB_SIZE; + data.data = bk; + if ((ret = __db_pitem(dbc, pg, + pind, BBLOB_SIZE, &data, NULL)) != 0) + goto err; + break; default: __db_errx(env, DB_STR_A("1022", "Unknown record format, page %lu, indx 0", @@ -1538,15 +1547,20 @@ err: return (ret); /* * __bam_merge_internal -- * Merge internal nodes of the tree. + * + * The first key of an internal page does not have a guaranteed- + * useful key. */ static int -__bam_merge_internal(dbc, ndbc, level, c_data, merged) +__bam_merge_internal(dbc, ndbc, level, c_data, merged, pgs_donep) DBC *dbc, *ndbc; int level; DB_COMPACT *c_data; int *merged; + int *pgs_donep; { BINTERNAL bi, *bip, *fip; + BOVERFLOW bo; BTREE_CURSOR *cp, *ncp; DB *dbp; DBT data, hdr; @@ -1579,7 +1593,6 @@ __bam_merge_internal(dbc, ndbc, level, c_data, merged) dbmp = dbp->mpf; cp = (BTREE_CURSOR *)dbc->internal; ncp = (BTREE_CURSOR *)ndbc->internal; - *merged = 0; ret = 0; /* @@ -1608,11 +1621,11 @@ __bam_merge_internal(dbc, ndbc, level, c_data, merged) * Check for overflow keys on both pages while we have * them locked. */ - if ((ret = - __bam_truncate_internal_overflow(dbc, pg, c_data)) != 0) + if ((ret = __bam_truncate_internal_overflow(dbc, + pg, c_data, pgs_donep)) != 0) goto err; - if ((ret = - __bam_truncate_internal_overflow(dbc, npg, c_data)) != 0) + if ((ret = __bam_truncate_internal_overflow(dbc, + npg, c_data, pgs_donep)) != 0) goto err; } @@ -1624,7 +1637,12 @@ __bam_merge_internal(dbc, ndbc, level, c_data, merged) */ fip = NULL; if (TYPE(pg) == P_IBTREE) { - /* See where we run out of space. */ + /* See where we run out of space. This does not yet include + * whatever extra pages are needed if an overflow key is + * going to be added to one or more parent pages. It would be + * better to use as little of the key that as necessary, though + * the effort of determining that might not be worthwhile. + */ freespace = P_FREESPACE(dbp, pg); /* * The leftmost key of an internal page is not accurate. @@ -1704,12 +1722,37 @@ fits: memset(&bi, 0, sizeof(bi)); if (fip == NULL) { data.size = bip->len; data.data = bip->data; + } else if (fip->type == B_OVERFLOW) { + DB_ASSERT(dbc->env, + fip->len == sizeof(BOVERFLOW)); + /* Cast to "BOVERFLOW *" to calm down lint. */ + memmove(&bo, + (BOVERFLOW *)fip->data, sizeof(BOVERFLOW)); + memset(&hdr, 0, sizeof(hdr)); + if ((ret = __db_goff(dbc, &hdr, bo.tlen, + bo.pgno, &hdr.data, &hdr.size)) == 0) + ret = __db_poff(dbc, &hdr, &bo.pgno); + if (hdr.data != NULL) + __os_free(dbp->env, hdr.data); + if (ret != 0) + return (ret); + data.size = sizeof(bo); + data.data = &bo; + } else if (fip->type == B_BLOB) { + /* Blobs should never appear as keys. */ + DB_ASSERT(dbc->env, + !(fip->type == B_BLOB && + TYPE(pg) == P_IBTREE)); } else { data.size = fip->len; data.data = fip->data; } bi.len = data.size; - B_TSET(bi.type, bip->type); + /* + * Set bi.type according to the data's type, to ensure + * that it is B_OVERLOW iff the data is BOVERFLOW. + */ + B_TSET(bi.type, fip == NULL ? bip->type : fip->type); bi.pgno = bip->pgno; bi.nrecs = bip->nrecs; hdr.data = &bi; @@ -1750,7 +1793,12 @@ fits: memset(&bi, 0, sizeof(bi)); if ((ret = __db_pitem(dbc, pg, pind, size, &hdr, &data)) != 0) goto err; pind++; - if (fip != NULL) { + /* add bip test so fortify does not complain */ + if (fip != NULL && bip != NULL) { + if (B_TYPE(bip->type) == B_OVERFLOW && + (ret = __db_doff(dbc, + ((BOVERFLOW *)bip->data)->pgno)) != 0) + goto err; /* reset size to be for the record being deleted. */ size = BINTERNAL_SIZE(bip->len); fip = NULL; @@ -1848,14 +1896,14 @@ fits: memset(&bi, 0, sizeof(bi)); PGNO(npg) > c_data->compact_truncate && ncp->csp != ncp->sp) { if ((ret = __db_exchange_page(ndbc, &ncp->csp->page, - pg, PGNO_INVALID, DB_EXCH_DEFAULT)) != 0) + pg, PGNO_INVALID, DB_EXCH_DEFAULT, pgs_donep)) != 0) goto err; } if (c_data->compact_truncate != PGNO_INVALID && PGNO(pg) > c_data->compact_truncate && cp->csp != cp->sp) { if ((ret = __db_exchange_page(dbc, &cp->csp->page, ncp->csp->page, - PGNO_INVALID, DB_EXCH_DEFAULT)) != 0) + PGNO_INVALID, DB_EXCH_DEFAULT, pgs_donep)) != 0) goto err; } } @@ -1875,13 +1923,13 @@ err: cp->csp = save_csp; * We may or may not have a write lock on this page. */ static int -__bam_compact_dups(dbc, ppg, factor, have_lock, c_data, donep) +__bam_compact_dups(dbc, ppg, factor, have_lock, c_data, pgs_donep) DBC *dbc; PAGE **ppg; u_int32_t factor; int have_lock; DB_COMPACT *c_data; - int *donep; + int *pgs_donep; { BOVERFLOW *bo; BTREE_CURSOR *cp; @@ -1896,15 +1944,19 @@ __bam_compact_dups(dbc, ppg, factor, have_lock, c_data, donep) DB_ASSERT(NULL, dbc != NULL); dbp = dbc->dbp; dbmp = dbp->mpf; + /* XXX Don't reserve any free bytes (Force 100% fillfactor) in OPD trees + * to ensure forward progress. + */ + factor = 0; cp = (BTREE_CURSOR *)dbc->internal; for (i = 0; i < NUM_ENT(*ppg); i++) { bo = GET_BOVERFLOW(dbp, *ppg, i); - if (B_TYPE(bo->type) == B_KEYDATA) + if (B_TYPE(bo->type) == B_KEYDATA || + B_TYPE(bo->type) == B_BLOB) continue; c_data->compact_pages_examine++; if (bo->pgno > c_data->compact_truncate) { - (*donep)++; if (!have_lock) { /* * The caller should have the page at @@ -1925,8 +1977,9 @@ __bam_compact_dups(dbc, ppg, factor, have_lock, c_data, donep) dbc->txn, DB_MPOOL_DIRTY, ppg)) != 0) goto err; } + pgno = bo->pgno; if ((ret = __bam_truncate_root_page(dbc, - *ppg, i, c_data)) != 0) + *ppg, i, c_data, pgs_donep)) != 0) goto err; /* Just in case it should move. Could it? */ bo = GET_BOVERFLOW(dbp, *ppg, i); @@ -1934,13 +1987,13 @@ __bam_compact_dups(dbc, ppg, factor, have_lock, c_data, donep) if (B_TYPE(bo->type) == B_OVERFLOW) { if ((ret = __db_truncate_overflow(dbc, - bo->pgno, have_lock ? NULL : ppg, c_data)) != 0) + bo->pgno, have_lock ? NULL : ppg, + c_data, pgs_donep)) != 0) goto err; - (*donep)++; continue; } if ((ret = __bam_compact_opd(dbc, bo->pgno, - have_lock ? NULL : ppg, factor, c_data, donep)) != 0) + have_lock ? NULL : ppg, factor, c_data, pgs_donep)) != 0) goto err; } @@ -1955,13 +2008,13 @@ err: * PUBLIC: db_pgno_t, PAGE **, u_int32_t, DB_COMPACT *, int *)); */ int -__bam_compact_opd(dbc, root_pgno, ppg, factor, c_data, donep) +__bam_compact_opd(dbc, root_pgno, ppg, factor, c_data, pgs_donep) DBC *dbc; db_pgno_t root_pgno; PAGE **ppg; u_int32_t factor; DB_COMPACT *c_data; - int *donep; + int *pgs_donep; { BTREE_CURSOR *cp; DBC *opd; @@ -2021,7 +2074,7 @@ __bam_compact_opd(dbc, root_pgno, ppg, factor, c_data, donep) NULL, factor, &span, c_data, &isdone)) != 0) break; /* For OPD the number of pages dirtied is returned in span. */ - *donep += span; + *pgs_donep += span; } while (!isdone); if (start.data != NULL) @@ -2041,11 +2094,12 @@ done: * The page is reference by the pg/indx passed in. */ static int -__bam_truncate_root_page(dbc, pg, indx, c_data) +__bam_truncate_root_page(dbc, pg, indx, c_data, pgs_donep) DBC *dbc; PAGE *pg; u_int32_t indx; DB_COMPACT *c_data; + int *pgs_donep; { BINTERNAL *bi; BOVERFLOW *bo; @@ -2053,8 +2107,8 @@ __bam_truncate_root_page(dbc, pg, indx, c_data) db_pgno_t *pgnop; u_int32_t tlen; - COMPQUIET(c_data, NULL); COMPQUIET(bo, NULL); + COMPQUIET(c_data, NULL); dbp = dbc->dbp; if (TYPE(pg) == P_IBTREE) { bi = GET_BINTERNAL(dbp, pg, indx); @@ -2075,7 +2129,7 @@ __bam_truncate_root_page(dbc, pg, indx, c_data) DB_ASSERT(dbp->env, IS_DIRTY(pg)); - return (__db_truncate_root(dbc, pg, indx, pgnop, tlen)); + return (__db_truncate_root(dbc, pg, indx, pgnop, tlen, pgs_donep)); } /* @@ -2086,10 +2140,11 @@ __bam_truncate_root_page(dbc, pg, indx, c_data) * nodes they will get copied adding pages to the database. */ static int -__bam_truncate_internal_overflow(dbc, page, c_data) +__bam_truncate_internal_overflow(dbc, page, c_data, pgs_donep) DBC *dbc; PAGE *page; DB_COMPACT *c_data; + int *pgs_donep; { BINTERNAL *bi; BOVERFLOW *bo; @@ -2104,10 +2159,11 @@ __bam_truncate_internal_overflow(dbc, page, c_data) continue; bo = (BOVERFLOW *)(bi->data); if (bo->pgno > c_data->compact_truncate && (ret = - __bam_truncate_root_page(dbc, page, indx, c_data)) != 0) + __bam_truncate_root_page(dbc, page, + indx, c_data, pgs_donep)) != 0) break; - if ((ret = __db_truncate_overflow( - dbc, bo->pgno, NULL, c_data)) != 0) + if ((ret = __db_truncate_overflow(dbc, + bo->pgno, NULL, c_data, pgs_donep)) != 0) break; } return (ret); @@ -2142,7 +2198,7 @@ __bam_compact_isdone(dbc, stop, pg, isdone) } else { DB_ASSERT(dbc->dbp->env, TYPE(pg) == P_LBTREE); if ((ret = __bam_cmp(dbc, stop, pg, 0, - t->bt_compare, &cmp)) != 0) + t->bt_compare, &cmp, NULL)) != 0) return (ret); *isdone = cmp <= 0; @@ -2328,7 +2384,7 @@ __bam_savekey(dbc, next, start) if (len == 0) { no_key: __db_errx(env, DB_STR("1023", "Compact cannot handle zero length key")); - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } } else { @@ -2360,14 +2416,15 @@ retry: return (DB_LOCK_NOTGRANTED); * Find high numbered pages in the internal nodes of a tree and * swap them for lower numbered pages. * PUBLIC: int __bam_truncate_ipages __P((DB *, - * PUBLIC: DB_THREAD_INFO *, DB_TXN *, DB_COMPACT *)); + * PUBLIC: DB_THREAD_INFO *, DB_TXN *, DB_COMPACT *, int *)); */ int -__bam_truncate_ipages(dbp, ip, txn, c_data) +__bam_truncate_ipages(dbp, ip, txn, c_data, pgs_donep) DB *dbp; DB_THREAD_INFO *ip; DB_TXN *txn; DB_COMPACT *c_data; + int *pgs_donep; { BTMETA *meta; BTREE *bt; @@ -2480,8 +2537,9 @@ new_txn: pgno = PGNO(cp->csp->page); if (pgno > c_data->compact_truncate) { - if ((ret = __db_exchange_page(dbc, &cp->csp->page, - NULL, PGNO_INVALID, DB_EXCH_DEFAULT)) != 0) + if ((ret = __db_exchange_page(dbc, + &cp->csp->page, NULL, PGNO_INVALID, + DB_EXCH_DEFAULT, pgs_donep)) != 0) goto err; } @@ -2561,7 +2619,8 @@ again: if (F_ISSET(dbp, DB_AM_SUBDB) && } if (PGNO(meta) > c_data->compact_truncate) { dbmeta = (DBMETA *)meta; - ret = __db_move_metadata(dbc, &dbmeta, c_data); + ret = __db_move_metadata(dbc, + &dbmeta, c_data, pgs_donep); meta = (BTMETA *)dbmeta; if (ret != 0) goto err; @@ -2583,8 +2642,8 @@ again: if (F_ISSET(dbp, DB_AM_SUBDB) && * page latch is released. */ ++dbp->mpf->mfp->revision; - if ((ret = __db_exchange_page(dbc, - &root, NULL, PGNO_INVALID, DB_EXCH_FREE)) != 0) + if ((ret = __db_exchange_page(dbc, &root, NULL, + PGNO_INVALID, DB_EXCH_FREE, pgs_donep)) != 0) goto err; if (PGNO(root) == bt->bt_root) goto err; diff --git a/src/btree/bt_compare.c b/src/btree/bt_compare.c index 5c009071..8923c5fa 100644 --- a/src/btree/bt_compare.c +++ b/src/btree/bt_compare.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1990, 1993, 1994, 1995, 1996 @@ -49,27 +49,39 @@ /* * __bam_cmp -- - * Compare a key to a given record. + * Compare a key to a given record. We always start the comparison + * at an offset and update the offset with longest matching count + * after the comparison. * * PUBLIC: int __bam_cmp __P((DBC *, const DBT *, PAGE *, u_int32_t, - * PUBLIC: int (*)(DB *, const DBT *, const DBT *), int *)); + * PUBLIC: int (*)(DB *, const DBT *, const DBT *, size_t *), + * PUBLIC: int *, size_t *)); */ int -__bam_cmp(dbc, dbt, h, indx, func, cmpp) +__bam_cmp(dbc, dbt, h, indx, func, cmpp, locp) DBC *dbc; const DBT *dbt; PAGE *h; u_int32_t indx; - int (*func)__P((DB *, const DBT *, const DBT *)); + int (*func)__P((DB *, const DBT *, const DBT *, size_t *)); int *cmpp; + size_t *locp; { + BBLOB bl; BINTERNAL *bi; BKEYDATA *bk; BOVERFLOW *bo; DB *dbp; DBT pg_dbt; + off_t blob_size; + int ret; + db_seq_t blob_id; dbp = dbc->dbp; + ret = 0; + + /* Assert that the func is non-Null. */ + DB_ASSERT(dbp->env, func != NULL); /* * Returns: @@ -91,11 +103,49 @@ __bam_cmp(dbc, dbt, h, indx, func, cmpp) bk = GET_BKEYDATA(dbp, h, indx); if (B_TYPE(bk->type) == B_OVERFLOW) bo = (BOVERFLOW *)bk; - else { + else if (B_TYPE(bk->type) == B_BLOB) { + /* + * This is very slow, but since blobs cannot be + * in databases with duplicates or be keys, it should + * only happen when using DB_GET_BOTH or DB_SET. + */ + memcpy(&bl, bk, BBLOB_SIZE); + memset(&pg_dbt, 0, sizeof(DBT)); + GET_BLOB_SIZE(dbc->env, bl, blob_size, ret); + if (ret != 0) + return (ret); + if (blob_size > UINT32_MAX) + pg_dbt.size = UINT32_MAX; + else + pg_dbt.size = (u_int32_t)blob_size; + blob_id = (db_seq_t)bl.id; + pg_dbt.flags = DB_DBT_USERMEM; + if ((ret = __os_malloc( + dbc->env, pg_dbt.size, &pg_dbt.data)) != 0) + return (ret); + pg_dbt.ulen = pg_dbt.size; + if ((ret = __blob_get(dbc, + &pg_dbt, blob_id, blob_size, NULL, NULL)) != 0) { + __os_free(dbc->env, pg_dbt.data); + return (ret); + } + *cmpp = func(dbp, dbt, &pg_dbt, locp); + /* + * There is no way to directly compare a blob file that + * is greater in size than UINT32_MAX, so instead we + * compare the data up to UINT32_MAX, and if they are + * equal return that the blob is larger, since it is + * longer than the input data. + */ + if (*cmpp == 0 && (blob_size > UINT32_MAX)) + *cmpp = -1; + __os_free(dbc->env, pg_dbt.data); + return (0); + } else { pg_dbt.app_data = NULL; pg_dbt.data = bk->data; pg_dbt.size = bk->len; - *cmpp = func(dbp, dbt, &pg_dbt); + *cmpp = func(dbp, dbt, &pg_dbt, locp); return (0); } break; @@ -123,13 +173,14 @@ __bam_cmp(dbc, dbt, h, indx, func, cmpp) } bi = GET_BINTERNAL(dbp, h, indx); - if (B_TYPE(bi->type) == B_OVERFLOW) + if (B_TYPE(bi->type) == B_OVERFLOW) { + DB_ASSERT(dbp->env, bi->len == BOVERFLOW_SIZE); bo = (BOVERFLOW *)(bi->data); - else { + } else { pg_dbt.app_data = NULL; pg_dbt.data = bi->data; pg_dbt.size = bi->len; - *cmpp = func(dbp, dbt, &pg_dbt); + *cmpp = func(dbp, dbt, &pg_dbt, locp); return (0); } break; @@ -141,42 +192,56 @@ __bam_cmp(dbc, dbt, h, indx, func, cmpp) * Overflow. */ return (__db_moff(dbc, dbt, bo->pgno, bo->tlen, - func == __bam_defcmp ? NULL : func, cmpp)); + func == __bam_defcmp ? NULL : func, cmpp, locp)); } /* * __bam_defcmp -- - * Default comparison routine. + * Keep track of how far along in the two keys we find matching + * characters, and use that as an offset into the keys to begin + * future comparisons. This will save us the overhead of always + * starting the comparisons on the first character. * - * PUBLIC: int __bam_defcmp __P((DB *, const DBT *, const DBT *)); + * PUBLIC: int __bam_defcmp __P((DB *, const DBT *, const DBT *, size_t *)); */ int -__bam_defcmp(dbp, a, b) +__bam_defcmp(dbp, a, b, locp) DB *dbp; const DBT *a, *b; + size_t *locp; { - size_t len; + size_t len, i, start; u_int8_t *p1, *p2; COMPQUIET(dbp, NULL); - + start = (locp == NULL ? 0 : *locp); /* * Returns: * < 0 if a is < b * = 0 if a is = b * > 0 if a is > b * + * We start the comparison from 'locp' and store the last match + * location in 'locp'. + * * XXX * If a size_t doesn't fit into a long, or if the difference between * any two characters doesn't fit into an int, this routine can lose. * What we need is a signed integral type that's guaranteed to be at * least as large as a size_t, and there is no such thing. */ + p1 = (u_int8_t *)a->data + start; + p2 = (u_int8_t *)b->data + start; len = a->size > b->size ? b->size : a->size; - for (p1 = a->data, p2 = b->data; len--; ++p1, ++p2) - if (*p1 != *p2) - return ((long)*p1 - (long)*p2); - return ((long)a->size - (long)b->size); + for (i = start; i < len; ++p1, ++p2, ++i) + if (*p1 != *p2) { + if (locp != NULL) + *locp = i; + return (*p1 < *p2 ? -1 : 1); + } + if (locp != NULL) + *locp = len; + return (a->size == b->size ? 0 : (a->size < b->size ? -1 : 1)); } /* diff --git a/src/btree/bt_compress.c b/src/btree/bt_compress.c index 3f293461..479e7248 100644 --- a/src/btree/bt_compress.c +++ b/src/btree/bt_compress.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ #include "db_config.h" @@ -352,16 +352,20 @@ __bam_compress_marshal_data(dbp, data, destbuf) * __bam_compress_dupcmp -- * Duplicate comparison function for compressed BTrees. * - * PUBLIC: int __bam_compress_dupcmp __P((DB *, const DBT *, const DBT *)); + * PUBLIC: int __bam_compress_dupcmp __P((DB *, const DBT *, const DBT *, + * PUBLIC: size_t *)); */ int -__bam_compress_dupcmp(db, a, b) +__bam_compress_dupcmp(db, a, b, locp) DB *db; const DBT *a; const DBT *b; + size_t *locp; { DBT dcmp_a, dcmp_b; + COMPQUIET(locp, NULL); + /* Decompress the initial data in a */ CMP_UNMARSHAL_DATA(a, &dcmp_a); dcmp_a.ulen = 0; @@ -380,7 +384,7 @@ __bam_compress_dupcmp(db, a, b) /* Call the user's duplicate compare function */ return ((BTREE *)db->bt_internal)-> - compress_dup_compare(db, &dcmp_a, &dcmp_b); + compress_dup_compare(db, &dcmp_a, &dcmp_b, NULL); } /* @@ -636,7 +640,7 @@ __bamc_next_decompress(dbc) db = dbc->dbp; if (cp->compcursor >= cp->compend) - return (DB_NOTFOUND); + return (DBC_ERR(dbc, DB_NOTFOUND)); cp->prevKey = cp->currentKey; cp->prevData = cp->currentData; @@ -1251,7 +1255,7 @@ __bamc_compress_merge_delete(dbc, stream, countp) * chunk, but don't delete any more * entries. */ - bulk_ret = DB_NOTFOUND; + bulk_ret = DBC_ERR(dbc, DB_NOTFOUND); moreStream = 0; iSmallEnough = 0; } else @@ -1318,7 +1322,7 @@ __bamc_compress_merge_delete(dbc, stream, countp) CMP_FREE_DBT(env, &nextk); CMP_FREE_DBT(env, &nextc); - return (ret != 0 ? ret : bulk_ret); + return (ret != 0 ? ret : DBC_ERR(dbc, bulk_ret)); } /* @@ -1389,7 +1393,7 @@ __bamc_compress_merge_delete_dups(dbc, stream, countp) * in the database */ if (ifound == 0) { - bulk_ret = DB_NOTFOUND; + bulk_ret = DBC_ERR(dbc, DB_NOTFOUND); } else ++chunk_count; break; @@ -1463,7 +1467,7 @@ __bamc_compress_merge_delete_dups(dbc, stream, countp) * current chunk, but don't delete * any more entries. */ - bulk_ret = DB_NOTFOUND; + bulk_ret = DBC_ERR(dbc, DB_NOTFOUND); moreStream = 0; iSmallEnough = 0; } else @@ -1541,7 +1545,7 @@ __bamc_compress_merge_delete_dups(dbc, stream, countp) CMP_FREE_DBT(env, &pdestdata); CMP_FREE_DBT(env, &nextk); - return (ret != 0 ? ret : bulk_ret); + return (ret != 0 ? ret : DBC_ERR(dbc, bulk_ret)); } /******************************************************************************/ @@ -1641,8 +1645,8 @@ __bamc_compress_get_prev_dup(dbc, flags) if ((ret = __bamc_compress_get_prev(dbc, flags)) != 0) return (ret); - if (t->bt_compare(dbp, cp->currentKey, &cp->del_key) != 0) - return (DB_NOTFOUND); + if (t->bt_compare(dbp, cp->currentKey, &cp->del_key, NULL) != 0) + return (DBC_ERR(dbc, DB_NOTFOUND)); return (0); } @@ -1684,7 +1688,7 @@ __bamc_compress_get_prev_nodup(dbc, flags) do if ((ret = __bamc_compress_get_prev(dbc, flags)) != 0) return (ret); - while (t->bt_compare(dbp, cp->currentKey, &cp->del_key) == 0); + while (t->bt_compare(dbp, cp->currentKey, &cp->del_key, NULL) == 0); return (0); } @@ -1702,7 +1706,7 @@ __bamc_compress_get_next(dbc, flags) if (F_ISSET(cp, C_COMPRESS_DELETED)) { if (cp->currentKey == 0) - return (DB_NOTFOUND); + return (DBC_ERR(dbc, DB_NOTFOUND)); F_CLR(cp, C_COMPRESS_DELETED); return (0); } else if (cp->currentKey) { @@ -1722,7 +1726,7 @@ __bamc_compress_get_next(dbc, flags) * to the right place */ __bamc_compress_reset(dbc); - return (DB_NOTFOUND); + return (DBC_ERR(dbc, DB_NOTFOUND)); } else if (ret != 0) return (ret); @@ -1753,17 +1757,18 @@ __bamc_compress_get_next_dup(dbc, key, flags) * deleted entry. */ if (cp->currentKey == 0) - return (DB_NOTFOUND); + return (DBC_ERR(dbc, DB_NOTFOUND)); F_CLR(cp, C_COMPRESS_DELETED); - return (t->bt_compare(dbp, - cp->currentKey, &cp->del_key) == 0 ? 0 : DB_NOTFOUND); + return (t->bt_compare(dbp, cp->currentKey, + &cp->del_key, NULL) == 0 ? 0 : DB_NOTFOUND); } else if (cp->currentKey == 0) return (EINVAL); /* Check that the next entry has the same key as the previous entry */ ret = __bamc_next_decompress(dbc); - if (ret == 0 && t->bt_compare(dbp, cp->currentKey, cp->prevKey) != 0) - return (DB_NOTFOUND); + if (ret == 0 && t->bt_compare(dbp, + cp->currentKey, cp->prevKey, NULL) != 0) + return (DBC_ERR(dbc, DB_NOTFOUND)); if (ret != DB_NOTFOUND) return (ret); @@ -1783,7 +1788,7 @@ __bamc_compress_get_next_dup(dbc, key, flags) * will end up pointing to the right place */ __bamc_compress_reset(dbc); - return (DB_NOTFOUND); + return (DBC_ERR(dbc, DB_NOTFOUND)); } else if (ret != 0) return (ret); @@ -1791,8 +1796,8 @@ __bamc_compress_get_next_dup(dbc, key, flags) return (ret); /* Check the keys are the same */ - if (t->bt_compare(dbp, cp->currentKey, key) != 0) - return (DB_NOTFOUND); + if (t->bt_compare(dbp, cp->currentKey, key, NULL) != 0) + return (DBC_ERR(dbc, DB_NOTFOUND)); return (0); } @@ -1828,7 +1833,7 @@ __bamc_compress_get_next_nodup(dbc, flags) do if ((ret = __bamc_compress_get_next(dbc, flags)) != 0) return (ret); - while (t->bt_compare(dbp, cp->currentKey, &cp->del_key) == 0); + while (t->bt_compare(dbp, cp->currentKey, &cp->del_key, NULL) == 0); return (ret); } @@ -1888,14 +1893,14 @@ __bamc_compress_get_set(dbc, key, data, method, flags) if (ret == 0 && __db_compare_both(dbp, cp->currentKey, 0, key, 0) != 0) { /* We didn't find the key */ - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); } break; case DB_GET_BOTH: if (ret == 0 && (cmp != 0 || (!F_ISSET(dbp, DB_AM_DUPSORT) && - __bam_defcmp(dbp, cp->currentData, data) != 0))) { + __bam_defcmp(dbp, cp->currentData, data, NULL) != 0))) { /* We didn't find the key/data pair */ - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); } break; default: @@ -1923,7 +1928,7 @@ __bamc_compress_get_bothc(dbc, data, flags) position */ if (__db_compare_both(dbp, cp->currentKey, cp->currentData, cp->currentKey, data) >= 0) - return (DB_NOTFOUND); + return (DBC_ERR(dbc, DB_NOTFOUND)); cmp = 0; /* Perform a linear search for the data in the current chunk */ @@ -1933,7 +1938,7 @@ __bamc_compress_get_bothc(dbc, data, flags) continue; if (ret == 0) - return (cmp == 0 ? 0 : DB_NOTFOUND); + return (cmp == 0 ? 0 : DBC_ERR(dbc, DB_NOTFOUND)); if (ret != DB_NOTFOUND) return (ret); @@ -2277,7 +2282,7 @@ __bamc_compress_iput(dbc, key, data, flags) switch (flags) { case DB_CURRENT: if (cp->currentKey == 0 || F_ISSET(cp, C_COMPRESS_DELETED)) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto end; } @@ -2290,7 +2295,7 @@ __bamc_compress_iput(dbc, key, data, flags) if (F_ISSET(dbp, DB_AM_DUPSORT) && ((BTREE *)dbp->bt_internal)->compress_dup_compare( - dbp, cp->currentData, data) != 0) { + dbp, cp->currentData, data, NULL) != 0) { __db_errx(env, DB_STR("1032", "Existing data sorts differently from put data")); ret = EINVAL; @@ -2464,7 +2469,7 @@ __bamc_compress_idel(dbc, flags) if (F_ISSET(cp, C_COMPRESS_DELETED)) return DB_KEYEMPTY; if (cp->currentKey == 0) - return DB_NOTFOUND; + return (DBC_ERR(dbc, DB_NOTFOUND)); if ((ret = __bam_compress_set_dbt(dbp, &cp->del_key, cp->currentKey->data, cp->currentKey->size)) != 0) @@ -3015,7 +3020,8 @@ __bam_compress_count(dbc, nkeysp, ndatap) if (ret != 0) goto err; - if (t->bt_compare(dbp, cp_n->currentKey, cp_n->prevKey) != 0) + if (t->bt_compare(dbp, + cp_n->currentKey, cp_n->prevKey, NULL) != 0) nkeys += 1; } diff --git a/src/btree/bt_conv.c b/src/btree/bt_conv.c index 348ce5c2..85baeed8 100644 --- a/src/btree/bt_conv.c +++ b/src/btree/bt_conv.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -88,7 +88,12 @@ __bam_mswap(env, pg) SWAP32(p); /* re_len */ SWAP32(p); /* re_pad */ SWAP32(p); /* root */ - p += 92 * sizeof(u_int32_t); /* unused */ + SWAP32(p); /* threshold */ + SWAP32(p); /* file id lo */ + SWAP32(p); /* file id hi */ + SWAP32(p); /* sdb id lo */ + SWAP32(p); /* sdb id hi */ + p += 87 * sizeof(u_int32_t); /* unused */ SWAP32(p); /* crypto_magic */ return (0); diff --git a/src/btree/bt_curadj.c b/src/btree/bt_curadj.c index 78606009..d3398ee8 100644 --- a/src/btree/bt_curadj.c +++ b/src/btree/bt_curadj.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c index 860c31ce..d63b7373 100644 --- a/src/btree/bt_cursor.c +++ b/src/btree/bt_cursor.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -938,7 +938,7 @@ __bamc_get(dbc, key, data, flags, pgnop) case DB_CURRENT: /* It's not possible to return a deleted record. */ if (F_ISSET(cp, C_DELETED)) { - ret = DB_KEYEMPTY; + ret = DBC_ERR(dbc, DB_KEYEMPTY); goto err; } @@ -979,7 +979,7 @@ __bamc_get(dbc, key, data, flags, pgnop) goto err; if (flags == DB_GET_BOTH) { if (!exact) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } break; @@ -1000,7 +1000,7 @@ __bamc_get(dbc, key, data, flags, pgnop) dbc, PGNO_INVALID, key, flags, &exact)) != 0) return (ret); if (!exact) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } @@ -1047,7 +1047,7 @@ __bamc_get(dbc, key, data, flags, pgnop) if ((ret = __bamc_next(dbc, 1, 0)) != 0) goto err; if (!IS_CUR_DUPLICATE(dbc, orig_pgno, orig_indx)) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } break; @@ -1077,7 +1077,7 @@ __bamc_get(dbc, key, data, flags, pgnop) if ((ret = __bamc_prev(dbc)) != 0) goto err; if (!IS_CUR_DUPLICATE(dbc, orig_pgno, orig_indx)) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } break; @@ -1173,12 +1173,15 @@ __bam_bulk(dbc, data, flags) DBT *data; u_int32_t flags; { + BBLOB bl; BKEYDATA *bk; BOVERFLOW *bo; BTREE_CURSOR *cp; PAGE *pg; db_indx_t *inp, indx, pg_keyoff; int32_t *endp, key_off, *offp, *saveoffp; + off_t blob_size; + db_seq_t blob_id; u_int8_t *dbuf, *dp, *np; u_int32_t key_size, pagesize, size, space; int adj, is_key, need_pg, next_key, no_dup, rec_key, ret; @@ -1279,6 +1282,7 @@ next_pg: */ if (is_key && pg_keyoff != inp[indx]) { bk = GET_BKEYDATA(dbc->dbp, pg, indx); + DB_ASSERT(dbc->env, B_TYPE(bk->type) != B_BLOB); if (B_TYPE(bk->type) == B_OVERFLOW) { bo = (BOVERFLOW *)bk; size = key_size = bo->tlen; @@ -1403,6 +1407,31 @@ get_key_space: *offp-- = (int32_t)(np - dbuf); np += size; *offp-- = (int32_t)size; + } else if (B_TYPE(bk->type) == B_BLOB) { + blob_size = 0; + blob_id = 0; + memcpy(&bl, bk, BBLOB_SIZE); + GET_BLOB_SIZE(dbc->env, bl, blob_size, ret); + if (ret != 0) + return (ret); + if (blob_size > UINT32_MAX) { + size = UINT32_MAX; + goto back_up; + } + size = (u_int32_t)blob_size; + if (size > space) + goto back_up; + blob_id = (db_seq_t)bl.id; + if ((ret = __blob_bulk(dbc, size, blob_id, np)) != 0) + return (ret); + if (is_key) { + *offp-- = (int32_t)key_off; + *offp-- = (int32_t)key_size; + } + space -= size; + *offp-- = (int32_t)(np - dbuf); + np += size; + *offp-- = (int32_t)size; } else { if (need_pg) { dp = np; @@ -1764,11 +1793,11 @@ __bam_getbothc(dbc, data) */ if ((ret = __bam_cmp(dbc, data, cp->page, cp->indx, dbp->dup_compare == NULL ? __bam_defcmp : dbp->dup_compare, - &cmp)) != 0) + &cmp, NULL)) != 0) return (ret); if (cmp <= 0) - return (DB_NOTFOUND); + return (DBC_ERR(dbc, DB_NOTFOUND)); /* Discard the current page, we're going to do a full search. */ if ((ret = __memp_fput(mpf, @@ -1791,7 +1820,7 @@ __bam_getbothc(dbc, data) */ if (cp->indx + P_INDX >= NUM_ENT(cp->page) || !IS_DUPLICATE(dbc, cp->indx, cp->indx + P_INDX)) - return (DB_NOTFOUND); + return (DBC_ERR(dbc, DB_NOTFOUND)); cp->indx += P_INDX; return (__bam_getboth_finddatum(dbc, data, DB_GET_BOTH)); @@ -1842,7 +1871,7 @@ __bam_getlte(dbc, key, data) /* Check if we're still on the correct key */ if ((ret = __bam_cmp(dbc, key, cp->page, cp->indx, - ((BTREE*)dbp->bt_internal)->bt_compare, &exact)) != 0) + ((BTREE*)dbp->bt_internal)->bt_compare, &exact, NULL)) != 0) goto end; exact = (exact == 0); } @@ -1884,8 +1913,8 @@ __bam_getlte(dbc, key, data) if (data != NULL) { /* Check if we're still on the correct data */ if ((ret = __bam_cmp( - dbc, data, ocp->page, ocp->indx, - dbp->dup_compare, &exact)) != 0) + dbc, data, ocp->page, ocp->indx, + dbp->dup_compare, &exact, NULL)) != 0) goto end; exact = (exact == 0); } else @@ -1915,7 +1944,8 @@ __bam_getlte(dbc, key, data) else { /* Check if we're still on the correct data */ if ((ret = __bam_cmp(dbc, data, cp->page, - cp->indx + O_INDX, dbp->dup_compare, &exact)) != 0) + cp->indx + O_INDX, dbp->dup_compare, + &exact, NULL)) != 0) goto end; exact = (exact == 0); } @@ -1982,7 +2012,7 @@ __bam_getboth_finddatum(dbc, data, flags) if (!IS_CUR_DELETED(dbc)) { if ((ret = __bam_cmp( dbc, data, cp->page, cp->indx + O_INDX, - __bam_defcmp, &cmp)) != 0) + __bam_defcmp, &cmp, NULL)) != 0) return (ret); if (cmp == 0) return (0); @@ -1992,7 +2022,8 @@ __bam_getboth_finddatum(dbc, data, flags) !IS_DUPLICATE(dbc, cp->indx, cp->indx + P_INDX)) break; } - return (DB_NOTFOUND); + + return (DBC_ERR(dbc, DB_NOTFOUND)); } /* @@ -2008,18 +2039,18 @@ __bam_getboth_finddatum(dbc, data, flags) break; if (base == (top - P_INDX)) { if ((ret = __bam_cmp(dbc, data, cp->page, - cp->indx + O_INDX, dbp->dup_compare, &cmp)) != 0) + cp->indx + O_INDX, dbp->dup_compare, &cmp, NULL)) != 0) return (ret); if (cmp == 0 || (cmp < 0 && flags == DB_GET_BOTH_RANGE)) return (0); cp->indx = top; - return DB_NOTFOUND; + return (DBC_ERR(dbc, DB_NOTFOUND)); } for (lim = (top - base) / (db_indx_t)P_INDX; lim != 0; lim >>= 1) { cp->indx = base + ((lim >> 1) * P_INDX); if ((ret = __bam_cmp(dbc, data, cp->page, - cp->indx + O_INDX, dbp->dup_compare, &cmp)) != 0) + cp->indx + O_INDX, dbp->dup_compare, &cmp, NULL)) != 0) return (ret); if (cmp == 0) { /* @@ -2039,7 +2070,7 @@ __bam_getboth_finddatum(dbc, data, flags) /* No match found; if we're looking for an exact match, we're done. */ if (flags == DB_GET_BOTH) - return (DB_NOTFOUND); + return (DBC_ERR(dbc, DB_NOTFOUND)); /* * Base is the smallest index greater than the data item, may be zero @@ -2049,7 +2080,7 @@ __bam_getboth_finddatum(dbc, data, flags) cp->indx = base; while (cp->indx < top && IS_CUR_DELETED(dbc)) cp->indx += P_INDX; - return (cp->indx < top ? 0 : DB_NOTFOUND); + return (cp->indx < top ? 0 : DBC_ERR(dbc, DB_NOTFOUND)); } /* @@ -2082,7 +2113,7 @@ split: ret = stack = 0; switch (flags) { case DB_CURRENT: if (F_ISSET(cp, C_DELETED)) - return (DB_NOTFOUND); + return (DBC_ERR(dbc, DB_NOTFOUND)); /* FALLTHROUGH */ case DB_AFTER: case DB_BEFORE: @@ -2206,7 +2237,8 @@ split: ret = stack = 0; */ for (;; cp->indx += P_INDX) { if ((ret = __bam_cmp(dbc, data, cp->page, - cp->indx + O_INDX, dbp->dup_compare, &cmp)) != 0) + cp->indx + O_INDX, dbp->dup_compare, + &cmp, NULL)) != 0) goto err; if (cmp < 0) { iiop = DB_BEFORE; @@ -2479,7 +2511,7 @@ __bamc_next(dbc, initial_move, deleted_okay) */ if (cp->indx >= NUM_ENT(cp->page)) { if ((pgno = NEXT_PGNO(cp->page)) == PGNO_INVALID) - return (DB_NOTFOUND); + return (DBC_ERR(dbc, DB_NOTFOUND)); ACQUIRE_CUR(dbc, lock_mode, pgno, 0, ret); if (ret != 0) @@ -2539,7 +2571,7 @@ __bamc_prev(dbc) if (cp->indx == 0) { if ((pgno = PREV_PGNO(cp->page)) == PGNO_INVALID) - return (DB_NOTFOUND); + return (DBC_ERR(dbc, DB_NOTFOUND)); ACQUIRE_CUR(dbc, lock_mode, pgno, 0, ret); if (ret != 0) @@ -2711,11 +2743,11 @@ __bamc_search(dbc, root_pgno, key, flags, exactp) if (h->next_pgno == PGNO_INVALID) { indx = NUM_ENT(h) - P_INDX; if ((ret = __bam_cmp(dbc, key, h, indx, - t->bt_compare, &cmp)) != 0) + t->bt_compare, &cmp, NULL)) != 0) goto fast_miss; if (cmp > 0) { if (FLD_ISSET(sflags, SR_EXACT)) - return (DB_NOTFOUND); + return (DBC_ERR(dbc, DB_NOTFOUND)); else indx += P_INDX; } @@ -2725,10 +2757,10 @@ __bamc_search(dbc, root_pgno, key, flags, exactp) if (h->prev_pgno == PGNO_INVALID) { indx = 0; if ((ret = __bam_cmp(dbc, key, h, indx, - t->bt_compare, &cmp)) != 0) + t->bt_compare, &cmp, NULL)) != 0) goto fast_miss; if (cmp < 0 && FLD_ISSET(sflags, SR_EXACT)) - return (DB_NOTFOUND); + return (DBC_ERR(dbc, DB_NOTFOUND)); if (cmp <= 0) goto fast_hit; } @@ -2736,7 +2768,7 @@ __bamc_search(dbc, root_pgno, key, flags, exactp) DB_BINARY_SEARCH_FOR(base, lim, NUM_ENT(h), P_INDX) { DB_BINARY_SEARCH_INCR(indx, base, lim, P_INDX); if ((ret = __bam_cmp(dbc, key, h, indx, - t->bt_compare, &cmp)) != 0) + t->bt_compare, &cmp, NULL)) != 0) goto fast_miss; if (cmp == 0) @@ -2752,7 +2784,7 @@ __bamc_search(dbc, root_pgno, key, flags, exactp) indx = base; if (indx > 0 && indx < NUM_ENT(h)) { if (FLD_ISSET(sflags, SR_EXACT)) - return (DB_NOTFOUND); + return (DBC_ERR(dbc, DB_NOTFOUND)); goto fast_hit; } } @@ -3068,7 +3100,7 @@ __bam_opd_exists(dbc, pgno) if (NUM_ENT(h) == 0) ret = 0; else - ret = DB_KEYEXIST; + ret = DBC_ERR(dbc, DB_KEYEXIST); (void)__memp_fput(dbc->dbp->mpf, dbc->thread_info, h, dbc->priority); diff --git a/src/btree/bt_delete.c b/src/btree/bt_delete.c index 37496b3f..a1ccef71 100644 --- a/src/btree/bt_delete.c +++ b/src/btree/bt_delete.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1990, 1993, 1994, 1995, 1996 @@ -61,15 +61,18 @@ __bam_ditem(dbc, h, indx) PAGE *h; u_int32_t indx; { + BBLOB bl; BINTERNAL *bi; BKEYDATA *bk; DB *dbp; + db_seq_t blob_id; u_int32_t nbytes; int ret; db_indx_t *inp; dbp = dbc->dbp; inp = P_INP(dbp, h); + ret = 0; /* The page should already have been dirtied by our caller. */ DB_ASSERT(dbp->env, IS_DIRTY(h)); @@ -139,6 +142,13 @@ __bam_ditem(dbc, h, indx) dbc, (GET_BOVERFLOW(dbp, h, indx))->pgno)) != 0) return (ret); break; + case B_BLOB: + nbytes = BBLOB_SIZE; + memcpy(&bl, bk, BBLOB_SIZE); + blob_id = (db_seq_t)bl.id; + if ((ret = __blob_del(dbc, blob_id)) != 0) + return (ret); + break; case B_KEYDATA: nbytes = BKEYDATA_SIZE(bk->len); break; @@ -241,7 +251,7 @@ __bam_dpages(dbc, use_top, flags) * single item deleted, and the rest of the pages are to be removed. * * Recno always has a stack to the root and __bam_merge operations - * may have unneeded items in the sack. We find the lowest page + * may have unneeded items in the stack. We find the lowest page * in the stack that has more than one record in it and start there. */ ret = 0; @@ -493,7 +503,9 @@ stop: done = 1; /* * __bam_pupdate -- - * Update parent key pointers up the tree. + * Update parent key pointers up the tree after putting a new key + * at the start of a leaf page. + * * * PUBLIC: int __bam_pupdate __P((DBC *, PAGE *)); */ diff --git a/src/btree/bt_method.c b/src/btree/bt_method.c index 5cf93d2e..2fb33be2 100644 --- a/src/btree/bt_method.c +++ b/src/btree/bt_method.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -15,7 +15,7 @@ static int __bam_set_bt_minkey __P((DB *, u_int32_t)); static int __bam_get_bt_compare - __P((DB *, int (**)(DB *, const DBT *, const DBT *))); + __P((DB *, int (**)(DB *, const DBT *, const DBT *, size_t *))); static int __bam_get_bt_prefix __P((DB *, size_t(**)(DB *, const DBT *, const DBT *))); static int __bam_set_bt_prefix @@ -233,7 +233,7 @@ incompat: static int __bam_get_bt_compare(dbp, funcp) DB *dbp; - int (**funcp) __P((DB *, const DBT *, const DBT *)); + int (**funcp) __P((DB *, const DBT *, const DBT *, size_t *)); { BTREE *t; @@ -251,13 +251,13 @@ __bam_get_bt_compare(dbp, funcp) * __bam_set_bt_compare -- * Set the comparison function. * - * PUBLIC: int __bam_set_bt_compare - * PUBLIC: __P((DB *, int (*)(DB *, const DBT *, const DBT *))); + * PUBLIC: int __bam_set_bt_compare __P((DB *, + * PUBLIC: int (*)(DB *, const DBT *, const DBT *, size_t *))); */ int __bam_set_bt_compare(dbp, func) DB *dbp; - int (*func) __P((DB *, const DBT *, const DBT *)); + int (*func) __P((DB *, const DBT *, const DBT *, size_t *)); { BTREE *t; @@ -351,6 +351,13 @@ __bam_set_bt_compress(dbp, compress, decompress) return (EINVAL); } + /* Compression is incompatible with blob storage. */ + if (dbp->blob_threshold > 0) { + __db_errx(dbp->env, DB_STR("1198", + "compression cannot be used with blobs enabled.")); + return (EINVAL); + } + if (compress != 0 && decompress != 0) { t->bt_compress = compress; t->bt_decompress = decompress; diff --git a/src/btree/bt_open.c b/src/btree/bt_open.c index 7be141c1..46a866d0 100644 --- a/src/btree/bt_open.c +++ b/src/btree/bt_open.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1990, 1993, 1994, 1995, 1996 @@ -44,6 +44,7 @@ #include "db_config.h" #include "db_int.h" +#include "dbinc/blob.h" #include "dbinc/crypto.h" #include "dbinc/db_page.h" #include "dbinc/db_swap.h" @@ -119,6 +120,7 @@ __bam_metachk(dbp, name, btm) int ret; env = dbp->env; + ret = 0; /* * At this point, all we know is that the magic number is for a Btree. @@ -136,6 +138,7 @@ __bam_metachk(dbp, name, btm) return (DB_OLD_VERSION); case 8: case 9: + case 10: break; default: __db_errx(env, DB_STR_A("1009", @@ -269,6 +272,29 @@ __bam_metachk(dbp, name, btm) /* Set the page size. */ dbp->pgsize = btm->dbmeta.pagesize; + dbp->blob_threshold = btm->blob_threshold; + GET_BLOB_FILE_ID(env, btm, dbp->blob_file_id, ret); + if (ret != 0) + return (ret); + GET_BLOB_SDB_ID(env, btm, dbp->blob_sdb_id, ret); + if (ret != 0) + return (ret); + /* Blob databases must be upgraded. */ + if (vers == 9 && (dbp->blob_file_id != 0 || dbp->blob_sdb_id != 0)) { + __db_errx(env, DB_STR_A("1207", +"%s: databases that support blobs must be upgraded.", "%s"), + name); + return (EINVAL); + } +#ifndef HAVE_64BIT_TYPES + if (dbp->blob_file_id != 0 || dbp->blob_sdb_id != 0) { + __db_errx(env, DB_STR_A("1199", + "%s: blobs require 64 integer compiler support.", "%s"), + name); + return (DB_OPNOTSUP); + } +#endif + /* Copy the file's ID. */ memcpy(dbp->fileid, btm->dbmeta.uid, DB_FILE_ID_LEN); @@ -442,6 +468,9 @@ __bam_init_meta(dbp, meta, pgno, lsnp) meta->minkey = t->bt_minkey; meta->re_len = t->re_len; meta->re_pad = (u_int32_t)t->re_pad; + meta->blob_threshold = dbp->blob_threshold; + SET_BLOB_META_FILE_ID(meta, dbp->blob_file_id, BTMETA); + SET_BLOB_META_SDB_ID(meta, dbp->blob_sdb_id, BTMETA); #ifdef HAVE_PARTITION if ((part = dbp->p_internal) != NULL) { @@ -535,6 +564,12 @@ __bam_new_file(dbp, ip, txn, fhp, name) pginfo.type = dbp->type; pdbt.data = &pginfo; pdbt.size = sizeof(pginfo); + if (dbp->blob_threshold) { + if ((ret = __blob_generate_dir_ids(dbp, txn, + &dbp->blob_file_id)) != 0) + return (ret); + + } if ((ret = __os_calloc(env, 1, dbp->pgsize, &buf)) != 0) return (ret); meta = (BTMETA *)buf; @@ -613,6 +648,12 @@ __bam_new_subdb(mdbp, dbp, ip, txn) meta = NULL; root = NULL; + if (dbp->blob_threshold) { + if ((ret = __blob_generate_dir_ids(dbp, txn, + &dbp->blob_sdb_id)) != 0) + return (ret); + } + if ((ret = __db_cursor(mdbp, ip, txn, &dbc, CDB_LOCKING(env) ? DB_WRITECURSOR : 0)) != 0) return (ret); diff --git a/src/btree/bt_put.c b/src/btree/bt_put.c index 13316181..5cd0ac12 100644 --- a/src/btree/bt_put.c +++ b/src/btree/bt_put.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1990, 1993, 1994, 1995, 1996 @@ -56,8 +56,8 @@ static int __bam_dup_check __P((DBC *, u_int32_t, static int __bam_dup_convert __P((DBC *, PAGE *, u_int32_t, u_int32_t)); static int __bam_ovput __P((DBC *, u_int32_t, db_pgno_t, PAGE *, u_int32_t, DBT *)); -static u_int32_t - __bam_partsize __P((DB *, u_int32_t, DBT *, PAGE *, u_int32_t)); +static int __bam_partsize + __P((DB *, u_int32_t, DBT *, PAGE *, u_int32_t, u_int32_t *)); /* * __bam_iitem -- @@ -71,18 +71,22 @@ __bam_iitem(dbc, key, data, op, flags) DBT *key, *data; u_int32_t op, flags; { + BBLOB bl, blob_buf; BKEYDATA *bk, bk_tmp; BTREE *t; BTREE_CURSOR *cp; DB *dbp; - DBT bk_hdr, tdbt; + DBT bk_hdr, blob_dbt, tdbt; DB_MPOOLFILE *mpf; ENV *env; + DB_LSN lsn; PAGE *h; db_indx_t cnt, indx; + off_t blob_size; + db_seq_t blob_id, new_blob_id; u_int32_t data_size, have_bytes, need_bytes, needed, pages, pagespace; char tmp_ch; - int cmp, bigkey, bigdata, del, dupadjust; + int cmp, bigkey, bigdata, blobdata, del, dupadjust; int padrec, replace, ret, t_ret, was_deleted; COMPQUIET(cnt, 0); @@ -95,6 +99,7 @@ __bam_iitem(dbc, key, data, op, flags) h = cp->page; indx = cp->indx; del = dupadjust = replace = was_deleted = 0; + blobdata = 0; /* * Fixed-length records with partial puts: it's an error to specify @@ -112,8 +117,12 @@ __bam_iitem(dbc, key, data, op, flags) * longer than the fixed-length, and we never require less than * the fixed-length record size. */ - data_size = F_ISSET(data, DB_DBT_PARTIAL) ? - __bam_partsize(dbp, op, data, h, indx) : data->size; + if (F_ISSET(data, DB_DBT_PARTIAL)) { + if ((ret = __bam_partsize( + dbp, op, data, h, indx, &data_size)) != 0) + return (ret); + } else + data_size = data->size; padrec = 0; if (F_ISSET(dbp, DB_AM_FIXEDLEN)) { if (data_size > t->re_len) @@ -190,6 +199,13 @@ __bam_iitem(dbc, key, data, op, flags) } if (!F_ISSET(data, DB_DBT_STREAMING) && (padrec || F_ISSET(data, DB_DBT_PARTIAL))) { + /* Partial puts need to be handled in the blob functions. */ + if (op == DB_CURRENT) { + bk = GET_BKEYDATA(dbp, h, indx + (TYPE(h) == P_LBTREE ? + O_INDX : 0)); + if (B_TYPE(bk->type) == B_BLOB) + goto dup_cmp; + } tdbt = *data; if ((ret = __bam_build(dbc, op, &tdbt, h, indx, data_size)) != 0) @@ -204,10 +220,10 @@ __bam_iitem(dbc, key, data, op, flags) * screwing up the duplicate sort order. We have to do this after * we build the real record so that we're comparing the real items. */ - if (op == DB_CURRENT && dbp->dup_compare != NULL) { +dup_cmp:if (op == DB_CURRENT && dbp->dup_compare != NULL) { if ((ret = __bam_cmp(dbc, data, h, indx + (TYPE(h) == P_LBTREE ? O_INDX : 0), - dbp->dup_compare, &cmp)) != 0) + dbp->dup_compare, &cmp, NULL)) != 0) return (ret); if (cmp != 0) { __db_errx(env, DB_STR("1004", @@ -218,10 +234,30 @@ __bam_iitem(dbc, key, data, op, flags) /* * If the key or data item won't fit on a page, we'll have to store - * them on overflow pages. + * them on overflow pages. The exception is if we are inserting + * into an existing blob file, in that case it remains a blob + * file regardless of its new size. */ + if (op == DB_CURRENT) { + bk = GET_BKEYDATA( + dbp, h, indx + (TYPE(h) == P_LBTREE ? O_INDX : 0)); + if (B_TYPE(bk->type) == B_BLOB) { + blobdata = 1; + bigdata = 0; + } else + bigdata = data_size > cp->ovflsize; + } else { + if (dbp->blob_threshold && + (dbp->blob_threshold <= data_size || + F_ISSET(data, DB_DBT_BLOB))) { + blobdata = 1; + bigdata = 0; + } else { + blobdata = 0; + bigdata = data_size > cp->ovflsize; + } + } needed = 0; - bigdata = data_size > cp->ovflsize; switch (op) { case DB_KEYFIRST: /* We're adding a new key and data pair. */ @@ -232,6 +268,8 @@ __bam_iitem(dbc, key, data, op, flags) needed += BKEYDATA_PSIZE(key->size); if (bigdata) needed += BOVERFLOW_PSIZE; + else if (blobdata) + needed += BBLOB_PSIZE; else needed += BKEYDATA_PSIZE(data_size); break; @@ -254,6 +292,8 @@ __bam_iitem(dbc, key, data, op, flags) indx + (TYPE(h) == P_LBTREE ? O_INDX : 0)); if (B_TYPE(bk->type) == B_KEYDATA) have_bytes = BKEYDATA_PSIZE(bk->len); + else if (B_TYPE(bk->type) == B_BLOB) + have_bytes = BBLOB_PSIZE; else have_bytes = BOVERFLOW_PSIZE; need_bytes = 0; @@ -263,6 +303,8 @@ __bam_iitem(dbc, key, data, op, flags) } if (bigdata) need_bytes += BOVERFLOW_PSIZE; + else if (blobdata) + need_bytes += BBLOB_PSIZE; else need_bytes += BKEYDATA_PSIZE(data_size); @@ -405,7 +447,8 @@ __bam_iitem(dbc, key, data, op, flags) * because we're going to immediately re-add the item into the * same slot. */ - if (bigdata || B_TYPE(bk->type) != B_KEYDATA) { + if (bigdata || (B_TYPE(bk->type) != B_KEYDATA && + B_TYPE(bk->type) != B_BLOB)) { /* * If streaming, don't delete the overflow item, * just delete the item pointing to the overflow item. @@ -448,13 +491,65 @@ __bam_iitem(dbc, key, data, op, flags) bk_hdr.size = SSZA(BKEYDATA, data); ret = __db_pitem(dbc, h, indx, BKEYDATA_SIZE(data->size), &bk_hdr, data); - } else if (replace) - ret = __bam_ritem(dbc, h, indx, data, 0); - else - ret = __db_pitem(dbc, h, indx, - BKEYDATA_SIZE(data->size), NULL, data); + } else if (replace) { + /* + * If updating a blob, replace the blob file with the + * new blob data and updated the blob db record. + */ + if (blobdata) { + memcpy(&bl, + P_ENTRY(dbp, h, indx), BBLOB_SIZE); + memset(&blob_dbt, 0, sizeof(DBT)); + blob_dbt.size = BBLOB_DSIZE; + if (F_ISSET(data, DB_DBT_BLOB_REC)) { + /* + * Replace the blob record with the + * blob record in the data DBT. + */ + blob_dbt.data = BBLOB_DATA(data->data); + } else { + blob_id = (db_seq_t)bl.id; + GET_BLOB_SIZE( + dbp->env, bl, blob_size, ret); + if (ret != 0) + goto err; + if ((ret = __blob_repl( + dbc, data, blob_id, + &new_blob_id, &blob_size)) != 0) + goto err; + blob_dbt.data = BBLOB_DATA((&bl)); + SET_BLOB_ID(&bl, new_blob_id, BBLOB); + SET_BLOB_SIZE(&bl, blob_size, BBLOB); + } + ret = __bam_ritem( + dbc, h, indx, &blob_dbt, B_BLOB); + } else + ret = __bam_ritem(dbc, h, indx, data, 0); + } else + if (blobdata) { + new_blob_id = 0; + blob_size = 0; + if ((ret = __blob_put(dbc, data, + &new_blob_id, &blob_size, &lsn)) != 0) + goto err; + memset(&blob_buf, 0, BBLOB_SIZE); + blob_buf.type = B_BLOB; + blob_buf.len = BBLOB_DSIZE; + tdbt.data = &blob_buf; + tdbt.size = BBLOB_SIZE; + SET_BLOB_ID(&blob_buf, new_blob_id, BBLOB); + SET_BLOB_SIZE(&blob_buf, blob_size, BBLOB); + SET_BLOB_FILE_ID( + &blob_buf, dbp->blob_file_id, BBLOB); + SET_BLOB_SDB_ID( + &blob_buf, dbp->blob_sdb_id, BBLOB); + ret = __db_pitem(dbc, h, + indx, BBLOB_SIZE, &tdbt, NULL); + } else + ret = __db_pitem(dbc, h, indx, + BKEYDATA_SIZE(data->size), NULL, data); } - if (ret != 0) { +err: if (ret != 0) { if (del == 1 && (t_ret = __bam_ca_di(dbc, PGNO(h), indx + 1, -1)) != 0) { __db_err(env, t_ret, DB_STR("1005", @@ -504,32 +599,61 @@ __bam_iitem(dbc, key, data, op, flags) * __bam_partsize -- * Figure out how much space a partial data item is in total. */ -static u_int32_t -__bam_partsize(dbp, op, data, h, indx) +static int +__bam_partsize(dbp, op, data, h, indx, data_size) DB *dbp; u_int32_t op, indx; DBT *data; PAGE *h; + u_int32_t *data_size; { + BBLOB bl; BKEYDATA *bk; + int ret; + off_t blob_size; u_int32_t nbytes; + ret = 0; + /* * If the record doesn't already exist, it's simply the data we're * provided. */ - if (op != DB_CURRENT) - return (data->doff + data->size); + if (op != DB_CURRENT) { + *data_size = data->doff + data->size; + return (0); + } /* * Otherwise, it's the data provided plus any already existing data * that we're not replacing. */ bk = GET_BKEYDATA(dbp, h, indx + (TYPE(h) == P_LBTREE ? O_INDX : 0)); - nbytes = - B_TYPE(bk->type) == B_OVERFLOW ? ((BOVERFLOW *)bk)->tlen : bk->len; + switch (B_TYPE(bk->type)) { + case B_BLOB: + memcpy(&bl, bk, BBLOB_SIZE); + GET_BLOB_SIZE(dbp->env, bl, blob_size, ret); + if (ret != 0) + return (ret); + /* + * It is not possible to add data past UINT32_MAX in the + * partial API, so this is safe. + */ + if (blob_size > UINT32_MAX) + nbytes = UINT32_MAX; + else + nbytes = (u_int32_t)blob_size; + break; + case B_OVERFLOW: + nbytes = ((BOVERFLOW *)bk)->tlen; + break; + default: + nbytes = bk->len; + } - return (__db_partsize(nbytes, data)); + *data_size = __db_partsize(nbytes, data); + + return (ret); } /* @@ -848,6 +972,7 @@ __bam_irep(dbc, h, indx, hdr, data) bi = GET_BINTERNAL(dbp, h, indx); bn = (BINTERNAL *) hdr->data; + DB_ASSERT(dbc->env, B_TYPE(bi->type) != B_BLOB); if (B_TYPE(bi->type) == B_OVERFLOW && (ret = __db_doff(dbc, ((BOVERFLOW *)bi->data)->pgno)) != 0) return (ret); @@ -892,6 +1017,7 @@ __bam_dup_check(dbc, op, h, indx, sz, cntp) /* Count the key once. */ bk = GET_BKEYDATA(dbp, h, indx); + DB_ASSERT(dbc->env, B_TYPE(bk->type) != B_BLOB); sz += B_TYPE(bk->type) == B_KEYDATA ? BKEYDATA_PSIZE(bk->len) : BOVERFLOW_PSIZE; @@ -994,6 +1120,7 @@ __bam_dup_convert(dbc, h, indx, cnt) * overflow, then free up those pages). */ bk = GET_BKEYDATA(dbp, h, dindx + 1); + DB_ASSERT(dbc->env, B_TYPE(bk->type) != B_BLOB); hdr.data = bk; hdr.size = B_TYPE(bk->type) == B_KEYDATA ? BKEYDATA_SIZE(bk->len) : BOVERFLOW_SIZE; diff --git a/src/btree/bt_rec.c b/src/btree/bt_rec.c index 026564b6..eb44d04b 100644 --- a/src/btree/bt_rec.c +++ b/src/btree/bt_rec.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/btree/bt_reclaim.c b/src/btree/bt_reclaim.c index f465cc5a..1203ea35 100644 --- a/src/btree/bt_reclaim.c +++ b/src/btree/bt_reclaim.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1998, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/btree/bt_recno.c b/src/btree/bt_recno.c index 9356a742..abbd8efb 100644 --- a/src/btree/bt_recno.c +++ b/src/btree/bt_recno.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -234,7 +234,7 @@ __ramc_del(dbc, flags) retry: if ((ret = __bam_rsearch(dbc, &cp->recno, SR_DELETE, 1, &exact)) != 0) goto err; if (!exact) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } stack = 1; @@ -256,7 +256,7 @@ retry: if ((ret = __bam_rsearch(dbc, &cp->recno, SR_DELETE, 1, &exact)) != 0) * if the record was "deleted", we could never have found it. */ if (B_DISSET(GET_BKEYDATA(dbp, cp->page, cp->indx)->type)) { - ret = DB_KEYEMPTY; + ret = DBC_ERR(dbc, DB_KEYEMPTY); goto err; } @@ -391,7 +391,7 @@ retry: switch (flags) { * a dup, so we set flags to DB_NEXT and keep going. */ if (!F_ISSET(dbc, DBC_OPD)) - return (DB_NOTFOUND); + return (DBC_ERR(dbc, DB_NOTFOUND)); /* FALLTHROUGH */ case DB_NEXT_NODUP: /* @@ -431,7 +431,7 @@ retry: switch (flags) { * is a dup, so we set flags to DB_PREV and keep going. */ if (!F_ISSET(dbc, DBC_OPD)) - return (DB_NOTFOUND); + return (DBC_ERR(dbc, DB_NOTFOUND)); /* FALLTHROUGH */ case DB_PREV_NODUP: /* @@ -443,7 +443,7 @@ retry: switch (flags) { flags = DB_PREV; if (cp->recno != RECNO_OOB) { if (cp->recno == 1) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } --cp->recno; @@ -458,7 +458,7 @@ retry: switch (flags) { if ((ret = __bam_nrecs(dbc, &cp->recno)) != 0) goto err; if (cp->recno == 0) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } break; @@ -476,7 +476,7 @@ retry: switch (flags) { cp->recno++; break; } - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; /* NOTREACHED */ case DB_GET_BOTH: @@ -522,7 +522,7 @@ retry: switch (flags) { 1, &exact)) != 0) goto err; if (!exact) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } @@ -561,22 +561,22 @@ retry: switch (flags) { (void)__bam_stkrel(dbc, STK_CLRDBC); continue; } - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; default: - ret = DB_KEYEMPTY; + ret = DBC_ERR(dbc, DB_KEYEMPTY); goto err; } if (flags == DB_GET_BOTH || flags == DB_GET_BOTHC || flags == DB_GET_BOTH_RANGE) { if ((ret = __bam_cmp(dbc, data, cp->page, cp->indx, - __bam_defcmp, &cmp)) != 0) + __bam_defcmp, &cmp, NULL)) != 0) return (ret); if (cmp == 0) break; if (!F_ISSET(dbc, DBC_OPD)) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } (void)__bam_stkrel(dbc, STK_CLRDBC); @@ -1331,7 +1331,7 @@ __ram_sread(dbc, top) if (0) { eof: t->re_eof = 1; - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); } err: if (!was_modified) t->re_modified = 0; @@ -1368,7 +1368,7 @@ retry: /* Find the slot for insertion. */ if (exact && flags == DB_NOOVERWRITE && !CD_ISSET(cp) && !B_DISSET(GET_BKEYDATA(dbc->dbp, cp->page, cp->indx)->type)) { - ret = DB_KEYEXIST; + ret = DBC_ERR(dbc, DB_KEYEXIST); goto err; } diff --git a/src/btree/bt_rsearch.c b/src/btree/bt_rsearch.c index 36d1c667..4ada6e2d 100644 --- a/src/btree/bt_rsearch.c +++ b/src/btree/bt_rsearch.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1990, 1993, 1994, 1995, 1996 @@ -147,7 +147,7 @@ __bam_rsearch(dbc, recnop, flags, stop, exactp) __TLPUT(dbc, lock)) != 0 && ret == 0) ret = t_ret; if (ret == 0) - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto done; } } @@ -197,7 +197,8 @@ __bam_rsearch(dbc, recnop, flags, stop, exactp) lock)) != 0 && ret == 0) ret = t_ret; if (ret == 0) - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, + DB_NOTFOUND); goto err; } } diff --git a/src/btree/bt_search.c b/src/btree/bt_search.c index e809a852..e3d69d16 100644 --- a/src/btree/bt_search.c +++ b/src/btree/bt_search.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1990, 1993, 1994, 1995, 1996 @@ -51,8 +51,9 @@ /* * __bam_get_root -- - * Fetch the root of a tree and see if we want to keep - * it in the stack. + * Try to appropriately lock and fetch the root page of a tree; + * if successful enter it into the cursor's stack; on error, leave the stack + * unchanged. * * PUBLIC: int __bam_get_root __P((DBC *, db_pgno_t, int, u_int32_t, int *)); */ @@ -232,9 +233,11 @@ retry: if (lock_mode == DB_LOCK_WRITE) } else if (atomic_read(&mpf->mfp->multiversion) != 0 && lock_mode == DB_LOCK_WRITE && (ret = __memp_dirty(mpf, &h, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0) { - (void)__memp_fput(mpf, - dbc->thread_info, h, dbc->priority); + if (h != NULL) + (void)__memp_fput(mpf, + dbc->thread_info, h, dbc->priority); (void)__LPUT(dbc, lock); + return (ret); } } @@ -272,9 +275,10 @@ __bam_search(dbc, root_pgno, key, flags, slevel, recnop, exactp) db_recno_t recno; int adjust, cmp, deloffset, ret, set_stack, stack, t_ret; int getlock, was_next; - int (*func) __P((DB *, const DBT *, const DBT *)); + int (*func) __P((DB *, const DBT *, const DBT *, size_t *)); u_int32_t get_mode, wait; u_int8_t level, saved_level; + size_t pos, pos_h, pos_l; if (F_ISSET(dbc, DBC_OPD)) LOCK_CHECK_OFF(dbc->thread_info); @@ -288,6 +292,7 @@ __bam_search(dbc, root_pgno, key, flags, slevel, recnop, exactp) t = dbp->bt_internal; recno = 0; t_ret = 0; + func = NULL; BT_STK_CLR(cp); LOCK_INIT(saved_lock); @@ -339,11 +344,17 @@ retry: if ((ret = __bam_get_root(dbc, start_pgno, slevel, flags, &stack)) != 0) BT_STK_CLR(cp); - /* Choose a comparison function. */ + /* + * Choose a comparison function. + * We apply the prefix search optimization only when there + * is no user-specific comparsion function set. + */ func = F_ISSET(dbc, DBC_OPD) ? (dbp->dup_compare == NULL ? __bam_defcmp : dbp->dup_compare) : t->bt_compare; + pos_h = 0; + pos_l = 0; for (;;) { if (TYPE(h) == P_LBTREE) adjust = P_INDX; @@ -389,9 +400,11 @@ retry: if ((ret = __bam_get_root(dbc, start_pgno, slevel, flags, &stack)) != 0) * match on a leaf page, we're done. */ DB_BINARY_SEARCH_FOR(base, lim, NUM_ENT(h), adjust) { + /* We compare from the common prefix */ + pos = pos_l > pos_h ? pos_h : pos_l; DB_BINARY_SEARCH_INCR(indx, base, lim, adjust); if ((ret = __bam_cmp(dbc, key, h, indx, - func, &cmp)) != 0) + func, &cmp, &pos)) != 0) goto err; if (cmp == 0) { if (LEVEL(h) == LEAFLEVEL || @@ -403,9 +416,19 @@ retry: if ((ret = __bam_get_root(dbc, start_pgno, slevel, flags, &stack)) != 0) } goto next; } - if (cmp > 0) + /* + * We have to maintain the offset in the keys where + * we begin comparing for both ends of the key range + * in which we are binary searching. So, update either + * the high or low position here, depending on how + * the comparison turned out. + */ + if (cmp > 0) { DB_BINARY_SEARCH_SHIFT_BASE(indx, base, lim, adjust); + pos_l = pos; + } else + pos_h = pos; } /* @@ -421,7 +444,7 @@ retry: if ((ret = __bam_get_root(dbc, start_pgno, slevel, flags, &stack)) != 0) *exactp = 0; if (LF_ISSET(SR_EXACT)) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } @@ -444,13 +467,13 @@ get_next: /* * at the root if the tree recently collapsed. */ if (PGNO(h) == root_pgno) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } indx = cp->sp->indx + 1; if (indx == NUM_ENT(cp->sp->page)) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); cp->csp++; goto err; } @@ -863,7 +886,7 @@ found: *exactp = 1; * DB_NOTFOUND. */ if (B_DISSET(GET_BKEYDATA(dbp, h, indx + deloffset)->type)) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index 8299c69a..f7719dc4 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1990, 1993, 1994, 1995, 1996 @@ -63,7 +63,7 @@ __bam_split(dbc, arg, root_pgnop) db_pgno_t *root_pgnop; { BTREE_CURSOR *cp; - DB_LOCK metalock, next_lock; + DB_LOCK meta_lock, next_lock; enum { UP, DOWN } dir; db_pgno_t pgno, next_pgno, root_pgno; int exact, level, ret; @@ -72,17 +72,16 @@ __bam_split(dbc, arg, root_pgnop) LOCK_CHECK_OFF(dbc->thread_info); cp = (BTREE_CURSOR *)dbc->internal; + LOCK_INIT(meta_lock); LOCK_INIT(next_lock); next_pgno = PGNO_INVALID; /* - * First get a lock on the metadata page, we will have to allocate + * First get a lock on the metadata page; we will have to allocate * pages and cannot get a lock while we have the search tree pinned. */ - pgno = PGNO_BASE_MD; - if ((ret = __db_lget(dbc, - 0, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0) + if ((ret = __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, &meta_lock)) != 0) goto err; root_pgno = BAM_ROOT_PGNO(dbc); @@ -189,7 +188,7 @@ no_split: /* Once we've split the leaf page, we're done. */ if (root_pgnop != NULL) *root_pgnop = BAM_ROOT_PGNO(dbc); err: -done: (void)__LPUT(dbc, metalock); +done: (void)__LPUT(dbc, meta_lock); (void)__TLPUT(dbc, next_lock); if (F_ISSET(dbc, DBC_OPD)) @@ -685,6 +684,7 @@ __bam_broot(dbc, rootp, split, lp, rp) DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data)); DB_SET_DBT(data, &bo, BOVERFLOW_SIZE); break; + case B_BLOB: case B_DUPLICATE: default: goto pgfmt; @@ -772,7 +772,30 @@ __ram_root(dbc, rootp, lp, rp) /* * __bam_pinsert -- - * Insert a new key into a parent page, completing the split. + * + * Construct a internal index item and place it in the parent page. It is + * primarily used by __bam_page() to add a new page into the tree. The sole + * other use is by __bam_pupdate() after a reverse split or compact has + * removed pages underneath it, in order to replace the parent's key/nrecs + * to match the new subtree. + * + * Parameters: + * parent - the page from the cursor stack to be modifed. The next entry + * in the stack (i.e., the next lower level in the tree) contains + * the key of the new item. The indx field must have been set + * when searching down the tree, to point to the new/replaced + * parent item. + * split - the indx in the cursor stack of the 'source' of the new item. + * lchild - the left child page is used *only* when attempting to use + * prefix key compression on a leaf (data) page. + * rchild - right child page. The source of the pgno of the new item. + * flags - BPI_REPLACE | BPI_NORENCUM + * BPI_NOLOGGING + * + * The pgno of the item always comes from rchild, which often is the same + * as parent[1].page. The key for DB_BTREE comes from the next lower page + * in the stack under parent, not from either lchild or rchild parameter -- + * though often rchild is a copy of parent[1].page. * * PUBLIC: int __bam_pinsert * PUBLIC: __P((DBC *, EPG *, u_int32_t, PAGE *, PAGE *, int)); @@ -867,12 +890,27 @@ __bam_pinsert(dbc, parent, split, lchild, rchild, flags) size = BINTERNAL_SIZE(child_bi->len); break; case B_OVERFLOW: - /* Reuse the overflow key. */ + /* Copy the overflow key. */ child_bo = (BOVERFLOW *)child_bi->data; memset(&bo, 0, sizeof(bo)); bo.type = B_OVERFLOW; bo.tlen = child_bo->tlen; - bo.pgno = child_bo->pgno; + if (LF_ISSET(BPI_REPLACE)) { + /* + * Replace (compact or reverse split) needs to + * copy in case the data item gets removed. + */ + memset(&hdr, 0, sizeof(hdr)); + if ((ret = __db_goff(dbc, &hdr, + child_bo->tlen, child_bo->pgno, + &hdr.data, &hdr.size)) == 0) + ret = __db_poff(dbc, &hdr, &bo.pgno); + if (hdr.data != NULL) + __os_free(dbp->env, hdr.data); + if (ret != 0) + return (ret); + } else + bo.pgno = child_bo->pgno; bi.len = BOVERFLOW_SIZE; B_TSET(bi.type, B_OVERFLOW); bi.pgno = rchild->pgno; @@ -881,6 +919,7 @@ __bam_pinsert(dbc, parent, split, lchild, rchild, flags) DB_SET_DBT(data, &bo, BOVERFLOW_SIZE); size = BINTERNAL_SIZE(BOVERFLOW_SIZE); break; + case B_BLOB: case B_DUPLICATE: default: goto pgfmt; @@ -982,8 +1021,8 @@ noprefix: if (P_FREESPACE(dbp, ppage) + oldsize < nbytes) DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data)); DB_SET_DBT(data, &bo, BOVERFLOW_SIZE); size = BINTERNAL_SIZE(BOVERFLOW_SIZE); - break; + case B_BLOB: case B_DUPLICATE: default: goto pgfmt; @@ -1153,23 +1192,32 @@ __bam_psplit(dbc, cp, lp, rp, splitret) nbytes += BINTERNAL_SIZE(BOVERFLOW_SIZE); break; case P_LBTREE: - if (B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) == - B_KEYDATA) - nbytes += BKEYDATA_SIZE(GET_BKEYDATA(dbp, - pp, off)->len); - else + switch (B_TYPE(GET_BKEYDATA(dbp, pp, off)->type)) { + case B_KEYDATA: + nbytes += BKEYDATA_SIZE( + GET_BKEYDATA(dbp, pp, off)->len); + break; + case B_BLOB: + nbytes += BBLOB_SIZE; + break; + default: nbytes += BOVERFLOW_SIZE; - + } ++off; /* FALLTHROUGH */ case P_LDUP: case P_LRECNO: - if (B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) == - B_KEYDATA) - nbytes += BKEYDATA_SIZE(GET_BKEYDATA(dbp, - pp, off)->len); - else + switch (B_TYPE(GET_BKEYDATA(dbp, pp, off)->type)) { + case B_KEYDATA: + nbytes += BKEYDATA_SIZE( + GET_BKEYDATA(dbp, pp, off)->len); + break; + case B_BLOB: + nbytes += BBLOB_SIZE; + break; + default: nbytes += BOVERFLOW_SIZE; + } break; case P_IRECNO: nbytes += RINTERNAL_SIZE; @@ -1269,7 +1317,7 @@ __bam_copy(dbp, pp, cp, nxt, stop) PAGE *pp, *cp; u_int32_t nxt, stop; { - BINTERNAL internal; + BINTERNAL *bi, internal; db_indx_t *cinp, nbytes, off, *pinp; cinp = P_INP(dbp, cp); @@ -1302,12 +1350,17 @@ __bam_copy(dbp, pp, cp, nxt, stop) /* FALLTHROUGH */ case P_LDUP: case P_LRECNO: - if (B_TYPE(GET_BKEYDATA(dbp, pp, nxt)->type) == - B_KEYDATA) - nbytes = BKEYDATA_SIZE(GET_BKEYDATA(dbp, - pp, nxt)->len); - else + switch (B_TYPE(GET_BKEYDATA(dbp, pp, nxt)->type)) { + case B_KEYDATA: + nbytes = BKEYDATA_SIZE( + GET_BKEYDATA(dbp, pp, nxt)->len); + break; + case B_BLOB: + nbytes = BBLOB_SIZE; + break; + default: nbytes = BOVERFLOW_SIZE; + } break; case P_IRECNO: nbytes = RINTERNAL_SIZE; @@ -1316,17 +1369,18 @@ __bam_copy(dbp, pp, cp, nxt, stop) return (__db_pgfmt(dbp->env, pp->pgno)); } cinp[off] = HOFFSET(cp) -= nbytes; + /* Minimize the first key on an IBTREE page; it isn't valid. */ + bi = GET_BINTERNAL(dbp, pp, nxt); if (off == 0 && nxt != 0 && TYPE(pp) == P_IBTREE) { internal.len = 0; UMRW_SET(internal.unused); internal.type = B_KEYDATA; - internal.pgno = GET_BINTERNAL(dbp, pp, nxt)->pgno; - internal.nrecs = GET_BINTERNAL(dbp, pp, nxt)->nrecs; + internal.pgno = bi->pgno; + internal.nrecs = bi->nrecs; memcpy(P_ENTRY(dbp, cp, off), &internal, nbytes); } else - memcpy(P_ENTRY(dbp, cp, off), - P_ENTRY(dbp, pp, nxt), nbytes); + memcpy(P_ENTRY(dbp, cp, off), bi, nbytes); } return (0); } diff --git a/src/btree/bt_stat.c b/src/btree/bt_stat.c index 668c4fdb..04c0fbcb 100644 --- a/src/btree/bt_stat.c +++ b/src/btree/bt_stat.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -278,6 +278,8 @@ __bam_stat_print(dbc, flags) "%#x\tFixed-length record pad", (u_int)sp->bt_re_pad); } __db_dl(env, + "Number of pages in the database", (u_long)sp->bt_pagecnt); + __db_dl(env, "Underlying database page size", (u_long)sp->bt_pagesize); if (dbp->type == DB_BTREE) __db_dl(env, "Overflow key/data size", @@ -288,6 +290,10 @@ __bam_stat_print(dbc, flags) "Number of records in the tree", (u_long)sp->bt_nkeys); __db_dl(env, "Number of data items in the tree", (u_long)sp->bt_ndata); + if (dbp->type == DB_BTREE) { + __db_dl(env, + "Number of blobs in the tree", (u_long)sp->bt_nblobs); + } __db_dl(env, "Number of tree internal pages", (u_long)sp->bt_int_pg); @@ -372,6 +378,10 @@ __bam_stat_callback(dbc, h, cookie, putp) /* Ignore off-page duplicates. */ if (B_TYPE(type) != B_DUPLICATE) ++sp->bt_ndata; + + /* Count blobs. */ + if (B_TYPE(type) == B_BLOB) + ++sp->bt_nblobs; } ++sp->bt_leaf_pg; diff --git a/src/btree/bt_upgrade.c b/src/btree/bt_upgrade.c index c9123351..66e27d56 100644 --- a/src/btree/bt_upgrade.c +++ b/src/btree/bt_upgrade.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -9,6 +9,7 @@ #include "db_config.h" #include "db_int.h" +#include "dbinc/blob.h" #include "dbinc/db_page.h" #include "dbinc/db_upgrade.h" #include "dbinc/btree.h" @@ -151,3 +152,94 @@ __bam_31_lbtree(dbp, real_name, flags, fhp, h, dirtyp) return (ret); } + +/* + * __bam_60_btreemeta-- + * Upgrade the version number. + * + * PUBLIC: int __bam_60_btreemeta + * PUBLIC: __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)); + */ +int +__bam_60_btreemeta(dbp, real_name, flags, fhp, h, dirtyp) + DB *dbp; + char *real_name; + u_int32_t flags; + DB_FH *fhp; + PAGE *h; + int *dirtyp; +{ + BTMETA33 *bmeta; + + COMPQUIET(flags, 0); + COMPQUIET(real_name, NULL); + COMPQUIET(fhp, NULL); + COMPQUIET(dbp, NULL); + bmeta = (BTMETA33 *)h; + + bmeta->dbmeta.version = 10; + *dirtyp = 1; + + return (0); +} + +/* + * __bam_60_lbtree -- + * Upgrade the blob records on the database btree leaf pages. + * + * PUBLIC: int __bam_60_lbtree + * PUBLIC: __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)); + */ +int +__bam_60_lbtree(dbp, real_name, flags, fhp, h, dirtyp) + DB *dbp; + char *real_name; + u_int32_t flags; + DB_FH *fhp; + PAGE *h; + int *dirtyp; +{ + BBLOB60 bl60; + BBLOB60P1 bl60p1; + BKEYDATA *bk; + db_seq_t blob_id, blob_size, file_id, sdb_id; + db_indx_t indx; + int ret; + + COMPQUIET(flags, 0); + COMPQUIET(real_name, NULL); + COMPQUIET(fhp, NULL); + ret = 0; + + DB_ASSERT(dbp->env, BBLOB60_SIZE == BBLOB_SIZE); + for (indx = O_INDX; indx < NUM_ENT(h); indx += P_INDX) { + bk = GET_BKEYDATA(dbp, h, indx); + if (B_TYPE(bk->type) == B_BLOB ) { + memcpy(&bl60, bk, BBLOB60_SIZE); + memset(&bl60p1, 0, BBLOB_SIZE); + bl60p1.type = bl60.type; + bl60p1.len = BBLOB_DSIZE; + bl60p1.encoding = bl60.encoding; + GET_BLOB60_ID(dbp->env, bl60, blob_id, ret); + if (ret != 0) + return (ret); + GET_BLOB60_SIZE(dbp->env, bl60, blob_size, ret); + if (ret != 0) + return (ret); + GET_BLOB60_FILE_ID(dbp->env, &bl60, file_id, ret); + if (ret != 0) + return (ret); + GET_BLOB60_SDB_ID(dbp->env, &bl60, sdb_id, ret); + if (ret != 0) + return (ret); + SET_BLOB_ID(&bl60p1, blob_id, BBLOB60P1); + SET_BLOB_SIZE(&bl60p1, blob_size, BBLOB60P1); + SET_BLOB_FILE_ID(&bl60p1, file_id, BBLOB60P1); + SET_BLOB_SDB_ID(&bl60p1, sdb_id, BBLOB60P1); + memcpy(bk, &bl60p1, BBLOB_SIZE); + *dirtyp = 1; + } + } + + return (ret); +} diff --git a/src/btree/bt_verify.c b/src/btree/bt_verify.c index 99354a58..8ceb50e6 100644 --- a/src/btree/bt_verify.c +++ b/src/btree/bt_verify.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -9,6 +9,7 @@ #include "db_config.h" #include "db_int.h" +#include "dbinc/blob.h" #include "dbinc/db_page.h" #include "dbinc/db_verify.h" #include "dbinc/btree.h" @@ -20,8 +21,8 @@ static int __bam_safe_getdata __P((DB *, DB_THREAD_INFO *, static int __bam_vrfy_inp __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, db_indx_t *, u_int32_t)); static int __bam_vrfy_treeorder __P((DB *, DB_THREAD_INFO *, PAGE *, - BINTERNAL *, BINTERNAL *, int (*)(DB *, const DBT *, const DBT *), - u_int32_t)); + BINTERNAL *, BINTERNAL *, + int (*)(DB *, const DBT *, const DBT *, size_t *), u_int32_t)); static int __ram_vrfy_inp __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, db_indx_t *, u_int32_t)); @@ -44,6 +45,7 @@ __bam_vrfy_meta(dbp, vdp, meta, pgno, flags) VRFY_PAGEINFO *pip; int isbad, t_ret, ret; db_indx_t ovflsize; + db_seq_t blob_id; env = dbp->env; isbad = 0; @@ -201,6 +203,56 @@ __bam_vrfy_meta(dbp, vdp, meta, pgno, flags) "%lu %lu"), (u_long)pgno, (u_long)pip->re_len)); } +/* + * Where 64-bit integer support is not available, + * return an error if the file has any blobs. + */ + t_ret = 0; +#ifdef HAVE_64BIT_TYPES + GET_BLOB_FILE_ID(env, meta, blob_id, t_ret); + if (t_ret != 0) { + isbad = 1; + EPRINT((env, DB_STR_A("1187", + "Page %lu: blob file id overflow.", "%lu"), (u_long)pgno)); + if (ret == 0) + ret = t_ret; + } + t_ret = 0; + GET_BLOB_SDB_ID(env, meta, blob_id, t_ret); + if (t_ret != 0) { + isbad = 1; + EPRINT((env, DB_STR_A("1188", + "Page %lu: blob subdatabase id overflow.", + "%lu"), (u_long)pgno)); + if (ret == 0) + ret = t_ret; + } +#else /* HAVE_64BIT_TYPES */ + /* + * db_seq_t is an int on systems that do not have 64 integers, so + * this will compile and run. + */ + GET_BLOB_FILE_ID(env, meta, blob_id, t_ret); + if (t_ret != 0 || blob_id != 0) { + isbad = 1; + EPRINT((env, DB_STR_A("1200", + "Page %lu: blobs require 64 integer compiler support.", + "%lu"), (u_long)pgno)); + if (ret == 0) + ret = t_ret; + } + t_ret = 0; + GET_BLOB_SDB_ID(env, meta, blob_id, t_ret); + if (t_ret != 0 || blob_id != 0) { + isbad = 1; + EPRINT((env, DB_STR_A("1201", + "Page %lu: blobs require 64 integer compiler support.", + "%lu"), (u_long)pgno)); + if (ret == 0) + ret = t_ret; + } +#endif + /* * We do not check that the rest of the page is 0, because it may * not be and may still be correct. @@ -268,8 +320,7 @@ __ram_vrfy_leaf(dbp, vdp, h, pgno, flags) if (F_ISSET(pip, VRFY_HAS_DUPS)) { EPRINT((env, DB_STR_A("1043", - "Page %lu: Recno database has dups", - "%lu"), (u_long)pgno)); + "Page %lu: Recno database has dups", "%lu"), (u_long)pgno)); ret = DB_VERIFY_BAD; goto err; } @@ -547,12 +598,15 @@ __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags) db_indx_t *nentriesp; u_int32_t flags; { + BBLOB bl; BKEYDATA *bk; BOVERFLOW *bo; ENV *env; VRFY_CHILDINFO child; VRFY_ITEM *pagelayout; VRFY_PAGEINFO *pip; + off_t blob_size; + db_seq_t blob_id, file_id, sdb_id; u_int32_t himark, offset; /* * These would be db_indx_ts * but for alignment. @@ -563,6 +617,7 @@ __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags) env = dbp->env; isbad = isdupitem = 0; nentries = 0; + file_id = sdb_id = 0; memset(&child, 0, sizeof(VRFY_CHILDINFO)); if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) return (ret); @@ -668,6 +723,9 @@ __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags) else endoff = offset + BKEYDATA_SIZE(bk->len) - 1; break; + case B_BLOB: + endoff = offset + BBLOB_SIZE - 1; + break; case B_DUPLICATE: /* * Flag that we have dups; we'll check whether @@ -731,6 +789,52 @@ __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags) * already been done. */ break; + case B_BLOB: + if (TYPE(h) == P_IBTREE) { + isbad = 1; + EPRINT((env, DB_STR_A("1189", + "Page %lu: blob item in internal btree page at item %lu", + "%lu %lu"), (u_long)pgno, (u_long)i)); + break; + } else if (TYPE(h) == P_LRECNO) { + isbad = 1; + EPRINT((env, DB_STR_A("1190", + "Page %lu: blob item referenced by recno page at item %lu", + "%lu %lu"), (u_long)pgno, (u_long)i)); + break; + } + /* + * Blob item. Check that the blob file exists and is + * the same file size as is stored in the database + * record. + */ + memcpy(&bl, bk, BBLOB_SIZE); + blob_id = (db_seq_t)bl.id; + GET_BLOB_SIZE(env, bl, blob_size, ret); + if (ret != 0 || blob_size < 0) { + isbad = 1; + EPRINT((env, DB_STR_A("1192", + "Page %lu: blob file size value has overflowed at item %lu", + "%lu %lu"), (u_long)pgno, (u_long)i)); + break; + } + file_id = (db_seq_t)bl.file_id; + sdb_id = (db_seq_t)bl.sdb_id; + if (file_id == 0 && sdb_id == 0) { + isbad = 1; + EPRINT((dbp->env, DB_STR_A("1195", + "Page %lu: invalid blob dir ids %llu %llu at item %lu", + "%lu %ll %ll %lu"), (u_long)pip->pgno, + (long long)file_id, + (long long)sdb_id, (u_long)i)); + break; + } + if ((ret = __blob_vrfy(env, blob_id, + blob_size, file_id, sdb_id, pgno, flags)) != 0) { + isbad = 1; + break; + } + break; case B_DUPLICATE: if (TYPE(h) == P_IBTREE) { isbad = 1; @@ -751,9 +855,17 @@ __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags) (BOVERFLOW *)(((BINTERNAL *)bk)->data) : (BOVERFLOW *)bk; - if (B_TYPE(bk->type) == B_OVERFLOW) + if (B_TYPE(bk->type) == B_OVERFLOW) { + if (TYPE(h) == P_IBTREE && + bk->len != BOVERFLOW_SIZE) { + EPRINT((env, DB_STR_A("1196", + "Page %lu: bad length %u in B_OVERFLOW item %lu", + "%lu %u %lu"), + (u_long)pgno, bk->len, (u_long)i)); + isbad = 1; + } /* Make sure tlen is reasonable. */ - if (bo->tlen > dbp->pgsize * vdp->last_pgno) { + if (bo->tlen >= dbp->pgsize * vdp->last_pgno) { isbad = 1; EPRINT((env, DB_STR_A("1056", "Page %lu: impossible tlen %lu, item %lu", @@ -762,6 +874,7 @@ __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags) /* Don't save as a child. */ break; } + } if (!IS_VALID_PGNO(bo->pgno) || bo->pgno == pgno || bo->pgno == PGNO_INVALID) { @@ -918,8 +1031,8 @@ __bam_vrfy_itemorder(dbp, vdp, ip, h, pgno, nentries, ovflok, hasdups, flags) VRFY_PAGEINFO *pip; db_indx_t i, *inp; int adj, cmp, freedup_1, freedup_2, isbad, ret, t_ret; - int (*dupfunc) __P((DB *, const DBT *, const DBT *)); - int (*func) __P((DB *, const DBT *, const DBT *)); + int (*dupfunc) __P((DB *, const DBT *, const DBT *, size_t *)); + int (*func) __P((DB *, const DBT *, const DBT *, size_t *)); void *buf1, *buf2, *tmpbuf; /* @@ -1066,6 +1179,11 @@ retry: p1 = &dbta; if (B_TYPE(bk->type) == B_OVERFLOW) { bo = (BOVERFLOW *)bk; goto overflow; + } else if (B_TYPE(bk->type) == B_BLOB) { + isbad = 1; + EPRINT((env, DB_STR_A("1197", + "Page %lu: Blob found in key item %lu", + "%lu %lu"), (u_long)pgno, (u_long)i)); } else { p2->data = bk->data; p2->size = bk->len; @@ -1124,7 +1242,8 @@ overflow: if (!ovflok) { /* Compare with the last key. */ if (p1->data != NULL && p2->data != NULL) { - cmp = inp[i] == inp[i - adj] ? 0 : func(dbp, p1, p2); + cmp = inp[i] == inp[i - adj] ? 0 : + func(dbp, p1, p2, NULL); /* comparison succeeded */ if (cmp > 0) { @@ -1236,8 +1355,8 @@ overflow: if (!ovflok) { * until we do the structure check * and see whether DUPSORT is set. */ - if (dupfunc(dbp, &dup_1, &dup_2) > 0 && - pip != NULL) + if (dupfunc(dbp, &dup_1, &dup_2, + NULL) > 0 && pip != NULL) F_SET(pip, VRFY_DUPS_UNSORTED); if (freedup_1) @@ -1409,7 +1528,7 @@ __bam_vrfy_subtree(dbp, vdp, pgno, l, r, flags, levelp, nrecsp, relenp) db_recno_t child_nrecs, nrecs; u_int32_t child_level, child_relen, j, level, relen, stflags; u_int8_t leaf_type; - int (*func) __P((DB *, const DBT *, const DBT *)); + int (*func) __P((DB *, const DBT *, const DBT *, size_t *)); int isbad, p, ret, t_ret, toplevel; if (levelp != NULL) /* Don't leave uninitialized on error. */ @@ -1524,7 +1643,7 @@ __bam_vrfy_subtree(dbp, vdp, pgno, l, r, flags, levelp, nrecsp, relenp) * Don't do the prev/next_pgno checks if we've lost * leaf pages due to another corruption. */ - if (!F_ISSET(vdp, VRFY_LEAFCHAIN_BROKEN)) { + if (!F_ISSET(vdp, SALVAGE_LEAFCHAIN_BROKEN)) { if (pip->pgno != vdp->next_pgno) { isbad = 1; EPRINT((env, DB_STR_A("1075", @@ -1547,7 +1666,7 @@ bad_prev: isbad = 1; } vdp->prev_pgno = pip->pgno; vdp->next_pgno = pip->next_pgno; - F_CLR(vdp, VRFY_LEAFCHAIN_BROKEN); + F_CLR(vdp, SALVAGE_LEAFCHAIN_BROKEN); /* * Overflow pages are common to all three leaf types; @@ -1694,7 +1813,7 @@ bad_prev: isbad = 1; * spew error messages about erroneous prev/next_pgnos, * since that's probably not the real problem. */ - F_SET(vdp, VRFY_LEAFCHAIN_BROKEN); + F_SET(vdp, SALVAGE_LEAFCHAIN_BROKEN); ret = DB_VERIFY_BAD; goto err; @@ -2042,7 +2161,7 @@ __bam_vrfy_treeorder(dbp, ip, h, lp, rp, func, flags) DB_THREAD_INFO *ip; PAGE *h; BINTERNAL *lp, *rp; - int (*func) __P((DB *, const DBT *, const DBT *)); + int (*func) __P((DB *, const DBT *, const DBT *, size_t *)); u_int32_t flags; { BOVERFLOW *bo; @@ -2050,7 +2169,7 @@ __bam_vrfy_treeorder(dbp, ip, h, lp, rp, func, flags) DBT dbt; ENV *env; db_indx_t last; - int ret, cmp; + int cmp, ret, t_ret; env = dbp->env; memset(&dbt, 0, sizeof(DBT)); @@ -2077,7 +2196,6 @@ __bam_vrfy_treeorder(dbp, ip, h, lp, rp, func, flags) return (__db_unknown_path(env, "__bam_vrfy_treeorder")); } - /* Populate a dummy cursor. */ if ((ret = __db_cursor_int(dbp, ip, NULL, DB_BTREE, PGNO_INVALID, 0, DB_LOCK_INVALIDID, &dbc)) != 0) return (ret); @@ -2095,9 +2213,6 @@ __bam_vrfy_treeorder(dbp, ip, h, lp, rp, func, flags) * parent and falsely report a failure.) */ if (lp != NULL && TYPE(h) != P_IBTREE) { - if ((ret = __db_cursor_int(dbp, ip, NULL, DB_BTREE, - PGNO_INVALID, 0, DB_LOCK_INVALIDID, &dbc)) != 0) - return (ret); if (lp->type == B_KEYDATA) { dbt.data = lp->data; dbt.size = lp->len; @@ -2105,13 +2220,13 @@ __bam_vrfy_treeorder(dbp, ip, h, lp, rp, func, flags) bo = (BOVERFLOW *)lp->data; if ((ret = __db_goff(dbc, &dbt, bo->tlen, bo->pgno, NULL, NULL)) != 0) - return (ret); - } else - return ( - __db_unknown_path(env, "__bam_vrfy_treeorder")); + goto err; + } else { + ret = __db_unknown_path(env, "__bam_vrfy_treeorder"); + goto err; + } - /* On error, fall through, free if needed, and return. */ - if ((ret = __bam_cmp(dbc, &dbt, h, 0, func, &cmp)) == 0) { + if ((ret = __bam_cmp(dbc, &dbt, h, 0, func, &cmp, NULL)) == 0) { if (cmp > 0) { EPRINT((env, DB_STR_A("1092", "Page %lu: first item on page sorted greater than parent entry", @@ -2126,7 +2241,7 @@ __bam_vrfy_treeorder(dbp, ip, h, lp, rp, func, flags) if (dbt.data != lp->data) __os_ufree(env, dbt.data); if (ret != 0) - return (ret); + goto err; } if (rp != NULL) { @@ -2137,13 +2252,14 @@ __bam_vrfy_treeorder(dbp, ip, h, lp, rp, func, flags) bo = (BOVERFLOW *)rp->data; if ((ret = __db_goff(dbc, &dbt, bo->tlen, bo->pgno, NULL, NULL)) != 0) - return (ret); - } else - return ( - __db_unknown_path(env, "__bam_vrfy_treeorder")); + goto err; + } else { + ret = __db_unknown_path(env, "__bam_vrfy_treeorder"); + goto err; + } - /* On error, fall through, free if needed, and return. */ - if ((ret = __bam_cmp(dbc, &dbt, h, last, func, &cmp)) == 0) { + if ((ret = __bam_cmp(dbc, + &dbt, h, last, func, &cmp, NULL)) == 0) { if (cmp < 0) { EPRINT((env, DB_STR_A("1094", "Page %lu: last item on page sorted greater than parent entry", @@ -2158,6 +2274,9 @@ __bam_vrfy_treeorder(dbp, ip, h, lp, rp, func, flags) if (dbt.data != rp->data) __os_ufree(env, dbt.data); } +err: + if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0) + ret = t_ret; return (ret); } @@ -2186,14 +2305,20 @@ __bam_salvage(dbp, vdp, pgno, pgtype, h, handle, callback, key, flags) { BKEYDATA *bk; BOVERFLOW *bo; + BBLOB bl; DBT dbt, repldbt, unknown_key, unknown_data; ENV *env; VRFY_ITEM *pgmap; db_indx_t i, last, beg, end, *inp; db_pgno_t ovflpg; + off_t blob_size, blob_offset, remaining; + u_int32_t blob_buf_size; + u_int8_t *blob_buf; u_int32_t himark, ovfl_bufsz; + db_seq_t blob_id, file_id, sdb_id; void *ovflbuf; int adj, ret, t_ret, t2_ret; + char *prefix; #ifdef HAVE_COMPRESSION DBT kcpy, *last_key; int unknown_dup_key; @@ -2202,6 +2327,8 @@ __bam_salvage(dbp, vdp, pgno, pgtype, h, handle, callback, key, flags) env = dbp->env; ovflbuf = pgmap = NULL; inp = P_INP(dbp, h); + blob_buf_size = 0; + blob_buf = NULL; memset(&dbt, 0, sizeof(DBT)); dbt.flags = DB_DBT_REALLOC; @@ -2543,6 +2670,68 @@ __bam_salvage(dbp, vdp, pgno, pgtype, h, handle, callback, key, flags) } #endif break; + case B_BLOB: + memcpy(&bl, bk, BBLOB_SIZE); + blob_id = (db_seq_t)bl.id; + GET_BLOB_SIZE(env, bl, blob_size, ret); + if (ret != 0 || blob_size < 0) + goto err; + file_id = (db_seq_t)bl.file_id; + sdb_id = (db_seq_t)bl.sdb_id; + + /* Read the blob, in pieces if it is too large.*/ + blob_offset = 0; + if (blob_size > MEGABYTE) { + if (blob_buf_size < MEGABYTE) { + if ((ret = __os_realloc( + env, MEGABYTE, &blob_buf)) != 0) + goto err; + blob_buf_size = MEGABYTE; + } + } else if (blob_buf_size < blob_size) { + blob_buf_size = (u_int32_t)blob_size; + if ((ret = __os_realloc(env, + blob_buf_size, &blob_buf)) != 0) + goto err; + } + dbt.data = blob_buf; + dbt.ulen = blob_buf_size; + remaining = blob_size; + prefix = " "; + do { + if ((ret = __blob_salvage(env, blob_id, + blob_offset, + ((remaining < blob_buf_size) ? + (size_t)remaining : blob_buf_size), + file_id, sdb_id, &dbt)) != 0) { + if (LF_ISSET(DB_AGGRESSIVE)) { + ret = DB_VERIFY_BAD; + break; + } + F_CLR(vdp, SALVAGE_STREAM_BLOB); + goto err; + } + if (remaining > blob_buf_size) + F_SET(vdp, SALVAGE_STREAM_BLOB); + else + F_CLR(vdp, SALVAGE_STREAM_BLOB); + if ((t_ret = __db_vrfy_prdbt( + &dbt, 0, prefix, + handle, callback, 0, 0, vdp)) != 0) { + if (ret == 0) + ret = t_ret; + F_CLR(vdp, SALVAGE_STREAM_BLOB); + goto err; + } + prefix = NULL; + blob_offset += dbt.size; + if (remaining < blob_buf_size) + remaining = 0; + else + remaining -= blob_buf_size; + } while (remaining > 0); + F_CLR(vdp, SALVAGE_STREAM_BLOB); + break; default: /* * We should never get here; __db_vrfy_inpitem should @@ -2572,6 +2761,8 @@ err: if (pgmap != NULL) __os_free(env, ovflbuf); if (repldbt.data != NULL) __os_free(env, repldbt.data); + if (blob_buf != NULL) + __os_free(env, blob_buf); #ifdef HAVE_COMPRESSION if (kcpy.data != NULL) __os_free(env, kcpy.data); diff --git a/src/btree/btree.src b/src/btree/btree.src index 08e5a206..02088b88 100644 --- a/src/btree/btree.src +++ b/src/btree/btree.src @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/clib/bsearch.c b/src/clib/bsearch.c index 3e55009a..de15358b 100644 --- a/src/clib/bsearch.c +++ b/src/clib/bsearch.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2010, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/clib/getcwd.c b/src/clib/getcwd.c index 83e8b62d..028fc3f2 100644 --- a/src/clib/getcwd.c +++ b/src/clib/getcwd.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1989, 1991, 1993 diff --git a/src/clib/getopt.c b/src/clib/getopt.c index ca98e7f1..4e4dc6c8 100644 --- a/src/clib/getopt.c +++ b/src/clib/getopt.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1987, 1993, 1994 diff --git a/src/clib/isalpha.c b/src/clib/isalpha.c index 6bf1ffb7..39114c08 100644 --- a/src/clib/isalpha.c +++ b/src/clib/isalpha.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/clib/isdigit.c b/src/clib/isdigit.c index d1b2a65e..e4e1d3d8 100644 --- a/src/clib/isdigit.c +++ b/src/clib/isdigit.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/clib/isprint.c b/src/clib/isprint.c index 685e20ea..310894d5 100644 --- a/src/clib/isprint.c +++ b/src/clib/isprint.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/clib/isspace.c b/src/clib/isspace.c index df450d3b..48a20617 100644 --- a/src/clib/isspace.c +++ b/src/clib/isspace.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/clib/memcmp.c b/src/clib/memcmp.c index 7fec827c..7db1d3ad 100644 --- a/src/clib/memcmp.c +++ b/src/clib/memcmp.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1990, 1993 diff --git a/src/clib/memmove.c b/src/clib/memmove.c index 34a181cc..866843dc 100644 --- a/src/clib/memmove.c +++ b/src/clib/memmove.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1990, 1993 diff --git a/src/clib/printf.c b/src/clib/printf.c index a2c01296..f36eeb15 100644 --- a/src/clib/printf.c +++ b/src/clib/printf.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/clib/raise.c b/src/clib/raise.c index ad0e567f..223f797f 100644 --- a/src/clib/raise.c +++ b/src/clib/raise.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/clib/rand.c b/src/clib/rand.c index 6b810060..426627a9 100644 --- a/src/clib/rand.c +++ b/src/clib/rand.c @@ -13,6 +13,7 @@ * PUBLIC: void srand __P((unsigned int)); * PUBLIC: #endif */ +#ifndef HAVE_RAND int rand(void) /* RAND_MAX assumed to be 32767 */ { DB_GLOBAL(rand_next) = DB_GLOBAL(rand_next) * 1103515245 + 12345; @@ -23,3 +24,4 @@ void srand(unsigned int seed) { DB_GLOBAL(rand_next) = seed; } +#endif diff --git a/src/clib/snprintf.c b/src/clib/snprintf.c index 6b31d850..8f1a6855 100644 --- a/src/clib/snprintf.c +++ b/src/clib/snprintf.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/clib/strerror.c b/src/clib/strerror.c index 62bd7dd5..b2d148e4 100644 --- a/src/clib/strerror.c +++ b/src/clib/strerror.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1988, 1993 diff --git a/src/clib/time.c b/src/clib/time.c index abc2ab2d..3a3f0c3e 100644 --- a/src/clib/time.c +++ b/src/clib/time.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2006, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/common/clock.c b/src/common/clock.c index e1f917af..21a17de6 100644 --- a/src/common/clock.c +++ b/src/common/clock.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/common/crypto_stub.c b/src/common/crypto_stub.c index 95faebdb..b961a620 100644 --- a/src/common/crypto_stub.c +++ b/src/common/crypto_stub.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/common/db_byteorder.c b/src/common/db_byteorder.c index 71428f0a..13bc2d52 100644 --- a/src/common/db_byteorder.c +++ b/src/common/db_byteorder.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/common/db_compint.c b/src/common/db_compint.c index 9f5ccf9a..10317b2f 100644 --- a/src/common/db_compint.c +++ b/src/common/db_compint.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ #include "db_config.h" diff --git a/src/common/db_err.c b/src/common/db_err.c index 6edc37b6..7acaa174 100644 --- a/src/common/db_err.c +++ b/src/common/db_err.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -18,6 +18,11 @@ static void __db_msgcall __P((const DB_ENV *, const char *, va_list)); static void __db_msgfile __P((const DB_ENV *, const char *, va_list)); +#if defined(HAVE_ERROR_HISTORY) +static void __db_thread_once_func __P((void)); +static void __db_deferred_free __P((void *)); +#endif + /* * __db_fchk -- * General flags checking routine. @@ -62,6 +67,9 @@ __db_ferr(env, name, iscombo) const char *name; int iscombo; { + int ret; + + ret = USR_ERR(env, EINVAL); if (iscombo) __db_errx(env, DB_STR_A("0054", "illegal flag combination specified to %s", "%s"), name); @@ -69,7 +77,7 @@ __db_ferr(env, name, iscombo) __db_errx(env, DB_STR_A("0055", "illegal flag specified to %s", "%s"), name); - return (EINVAL); + return (ret); } /* @@ -145,9 +153,24 @@ __db_assert(env, e, file, line) if (DB_GLOBAL(j_assert) != NULL) DB_GLOBAL(j_assert)(e, file, line); else { - __db_errx(env, DB_STR_A("0059", - "assert failure: %s/%d: \"%s\"", - "%s %d %s"), file, line, e); + /* + * If a panic has preceded this assertion failure, print that + * message as well -- it might be relevant. + */ +#ifdef HAVE_FAILCHK_BROADCAST + if (PANIC_ISSET(env)) { + REGENV *renv; + renv = (env == NULL || env->reginfo == NULL) ? + NULL : env->reginfo->primary; + __db_errx(env, DB_STR_A("0242", + "assert failure (%s/%d: %s) after panic %s", + "%s %d %s %s"), file, line, e, + renv == NULL ? "" : renv->failure_symptom); + } else +#endif + __db_errx(env, DB_STR_A("0059", + "assert failure: %s/%d: \"%s\"", + "%s %d %s"), file, line, e); __os_abort(env); /* NOTREACHED */ @@ -156,8 +179,49 @@ __db_assert(env, e, file, line) #endif /* + * __env_panic_event - + * Notify the application of a db_register, failchk, or generic panic. + * + * PUBLIC: void __env_panic_event __P((ENV *, int)); + */ +void +__env_panic_event(env, errval) + ENV *env; + int errval; +{ + DB_ENV *dbenv; + REGENV *renv; + u_int32_t event; + void *info; + DB_EVENT_FAILCHK_INFO failinfo; + + dbenv = env->dbenv; + info = &errval; + if (dbenv->db_paniccall != NULL) /* Deprecated */ + dbenv->db_paniccall(dbenv, errval); + /* + * We check for DB_EVENT_FAILCHK and DB_EVENT_REG_PANIC first because + * they are not set by themselves. If one of those is set, it means that + * this panic is somewhat an expected consequence of a previous failure. + */ + renv = (env->reginfo == NULL) ? NULL : env->reginfo->primary; + if (renv != NULL && renv->failure_panic) { + event = DB_EVENT_FAILCHK_PANIC; + failinfo.error = errval; + (void)strncpy(failinfo.symptom, + renv->failure_symptom, sizeof(failinfo.symptom)); + failinfo.symptom[sizeof(failinfo.symptom) - 1] = '\0'; + info = &failinfo; + } else if (renv != NULL && renv->reg_panic) + event = DB_EVENT_REG_PANIC; + else + event = DB_EVENT_PANIC; + DB_EVENT(env, event, info); +} + +/* * __env_panic_msg -- - * Just report that someone else paniced. + * Report that we noticed a panic which had been set somewhere else. * * PUBLIC: int __env_panic_msg __P((ENV *)); */ @@ -165,28 +229,16 @@ int __env_panic_msg(env) ENV *env; { - DB_ENV *dbenv; int ret; - dbenv = env->dbenv; - ret = DB_RUNRECOVERY; + /* Make a note saying where this panic was detected. */ + (void)USR_ERR(env, ret); __db_errx(env, DB_STR("0060", "PANIC: fatal region error detected; run recovery")); - if (dbenv->db_paniccall != NULL) /* Deprecated */ - dbenv->db_paniccall(dbenv, ret); - - /* Must check for DB_EVENT_REG_PANIC panic first because it is never - * set by itself. If set, it means panic came from DB_REGISTER code - * only, otherwise it could be from many possible places in the code. - */ - if ((env->reginfo != NULL) && - (((REGENV *)env->reginfo->primary)->reg_panic)) - DB_EVENT(env, DB_EVENT_REG_PANIC, &ret); - else - DB_EVENT(env, DB_EVENT_PANIC, &ret); + __env_panic_event(env, ret); return (ret); } @@ -202,28 +254,13 @@ __env_panic(env, errval) ENV *env; int errval; { - DB_ENV *dbenv; - - dbenv = env->dbenv; - if (env != NULL) { __env_panic_set(env, 1); - __db_err(env, errval, DB_STR("0061", "PANIC")); + if (errval != DB_RUNRECOVERY) + __db_err(env, errval, DB_STR("0061", "PANIC")); - if (dbenv->db_paniccall != NULL) /* Deprecated */ - dbenv->db_paniccall(dbenv, errval); - - /* Must check for DB_EVENT_REG_PANIC first because it is never - * set by itself. If set, it means panic came from DB_REGISTER - * code only, otherwise it could be from many possible places - * in the code. - */ - if ((env->reginfo != NULL) && - (((REGENV *)env->reginfo->primary)->reg_panic)) - DB_EVENT(env, DB_EVENT_REG_PANIC, &errval); - else - DB_EVENT(env, DB_EVENT_PANIC, &errval); + __env_panic_event(env, errval); } #if defined(DIAGNOSTIC) && !defined(CONFIG_TEST) @@ -302,6 +339,9 @@ db_strerror(error) case DB_LOG_VERIFY_BAD: return (DB_STR("0071", "DB_LOG_VERIFY_BAD: Log verification failed")); + case DB_META_CHKSUM_FAIL: + return (DB_STR("0247", + "DB_META_CHKSUM_FAIL: Checksum mismatch detected on a database metadata page")); case DB_NOSERVER: return (DB_STR("0072", "DB_NOSERVER: No message dispatch call-back function has been configured")); @@ -419,18 +459,21 @@ __db_syserr(env, error, fmt, va_alist) DB_ENV *dbenv; dbenv = env == NULL ? NULL : env->dbenv; + if (env != NULL) + (void)USR_ERR(env, error); /* * The same as DB->err, except we don't default to writing to stderr * after any output channel has been configured, and we use a system- * specific function to translate errors to strings. */ - DB_REAL_ERR(dbenv, error, DB_ERROR_SYSTEM, 0, fmt); + DB_REAL_ERR(dbenv, + error, error == 0 ? DB_ERROR_NOT_SET : DB_ERROR_SYSTEM, 0, fmt); } /* * __db_err -- - * Standard error routine. + * Standard error routine with an error code. * * PUBLIC: void __db_err __P((const ENV *, int, const char *, ...)) * PUBLIC: __attribute__ ((__format__ (__printf__, 3, 4))); @@ -450,6 +493,10 @@ __db_err(env, error, fmt, va_alist) dbenv = env == NULL ? NULL : env->dbenv; + /* (If no deferred messages yet, at least?) add this calls' info. + (void)USR_ERR(env, error); + */ + /* * The same as DB->err, except we don't default to writing to stderr * once an output channel has been configured. @@ -459,7 +506,7 @@ __db_err(env, error, fmt, va_alist) /* * __db_errx -- - * Standard error routine. + * Standard error routine without any error code. * * PUBLIC: void __db_errx __P((const ENV *, const char *, ...)) * PUBLIC: __attribute__ ((__format__ (__printf__, 2, 3))); @@ -500,25 +547,54 @@ __db_errcall(dbenv, error, error_set, fmt, ap) const char *fmt; va_list ap; { - char *p; - char buf[2048]; /* !!!: END OF THE STACK DON'T TRUST SPRINTF. */ - char sysbuf[1024]; /* !!!: END OF THE STACK DON'T TRUST SPRINTF. */ + char *end, *p; + char buf[2048 + DB_ERROR_HISTORY_SIZE]; + char sysbuf[1024]; +#ifdef HAVE_ERROR_HISTORY + DB_MSGBUF *deferred_mb; + ptrdiff_t len; +#endif p = buf; + /* Reserve 1 byte at the end for '\0'. */ + end = buf + sizeof(buf) - 1; if (fmt != NULL) p += vsnprintf(buf, sizeof(buf), fmt, ap); + if (error_set != DB_ERROR_NOT_SET) - p += snprintf(p, - sizeof(buf) - (size_t)(p - buf), ": %s", + p += snprintf(p, (size_t)(end - p), ": %s", error_set == DB_ERROR_SET ? db_strerror(error) : __os_strerror(error, sysbuf, sizeof(sysbuf))); +#ifdef HAVE_ERROR_HISTORY + /* + * Append any messages (e.g., diagnostics) stashed away in the deferred + * msgbuf. Strncpy() can't be trusted to append '\0', do it "manually". + */ + if ((deferred_mb = __db_deferred_get()) != NULL && + (len = deferred_mb->cur - deferred_mb->buf) != 0) { + p += snprintf(p, + (size_t)(end - p), "\nErrors during this API call:"); + if (len > (end - p)) + len = end - p; + if (len != 0) { + memmove(p, deferred_mb->buf, (size_t)len); + p[len] = '\0'; + } + } +#endif + dbenv->db_errcall(dbenv, dbenv->db_errpfx, buf); } /* * __db_errfile -- - * Do the error message work for FILE *s. + * Do the error message work for FILE *s. Combine the messages into a + * single fprintf() call, to avoid interspersed output when there are + * multiple active threads. + * + * Display a ": " after the dbenv prefix, if it has one. + * Display a ": " before the error message string, if it error was set. * * PUBLIC: void __db_errfile * PUBLIC: __P((const DB_ENV *, int, db_error_set_t, const char *, va_list)); @@ -532,29 +608,62 @@ __db_errfile(dbenv, error, error_set, fmt, ap) va_list ap; { FILE *fp; - int need_sep; - char sysbuf[1024]; /* !!!: END OF THE STACK DON'T TRUST SPRINTF. */ + char *defintro, *defmsgs, *error_str, *prefix, *sep1, *sep2; + char sysbuf[200]; + char prefix_buf[200]; + char full_fmt[4096]; +#ifdef HAVE_ERROR_HISTORY + DB_MSGBUF *deferred_mb; + size_t room; +#endif + prefix = sep1 = sep2 = error_str = ""; fp = dbenv == NULL || dbenv->db_errfile == NULL ? stderr : dbenv->db_errfile; - need_sep = 0; + if (fmt == NULL) + fmt = ""; if (dbenv != NULL && dbenv->db_errpfx != NULL) { - (void)fprintf(fp, "%s", dbenv->db_errpfx); - need_sep = 1; + prefix = __db_fmt_quote(prefix_buf, + sizeof(prefix_buf), dbenv->db_errpfx); + sep1 = ": "; } - if (fmt != NULL && fmt[0] != '\0') { - if (need_sep) - (void)fprintf(fp, ": "); - need_sep = 1; - (void)vfprintf(fp, fmt, ap); + switch (error_set) { + case DB_ERROR_NOT_SET: + break; + case DB_ERROR_SET: + error_str = db_strerror(error); + sep2 = ": "; + break; + case DB_ERROR_SYSTEM: + error_str = __os_strerror(error, sysbuf, sizeof(sysbuf)); + sep2 = ": "; + break; } - if (error_set != DB_ERROR_NOT_SET) - (void)fprintf(fp, "%s%s", - need_sep ? ": " : "", - error_set == DB_ERROR_SET ? db_strerror(error) : - __os_strerror(error, sysbuf, sizeof(sysbuf))); - (void)fprintf(fp, "\n"); +#ifdef HAVE_ERROR_HISTORY + if ((deferred_mb = __db_deferred_get()) != NULL && + deferred_mb->cur != deferred_mb->buf) { + defmsgs = + __db_fmt_quote(deferred_mb->buf, deferred_mb->len, NULL); + defintro = "\nErrors during this API call:"; + /* + * If there are more deferred messages than will be displayed + * change the introductory message to warn of the truncation. + */ + room = sizeof(full_fmt) - (strlen(sep1) + + strlen(fmt) + strlen(sep2) + strlen(error_str)); + if (deferred_mb->len + strlen(defintro) > room) { + defintro = + "\nFirst recorded errors during this API call:"; + memmove(defmsgs + room - 4, "...\n", 4); + } + + } else +#endif + defmsgs = defintro = ""; + (void)snprintf(full_fmt, sizeof(full_fmt), "%s%s%s%s%s%s%s\n", prefix, + sep1, fmt, sep2, error_str, defintro, defmsgs); + (void)vfprintf(fp, full_fmt, ap); (void)fflush(fp); } @@ -562,15 +671,15 @@ __db_errfile(dbenv, error, error_set, fmt, ap) * __db_msgadd -- * Aggregate a set of strings into a buffer for the callback API. * - * PUBLIC: void __db_msgadd __P((ENV *, DB_MSGBUF *, const char *, ...)) + * PUBLIC: void __db_msgadd __P((const ENV *, DB_MSGBUF *, const char *, ...)) * PUBLIC: __attribute__ ((__format__ (__printf__, 3, 4))); */ void #ifdef STDC_HEADERS -__db_msgadd(ENV *env, DB_MSGBUF *mbp, const char *fmt, ...) +__db_msgadd(const ENV *env, DB_MSGBUF *mbp, const char *fmt, ...) #else __db_msgadd(env, mbp, fmt, va_alist) - ENV *env; + const ENV *env; DB_MSGBUF *mbp; const char *fmt; va_dcl @@ -592,17 +701,17 @@ __db_msgadd(env, mbp, fmt, va_alist) * Aggregate a set of strings into a buffer for the callback API. * * PUBLIC: void __db_msgadd_ap - * PUBLIC: __P((ENV *, DB_MSGBUF *, const char *, va_list)); + * PUBLIC: __P((const ENV *, DB_MSGBUF *, const char *, va_list)); */ void __db_msgadd_ap(env, mbp, fmt, ap) - ENV *env; + const ENV *env; DB_MSGBUF *mbp; const char *fmt; va_list ap; { - size_t len, olen; - char buf[2048]; /* !!!: END OF THE STACK DON'T TRUST SPRINTF. */ + size_t len, nlen, olen; + char buf[2048]; len = (size_t)vsnprintf(buf, sizeof(buf), fmt, ap); @@ -613,9 +722,16 @@ __db_msgadd_ap(env, mbp, fmt, ap) */ olen = (size_t)(mbp->cur - mbp->buf); if (olen + len >= mbp->len) { - if (__os_realloc(env, mbp->len + len + 256, &mbp->buf)) + /* Don't write too much for preallocated DB_MSGBUFs. */ + if (F_ISSET(mbp, DB_MSGBUF_PREALLOCATED)) { + memset(mbp->cur, '*', mbp->len - olen); + mbp->cur = mbp->buf + mbp->len; return; - mbp->len += (len + 256); + } + nlen = mbp->len + len + (env == NULL ? 8192 : 256); + if (__os_realloc(env, nlen, &mbp->buf)) + return; + mbp->len = nlen; mbp->cur = mbp->buf + olen; } @@ -648,6 +764,42 @@ __db_msg(env, fmt, va_alist) } /* + * __db_debug_msg -- + * Save a message to be displayed only if this API call returns an error. + * The message is discarded if this API call succeeds. + * + * PUBLIC: void __db_debug_msg __P((const ENV *, const char *, ...)); + */ +void +#ifdef STDC_HEADERS +__db_debug_msg(const ENV *env, const char *fmt, ...) +#else +__db_debug_msg(env, fmt, va_alist) + const ENV *env; + const char *fmt; + va_dcl +#endif +{ +#ifdef HAVE_ERROR_HISTORY + DB_MSGBUF *mb; + va_list ap; + + if (env == NULL || (mb = __db_deferred_get()) == NULL) + return; + +#ifdef STDC_HEADERS + va_start(ap, fmt); +#else + va_start(ap); +#endif + __db_msgadd_ap(env, mb, fmt, ap); + va_end(ap); +#endif + COMPQUIET(env, NULL); + COMPQUIET(fmt, NULL); +} + +/* * __db_repmsg -- * Replication system message routine. * @@ -665,7 +817,7 @@ __db_repmsg(env, fmt, va_alist) #endif { va_list ap; - char buf[2048]; /* !!!: END OF THE STACK DON'T TRUST SPRINTF. */ + char buf[2048]; #ifdef STDC_HEADERS va_start(ap, fmt); @@ -679,7 +831,7 @@ __db_repmsg(env, fmt, va_alist) /* * __db_msgcall -- - * Do the message work for callback functions. + * Do the message work for callback functions in DB_REAL_MSG(). */ static void __db_msgcall(dbenv, fmt, ap) @@ -687,16 +839,15 @@ __db_msgcall(dbenv, fmt, ap) const char *fmt; va_list ap; { - char buf[2048]; /* !!!: END OF THE STACK DON'T TRUST SPRINTF. */ + char buf[2048]; (void)vsnprintf(buf, sizeof(buf), fmt, ap); - dbenv->db_msgcall(dbenv, buf); } /* * __db_msgfile -- - * Do the message work for FILE *s. + * Do the message work for FILE *s in DB_REAL_MSG(). */ static void __db_msgfile(dbenv, fmt, ap) @@ -805,6 +956,13 @@ __db_check_txn(dbp, txn, assoc_locker, read_op) if (IS_RECOVERING(env) || F_ISSET(dbp, DB_AM_RECOVER)) return (0); + if (txn != NULL && dbp->blob_threshold && + F_ISSET(txn, (TXN_READ_UNCOMMITTED | TXN_SNAPSHOT))) { + __db_errx(env, DB_STR("0237", +"Blob enabled databases do not support DB_READ_UNCOMMITTED and TXN_SNAPSHOT")); + return (EINVAL); + } + /* * Check for common transaction errors: * an operation on a handle whose open commit hasn't completed. @@ -1095,9 +1253,9 @@ __db_space_err(dbp) /* * __db_failed -- - * Common failed thread message. + * Common failed thread message, e.g., after it is seen to have crashed. * - * PUBLIC: int __db_failed __P((const ENV *, + PUBLIC: int __db_failed __P((const ENV *, * PUBLIC: const char *, pid_t, db_threadid_t)); */ int @@ -1108,11 +1266,321 @@ __db_failed(env, msg, pid, tid) db_threadid_t tid; { DB_ENV *dbenv; - char buf[DB_THREADID_STRLEN]; + int ret; + char tidstr[DB_THREADID_STRLEN], failmsg[DB_FAILURE_SYMPTOM_SIZE]; dbenv = env->dbenv; + (void)dbenv->thread_id_string(dbenv, pid, tid, tidstr); + ret = USR_ERR(env, DB_RUNRECOVERY); + snprintf(failmsg, sizeof(failmsg), DB_STR_A("0113", + "Thread/process %s failed: %s", "%s %s"), tidstr, msg); + (void)__env_failure_remember(env, failmsg); + __db_errx(env, "%s", failmsg); + return (ret); +} - __db_errx(env, DB_STR_A("0113", "Thread/process %s failed: %s", - "%s %s"), dbenv->thread_id_string(dbenv, pid, tid, buf), msg); - return (DB_RUNRECOVERY); +/* + * __env_failure_remember -- + * If this failure of a process in the environment is about to set panic + * for the first time, record that a crashed thread was thw culprit. + * Do nothing if panic has already been set. There are no mutexes here; + * in order to avoid hanging on any crashed threads. + * + * PUBLIC: int __env_failure_remember __P((const ENV *, const char *)); + */ +int +__env_failure_remember(env, reason) + const ENV *env; + const char *reason; +{ + REGENV *renv; + + renv = env->reginfo->primary; + if (renv == NULL || renv->panic || renv->failure_panic) + return (0); + renv->failure_panic = 1; + if (renv->failure_symptom[0] == '\0') { + (void)strncpy(renv->failure_symptom, + reason, sizeof(renv->failure_symptom)); + renv->failure_symptom[sizeof(renv->failure_symptom) - 1] = '\0'; + } + return (0); +} + +#if defined(HAVE_ERROR_HISTORY) +/* + * __db_deferred_free -- + * Pthread_exit() calls this to release DB_GLOBAL(msgs_key)'s + * thread-local storage. + */ +static void +__db_deferred_free(void *p) +{ + DB_MSGBUF *mb; + + if ((mb = p) != NULL) { + (void)pthread_setspecific(DB_GLOBAL(msgs_key), NULL); + if (mb->buf != NULL) + __os_free(NULL, mb->buf); + free(mb); + } +} + +/* + * __db_thread_once_func -- + * The pthread_once() functions to initialize thread local storage. + */ +static void +__db_thread_once_func() +{ + (void)pthread_key_create(&DB_GLOBAL(msgs_key), __db_deferred_free); +} + +/* + * __db_thread_init -- + * Initialization hook to be called at least once per process, before + * deferring any messages. + * + * PUBLIC: #ifdef HAVE_ERROR_HISTORY + * PUBLIC: void __db_thread_init __P((void)); + * PUBLIC: #endif + */ +void +__db_thread_init() +{ + /* + * Assign the thread-local storage identifier. Tell thread exit to clean + * up withl __db_deferred_free(). + */ + (void)pthread_once(&DB_GLOBAL(thread_once), __db_thread_once_func); +} + +/* + * __db_diags -- + * + * Save the context which triggers the "first notice" of an error code; + * i.e., its creation. It doesn't touch anything when err == 0. + * + * PUBLIC: #ifdef HAVE_ERROR_HISTORY + * PUBLIC: int __db_diags __P((const ENV *, int)); + * PUBLIC: #endif + */ + int +__db_diags(env, err) + const ENV *env; + int err; +{ + DB_MSGBUF *mb; + + if (err != 0 && (mb = __db_deferred_get()) != NULL) + (void)__db_remember_context(env, mb, err); + return (err); +} + +/* + * __db_deferred_get -- + * Get this thread's deferred DB_MSGBUF, possibly allocating it. + * + * PUBLIC: #ifdef HAVE_ERROR_HISTORY + * PUBLIC: DB_MSGBUF *__db_deferred_get __P((void)); + * PUBLIC: #endif + */ +DB_MSGBUF * +__db_deferred_get() +{ + DB_MSGBUF *mb; + + if ((mb = pthread_getspecific(DB_GLOBAL(msgs_key))) == NULL) { + if ((mb = calloc(1, sizeof(*mb))) != NULL) + if (pthread_setspecific(DB_GLOBAL(msgs_key), mb) != 0) { + /* Nothing else is safe do on an error. */ + free(mb); + mb = NULL; + } + } + return (mb); +} + +/* + * __db_deferred_discard -- + * Discard any saved-up deferred messages, at e.g. the end of the command. + * + * PUBLIC: #ifdef HAVE_ERROR_HISTORY + * PUBLIC: void __db_deferred_discard __P((void)); + * PUBLIC: #endif + */ +void +__db_deferred_discard() +{ + DB_MSGBUF *mb; + + if ((mb = pthread_getspecific(DB_GLOBAL(msgs_key))) != NULL) + mb->cur = mb->buf; +} + +/* + * __db_remember_context + * Save the context which triggers the "first notice" of an error code; + * i.e., its creation. Include the time, thread, recent portion of the + * stack, and the error number. Add replication info too? + * + * Return the error number passed in, or 0? + * + * PUBLIC: #ifdef HAVE_ERROR_HISTORY + * PUBLIC: int __db_remember_context __P((const ENV *, DB_MSGBUF *, int)); + * PUBLIC: #endif + */ + int + __db_remember_context(env, mb, err) + const ENV *env; + DB_MSGBUF *mb; + int err; +{ + DB_ENV *dbenv; + LOG *lp; + db_timespec now; + pid_t pid; + db_threadid_t tid; + char threadid[DB_THREADID_STRLEN], timestr[CTIME_BUFLEN]; + + /* Limit the amount of context messges which are remembered. */ + if (mb->len >= DB_ERROR_HISTORY_SIZE) + return (0); + + lp = NULL; + if (env == NULL) { + dbenv = NULL; + threadid[0] = '\0'; + } else { + dbenv = env->dbenv; + dbenv->thread_id(dbenv, &pid, &tid); + (void)dbenv->thread_id_string(dbenv, pid, tid, threadid); + if (LOGGING_ON(env) && !IS_RECOVERING(env)) + lp = env->lg_handle->reginfo.primary; + } + + __os_gettime(env, &now, 0); + (void)__db_ctimespec(&now, timestr); + __db_msgadd(env, mb, "\n[%s][%s] %s", + timestr, threadid, db_strerror(err)); + if (lp != NULL) + __db_msgadd(env, mb, " lsn [%lu][%lu]", + (u_long)lp->lsn.file, (u_long)lp->lsn.offset); + +#if defined(HAVE_BACKTRACE) && defined(HAVE_BACKTRACE_SYMBOLS) + /* + * Add many frames of stack trace to the record, skipping the first two + * frames: __os_stack_msgadd() and __db_remember_context(). + */ + __db_msgadd(env, mb, " from\n"); + __os_stack_msgadd(env, mb, 15, 2, NULL); +#endif + + return (0); +} +#endif + +/* + * __db_ctimespec -- + * Format a timespec in microseconds, similar to a terse __os_ctime(), + * storing the results into a CTIME_BUFLEN sized buffer. + * The result format depends on the availability of localtime, etc + * MM/DD HH:MM:SS.uuuuuu if strftime is available, or + * Jan DD HH:MM:SS.uuuuuu if only __os_ctime() is available. + * Both are small enough to use __os_ctime() sized buffer, e.g. 26. + * The other fields (year, day-of-week, ...) are intentionally removed. + * + * PUBLIC: char * __db_ctimespec __P((const db_timespec *, char *)); + */ +char * +__db_ctimespec(timespec, buf) + const db_timespec *timespec; + char *buf; +{ + char *d, date[CTIME_BUFLEN]; +#ifdef HAVE_STRFTIME + struct tm *tm_p; +#ifdef HAVE_LOCALTIME_R + struct tm tm; +#endif +#endif + + /* Print the time readably if possible; else print seconds. */ +#ifdef HAVE_STRFTIME +#ifdef HAVE_LOCALTIME_R + tm_p = localtime_r(×pec->tv_sec, &tm); +#else + tm_p = localtime(×pec->tv_sec); +#endif + if (tm_p != NULL) { + d = date; + (void)strftime(d, sizeof(date), DB_GLOBAL(time_format), tm_p); + } + else +#endif + { + /* Trim off the leading day-of-week; then the trailing year. */ + d = __os_ctime(×pec->tv_sec, date) + 4; + d[sizeof("Jan 01 00:00:00")] = '\0'; + } + (void)snprintf(buf, CTIME_BUFLEN, + "%s.%06lu", d, (u_long)(timespec->tv_nsec / NS_PER_US)); + buf[CTIME_BUFLEN - 1] = '\0'; /* In case of buggy snprintf. */ + return (buf); +} + +/* + * __db_fmt_quote -- + * Copy a printf format string, quoting (doubling) each '%' along the way. + * Use this when inserting a user-defined string into a *printf format. + * If the src parameter is NULL, then quote in-place, shifting the + * rest of the string down by one character for each quote. + * + * PUBLIC: char *__db_fmt_quote __P((char *, size_t, const char *)); + */ +char * +__db_fmt_quote(dest, destsize, src) + char *dest; + size_t destsize; + const char *src; +{ + char *d, *end; + const char *s; + size_t len; + + /* Stop early enough so that dest always has room for a '\0'. */ + end = dest + destsize - 1; + if (src == NULL) { + d = dest; + while ((d = strchr(d, '%')) != NULL && d[1] != '\0') { + /* + * Shift the rest of the string by one byte to make + * space for another '%'. By starting at d and adding 1 + * to the length, we double the '%' while copying the + * string and its terminating '\0'. + */ + len = strlen(d) + 1; + memmove(d + 1, d, len); + /* + * We're done if the string now is larger than the + * reserved size; else advance over both '%'s. + */ + if (d + len >= end) { + DB_ASSERT(NULL, d + len == end); + *end = '\0'; + break; + } + d += 2; + } + } else { + for (s = src, d = dest; *s != '\0' && d < end; d++, s++) + if ((*d = *s) == '%') { + /* Discard a % at the end of the string. */ + if (s[1] == '\0') + break; + *++d = '%'; + } + *d = '\0'; + } + return (dest); } diff --git a/src/common/db_getlong.c b/src/common/db_getlong.c index cac55a0e..2dca6891 100644 --- a/src/common/db_getlong.c +++ b/src/common/db_getlong.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/common/db_idspace.c b/src/common/db_idspace.c index a9cbb1bf..4ac18e42 100644 --- a/src/common/db_idspace.c +++ b/src/common/db_idspace.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/common/db_log2.c b/src/common/db_log2.c index 9c929f84..42eb7e3a 100644 --- a/src/common/db_log2.c +++ b/src/common/db_log2.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1995, 1996 diff --git a/src/common/db_shash.c b/src/common/db_shash.c index a056e4b1..df862c04 100644 --- a/src/common/db_shash.c +++ b/src/common/db_shash.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/common/dbt.c b/src/common/dbt.c index 90409f2c..4a9970d9 100644 --- a/src/common/dbt.c +++ b/src/common/dbt.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/common/mkpath.c b/src/common/mkpath.c index c684692c..163dbfba 100644 --- a/src/common/mkpath.c +++ b/src/common/mkpath.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/common/openflags.c b/src/common/openflags.c index cec1f081..91d6e51b 100644 --- a/src/common/openflags.c +++ b/src/common/openflags.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/common/os_method.c b/src/common/os_method.c index 1ee06d7a..34627d59 100644 --- a/src/common/os_method.c +++ b/src/common/os_method.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/common/util_arg.c b/src/common/util_arg.c index 73416cb7..f5db1831 100644 --- a/src/common/util_arg.c +++ b/src/common/util_arg.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/common/util_cache.c b/src/common/util_cache.c index 1206940b..f0bc398d 100644 --- a/src/common/util_cache.c +++ b/src/common/util_cache.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2000, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2000, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/common/util_log.c b/src/common/util_log.c index d158d3f0..ffe69394 100644 --- a/src/common/util_log.c +++ b/src/common/util_log.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2000, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2000, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/common/util_sig.c b/src/common/util_sig.c index 02a0fcb2..b159cc80 100644 --- a/src/common/util_sig.c +++ b/src/common/util_sig.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2000, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2000, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/common/zerofill.c b/src/common/zerofill.c index 37662ddc..09d0dafe 100644 --- a/src/common/zerofill.c +++ b/src/common/zerofill.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/crypto/aes_method.c b/src/crypto/aes_method.c index 47193539..fed98f2b 100644 --- a/src/crypto/aes_method.c +++ b/src/crypto/aes_method.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved. * * Some parts of this code originally written by Adam Stubblefield, * -- astubble@rice.edu. diff --git a/src/crypto/crypto.c b/src/crypto/crypto.c index b731496f..ba115dd3 100644 --- a/src/crypto/crypto.c +++ b/src/crypto/crypto.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * Some parts of this code originally written by Adam Stubblefield * -- astubble@rice.edu @@ -15,6 +15,8 @@ #include "dbinc/db_page.h" #include "dbinc/crypto.h" +static void randomize __P((ENV *, void *, size_t)); + /* * __crypto_region_init -- * Initialize crypto. @@ -110,7 +112,7 @@ __crypto_region_init(env) * existing one, we are done with the passwd in the env. We smash * N-1 bytes so that we don't overwrite the nul. */ - memset(dbenv->passwd, 0xff, dbenv->passwd_len-1); + randomize(env, dbenv->passwd, dbenv->passwd_len - 1); __os_free(env, dbenv->passwd); dbenv->passwd = NULL; dbenv->passwd_len = 0; @@ -135,9 +137,10 @@ __crypto_env_close(env) dbenv = env->dbenv; if (dbenv->passwd != NULL) { - memset(dbenv->passwd, 0xff, dbenv->passwd_len-1); + randomize(env, dbenv->passwd, dbenv->passwd_len - 1); __os_free(env, dbenv->passwd); dbenv->passwd = NULL; + dbenv->passwd_len = 0; } if (!CRYPTO_ON(env)) @@ -225,7 +228,8 @@ __crypto_algsetup(env, db_cipher, alg, do_init) /* * __crypto_decrypt_meta -- - * Perform decryption on a metapage if needed. + * Perform decryption on a possible metadata page, if needed. This is used + * to help decide whether this is a real DB. Don't trust random data. * * PUBLIC: int __crypto_decrypt_meta __P((ENV *, DB *, u_int8_t *, int)); */ @@ -241,6 +245,7 @@ __crypto_decrypt_meta(env, dbp, mbuf, do_metachk) DB_CIPHER *db_cipher; size_t pg_off; int ret; + unsigned added_flags; u_int8_t *iv; /* @@ -293,6 +298,7 @@ __crypto_decrypt_meta(env, dbp, mbuf, do_metachk) */ if (meta->encrypt_alg != 0) { db_cipher = env->crypto_handle; + added_flags = 0; if (!F_ISSET(dbp, DB_AM_ENCRYPT)) { if (!CRYPTO_ON(env)) { __db_errx(env, DB_STR("0178", @@ -300,12 +306,14 @@ __crypto_decrypt_meta(env, dbp, mbuf, do_metachk) return (EINVAL); } /* - * User has a correct, secure env, but has encountered - * a database in that env that is secure, but user - * didn't dbp->set_flags. Since it is existing, use - * encryption if it is that way already. + * User has a correct, secure env and has encountered + * a database in that env that APPEARS TO BE secure, but + * user didn't set the encryption flags. Since the db + * already exists, turn encryption on. Remember what was + * set, so the flags can restored if it doesn't decrypt. */ - F_SET(dbp, DB_AM_ENCRYPT|DB_AM_CHKSUM); + added_flags = DB_AM_ENCRYPT | DB_AM_CHKSUM; + F_SET(dbp, added_flags); } /* * This was checked in set_flags when DB_AM_ENCRYPT was set. @@ -316,6 +324,7 @@ __crypto_decrypt_meta(env, dbp, mbuf, do_metachk) meta->encrypt_alg != db_cipher->alg) { __db_errx(env, DB_STR("0179", "Database encrypted using a different algorithm")); + F_CLR(dbp, added_flags); return (EINVAL); } DB_ASSERT(env, F_ISSET(dbp, DB_AM_CHKSUM)); @@ -334,12 +343,14 @@ alg_retry: if (!F_ISSET(db_cipher, CIPHER_ANY)) { if (do_metachk && (ret = db_cipher->decrypt(env, db_cipher->data, iv, mbuf + pg_off, - DBMETASIZE - pg_off))) + DBMETASIZE - pg_off))) { + F_CLR(dbp, added_flags); return (ret); - if (((BTMETA *)meta)->crypto_magic != - meta->magic) { + } + if (((BTMETA *)meta)->crypto_magic != meta->magic) { __db_errx(env, DB_STR("0180", "Invalid password")); + F_CLR(dbp, added_flags); return (EINVAL); } /* @@ -409,3 +420,45 @@ __crypto_set_passwd(env_src, env_dest) sh_passwd = R_ADDR(infop, cipher->passwd); return (__env_set_encrypt(env_dest->dbenv, sh_passwd, DB_ENCRYPT_AES)); } + +/* + * randomize + * + */ +static void +randomize(env, base, size) + ENV *env; + void *base; + size_t size; +{ + size_t i, copysize; + u_int8_t last, *p; + u_int32_t value; + + last = ((u_int8_t *)base)[size]; + for (i = 0, p = base; i < size; i += copysize, p += copysize) { + value = __os_random(); + if ((copysize = (size - i)) > sizeof(int32_t)) + copysize = sizeof(int32_t); + switch (copysize) + { + default: + memmove(p, &value, sizeof(int32_t)); + break; + case 3: + p[2] = (u_int8_t)(value >> 16); + /* FALLTHROUGH */ + case 2: + p[1] = (u_int8_t)(value >> 8); + /* FALLTHROUGH */ + case 1: + p[0] = (u_int8_t)(value); + break; + case 0: + DB_ASSERT(env, "randomize size 0?"); + break; + } + + } + DB_ASSERT(env, last == *p); +} diff --git a/src/crypto/mersenne/mt19937db.c b/src/crypto/mersenne/mt19937db.c index 2d53c312..0460b994 100644 --- a/src/crypto/mersenne/mt19937db.c +++ b/src/crypto/mersenne/mt19937db.c @@ -156,7 +156,7 @@ __db_genrand(env) * function will return 4 bytes if we don't send in a key. */ do { - __os_gettime(env, &ts, 1); + __os_gettime(env, &ts, 0); __db_chksum(NULL, (u_int8_t *)&ts.tv_sec, sizeof(ts.tv_sec), NULL, (u_int8_t *)&seed); } while (seed == 0); diff --git a/src/crypto/rijndael/rijndael-api-fst.c b/src/crypto/rijndael/rijndael-api-fst.c index 3fd6489d..5d67937c 100644 --- a/src/crypto/rijndael/rijndael-api-fst.c +++ b/src/crypto/rijndael/rijndael-api-fst.c @@ -56,7 +56,7 @@ __db_makeKey(key, direction, keyLen, keyMaterial) { u8 cipherKey[MAXKB]; - if (key == NULL) { + if (key == NULL || keyMaterial == NULL) { return BAD_KEY_INSTANCE; } @@ -72,9 +72,7 @@ __db_makeKey(key, direction, keyLen, keyMaterial) return BAD_KEY_MAT; } - if (keyMaterial != NULL) { - memcpy(cipherKey, keyMaterial, key->keyLen/8); - } + memcpy(cipherKey, keyMaterial, key->keyLen/8); if (direction == DIR_ENCRYPT) { key->Nr = __db_rijndaelKeySetupEnc(key->rk, cipherKey, keyLen); diff --git a/src/db/crdel.src b/src/db/crdel.src index 70473899..a1cbc0ed 100644 --- a/src/db/crdel.src +++ b/src/db/crdel.src @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/db/crdel_rec.c b/src/db/crdel_rec.c index 08e7bae8..2c529627 100644 --- a/src/db/crdel_rec.c +++ b/src/db/crdel_rec.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -81,7 +81,7 @@ __crdel_metasub_recover(env, dbtp, lsnp, op, info) /* * If this was an in-memory database and we are re-creating * and this is the meta-data page, then we need to set up a - * bunch of fields in the dbo as well. + * bunch of fields in the dbp as well. */ if (F_ISSET(file_dbp, DB_AM_INMEM) && argp->pgno == PGNO_BASE_MD && diff --git a/src/db/db.c b/src/db/db.c index 0d9d1e6e..ffeb6d2b 100644 --- a/src/db/db.c +++ b/src/db/db.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1990, 1993, 1994, 1995, 1996 @@ -41,6 +41,7 @@ #include "db_config.h" #include "db_int.h" +#include "dbinc_auto/sequence_ext.h" #include "dbinc/db_page.h" #include "dbinc/db_swap.h" #include "dbinc/btree.h" @@ -92,6 +93,9 @@ __db_master_open(subdbp, ip, txn, name, flags, mode, dbpp) if ((ret = __db_create_internal(&dbp, subdbp->env, 0)) != 0) return (ret); + /* Set the creation directory. */ + dbp->dirname = subdbp->dirname; + /* * It's always a btree. * Run in the transaction we've created. @@ -105,6 +109,20 @@ __db_master_open(subdbp, ip, txn, name, flags, mode, dbpp) DB_AM_ENCRYPT | DB_AM_CHKSUM | DB_AM_NOT_DURABLE)); /* + * If creating the master database, disable blobs, but assign it a + * blob file id if blobs are enabled in the subdatabase. This means + * that subdatabses can only support blobs if the first subdatabse + * supports blobs. This is a temporary restriction, but is needed at + * the moment to prevent an infinite loop. + */ + dbp->blob_threshold = 0; + if (LF_ISSET(DB_CREATE) && subdbp->blob_threshold != 0) { + if ((ret = __blob_generate_dir_ids( + dbp, txn, &dbp->blob_file_id)) != 0) + return (ret); + } + + /* * If there was a subdb specified, then we only want to apply * DB_EXCL to the subdb, not the actual file. We only got here * because there was a subdb specified. @@ -819,6 +837,21 @@ __db_refresh(dbp, txn, flags, deferred_closep, reuse) if (dbp->mpf == NULL) LF_SET(DB_NOSYNC); +#ifdef HAVE_64BIT_TYPES + /* Close the blob meta data databases. */ + if (dbp->blob_seq != NULL) { + if ((t_ret = __seq_close(dbp->blob_seq, 0)) != 0 && ret == 0) + ret = t_ret; + dbp->blob_seq = NULL; + } + if (dbp->blob_meta_db != NULL) { + if ((t_ret = __db_close( + dbp->blob_meta_db, NULL, 0)) != 0 && ret == 0) + ret = t_ret; + dbp->blob_meta_db = NULL; + } +#endif + /* If never opened, or not currently open, it's easy. */ if (!F_ISSET(dbp, DB_AM_OPEN_CALLED)) goto never_opened; @@ -1164,6 +1197,10 @@ never_opened: __os_free(dbp->env, dbp->dname); dbp->dname = NULL; } + if (dbp->blob_sub_dir != NULL) { + __os_free(dbp->env, dbp->blob_sub_dir); + dbp->blob_sub_dir = NULL; + } /* Discard any memory used to store returned data. */ if (dbp->my_rskey.data != NULL) @@ -1235,8 +1272,11 @@ __db_disassociate(sdbp) sdbp->s_refcnt = 0; while ((dbc = TAILQ_FIRST(&sdbp->free_queue)) != NULL) - if ((t_ret = __dbc_destroy(dbc)) != 0 && ret == 0) - ret = t_ret; + if ((t_ret = __dbc_destroy(dbc)) != 0) { + if (ret == 0) + ret = t_ret; + break; + } F_CLR(sdbp, DB_AM_SECONDARY); return (ret); diff --git a/src/db/db.src b/src/db/db.src index 879c7856..4a90ac16 100644 --- a/src/db/db.src +++ b/src/db/db.src @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/db/db_am.c b/src/db/db_am.c index 1cf3a505..84bb04bb 100644 --- a/src/db/db_am.c +++ b/src/db/db_am.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1998, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -205,6 +205,7 @@ __db_cursor_int(dbp, ip, txn, dbtype, root, flags, locker, dbcp) /* Refresh the DBC structure. */ dbc->dbtype = dbtype; RESET_RET_MEM(dbc); + dbc->db_stream = __dbc_db_stream; dbc->set_priority = __dbc_set_priority; dbc->get_priority = __dbc_get_priority; dbc->priority = dbp->priority; @@ -314,11 +315,11 @@ __db_cursor_int(dbp, ip, txn, dbtype, root, flags, locker, dbcp) if (F2_ISSET(dbp, DB2_AM_EXCL)) { F_SET(dbc, DBC_DONTLOCK); if (IS_REAL_TXN(txn)&& !LF_ISSET(DBC_OPD | DBC_DUPLICATE)) { - /* - * Exclusive databases can only have one active - * transaction at a time since there are no internal + /* + * Exclusive databases can only have one active + * transaction at a time since there are no internal * locks to prevent one transaction from reading and - * writing another's uncommitted changes. + * writing another's uncommitted changes. */ if (dbp->cur_txn != NULL && dbp->cur_txn != txn) { __db_errx(env, DB_STR("0749", @@ -332,7 +333,7 @@ __db_cursor_int(dbp, ip, txn, dbtype, root, flags, locker, dbcp) memset(&req, 0, sizeof(req)); req.lock = dbp->handle_lock; req.op = DB_LOCK_TRADE; - if ((ret = __lock_vec(env, txn->locker, 0, + if ((ret = __lock_vec(env, txn->locker, 0, &req, 1, 0)) != 0) goto err; dbp->cur_txn = txn; @@ -397,10 +398,11 @@ __db_cursor_int(dbp, ip, txn, dbtype, root, flags, locker, dbcp) if (ip != NULL) { dbc->thread_info = ip; #ifdef DIAGNOSTIC - if (dbc->locker != NULL) + if (dbc->locker != NULL) { + dbc->locker->prev_locker = ip->dbth_locker; ip->dbth_locker = R_OFFSET(&(env->lk_handle->reginfo), dbc->locker); - else + } else ip->dbth_locker = INVALID_ROFF; #endif } else if (txn != NULL) diff --git a/src/db/db_backup.c b/src/db/db_backup.c index 66d7382a..1c72e4d7 100644 --- a/src/db/db_backup.c +++ b/src/db/db_backup.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2011, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -24,8 +24,9 @@ static int backup_read_data_dir __P((DB_ENV *, DB_THREAD_INFO *, const char *, const char *, u_int32_t)); static int backup_dir_clean __P((DB_ENV *, const char *, const char *, int *, u_int32_t)); -static int backup_data_copy - __P((DB_ENV *, const char *, const char *, const char *, int)); +static int backup_lgconf_chk __P((DB_ENV *)); +static int __db_backup + __P((DB_ENV *, const char *, DB_THREAD_INFO *, int, u_int32_t)); /* * __db_dbbackup_pp -- @@ -47,9 +48,9 @@ __db_dbbackup_pp(dbenv, dbfile, target, flags) "DB_ENV->dbbackup", flags, DB_EXCL)) != 0) return (ret); ENV_ENTER(dbenv->env, ip); - - ret = __db_dbbackup(dbenv, ip, dbfile, target, flags); - + REPLICATION_WRAP(dbenv->env, + (__db_dbbackup( + dbenv, ip, dbfile, target, flags, 0, NULL)), 0, ret); ENV_LEAVE(dbenv->env, ip); return (ret); } @@ -58,15 +59,17 @@ __db_dbbackup_pp(dbenv, dbfile, target, flags) * __db_dbbackup -- * Copy a database file coordinated with mpool. * - * PUBLIC: int __db_dbbackup __P((DB_ENV *, DB_THREAD_INFO *, - * PUBLIC: const char *, const char *, u_int32_t)); + * PUBLIC: int __db_dbbackup __P((DB_ENV *, DB_THREAD_INFO *, const char *, + * PUBLIC: const char *, u_int32_t, u_int32_t, const char *)); */ int -__db_dbbackup(dbenv, ip, dbfile, target, flags) +__db_dbbackup(dbenv, ip, dbfile, target, flags, oflags, full_path) DB_ENV *dbenv; DB_THREAD_INFO *ip; const char *dbfile, *target; u_int32_t flags; + u_int32_t oflags; + const char *full_path; { DB *dbp; DB_FH *fp; @@ -77,8 +80,8 @@ __db_dbbackup(dbenv, ip, dbfile, target, flags) retry_count = 0; retry: if ((ret = __db_create_internal(&dbp, dbenv->env, 0)) == 0 && - (ret = __db_open(dbp, ip, NULL, dbfile, NULL, - DB_UNKNOWN, DB_AUTO_COMMIT | DB_RDONLY, 0, PGNO_BASE_MD)) != 0) { + (ret = __db_open(dbp, ip, NULL, dbfile, NULL, DB_UNKNOWN, + DB_AUTO_COMMIT | DB_RDONLY | oflags, 0, PGNO_BASE_MD)) != 0) { if (ret == DB_LOCK_DEADLOCK || ret == DB_LOCK_NOTGRANTED) { (void)__db_close(dbp, NULL, DB_NOSYNC); dbp = NULL; @@ -91,9 +94,16 @@ retry: if ((ret = __db_create_internal(&dbp, dbenv->env, 0)) == 0 && } } + /* Hot backup requires DB_LOG_BLOB. */ + if (ret == 0 && dbp->blob_threshold != 0 && + (ret = backup_lgconf_chk(dbenv)) != 0) + goto err; + + if (full_path == NULL) + full_path = dbfile; if (ret == 0) { if ((ret = __memp_backup_open(dbenv->env, - dbp->mpf, dbfile, target, flags, &fp, &handle)) == 0) { + dbp->mpf, full_path, target, flags, &fp, &handle)) == 0) { if (dbp->type == DB_HEAP) ret = __heap_backup( dbenv, dbp, ip, fp, handle, flags); @@ -104,10 +114,21 @@ retry: if ((ret = __db_create_internal(&dbp, dbenv->env, 0)) == 0 && fp, handle, flags); } if ((t_ret = __memp_backup_close(dbenv->env, - dbp->mpf, dbfile, fp, handle)) != 0 && ret == 0) + dbp->mpf, full_path, fp, handle)) != 0 && ret == 0) ret = t_ret; } + /* + * Copy blob files. Since no locking is done here, it is possible + * that a blob file may be copied in the middle of being written. + * This is not a problem since hotbackup requires DB_LOG_BLOB and + * catastrophic recovery, which will fix any inconsistances in the + * blob files. + */ + if (ret == 0 && dbp->blob_threshold != 0 && + (t_ret = __blob_copy_all(dbp, target, flags)) != 0) + ret= t_ret; + #ifdef HAVE_QUEUE /* * For compatibility with the 5.2 and patch versions of db_copy @@ -117,7 +138,7 @@ retry: if ((ret = __db_create_internal(&dbp, dbenv->env, 0)) == 0 && ret = __qam_backup_extents(dbp, ip, target, flags); #endif - if (dbp != NULL && +err: if (dbp != NULL && (t_ret = __db_close(dbp, NULL, DB_NOSYNC)) != 0 && ret == 0) ret = t_ret; @@ -205,8 +226,11 @@ backup_dir_clean(dbenv, backup_dir, log_dir, remove_maxp, flags) /* * backup_data_copy -- * Copy a non-database file into the backup directory. + * + * PUBLIC: int backup_data_copy __P(( + * PUBLIC: DB_ENV *, const char *, const char *, const char *, int)); */ -static int +int backup_data_copy(dbenv, file, from_dir, to_dir, log) DB_ENV *dbenv; const char *file, *from_dir, *to_dir; @@ -352,13 +376,16 @@ backup_read_data_dir(dbenv, ip, dir, backup_dir, flags) ENV *env; FILE *savefile; int fcnt, ret; - size_t cnt; + size_t cnt, len; const char *bd; char **names, buf[DB_MAXPATHLEN], bbuf[DB_MAXPATHLEN]; + char fullpath[DB_MAXPATHLEN]; void (*savecall) (const DB_ENV *, const char *, const char *); env = dbenv->env; memset(bbuf, 0, sizeof(bbuf)); + memset(fullpath, 0, sizeof(fullpath)); + len = 0; bd = backup_dir; if (!LF_ISSET(DB_BACKUP_SINGLE_DIR) && dir != env->db_home) { @@ -401,6 +428,12 @@ backup_read_data_dir(dbenv, ip, dir, backup_dir, flags) "%s: path too long", "%s"), buf); return (EINVAL); } + /* Save the original dir. */ + if (!LF_ISSET(DB_BACKUP_SINGLE_DIR)) { + (void)snprintf(fullpath, sizeof(fullpath), + "%s%c%c", dir, PATH_SEPARATOR[0], '\0'); + len = strlen(fullpath); + } dir = buf; } /* Get a list of file names. */ @@ -449,7 +482,16 @@ backup_read_data_dir(dbenv, ip, dir, backup_dir, flags) savefile = dbenv->db_errfile; dbenv->db_errfile = NULL; - ret = __db_dbbackup(dbenv, ip, names[cnt], bd, flags); + /* + * If it is not backing up to a single directory, prefix + * the file with 'dir' so that the file and directory structure + * in the source and backup location will be the same. + */ + if (len != 0) + (void)snprintf(fullpath + len, + sizeof(fullpath) - len, "%s%c", names[cnt], '\0'); + ret = __db_dbbackup(dbenv, ip, names[cnt], + backup_dir, flags, 0, len != 0 ? fullpath : NULL); dbenv->db_errcall = savecall; dbenv->db_errfile = savefile; @@ -662,21 +704,22 @@ err: if (logd != dbenv->db_log_dir && logd != env->db_home) * __db_backup -- * Backup databases in the enviornment. * - * PUBLIC: int __db_backup __P((DB_ENV *, const char *, u_int32_t)); + * PUBLIC: int __db_backup_pp __P((DB_ENV *, const char *, u_int32_t)); */ int -__db_backup(dbenv, target, flags) +__db_backup_pp(dbenv, target, flags) DB_ENV *dbenv; const char *target; u_int32_t flags; { DB_THREAD_INFO *ip; ENV *env; - int copy_min, remove_max, ret; - char **dir; + u_int32_t bytes; + int remove_max, ret; env = dbenv->env; - remove_max = copy_min = 0; + bytes = 0; + remove_max = 0; #undef OKFLAGS #define OKFLAGS \ @@ -692,6 +735,11 @@ __db_backup(dbenv, target, flags) return (EINVAL); } + /* Hot backup requires DB_LOG_BLOB. */ + if ((ret = __env_get_blob_threshold_int(env, &bytes)) != 0 || + (bytes != 0 && (ret = backup_lgconf_chk(dbenv)) != 0)) + return (ret); + /* * If the target directory for the backup does not exist, create it * with mode read-write-execute for the owner. Ignore errors here, @@ -714,6 +762,30 @@ __db_backup(dbenv, target, flags) } ENV_ENTER(env, ip); + REPLICATION_WRAP(env, + (__db_backup(dbenv, target, ip, remove_max, flags)), 0, ret); + ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __db_backup -- + * Backup databases in the enviornment. + */ +static int +__db_backup(dbenv, target, ip, remove_max, flags) + DB_ENV *dbenv; + const char *target; + DB_THREAD_INFO *ip; + int remove_max; + u_int32_t flags; +{ + ENV *env; + int copy_min, ret; + char **dir; + + env = dbenv->env; + copy_min = 0; /* * If the UPDATE option was not specified, copy all database @@ -724,6 +796,19 @@ __db_backup(dbenv, target, flags) goto end; F_SET(dbenv, DB_ENV_HOTBACKUP); if (!LF_ISSET(DB_BACKUP_UPDATE)) { + /* + * Don't allow absolute path of blob directory when + * it is not backing up to a single directory. + */ + if (!LF_ISSET(DB_BACKUP_SINGLE_DIR) && + dbenv->db_blob_dir != NULL && + __os_abspath(dbenv->db_blob_dir)) { + __db_errx(env, DB_STR_A("0780", +"blob directory '%s' is absolute path, not permitted unless backup is to a single directory", + "%s"), dbenv->db_blob_dir); + ret = EINVAL; + goto err; + } if ((ret = backup_read_data_dir(dbenv, ip, env->db_home, target, flags)) != 0) goto err; @@ -734,8 +819,8 @@ __db_backup(dbenv, target, flags) * enviroment -- running recovery with them would * corrupt the source files. */ - if (!LF_ISSET(DB_BACKUP_SINGLE_DIR) - && __os_abspath(*dir)) { + if (!LF_ISSET(DB_BACKUP_SINGLE_DIR) && + __os_abspath(*dir)) { __db_errx(env, DB_STR_A("0725", "data directory '%s' is absolute path, not permitted unless backup is to a single directory", "%s"), *dir); @@ -751,7 +836,17 @@ __db_backup(dbenv, target, flags) /* * Copy all log files found in the log directory. * The log directory defaults to the home directory. + * Don't allow absolute path of log directory when + * it is not backing up to a single directory. */ + if (!LF_ISSET(DB_BACKUP_SINGLE_DIR) && + dbenv->db_log_dir != NULL && __os_abspath(dbenv->db_log_dir)) { + __db_errx(env, DB_STR_A("0781", +"log directory '%s' is absolute path, not permitted unless backup is to a single directory", + "%s"), dbenv->db_log_dir); + ret = EINVAL; + goto err; + } if ((ret = backup_read_log_dir(dbenv, target, ©_min, flags)) != 0) goto err; /* @@ -761,7 +856,7 @@ __db_backup(dbenv, target, flags) * cleanup. */ if (LF_ISSET(DB_BACKUP_UPDATE) && remove_max < copy_min && - !(remove_max == 0 && copy_min == 1)) { + remove_max != 0 && copy_min != 1) { __db_errx(env, DB_STR_A("0743", "the largest log file removed (%d) must be greater than or equal the smallest log file copied (%d)", "%d %d"), remove_max, copy_min); @@ -770,6 +865,28 @@ __db_backup(dbenv, target, flags) err: F_CLR(dbenv, DB_ENV_HOTBACKUP); (void)__env_set_backup(env, 0); -end: ENV_LEAVE(env, ip); +end: return (ret); +} + +/* + * __db_backup_fchk -- + * Log configure checking for backup when blob is enabled. + */ +static int +backup_lgconf_chk(dbenv) + DB_ENV *dbenv; +{ + int lgconf, ret; + + ret = 0; + + if (LOGGING_ON(dbenv->env) && ((ret = __log_get_config(dbenv, + DB_LOG_BLOB, &lgconf)) != 0 || lgconf == 0)) { + __db_errx(dbenv->env, DB_STR("0782", + "Hot backup requires DB_LOG_BLOB")); + if (ret == 0) + ret = EINVAL; + } + return (ret); } diff --git a/src/db/db_cam.c b/src/db/db_cam.c index 6ee8b579..1a330bdb 100644 --- a/src/db/db_cam.c +++ b/src/db/db_cam.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2000, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2000, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -11,6 +11,7 @@ #include "db_int.h" #include "dbinc/db_page.h" #include "dbinc/btree.h" +#include "dbinc/fop.h" #include "dbinc/hash.h" #include "dbinc/heap.h" #include "dbinc/lock.h" @@ -83,6 +84,9 @@ __dbc_close(dbc) DB *dbp; DBC *opd; DBC_INTERNAL *cp; +#ifdef DIAGNOSTIC + DB_THREAD_INFO *ip; +#endif DB_TXN *txn; ENV *env; int ret, t_ret; @@ -149,6 +153,14 @@ __dbc_close(dbc) ret = t_ret; F_CLR(dbc, DBC_FAMILY); } +#ifdef DIAGNOSTIC + if (dbc->locker != NULL) { + ENV_GET_THREAD_INFO(env, ip); + if (ip != NULL) + ip->dbth_locker = dbc->locker->prev_locker; + dbc->locker->prev_locker = INVALID_ROFF; + } +#endif if ((txn = dbc->txn) != NULL) txn->cursors--; @@ -510,6 +522,305 @@ __dbc_idel(dbc, flags) return (ret); } +/* + * __dbc_db_stream -- + * + * DBC->db_stream + * + * PUBLIC: int __dbc_db_stream __P((DBC *, DB_STREAM **, u_int32_t)); + */ +int +__dbc_db_stream(dbc, dbsp, flags) + DBC *dbc; + DB_STREAM **dbsp; + u_int32_t flags; +{ + ENV *env; + int ret; + u_int32_t oflags; + + env = dbc->env; + oflags = flags; + + if ((ret = __db_fchk( + env, "DBC->db_stream", flags, + DB_STREAM_READ | DB_STREAM_WRITE | DB_STREAM_SYNC_WRITE)) != 0) + return (ret); + + if (DB_IS_READONLY(dbc->dbp)) { + LF_SET(DB_STREAM_READ); + oflags |= DB_STREAM_READ; + } + if (LF_ISSET(DB_STREAM_READ) && LF_ISSET(DB_STREAM_WRITE)) { + ret = EINVAL; + __db_errx(env, DB_STR("0750", + "Error, cannot set both DB_STREAM_WRITE and DB_STREAM_READ.")); + goto err; + } + + if (oflags & DB_STREAM_READ) + LF_SET(DB_FOP_READONLY); + else + LF_SET(DB_FOP_WRITE); + if (oflags & DB_STREAM_SYNC_WRITE) + LF_SET(DB_FOP_SYNC_WRITE); + + ret = __db_stream_init(dbc, dbsp, flags); + +err: return (ret); +} + +/* + * __dbc_get_blob_id -- + * + * Returns the blob id stored in the data record to which the cursor currently + * points. Returns EINVAL if the cursor does not point to a blob record. + * + * PUBLIC: int __dbc_get_blob_id __P((DBC *, db_seq_t *)); + */ +int +__dbc_get_blob_id(dbc, blob_id) + DBC *dbc; + db_seq_t *blob_id; +{ + DBT key, data; + BBLOB bl; + HBLOB hbl; + HEAPBLOBHDR bhdr; + int ret; + + if (dbc->dbtype != DB_BTREE && + dbc->dbtype != DB_HEAP && dbc->dbtype != DB_HASH) { + return (EINVAL); + } + + ret = 0; + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + /* Get the blob database record instead of the blob. */ + data.flags |= DB_DBT_BLOB_REC; + + /* + * It would be great if there was a more efficient way to do this, but + * the complexities of getting a page from a database, especially + * when taking into account things like partitions and compression, + * make that more trouble than it is worth. + */ + if ((ret = __dbc_get(dbc, &key, &data, DB_CURRENT)) != 0) + goto err; + + switch (dbc->dbtype) { + case DB_BTREE: + if (data.size != BBLOB_SIZE) { + ret = EINVAL; + goto err; + } + memcpy(&bl, data.data, BBLOB_SIZE); + if (B_TYPE(bl.type) != B_BLOB) { + ret = EINVAL; + goto err; + } + *blob_id = (db_seq_t)bl.id; + break; + case DB_HEAP: + if (data.size != HEAPBLOBREC_SIZE) { + ret = EINVAL; + goto err; + } + memcpy(&bhdr, data.data, HEAPBLOBREC_SIZE); + if (!F_ISSET(&bhdr.std_hdr, HEAP_RECBLOB)) { + ret = EINVAL; + goto err; + } + *blob_id = (db_seq_t)bhdr.id; + break; + case DB_HASH: + if (data.size != HBLOB_SIZE) { + ret = EINVAL; + goto err; + } + memcpy(&hbl, data.data, HBLOB_SIZE); + if (HPAGE_PTYPE(&hbl) != H_BLOB) { + ret = EINVAL; + goto err; + } + *blob_id = (db_seq_t)hbl.id; + break; + default: + ret = EINVAL; + goto err; + } + +err: return (ret); +} + +/* + * __dbc_get_blob_size -- + * + * Returns the blob file size stored in the data record to which the cursor + * currently points. Returns EINVAL if the cursor does not point to a blob + * record. + * + * PUBLIC: int __dbc_get_blob_size __P((DBC *, off_t *)); + */ +int +__dbc_get_blob_size(dbc, size) + DBC *dbc; + off_t *size; +{ + DBT key, data; + ENV *env; + BBLOB bl; + HBLOB hbl; + HEAPBLOBHDR bhdr; + int ret; + + if (dbc->dbtype != DB_BTREE && + dbc->dbtype != DB_HEAP && dbc->dbtype != DB_HASH) { + return (EINVAL); + } + + env = dbc->env; + ret = 0; + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + /* Get the blob database record instead of the blob. */ + data.flags |= DB_DBT_BLOB_REC; + + /* + * It would be great if there was a more efficient way to do this, but + * the complexities of getting a page from a database, especially + * when taking into account things like partitions and compression, + * make that more trouble than it is worth. + */ + if ((ret = __dbc_get(dbc, &key, &data, DB_CURRENT)) != 0) + goto err; + + switch (dbc->dbtype) { + case DB_BTREE: + if (data.size != BBLOB_SIZE) { + ret = EINVAL; + goto err; + } + memcpy(&bl, data.data, BBLOB_SIZE); + if (B_TYPE(bl.type) != B_BLOB) { + ret = EINVAL; + goto err; + } + GET_BLOB_SIZE(env, bl, *size, ret); + break; + case DB_HEAP: + if (data.size != HEAPBLOBREC_SIZE) { + ret = EINVAL; + goto err; + } + memcpy(&bhdr, data.data, HEAPBLOBREC_SIZE); + if (!F_ISSET(&bhdr.std_hdr, HEAP_RECBLOB)) { + ret = EINVAL; + goto err; + } + GET_BLOB_SIZE(env, bhdr, *size, ret); + break; + case DB_HASH: + if (data.size != HBLOB_SIZE) { + ret = EINVAL; + goto err; + } + memcpy(&hbl, data.data, HBLOB_SIZE); + if (HPAGE_PTYPE(&hbl) != H_BLOB) { + ret = EINVAL; + goto err; + } + GET_BLOB_SIZE(env, hbl, *size, ret); + break; + default: + ret = EINVAL; + goto err; + } + +err: return (ret); +} + +/* + * __dbc_set_blob_size -- + * + * Sets the blob file size in the data record to which the cursor + * currently points. Returns EINVAL if the cursor does not point to a blob + * record. + * + * PUBLIC: int __dbc_set_blob_size __P((DBC *, off_t)); + */ +int +__dbc_set_blob_size(dbc, size) + DBC *dbc; + off_t size; +{ + DBT key, data; + BBLOB *bl; + HBLOB *hbl; + HEAPBLOBHDR *bhdr; + int ret; + + if (dbc->dbtype != DB_BTREE && + dbc->dbtype != DB_HEAP && dbc->dbtype != DB_HASH) { + return (EINVAL); + } + + ret = 0; + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + /* Get the blob database record instead of the blob. */ + data.flags |= DB_DBT_BLOB_REC; + + /* + * It would be great if there was a more efficient way to do this, but + * the complexities of getting a page from a database, especially + * when taking into account things like partitions and compression, + * make that more trouble than it is worth. + */ + if ((ret = __dbc_get(dbc, &key, &data, DB_CURRENT)) != 0) + goto err; + + switch (dbc->dbtype) { + case DB_BTREE: + bl = (BBLOB *)data.data; + if (bl == NULL || + B_TYPE(bl->type) != B_BLOB || data.size != BBLOB_SIZE) { + ret = EINVAL; + goto err; + } + SET_BLOB_SIZE(bl, size, BBLOB); + break; + case DB_HEAP: + bhdr = (HEAPBLOBHDR *)data.data; + if (bhdr == NULL || + !F_ISSET(&bhdr->std_hdr, HEAP_RECBLOB) || + data.size != HEAPBLOBREC_SIZE) { + ret = EINVAL; + goto err; + } + SET_BLOB_SIZE(bhdr, size, HEAPBLOBHDR); + break; + case DB_HASH: + hbl = data.data; + if (hbl == NULL || + HPAGE_PTYPE(hbl) != H_BLOB || data.size != HBLOB_SIZE) { + ret = EINVAL; + goto err; + } + SET_BLOB_SIZE((HBLOB *)hbl, size, HBLOB); + break; + default: + ret = EINVAL; + goto err; + } + + if ((ret = __dbc_put(dbc, &key, &data, DB_CURRENT)) != 0) + goto err; + +err: return (ret); +} + #ifdef HAVE_COMPRESSION /* * __dbc_bulk_del -- @@ -632,6 +943,12 @@ __dbc_idup(dbc_orig, dbcp, flags) int_n->stream_off = int_orig->stream_off; int_n->stream_curr_pgno = int_orig->stream_curr_pgno; +#ifdef HAVE_PARTITION + if (DB_IS_PARTITIONED(dbp)) { + if ((ret = __partc_dup(dbc_orig, dbc_n)) != 0) + goto err; + } else +#endif switch (dbc_orig->dbtype) { case DB_QUEUE: if ((ret = __qamc_dup(dbc_orig, dbc_n)) != 0) @@ -859,7 +1176,11 @@ __dbc_iget(dbc, key, data, flags) * we acquire a write lock in the primary tree and no locks in the * off-page dup tree. If the DB_RMW flag was specified and the get * operation is done in an off-page duplicate tree, call the primary - * cursor's upgrade routine first. + * cursor's upgrade routine first. We fetch the primary tree's data + * page to follow the buffer latching order rules for btrees: latch from + * the top of the main tree down, even when also searching OPD trees. + * Deadlocks could otherwise occur if we need to fetch the main page + * while an OPD page is latched. [#22532] */ cp = dbc->internal; if (cp->opd != NULL && @@ -868,6 +1189,10 @@ __dbc_iget(dbc, key, data, flags) flags == DB_PREV || flags == DB_PREV_DUP)) { if (tmp_rmw && (ret = dbc->am_writelock(dbc)) != 0) goto err; + if (cp->page == NULL && (ret = __memp_fget(mpf, &cp->pgno, + dbc->thread_info, dbc->txn, 0, &cp->page)) != 0) + goto err; + if (F_ISSET(dbc, DBC_TRANSIENT)) opd = cp->opd; else if ((ret = __dbc_idup(cp->opd, &opd, DB_POSITION)) != 0) @@ -1660,7 +1985,7 @@ __dbc_put_secondaries(dbc, tskeyp, &oldpkey, rmw | DB_SET); if (ret == 0) { cmp = __bam_defcmp(sdbp, - &oldpkey, pkey); + &oldpkey, pkey, NULL); __os_ufree(env, oldpkey.data); /* * If the secondary key is unchanged, @@ -1868,7 +2193,7 @@ __dbc_put_primary(dbc, key, data, flags) olddata.flags = DB_DBT_PARTIAL | DB_DBT_USERMEM; ret = __dbc_get(dbc, key, &olddata, DB_SET); if (ret == 0) { - ret = DB_KEYEXIST; + ret = DBC_ERR(dbc, DB_KEYEXIST); goto done; } else if (ret != DB_NOTFOUND && ret != DB_KEYEMPTY) goto err; @@ -2100,7 +2425,7 @@ __dbc_iput(dbc, key, data, flags) if (dbc->dbtype == DB_HASH && F_ISSET( ((BTREE_CURSOR *)(dbc->internal->opd->internal)), C_DELETED)) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } @@ -2228,7 +2553,7 @@ __dbc_del_oldskey(sdbp, dbc, skey, pkey, olddata) */ for (i = 0, tskeyp = skey; i < nskey; i++, tskeyp++) if (((BTREE *)sdbp->bt_internal)->bt_compare(sdbp, - toldskeyp, tskeyp) == 0) { + toldskeyp, tskeyp, NULL) == 0) { nsame++; F_CLR(tskeyp, DB_DBT_ISSET); break; @@ -2382,12 +2707,14 @@ __dbc_cleanup(dbc, dbc_n, failed) * cursors. */ if (!failed && ret == 0) { + MUTEX_LOCK(dbp->env, dbp->mutex); if (opd != NULL) opd->internal->pdbc = dbc; if (internal->opd != NULL) internal->opd->internal->pdbc = dbc_n; dbc->internal = dbc_n->internal; dbc_n->internal = internal; + MUTEX_UNLOCK(dbp->env, dbp->mutex); } /* @@ -3501,6 +3828,32 @@ __db_check_skeyset(sdbp, skeyp) for (key2 = key1 + 1; key2 < last_key; key2++) DB_ASSERT(env, ((BTREE *)sdbp->bt_internal)->bt_compare(sdbp, - key1, key2) != 0); + key1, key2, NULL) != 0); +} +#endif + +#ifdef HAVE_ERROR_HISTORY +/* + * __dbc_diags + * Save the context which triggers the "first notice" of an error code; + * i.e., its creation. It doesn't touch anything when err == 0. + * + * PUBLIC: int __dbc_diags __P((DBC *, int)); + */ + int + __dbc_diags(dbc, err) + DBC *dbc; + int err; +{ + DB_MSGBUF *mb; + + if (err != 0 && dbc->env != NULL && + (mb = __db_deferred_get()) != NULL) { + (void)__db_remember_context(dbc->env, mb, err); + __db_msgadd(dbc->env, mb, "DB: %s:%s\n" , + dbc->dbp->fname == NULL ? "in-mem" : dbc->dbp->fname, + dbc->dbp->dname == NULL ? "" : dbc->dbp->fname); + } + return (err); } #endif diff --git a/src/db/db_cds.c b/src/db/db_cds.c index 185d5487..d3cc990a 100644 --- a/src/db/db_cds.c +++ b/src/db/db_cds.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2000, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2000, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -43,7 +43,15 @@ static int __cdsgroup_abort(txn) DB_TXN *txn; { - return (__cdsgroup_notsup(txn->mgrp->env, "abort")); + ENV *env; + + env = txn->mgrp->env; + /* + * As the txn handle can not be used any more, we call + * __cdsgroup_commit to release the lock and destroy the handle. + */ + (void)__cdsgroup_commit(txn, 0); + return (__cdsgroup_notsup(env, "abort")); } static int @@ -83,8 +91,16 @@ static int __cdsgroup_discard(txn, flags) DB_TXN *txn; u_int32_t flags; { + ENV *env; + COMPQUIET(flags, 0); - return (__cdsgroup_notsup(txn->mgrp->env, "discard")); + env = txn->mgrp->env; + /* + * As the txn handle can not be used any more, we call + * __cdsgroup_commit to release the lock and destroy the handle. + */ + (void)__cdsgroup_commit(txn, 0); + return (__cdsgroup_notsup(env, "discard")); } static u_int32_t __cdsgroup_id(txn) diff --git a/src/db/db_compact.c b/src/db/db_compact.c index d0f4801e..afe5a997 100644 --- a/src/db/db_compact.c +++ b/src/db/db_compact.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -262,9 +262,11 @@ err: if (txn_local && txn != NULL) { done: if (LF_ISSET(DB_FREE_SPACE)) { DBMETA *meta; db_pgno_t pgno; + int pgs_done; pgno = PGNO_BASE_MD; isdone = 1; + pgs_done = 0; if (ret == 0 && !LF_ISSET(DB_FREELIST_ONLY) && __memp_fget(dbp->mpf, &pgno, ip, txn, 0, &meta) == 0) { isdone = meta->free == PGNO_INVALID; @@ -281,7 +283,8 @@ done: if (LF_ISSET(DB_FREE_SPACE)) { } else #endif if (!isdone) - ret = __bam_truncate_ipages(dbp, ip, txn_orig, c_data); + ret = __bam_truncate_ipages(dbp, + ip, txn_orig, c_data, &pgs_done); /* Clean up the free list. */ if (list != NULL) @@ -387,17 +390,26 @@ err: if (dbc != NULL && (t_ret = __LPUT(dbc, lock)) != 0 && ret == 0) #endif /* - * __db_exchange_page -- swap a page with a lower numbered page. - * The routine will optionally free the higher numbered page. The cursor - * has a stack which includes at least the immediate parent of this page. - * PUBLIC: int __db_exchange_page __P((DBC *, PAGE **, PAGE *, db_pgno_t, int)); + * __db_exchange_page -- try to move a page 'down', to earlier in the file. + * + * This tries to move a page to a lower location the file, by swapping it + * with an earlier free page. The free page comes either from the free list or + * the newpgno parameter (e.g., __ham_compact_hash()). If the new page turns + * out to be higher than the original one, the allocation is undone and + * the caller is left unchanged. After a successful swap, this routine can + * optionally free the old, higher numbered page. + * The cursor's stack includes at least the immediate parent of this page. + * + * PUBLIC: int __db_exchange_page + * PUBLIC: __P((DBC *, PAGE **, PAGE *, db_pgno_t, int, int *)); */ int -__db_exchange_page(dbc, pgp, opg, newpgno, flags) +__db_exchange_page(dbc, pgp, opg, newpgno, flags, pgs_donep) DBC *dbc; PAGE **pgp, *opg; db_pgno_t newpgno; int flags; + int *pgs_donep; { BTREE_CURSOR *cp; DB *dbp; @@ -445,7 +457,9 @@ __db_exchange_page(dbc, pgp, opg, newpgno, flags) * are allocating at the same time, if so, just put it back. */ if (PGNO(newpage) > PGNO(*pgp)) { - /* Its unfortunate but you can't just free a new overflow. */ + /* It is unfortunate but you can't just free a new overflow. */ + /* XXX Is the above comment still true? */ + /* XXX Should __db_new(OVERFLOW) zero OV_LEN()? */ if (TYPE(newpage) == P_OVERFLOW) OV_LEN(newpage) = 0; if ((ret = __LPUT(dbc, lock)) != 0) @@ -572,7 +586,9 @@ __db_exchange_page(dbc, pgp, opg, newpgno, flags) if ((ret = __TLPUT(dbc, lock)) != 0) return (ret); -done: return (0); +done: + (*pgs_donep)++; + return (0); err: (void)__memp_fput(dbp->mpf, dbc->thread_info, newpage, dbc->priority); (void)__TLPUT(dbc, lock); @@ -584,15 +600,16 @@ err: (void)__memp_fput(dbp->mpf, dbc->thread_info, newpage, dbc->priority); * Walk the pages of an overflow chain and swap out * high numbered pages. We are passed the first page * but only deal with the second and subsequent pages. - * PUBLIC: int __db_truncate_overflow __P((DBC *, - * PUBLIC: db_pgno_t, PAGE **, DB_COMPACT *)); + * PUBLIC: int __db_truncate_overflow __P((DBC *, db_pgno_t, + * PUBLIC: PAGE **, DB_COMPACT *, int *)); */ int -__db_truncate_overflow(dbc, pgno, ppg, c_data) +__db_truncate_overflow(dbc, pgno, ppg, c_data, pgs_donep) DBC *dbc; db_pgno_t pgno; PAGE **ppg; DB_COMPACT *c_data; + int *pgs_donep; { DB *dbp; DB_LOCK lock; @@ -618,7 +635,7 @@ __db_truncate_overflow(dbc, pgno, ppg, c_data) return (ret); if (pgno <= c_data->compact_truncate) continue; - if (have_lock == 0) { + if (!have_lock) { DB_ASSERT(dbp->env, ppg != NULL); ppgno = PGNO(*ppg); if ((ret = __memp_fput(dbp->mpf, dbc->thread_info, @@ -635,30 +652,32 @@ __db_truncate_overflow(dbc, pgno, ppg, c_data) have_lock = 1; } if ((ret = __db_exchange_page(dbc, - &page, NULL, PGNO_INVALID, DB_EXCH_FREE)) != 0) + &page, NULL, PGNO_INVALID, DB_EXCH_FREE, pgs_donep)) != 0) break; } err: if (page != NULL && - (t_ret = __memp_fput( dbp->mpf, + (t_ret = __memp_fput(dbp->mpf, dbc->thread_info, page, dbc->priority)) != 0 && ret == 0) ret = t_ret; if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0) ret = t_ret; return (ret); } + /* * __db_truncate_root -- swap a root page for a lower numbered page. * PUBLIC: int __db_truncate_root __P((DBC *, - * PUBLIC: PAGE *, u_int32_t, db_pgno_t *, u_int32_t)); + * PUBLIC: PAGE *, u_int32_t, db_pgno_t *, u_int32_t, int *)); */ int -__db_truncate_root(dbc, ppg, indx, pgnop, tlen) +__db_truncate_root(dbc, ppg, indx, pgnop, tlen, pgs_donep) DBC *dbc; PAGE *ppg; u_int32_t indx; db_pgno_t *pgnop; u_int32_t tlen; + int *pgs_donep; { DB *dbp; DBT orig; @@ -693,7 +712,7 @@ __db_truncate_root(dbc, ppg, indx, pgnop, tlen) } else { LOCK_CHECK_OFF(dbc->thread_info); ret = __db_exchange_page(dbc, - &page, NULL, PGNO_INVALID, DB_EXCH_FREE); + &page, NULL, PGNO_INVALID, DB_EXCH_FREE, pgs_donep); LOCK_CHECK_ON(dbc->thread_info); if (ret != 0) goto err; @@ -705,8 +724,7 @@ __db_truncate_root(dbc, ppg, indx, pgnop, tlen) /* Update the reference. */ if (DBC_LOGGING(dbc)) { - if ((ret = __db_pgno_log(dbp, - dbc->txn, &LSN(ppg), 0, PGNO(ppg), + if ((ret = __db_pgno_log(dbp, dbc->txn, &LSN(ppg), 0, PGNO(ppg), &LSN(ppg), (u_int32_t)indx, *pgnop, newpgno)) != 0) goto err; } else @@ -780,13 +798,13 @@ __db_find_free(dbc, type, size, bstart, freep) goto err; if (nelems == 0) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } for (i = 0; i < nelems; i++) { if (list[i] > bstart) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } start = i; @@ -812,7 +830,7 @@ __db_find_free(dbc, type, size, bstart, freep) goto found; } } - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; found: /* We have size range of pages. Remove them. */ @@ -1005,13 +1023,15 @@ err: if (np != NULL && np != otherp) * __db_move_metadata -- move a meta data page to a lower page number. * The meta data page must be exclusively latched on entry. * - * PUBLIC: int __db_move_metadata __P((DBC *, DBMETA **, DB_COMPACT *)); + * PUBLIC: int __db_move_metadata + * PUBLIC: __P((DBC *, DBMETA **, DB_COMPACT *, int *)); */ int -__db_move_metadata(dbc, metap, c_data) +__db_move_metadata(dbc, metap, c_data, pgs_donep) DBC *dbc; DBMETA **metap; DB_COMPACT *c_data; + int *pgs_donep; { BTREE *bt; DB *dbp, *mdbp; @@ -1023,7 +1043,7 @@ __db_move_metadata(dbc, metap, c_data) c_data->compact_pages_examine++; if ((ret = __db_exchange_page(dbc, - (PAGE**)metap, NULL, PGNO_INVALID, DB_EXCH_FREE)) != 0) + (PAGE **)metap, NULL, PGNO_INVALID, DB_EXCH_FREE, pgs_donep)) != 0) return (ret); if (PGNO(*metap) == dbp->meta_pgno) diff --git a/src/db/db_conv.c b/src/db/db_conv.c index 210b4d6e..77c6b760 100644 --- a/src/db/db_conv.c +++ b/src/db/db_conv.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1990, 1993, 1994, 1995, 1996 @@ -487,8 +487,12 @@ __db_byteswap(dbp, pg, h, pagesize, pgin) { ENV *env; BINTERNAL *bi; + BBLOB *bl; BKEYDATA *bk; BOVERFLOW *bo; + HEAPBLOBHDR *bhdr; + HEAPHDR *hh; + HEAPSPLITHDR *hsh; RINTERNAL *ri; db_indx_t i, *inp, len, tmp; u_int8_t *end, *p, *pgend; @@ -500,8 +504,14 @@ __db_byteswap(dbp, pg, h, pagesize, pgin) M_32_SWAP(h->lsn.file); M_32_SWAP(h->lsn.offset); M_32_SWAP(h->pgno); - M_32_SWAP(h->prev_pgno); - M_32_SWAP(h->next_pgno); + if (TYPE(h) == P_HEAP) { + M_32_SWAP(((HEAPPG *)h)->high_pgno); + M_16_SWAP(((HEAPPG *)h)->high_indx); + M_16_SWAP(((HEAPPG *)h)->free_indx); + } else { + M_32_SWAP(h->prev_pgno); + M_32_SWAP(h->next_pgno); + } M_16_SWAP(h->entries); M_16_SWAP(h->hf_offset); } @@ -527,6 +537,14 @@ __db_byteswap(dbp, pg, h, pagesize, pgin) continue; switch (HPAGE_TYPE(dbp, h, i)) { + case H_BLOB: + p = HBLOB_ID(P_ENTRY(dbp, h, i)); + SWAP64(p); /* id */ + SWAP64(p); /* size */ + p = HBLOB_FILE_ID(P_ENTRY(dbp, h, i)); + SWAP64(p); /* file id */ + SWAP64(p); /* sdb id */ + break; case H_KEYDATA: break; case H_DUPLICATE: @@ -599,6 +617,14 @@ __db_byteswap(dbp, pg, h, pagesize, pgin) if ((u_int8_t *)bk >= pgend) continue; switch (B_TYPE(bk->type)) { + case B_BLOB: + bl = (BBLOB *)bk; + M_16_SWAP(bl->len); + M_64_SWAP(bl->id); /* id */ + M_64_SWAP(bl->size); /* size */ + M_64_SWAP(bl->file_id); /* file id */ + M_64_SWAP(bl->sdb_id); /* sdb id */ + break; case B_KEYDATA: M_16_SWAP(bk->len); break; @@ -663,6 +689,32 @@ __db_byteswap(dbp, pg, h, pagesize, pgin) } break; case P_HEAP: + for (i = 0; i <= HEAP_HIGHINDX(h); i++) { + if (pgin) + M_16_SWAP(inp[i]); + if (inp[i] == 0) + continue; + + hh = (HEAPHDR *)P_ENTRY(dbp, h, i); + if ((u_int8_t *)hh >= pgend) + continue; + M_16_SWAP(hh->size); + if (F_ISSET(hh, HEAP_RECSPLIT)) { + hsh = (HEAPSPLITHDR *)hh; + M_32_SWAP(hsh->tsize); + M_32_SWAP(hsh->nextpg); + M_16_SWAP(hsh->nextindx); + } else if (F_ISSET(hh, HEAP_RECBLOB)) { + bhdr = (HEAPBLOBHDR *)hh; + M_64_SWAP(bhdr->id); /* id */ + M_64_SWAP(bhdr->size); /* size */ + M_64_SWAP(bhdr->file_id); /* file id */ + } + + if (!pgin) + M_16_SWAP(inp[i]); + } + break; case P_IHEAP: case P_INVALID: case P_OVERFLOW: @@ -678,8 +730,14 @@ out: if (!pgin) { M_32_SWAP(h->lsn.file); M_32_SWAP(h->lsn.offset); M_32_SWAP(h->pgno); - M_32_SWAP(h->prev_pgno); - M_32_SWAP(h->next_pgno); + if (TYPE(h) == P_HEAP) { + M_32_SWAP(((HEAPPG *)h)->high_pgno); + M_16_SWAP(((HEAPPG *)h)->high_indx); + M_16_SWAP(((HEAPPG *)h)->free_indx); + } else { + M_32_SWAP(h->prev_pgno); + M_32_SWAP(h->next_pgno); + } M_16_SWAP(h->entries); M_16_SWAP(h->hf_offset); } @@ -718,7 +776,10 @@ __db_pageswap(env, dbp, pp, len, pdata, pgin) case P_HASHMETA: return (__ham_mswap(env, pp)); - +#ifdef HAVE_HEAP + case P_HEAPMETA: + return (__heap_mswap(env, pp)); +#endif case P_QAMMETA: return (__qam_mswap(env, pp)); @@ -794,12 +855,17 @@ __db_recordswap(op, size, hdr, data, pgin) void *hdr, *data; u_int32_t pgin; { + BBLOB *bl; BKEYDATA *bk; BOVERFLOW *bo; BINTERNAL *bi; + DBT *dbt; + HEAPHDR *hh; + HEAPBLOBHDR bhdr; + HEAPSPLITHDR *hsh; RINTERNAL *ri; db_indx_t tmp; - u_int8_t *p, *end; + u_int8_t buf[HEAPBLOBREC_SIZE], *end, *p; if (size == 0) return; @@ -812,6 +878,14 @@ __db_recordswap(op, size, hdr, data, pgin) case B_KEYDATA: M_16_SWAP(bk->len); break; + case B_BLOB: + bl = (BBLOB *)bk; + M_16_SWAP(bl->len); + M_64_SWAP(bl->id); /* id */ + M_64_SWAP(bl->size); /* size */ + M_64_SWAP(bl->file_id); /* file id */ + M_64_SWAP(bl->sdb_id); /* sdb id */ + break; case B_DUPLICATE: case B_OVERFLOW: bo = (BOVERFLOW *)hdr; @@ -835,6 +909,7 @@ __db_recordswap(op, size, hdr, data, pgin) } else bo = (BOVERFLOW *)data; M_32_SWAP(bo->pgno); + M_32_SWAP(bo->tlen); } break; case P_IRECNO: @@ -867,10 +942,10 @@ __db_recordswap(op, size, hdr, data, pgin) SWAP16(p); } break; - /* These two record types include the full header. */ + /* These three record types include the full header. */ case H_OFFDUP: p = (u_int8_t *)hdr; - p += SSZ(HOFFPAGE, pgno); + p += SSZ(HOFFDUP, pgno); SWAP32(p); /* pgno */ break; case H_OFFPAGE: @@ -879,11 +954,61 @@ __db_recordswap(op, size, hdr, data, pgin) SWAP32(p); /* pgno */ SWAP32(p); /* tlen */ break; + case H_BLOB: + p = HBLOB_ID(hdr); + SWAP64(p); /* id */ + SWAP64(p); /* size */ + p = HBLOB_FILE_ID(hdr); + SWAP64(p); /* file id */ + SWAP64(p); /* sdb id */ + break; default: DB_ASSERT(NULL, op != op); } break; - + case P_HEAP: + hh = (HEAPHDR *)hdr; + M_16_SWAP(hh->size); + if (F_ISSET(hh, HEAP_RECSPLIT)) { + hsh = (HEAPSPLITHDR *)hdr; + M_32_SWAP(hsh->tsize); + M_32_SWAP(hsh->nextpg); + M_16_SWAP(hsh->nextindx); + }else if (F_ISSET(hh, HEAP_RECBLOB)) { + /* + * Heap blob records are broken into two parts when + * logged, the shared header and the part that is + * unique to blob records, which is stored in the + * log data field. + */ + if (data != NULL) { + dbt = NULL; + if (pgin) { + dbt = data; + memcpy(buf + sizeof(HEAPHDR), + dbt->data, HEAPBLOBREC_DSIZE); + } else { + memcpy(buf + sizeof(HEAPHDR), + data, HEAPBLOBREC_DSIZE); + } + memcpy(&bhdr, buf, HEAPBLOBREC_SIZE); + M_64_SWAP(bhdr.id); /* id */ + M_64_SWAP(bhdr.size); /* size */ + M_64_SWAP(bhdr.file_id); /* file id */ + memcpy(buf, &bhdr, HEAPBLOBREC_SIZE); + if (pgin) { + memcpy(dbt->data, + HEAPBLOBREC_DATA(buf), + HEAPBLOBREC_DSIZE); + } else { + memcpy(data, + HEAPBLOBREC_DATA(buf), + HEAPBLOBREC_DSIZE); + } + } + break; + } + break; default: DB_ASSERT(NULL, op != op); } diff --git a/src/db/db_copy.c b/src/db/db_copy.c index 359c74be..d9786702 100644 --- a/src/db/db_copy.c +++ b/src/db/db_copy.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2011, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/db/db_dispatch.c b/src/db/db_dispatch.c index 06de4ef7..7cb7f9ca 100644 --- a/src/db/db_dispatch.c +++ b/src/db/db_dispatch.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1995, 1996 @@ -639,7 +639,7 @@ __db_txnlist_find(env, hp, txnid, statusp) DB_TXNLIST *entry; if (txnid == 0) - return (DB_NOTFOUND); + return (USR_ERR(env, DB_NOTFOUND)); return (__db_txnlist_find_internal(env, hp, TXNLIST_TXNID, txnid, &entry, 0, statusp)); @@ -666,7 +666,7 @@ __db_txnlist_update(env, hp, txnid, status, lsn, ret_status, add_ok) int ret; if (txnid == 0) - return (DB_NOTFOUND); + return (USR_ERR(env, DB_NOTFOUND)); ret = __db_txnlist_find_internal(env, hp, TXNLIST_TXNID, txnid, &elp, 0, ret_status); @@ -715,7 +715,7 @@ __db_txnlist_find_internal(env, ret = 0; if (hp == NULL) - return (DB_NOTFOUND); + return (USR_ERR(env, DB_NOTFOUND)); switch (type) { case TXNLIST_TXNID: @@ -759,7 +759,7 @@ __db_txnlist_find_internal(env, return (ret); } - return (DB_NOTFOUND); + return (USR_ERR(env, DB_NOTFOUND)); } /* diff --git a/src/db/db_dup.c b/src/db/db_dup.c index 9fd04791..e66ec92b 100644 --- a/src/db/db_dup.c +++ b/src/db/db_dup.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/db/db_iface.c b/src/db/db_iface.c index 59e0ba53..da6140a4 100644 --- a/src/db/db_iface.c +++ b/src/db/db_iface.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -159,9 +159,15 @@ __db_associate_arg(dbp, sdbp, callback, flags) env = dbp->env; + if (dbp->blob_threshold || sdbp->blob_threshold) { + __db_errx(env, DB_STR("0751", + "Secondary and primary databases cannot support blobs.")); + return (EINVAL); + } + if (sdbp->type == DB_HEAP) { - __db_errx(env, - "Heap databases may not be used as secondary databases"); + __db_errx(env, DB_STR("0752", + "Heap databases may not be used as secondary databases")); return (EINVAL); } @@ -288,6 +294,7 @@ __db_cursor_pp(dbp, txn, dbcp, flags) int rep_blocked, ret; env = dbp->env; + (*dbcp) = NULL; DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->cursor"); @@ -331,7 +338,8 @@ __db_cursor_pp(dbp, txn, dbcp, flags) * If a family transaction was passed in, the transaction handle in * the cursor may not match. */ - txn = (*dbcp)->txn; + if ((*dbcp) != NULL) + txn = (*dbcp)->txn; if (txn != NULL && ret == 0) TAILQ_INSERT_HEAD(&(txn->my_cursors), *dbcp, txn_cursors); @@ -434,6 +442,13 @@ __db_cursor_arg(dbp, flags) return (__db_fnl(env, "DB->cursor")); } + if (dbp->blob_threshold && + LF_ISSET(DB_READ_UNCOMMITTED | DB_TXN_SNAPSHOT)) { + __db_errx(dbp->env, DB_STR("0753", +"Blob enabled databases do not support READ_UNCOMMITTED and TXN_SNAPSHOT.")); + return (EINVAL); + } + LF_CLR(DB_CURSOR_BULK | DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_TXN_SNAPSHOT); @@ -828,6 +843,12 @@ __db_get_arg(dbp, key, data, flags) env = dbp->env; + if (dbp->blob_threshold && LF_ISSET(DB_READ_UNCOMMITTED)) { + __db_errx(env, DB_STR("0754", + "Blob enabled databases do not support DB_READ_UNCOMMITTED.")); + return (EINVAL); + } + /* * Check for read-modify-write validity. DB_RMW doesn't make sense * with CDB cursors since if you're going to write the cursor, you @@ -876,6 +897,9 @@ __db_get_arg(dbp, key, data, flags) break; case DB_CONSUME: case DB_CONSUME_WAIT: + if (DB_IS_READONLY(dbp)) + return (__db_rdonly(env, + "DB->get CONSUME/CONSUME_WAIT")); if (dirty) { __db_errx(env, DB_STR_A("0583", "%s is not supported with DB_CONSUME or DB_CONSUME_WAIT", @@ -1148,6 +1172,13 @@ __db_open_pp(dbp, txn, fname, dname, type, flags, mode) /* Save the current DB handle flags for refresh. */ dbp->orig_flags = dbp->flags; + if (fname == 0 && PREFMAS_IS_SET(env)) { + __db_errx(env, DB_STR("0783", "In-memory databases are not " + "supported in Replication Manager preferred master mode")); + ret = EINVAL; + goto err; + } + /* Check for replication block. */ handle_check = IS_ENV_REPLICATED(env); if (handle_check && @@ -1389,6 +1420,18 @@ __db_open_arg(dbp, txn, fname, dname, type, flags) return (EINVAL); } + if (LF_ISSET(DB_MULTIVERSION) && dbp->blob_threshold) { + __db_errx(env, DB_STR("0755", + "DB_MULTIVERSION illegal with blob enabled databases")); + return (EINVAL); + } + + if (LF_ISSET(DB_READ_UNCOMMITTED) && dbp->blob_threshold) { + __db_errx(env, DB_STR("0756", + "DB_READ_UNCOMMITTED illegal with blob enabled databases")); + return (EINVAL); + } + /* DB_TRUNCATE is neither transaction recoverable nor lockable. */ if (LF_ISSET(DB_TRUNCATE) && (LOCKING_ON(env) || txn != NULL)) { __db_errx(env, DB_STR_A("0599", @@ -1901,8 +1944,6 @@ __db_compact_pp(dbp, txn, start, stop, c_data, flags, end) ret = __db_compact_int(dbp, ip, txn, start, stop, dp, flags, end); break; - case DB_HEAP: - break; default: ret = __dbh_am_chk(dbp, DB_OK_BTREE); break; @@ -2893,7 +2934,7 @@ __dbt_ferr(dbp, name, dbt, check_thread) * database, without having to clear flags. */ if ((ret = __db_fchk(env, name, dbt->flags, - DB_DBT_APPMALLOC | DB_DBT_BULK | DB_DBT_DUPOK | + DB_DBT_APPMALLOC | DB_DBT_BLOB | DB_DBT_BULK | DB_DBT_DUPOK | DB_DBT_MALLOC | DB_DBT_REALLOC | DB_DBT_USERCOPY | DB_DBT_USERMEM | DB_DBT_PARTIAL | DB_DBT_READONLY)) != 0) return (ret); diff --git a/src/db/db_join.c b/src/db/db_join.c index 751cf9e2..24d5260e 100644 --- a/src/db/db_join.c +++ b/src/db/db_join.c @@ -1,7 +1,7 @@ /* * See the file LICENSE for redistribution information. * - * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1998, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -717,7 +717,6 @@ __db_join_close(dbc) DBC *dbc; { DB *dbp; - DB_THREAD_INFO *ip; ENV *env; JOIN_CURSOR *jc; int ret, t_ret; @@ -737,7 +736,6 @@ __db_join_close(dbc) TAILQ_REMOVE(&dbp->join_queue, dbc, links); MUTEX_UNLOCK(env, dbp->mutex); - ENV_ENTER(env, ip); /* * Close any open scratch cursors. In each case, there may * not be as many outstanding as there are cursors in @@ -757,7 +755,6 @@ __db_join_close(dbc) (t_ret = __dbc_close(jc->j_fdupcurs[i])) != 0) ret = t_ret; } - ENV_LEAVE(env, ip); __os_free(env, jc->j_exhausted); __os_free(env, jc->j_curslist); @@ -796,7 +793,7 @@ __db_join_getnext(dbc, key, data, exhausted, opmods) int ret, cmp; DB *dbp; DBT ldata; - int (*func) __P((DB *, const DBT *, const DBT *)); + int (*func) __P((DB *, const DBT *, const DBT *, size_t *)); dbp = dbc->dbp; func = (dbp->dup_compare == NULL) ? __bam_defcmp : dbp->dup_compare; @@ -812,7 +809,7 @@ __db_join_getnext(dbc, key, data, exhausted, opmods) if ((ret = __dbc_get(dbc, key, &ldata, opmods | DB_CURRENT)) != 0) break; - cmp = func(dbp, data, &ldata); + cmp = func(dbp, data, &ldata, NULL); if (cmp == 0) { /* * We have to return the real data value. Copy diff --git a/src/db/db_meta.c b/src/db/db_meta.c index 8f97ebd8..53cf77cc 100644 --- a/src/db/db_meta.c +++ b/src/db/db_meta.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1990, 1993, 1994, 1995, 1996 @@ -939,12 +939,14 @@ done: if (last_pgnop != NULL) *last_pgnop = meta->last_pgno; /* - * The truncate point is the number of pages in the free - * list back from the last page. The number of pages - * in the free list are the number that we can swap in. - * Adjust it down slightly so if we find higher numbered - * pages early and then free other pages later we can - * truncate them. + * Set the truncation point which determines which pages may be + * relocated. Pages above are candidates to be swapped with a lower one + * from the freelist by __db_exchange_page(); pages before the truncate + * point are not relocated. + * The truncation point starts as N pages less than the last_pgno, where + * N is the size of the free list. This is reduced by 1/4 in the hope + * that partially full pages will be coalesced together, creating + * additional free pages during the compact. */ if (c_data) { c_data->compact_truncate = (u_int32_t)meta->last_pgno - nelems; diff --git a/src/db/db_method.c b/src/db/db_method.c index 82d03e5f..d807bab6 100644 --- a/src/db/db_method.c +++ b/src/db/db_method.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -9,6 +9,7 @@ #include "db_config.h" #include "db_int.h" +#include "dbinc/blob.h" #include "dbinc/crypto.h" #include "dbinc/db_page.h" #include "dbinc/btree.h" @@ -36,14 +37,15 @@ static int __db_set_alloc __P((DB *, void *(*)(size_t), static int __db_get_append_recno __P((DB *, int (**)(DB *, DBT *, db_recno_t))); static int __db_set_append_recno __P((DB *, int (*)(DB *, DBT *, db_recno_t))); +static int __db_get_blob_dir __P((DB *, const char **)); +static int __db_set_blob_dir __P((DB *, const char *)); +static int __db_get_blob_sub_dir __P((DB *, const char **)); static int __db_get_cachesize __P((DB *, u_int32_t *, u_int32_t *, int *)); static int __db_set_cachesize __P((DB *, u_int32_t, u_int32_t, int)); static int __db_get_create_dir __P((DB *, const char **)); static int __db_set_create_dir __P((DB *, const char *)); static int __db_get_dup_compare - __P((DB *, int (**)(DB *, const DBT *, const DBT *))); -static int __db_set_dup_compare - __P((DB *, int (*)(DB *, const DBT *, const DBT *))); + __P((DB *, int (**)(DB *, const DBT *, const DBT *, size_t *))); static int __db_get_encrypt_flags __P((DB *, u_int32_t *)); static int __db_set_encrypt __P((DB *, const char *, u_int32_t)); static int __db_get_feedback __P((DB *, void (**)(DB *, int, int))); @@ -90,6 +92,12 @@ db_create(dbpp, dbenv, flags) ip = NULL; env = dbenv == NULL ? NULL : dbenv->env; +#ifdef HAVE_ERROR_HISTORY + /* Call thread local storage initializer at least once per process. */ + if (env == NULL) + __db_thread_init(); +#endif + /* Check for invalid function flags. */ switch (flags) { case 0: @@ -206,12 +214,11 @@ __db_create_internal(dbpp, env, flags) err: if (dbp != NULL) { if (dbp->mpf != NULL) (void)__memp_fclose(dbp->mpf, 0); + if (F_ISSET(env, ENV_DBLOCAL)) + (void)__env_close(dbp->dbenv, 0); __os_free(env, dbp); } - if (dbp != NULL && F_ISSET(env, ENV_DBLOCAL)) - (void)__env_close(dbp->dbenv, 0); - return (ret); } @@ -225,6 +232,7 @@ __db_init(dbp, flags) u_int32_t flags; { int ret; + u_int32_t bytes; dbp->locker = NULL; dbp->alt_close = NULL; @@ -254,6 +262,9 @@ __db_init(dbp, flags) dbp->get_alloc = __db_get_alloc; dbp->get_append_recno = __db_get_append_recno; dbp->get_assoc_flags = __db_get_assoc_flags; + dbp->get_blob_dir = __db_get_blob_dir; + dbp->get_blob_sub_dir = __db_get_blob_sub_dir; + dbp->get_blob_threshold = __db_get_blob_threshold; dbp->get_byteswapped = __db_get_byteswapped; dbp->get_cachesize = __db_get_cachesize; dbp->get_create_dir = __db_get_create_dir; @@ -290,6 +301,8 @@ __db_init(dbp, flags) dbp->rename = __db_rename_pp; dbp->set_alloc = __db_set_alloc; dbp->set_append_recno = __db_set_append_recno; + dbp->set_blob_dir = __db_set_blob_dir; + dbp->set_blob_threshold = __db_set_blob_threshold; dbp->set_cachesize = __db_set_cachesize; dbp->set_create_dir = __db_set_create_dir; dbp->set_dup_compare = __db_set_dup_compare; @@ -316,7 +329,11 @@ __db_init(dbp, flags) dbp->verify = __db_verify_pp; /* DB PUBLIC HANDLE LIST END */ - /* Access method specific. */ + if ((ret = __env_get_blob_threshold_int(dbp->env, &bytes)) != 0) + return (ret); + dbp->blob_threshold = bytes; + + /* Access method specific. */ if ((ret = __bam_db_create(dbp)) != 0) return (ret); if ((ret = __ham_db_create(dbp)) != 0) @@ -535,6 +552,182 @@ __db_set_append_recno(dbp, func) } /* + * __db_get_blob_threshold -- + * Get the current threshold size at which records are stored as blobs. + * + * PUBLIC: int __db_get_blob_threshold __P((DB *, u_int32_t *)); + */ +int +__db_get_blob_threshold(dbp, bytes) + DB *dbp; + u_int32_t *bytes; +{ + /* + * While shared, this value never changes after open, so it is safe + * to access it without mutex protection. + */ + *bytes = dbp->blob_threshold; + + return (0); +} + +/* + * __db_set_blob_threshold -- + * API to allow setting the threshold size at which records are stored + * as blobs rather than in database items. No flags currently supported. + * PUBLIC: int __db_set_blob_threshold __P((DB *, u_int32_t, u_int32_t)); + */ +int +__db_set_blob_threshold(dbp, bytes, flags) + DB *dbp; + u_int32_t bytes; + u_int32_t flags; +{ + if (__db_fchk(dbp->env, "DB->set_blob_threshold", flags, 0) != 0) + return (EINVAL); + + DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_blob_threshold"); + + if (bytes != 0 && F_ISSET(dbp, + (DB_AM_CHKSUM | DB_AM_ENCRYPT | DB_AM_DUP | DB_AM_DUPSORT))) { + __db_errx(dbp->env, DB_STR("0760", +"Cannot enable blobs in databases with checksum, encryption, or duplicates.")); + return (EINVAL); + } +#ifdef HAVE_COMPRESSION + if (DB_IS_COMPRESSED(dbp) && bytes != 0) { + __db_errx(dbp->env, DB_STR("0761", + "Cannot enable blobs in databases with compression.")); + return (EINVAL); + } +#endif + + dbp->blob_threshold = bytes; + + return (0); +} + +/* + * __db_blobs_enabled -- + * + * Used to tell if the database is configured to support blobs. + * PUBLIC: int __db_blobs_enabled __P((DB *)); + */ +int +__db_blobs_enabled(dbp) + DB *dbp; +{ + /* Blob threshold must be non-0. */ + if (!dbp->blob_threshold) + return (0); + /* Blobs cannot support encryption or checksum, but that may change. */ + if (F_ISSET(dbp, (DB_AM_CHKSUM | DB_AM_ENCRYPT))) + return (0); + /* Blobs do not support compression, but that may change. */ +#ifdef HAVE_COMPRESSION + if (DB_IS_COMPRESSED(dbp)) + return (0); +#endif + if (dbp->env->dbenv != NULL && + F_ISSET(dbp->env->dbenv, DB_ENV_TXN_SNAPSHOT)) + return (0); + /* Cannot support blobs in recno or queue. */ + if (dbp->type == DB_RECNO || dbp->type == DB_QUEUE) + return (0); + /* + * Cannot support dups because that would require comparing + * blob data items. + */ + if (F_ISSET(dbp, (DB_AM_DUP | DB_AM_DUPSORT))) + return (0); + /* No place to put blob files when using an in-memory db. */ + if (F_ISSET(dbp, (DB_AM_INMEM))) + return (0); + + /* BDB managed databases should not support blobs. */ + if ((dbp->fname != NULL && IS_DB_FILE(dbp->fname)) || + (dbp->dname != NULL && IS_DB_FILE(dbp->dname))) + return (0); + + return (1); +} + +/* + * __db_get_blob_sub_dir -- + * + * Returns the subdirectory of the blob directory in which the blob files + * for the given db are stored, or NULL if there is none. + * + */ +static int +__db_get_blob_sub_dir(dbp, dir) + DB *dbp; + const char **dir; +{ + DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->get_blob_sub_dir"); + + *dir = dbp->blob_sub_dir; + + return (0); +} + +/* + * __db_get_blob_dir -- + * + * Get the blob directory for this database. + */ +static int +__db_get_blob_dir(dbp, dir) + DB *dbp; + const char **dir; +{ + DB_ENV *dbenv; + ENV *env; + + env = dbp->env; + dbenv = dbp->env->dbenv; + *dir = NULL; + + if (dbenv == NULL) + return (0); + + if (dbenv->db_blob_dir != NULL) + *dir = dbenv->db_blob_dir; + else if (env->db_home != NULL) + *dir = BLOB_DEFAULT_DIR; + + return (0); +} + +/* + * __db_set_blob_dir -- + * + * Set the blob directory in a local environment. + */ +static int +__db_set_blob_dir(dbp, dir) + DB *dbp; + const char *dir; +{ + DB_ENV *dbenv; + ENV *env; + + DB_ILLEGAL_IN_ENV(dbp, "DB->set_blob_dir"); + DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_blob_dir"); + env = dbp->env; + dbenv = dbp->env->dbenv; + + if (dbenv == NULL) + return (0); + + if (dbenv->db_blob_dir != NULL) + __os_free(env, dbenv->db_blob_dir); + dbenv->db_blob_dir = NULL; + + return (__os_strdup(env, dir, &dbenv->db_blob_dir)); +} + +/* * __db_get_cachesize -- * Get underlying cache size. */ @@ -607,7 +800,7 @@ __db_get_create_dir(dbp, dirp) static int __db_get_dup_compare(dbp, funcp) DB *dbp; - int (**funcp) __P((DB *, const DBT *, const DBT *)); + int (**funcp) __P((DB *, const DBT *, const DBT *, size_t *)); { DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE | DB_OK_HASH); @@ -628,11 +821,14 @@ __db_get_dup_compare(dbp, funcp) /* * __db_set_dup_compare -- * Set duplicate comparison routine. + * + * PUBLIC: int __db_set_dup_compare __P((DB *, + * PUBLIC: int (*)(DB *, const DBT *, const DBT *, size_t *))); */ -static int +int __db_set_dup_compare(dbp, func) DB *dbp; - int (*func) __P((DB *, const DBT *, const DBT *)); + int (*func) __P((DB *, const DBT *, const DBT *, size_t *)); { int ret; @@ -900,6 +1096,13 @@ __db_set_flags(dbp, flags) ENV_REQUIRES_CONFIG(env, env->tx_handle, "DB_NOT_DURABLE", DB_INIT_TXN); + if (dbp->blob_threshold && + LF_ISSET(DB_CHKSUM | DB_ENCRYPT | DB_DUP | DB_DUPSORT)) { + __db_errx(dbp->env, DB_STR("0763", +"Cannot enable checksum, encryption, or duplicates with blob support.")); + return (EINVAL); + } + __db_map_flags(dbp, &flags, &dbp->flags); if ((ret = __bam_set_flags(dbp, &flags)) != 0) diff --git a/src/db/db_open.c b/src/db/db_open.c index fefda48f..21074b15 100644 --- a/src/db/db_open.c +++ b/src/db/db_open.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -119,6 +119,15 @@ __db_open(dbp, ip, txn, fname, dname, type, flags, mode, meta_pgno) goto err; /* + * Silently disabled blobs in databases that cannot support them. + * Most illegal configurations will have already been caught, this + * is to allow a user to set an environment wide blob threshold, but + * not have to explicitly turn it off for in-memory or queue databases. + */ + if (!__db_blobs_enabled(dbp)) + dbp->blob_threshold = 0; + + /* * If both fname and subname are NULL, it's always a create, so make * sure that we have both DB_CREATE and a type specified. It would * be nice if this checking were done in __db_open where most of the @@ -259,6 +268,11 @@ __db_open(dbp, ip, txn, fname, dname, type, flags, mode, meta_pgno) if (ret != 0) goto err; + if (dbp->blob_file_id != 0) + if ((ret = __blob_make_sub_dir(env, &dbp->blob_sub_dir, + dbp->blob_file_id, dbp->blob_sdb_id)) != 0) + goto err; + #ifdef HAVE_PARTITION if (dbp->p_internal != NULL && (ret = __partition_open(dbp, ip, txn, fname, type, flags, mode, 1)) != 0) @@ -432,8 +446,10 @@ err: return (ret); /* * __db_chk_meta -- - * Take a buffer containing a meta-data page and check it for a valid LSN, - * checksum (and verify the checksum if necessary) and possibly decrypt it. + * Validate a buffer containing a possible meta-data page. It is + * byte-swapped as necessary and checked for having a valid magic number. + * If it does, then it can validate the LSN, checksum (if necessary), + * and possibly decrypt it. * * Return 0 on success, >0 (errno). * @@ -447,44 +463,64 @@ __db_chk_meta(env, dbp, meta, flags) u_int32_t flags; { DB_LSN swap_lsn; - int is_hmac, ret, swapped; - u_int32_t magic, orig_chk; + int is_hmac, needs_swap, ret; + u_int32_t magic; u_int8_t *chksum; ret = 0; - swapped = 0; + needs_swap = 0; + /* + * We can verify that this is some kind of db now, before any potential + * decryption, because the first P_OVERHEAD() bytes of most pages are + * cleartext. This gets called both before and after swapping, so we + * need to check for byte swapping ourselves. + */ + magic = meta->magic; +magic_retry: + switch (magic) { + case DB_BTREEMAGIC: + case DB_HASHMAGIC: + case DB_HEAPMAGIC: + case DB_QAMMAGIC: + case DB_RENAMEMAGIC: + break; + default: + if (needs_swap) + /* It's already been swapped, so it isn't a BDB file. */ + return (EINVAL); + M_32_SWAP(magic); + needs_swap = 1; + goto magic_retry; + } + + if (LOGGING_ON(env) && !LF_ISSET(DB_CHK_NOLSN)) { + swap_lsn = meta->lsn; + if (needs_swap) { + M_32_SWAP(swap_lsn.file); + M_32_SWAP(swap_lsn.offset); + } + if (!IS_REP_CLIENT(env) && !IS_NOT_LOGGED_LSN(swap_lsn) && + !IS_ZERO_LSN(swap_lsn) && (ret = + __log_check_page_lsn(env, dbp, &swap_lsn)) != 0) + return (ret); + } if (FLD_ISSET(meta->metaflags, DBMETA_CHKSUM)) { if (dbp != NULL) F_SET(dbp, DB_AM_CHKSUM); - - is_hmac = meta->encrypt_alg == 0 ? 0 : 1; - chksum = ((BTMETA *)meta)->chksum; - - /* - * If we need to swap, the checksum function overwrites the - * original checksum with 0, so we need to save a copy of the - * original for swapping later. - */ - orig_chk = *(u_int32_t *)chksum; - /* * We cannot add this to __db_metaswap because that gets done * later after we've verified the checksum or decrypted. */ if (LF_ISSET(DB_CHK_META)) { - swapped = 0; -chk_retry: if ((ret = + is_hmac = meta->encrypt_alg != 0; + chksum = ((BTMETA *)meta)->chksum; + if (needs_swap && !is_hmac) + M_32_SWAP(*(u_int32_t *)chksum); + if ((ret = __db_check_chksum(env, NULL, env->crypto_handle, - chksum, meta, DBMETASIZE, is_hmac)) != 0) { - if (is_hmac || swapped) - return (DB_CHKSUM_FAIL); - - M_32_SWAP(orig_chk); - swapped = 1; - *(u_int32_t *)chksum = orig_chk; - goto chk_retry; - } + chksum, meta, DBMETASIZE, is_hmac)) != 0) + return (DB_CHKSUM_FAIL); } } else if (dbp != NULL) F_CLR(dbp, DB_AM_CHKSUM); @@ -492,44 +528,8 @@ chk_retry: if ((ret = #ifdef HAVE_CRYPTO if (__crypto_decrypt_meta(env, dbp, (u_int8_t *)meta, LF_ISSET(DB_CHK_META)) != 0) - ret = DB_CHKSUM_FAIL; - else + ret = DB_CHKSUM_FAIL; #endif - - /* Now that we're decrypted, we can check LSN. */ - if (LOGGING_ON(env) && !LF_ISSET(DB_CHK_NOLSN)) { - /* - * This gets called both before and after swapping, so we - * need to check ourselves. If we already swapped it above, - * we'll know that here. - */ - - swap_lsn = meta->lsn; - magic = meta->magic; -lsn_retry: - if (swapped) { - M_32_SWAP(swap_lsn.file); - M_32_SWAP(swap_lsn.offset); - M_32_SWAP(magic); - } - switch (magic) { - case DB_BTREEMAGIC: - case DB_HASHMAGIC: - case DB_HEAPMAGIC: - case DB_QAMMAGIC: - case DB_RENAMEMAGIC: - break; - default: - if (swapped) - return (EINVAL); - swapped = 1; - goto lsn_retry; - } - if (!IS_REP_CLIENT(env) && - !IS_NOT_LOGGED_LSN(swap_lsn) && !IS_ZERO_LSN(swap_lsn)) - /* Need to do check. */ - ret = __log_check_page_lsn(env, dbp, &swap_lsn); - } return (ret); } @@ -598,7 +598,6 @@ swap_retry: } /* - * We can only check the meta page if we are sure we have a meta page. * If it is random data, then this check can fail. So only now can we * checksum and decrypt. Don't distinguish between configuration and * checksum match errors here, because we haven't opened the database @@ -606,9 +605,9 @@ swap_retry: * If DB_SKIP_CHK is set, it means the checksum was already checked * and the page was already decrypted. */ - if (!LF_ISSET(DB_SKIP_CHK) && + if (!LF_ISSET(DB_SKIP_CHK) && (ret = __db_chk_meta(env, dbp, meta, flags)) != 0) { - if (ret == DB_CHKSUM_FAIL) + if (ret == DB_CHKSUM_FAIL) __db_errx(env, DB_STR_A("0640", "%s: metadata page checksum error", "%s"), name); goto bad_format; @@ -669,10 +668,9 @@ swap_retry: } if (FLD_ISSET(meta->metaflags, - DBMETA_PART_RANGE | DBMETA_PART_CALLBACK)) - if ((ret = - __partition_init(dbp, meta->metaflags)) != 0) - return (ret); + DBMETA_PART_RANGE | DBMETA_PART_CALLBACK) && + (ret = __partition_init(dbp, meta->metaflags)) != 0) + return (ret); return (0); bad_format: diff --git a/src/db/db_overflow.c b/src/db/db_overflow.c index d992ec0d..22f349ed 100644 --- a/src/db/db_overflow.c +++ b/src/db/db_overflow.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1990, 1993, 1994, 1995, 1996 @@ -58,39 +58,26 @@ */ /* - * __db_goff -- - * Get an offpage item. + * __db_alloc_dbt * - * PUBLIC: int __db_goff __P((DBC *, - * PUBLIC: DBT *, u_int32_t, db_pgno_t, void **, u_int32_t *)); + * Allocate enough space in the dbt to hold the data. Also used by the + * blob file API. + * + * PUBLIC: int __db_alloc_dbt __P((ENV *, DBT *, u_int32_t, u_int32_t *, + * PUBLIC: u_int32_t *, void **, u_int32_t *)); */ int -__db_goff(dbc, dbt, tlen, pgno, bpp, bpsz) - DBC *dbc; +__db_alloc_dbt(env, dbt, tlen, nd, st, bpp, bpsz) + ENV *env; DBT *dbt; u_int32_t tlen; - db_pgno_t pgno; + u_int32_t *nd; + u_int32_t *st; void **bpp; u_int32_t *bpsz; { - DB *dbp; - DB_MPOOLFILE *mpf; - DB_TXN *txn; - DBC_INTERNAL *cp; - ENV *env; - PAGE *h; - DB_THREAD_INFO *ip; - db_indx_t bytes; - u_int32_t curoff, needed, start; - u_int8_t *p, *src; int ret; - - dbp = dbc->dbp; - cp = dbc->internal; - env = dbp->env; - ip = dbc->thread_info; - mpf = dbp->mpf; - txn = dbc->txn; + u_int32_t needed, start; /* * Check if the buffer is big enough; if it is not and we are @@ -110,6 +97,8 @@ __db_goff(dbc, dbt, tlen, pgno, bpp, bpsz) start = 0; needed = tlen; } + *nd = needed; + *st = start; /* * If the caller has not requested any data, return success. This @@ -123,7 +112,7 @@ __db_goff(dbc, dbt, tlen, pgno, bpp, bpsz) } if (F_ISSET(dbt, DB_DBT_USERCOPY)) - goto skip_alloc; + return (0); /* Allocate any necessary memory. */ if (F_ISSET(dbt, DB_DBT_USERMEM)) { @@ -152,7 +141,48 @@ __db_goff(dbc, dbt, tlen, pgno, bpp, bpsz) return (DB_BUFFER_SMALL); } -skip_alloc: + return (0); +} + +/* + * __db_goff -- + * Get an offpage item. + * + * PUBLIC: int __db_goff __P((DBC *, + * PUBLIC: DBT *, u_int32_t, db_pgno_t, void **, u_int32_t *)); + */ +int +__db_goff(dbc, dbt, tlen, pgno, bpp, bpsz) + DBC *dbc; + DBT *dbt; + u_int32_t tlen; + db_pgno_t pgno; + void **bpp; + u_int32_t *bpsz; +{ + DB *dbp; + DB_MPOOLFILE *mpf; + DB_TXN *txn; + DBC_INTERNAL *cp; + ENV *env; + PAGE *h; + DB_THREAD_INFO *ip; + db_indx_t bytes; + u_int32_t curoff, needed, start; + u_int8_t *p, *src; + int ret; + + dbp = dbc->dbp; + cp = dbc->internal; + env = dbp->env; + ip = dbc->thread_info; + mpf = dbp->mpf; + txn = dbc->txn; + + if (((ret = __db_alloc_dbt( + env, dbt, tlen, &needed, &start, bpp, bpsz)) != 0) || needed == 0) + return (ret); + /* Set up a start page in the overflow chain if streaming. */ if (cp->stream_start_pgno != PGNO_INVALID && pgno == cp->stream_start_pgno && start >= cp->stream_off && @@ -485,28 +515,33 @@ __db_doff(dbc, pgno) /* * __db_moff -- - * Match on overflow pages. + * Match on overflow pages from a specific offset. * - * Given a starting page number and a key, return <0, 0, >0 to indicate if the - * key on the page is less than, equal to or greater than the key specified. - * We optimize this by doing chunk at a time comparison unless the user has - * specified a comparison function. In this case, we need to materialize - * the entire object and call their comparison routine. + * Given a starting page number and a key, store <0, 0, >0 in 'cmpp' to indicate + * if the key on the page is less than, equal to or greater than the key + * specified. We optimize this by doing a chunk at a time comparison unless the + * user has specified a comparison function. In this case, we need to + * materialize the entire object and call their comparison routine. + * + * We start the comparison at an offset and update the offset with the + * longest matching count after the comparison. * * __db_moff and __db_coff are generic functions useful in searching and * ordering off page items. __db_moff matches an overflow DBT with an offpage * item. __db_coff compares two offpage items for lexicographic sort order. * * PUBLIC: int __db_moff __P((DBC *, const DBT *, db_pgno_t, u_int32_t, - * PUBLIC: int (*)(DB *, const DBT *, const DBT *), int *)); + * PUBLIC: int (*)(DB *, const DBT *, const DBT *, size_t *), + * PUBLIC: int *, size_t *)); */ int -__db_moff(dbc, dbt, pgno, tlen, cmpfunc, cmpp) +__db_moff(dbc, dbt, pgno, tlen, cmpfunc, cmpp, locp) DBC *dbc; const DBT *dbt; db_pgno_t pgno; u_int32_t tlen; - int (*cmpfunc) __P((DB *, const DBT *, const DBT *)), *cmpp; + int (*cmpfunc) __P((DB *, const DBT *, const DBT *, size_t *)), *cmpp; + size_t *locp; { DB *dbp; DBT local_dbt; @@ -517,6 +552,7 @@ __db_moff(dbc, dbt, pgno, tlen, cmpfunc, cmpp) u_int32_t bufsize, cmp_bytes, key_left; u_int8_t *p1, *p2; int ret; + size_t pos, start; dbp = dbc->dbp; ip = dbc->thread_info; @@ -535,39 +571,76 @@ __db_moff(dbc, dbt, pgno, tlen, cmpfunc, cmpp) &local_dbt, tlen, pgno, &buf, &bufsize)) != 0) return (ret); /* Pass the key as the first argument */ - *cmpp = cmpfunc(dbp, dbt, &local_dbt); + *cmpp = cmpfunc(dbp, dbt, &local_dbt, NULL); __os_free(dbp->env, buf); return (0); } + /* + * We start the comparison from the location of 'locp' and store the + * last matching location into 'locp'. + */ + start = (locp == NULL ? 0 : *locp); + pos = 0; + + /* Subtract prefix length from lengths. */ + tlen -= (u_int32_t)start; + key_left = dbt->size - (u_int32_t)start; + p1 = (u_int8_t *)dbt->data + start; + /* While there are both keys to compare. */ - for (*cmpp = 0, p1 = dbt->data, - key_left = dbt->size; key_left > 0 && pgno != PGNO_INVALID;) { + for (*cmpp = 0; key_left > 0 && + tlen > 0 && pgno != PGNO_INVALID;) { if ((ret = __memp_fget(mpf, &pgno, ip, dbc->txn, 0, &pagep)) != 0) return (ret); - cmp_bytes = OV_LEN(pagep) < key_left ? OV_LEN(pagep) : key_left; - tlen -= cmp_bytes; - key_left -= cmp_bytes; - for (p2 = (u_int8_t *)pagep + P_OVERHEAD(dbp); - cmp_bytes-- > 0; ++p1, ++p2) - if (*p1 != *p2) { - *cmpp = (long)*p1 - (long)*p2; - break; + /* + * Figure out where to start comparison, and how many + * bytes to compare. + */ + if (pos >= start) { + p2 = (u_int8_t *)pagep + P_OVERHEAD(dbp); + cmp_bytes = OV_LEN(pagep); + } else if (pos + OV_LEN(pagep) > start) { + p2 = (u_int8_t *)pagep + + P_OVERHEAD(dbp) + (start - pos); + cmp_bytes = OV_LEN(pagep) - (u_int32_t)(start - pos); + } else { + p2 = NULL; + cmp_bytes = 0; + } + + pos += OV_LEN(pagep); + + if (cmp_bytes != 0) { + if (cmp_bytes > key_left) + cmp_bytes = key_left; + tlen -= cmp_bytes; + key_left -= cmp_bytes; + for (;cmp_bytes-- > 0; ++p1, ++p2) { + if (*p1 != *p2) { + *cmpp = (long)*p1 - (long)*p2; + break; + } + if (locp != NULL) + ++(*locp); } + + } pgno = NEXT_PGNO(pagep); if ((ret = __memp_fput(mpf, ip, pagep, dbp->priority)) != 0) return (ret); if (*cmpp != 0) return (0); } - if (key_left > 0) /* DBT is longer than the page key. */ - *cmpp = 1; - else if (tlen > 0) /* DBT is shorter than the page key. */ - *cmpp = -1; - else - *cmpp = 0; + + if (*cmpp == 0) { + if (key_left > 0) /* DBT is longer than the page key. */ + *cmpp = 1; + else if (tlen > 0) /* DBT is shorter than the page key. */ + *cmpp = -1; + } return (0); } @@ -587,13 +660,13 @@ __db_moff(dbc, dbt, pgno, tlen, cmpfunc, cmpp) * DBT type. * * PUBLIC: int __db_coff __P((DBC *, const DBT *, const DBT *, - * PUBLIC: int (*)(DB *, const DBT *, const DBT *), int *)); + * PUBLIC: int (*)(DB *, const DBT *, const DBT *, size_t *), int *)); */ int __db_coff(dbc, dbt, match, cmpfunc, cmpp) DBC *dbc; const DBT *dbt, *match; - int (*cmpfunc) __P((DB *, const DBT *, const DBT *)), *cmpp; + int (*cmpfunc) __P((DB *, const DBT *, const DBT *, size_t *)), *cmpp; { DB *dbp; DB_THREAD_INFO *ip; @@ -643,7 +716,7 @@ __db_coff(dbc, dbt, match, cmpfunc, cmpp) match_pgno, &match_buf, &match_bufsz)) != 0) goto err1; /* The key needs to be the first argument for sort order */ - *cmpp = cmpfunc(dbp, &local_key, &local_match); + *cmpp = cmpfunc(dbp, &local_key, &local_match, NULL); err1: if (dbt_buf != NULL) __os_free(dbp->env, dbt_buf); @@ -657,6 +730,7 @@ err1: if (dbt_buf != NULL) if ((ret = __memp_fget(mpf, &dbt_pgno, ip, txn, 0, &dbt_pagep)) != 0) return (ret); + DB_ASSERT(dbc->env, TYPE(dbt_pagep) == P_OVERFLOW); if ((ret = __memp_fget(mpf, &match_pgno, ip, txn, 0, &match_pagep)) != 0) { @@ -664,6 +738,7 @@ err1: if (dbt_buf != NULL) mpf, ip, dbt_pagep, DB_PRIORITY_UNCHANGED); return (ret); } + DB_ASSERT(dbc->env, TYPE(match_pagep) == P_OVERFLOW); cmp_bytes = page_space < max_data ? page_space : max_data; for (p1 = (u_int8_t *)dbt_pagep + P_OVERHEAD(dbp), p2 = (u_int8_t *)match_pagep + P_OVERHEAD(dbp); diff --git a/src/db/db_ovfl_vrfy.c b/src/db/db_ovfl_vrfy.c index fa630f7b..55eb2b70 100644 --- a/src/db/db_ovfl_vrfy.c +++ b/src/db/db_ovfl_vrfy.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1990, 1993, 1994, 1995, 1996 diff --git a/src/db/db_pr.c b/src/db/db_pr.c index d95440f9..4933498e 100644 --- a/src/db/db_pr.c +++ b/src/db/db_pr.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -11,6 +11,7 @@ #include "db_int.h" #include "dbinc/db_page.h" #include "dbinc/btree.h" +#include "dbinc/fop.h" #include "dbinc/hash.h" #include "dbinc/heap.h" #include "dbinc/mp.h" @@ -25,6 +26,11 @@ static int __db_hmeta __P((ENV *, DB *, HMETA *, u_int32_t)); static void __db_meta __P((ENV *, DB *, DBMETA *, FN const *, u_int32_t)); static void __db_proff __P((ENV *, DB_MSGBUF *, void *)); static int __db_qmeta __P((ENV *, DB *, QMETA *, u_int32_t)); +static int __db_prblob __P((DBC *, DBT *, DBT *, int, const char *, + void *, int (*callback) __P((void *, const void *)), int, int)); +static int __db_prblob_id __P((DB *, db_seq_t, + off_t, DBT *, int, const char *, void *, + int (*callback) __P((void *, const void *)))); #ifdef HAVE_STATISTICS static void __db_prdb __P((DB *, u_int32_t)); static int __db_prtree __P((DB *, DB_TXN *, @@ -515,6 +521,11 @@ __db_bmeta(env, dbp, h, flags) __db_msg(env, "\tre_len: %#lx re_pad: %#lx", (u_long)h->re_len, (u_long)h->re_pad); __db_msg(env, "\troot: %lu", (u_long)h->root); + __db_msg(env, "\tblob_threshold: %lu", (u_long)h->blob_threshold); + __db_msg(env, "\tblob_file_lo: %lu", (u_long)h->blob_file_lo); + __db_msg(env, "\tblob_file_hi: %lu", (u_long)h->blob_file_hi); + __db_msg(env, "\tblob_sdb_lo: %lu", (u_long)h->blob_sdb_lo); + __db_msg(env, "\tblob_sdb_hi: %lu", (u_long)h->blob_sdb_hi); return (0); } @@ -549,6 +560,11 @@ __db_hmeta(env, dbp, h, flags) __db_msg(env, "\tffactor: %lu", (u_long)h->ffactor); __db_msg(env, "\tnelem: %lu", (u_long)h->nelem); __db_msg(env, "\th_charkey: %#lx", (u_long)h->h_charkey); + __db_msg(env, "\tblob_threshold: %lu", (u_long)h->blob_threshold); + __db_msg(env, "\tblob_file_lo: %lu", (u_long)h->blob_file_lo); + __db_msg(env, "\tblob_file_hi: %lu", (u_long)h->blob_file_hi); + __db_msg(env, "\tblob_sdb_lo: %lu", (u_long)h->blob_sdb_lo); + __db_msg(env, "\tblob_sdb_hi: %lu", (u_long)h->blob_sdb_hi); __db_msgadd(env, &mb, "\tspare points:\n\t"); for (i = 0; i < NCACHED; i++) { __db_msgadd(env, &mb, "%lu (%lu) ", (u_long)h->spares[i], @@ -604,6 +620,9 @@ __db_heapmeta(env, dbp, h, flags) __db_msg(env, "\tnregions: %lu", (u_long)h->nregions); __db_msg(env, "\tgbytes: %lu", (u_long)h->gbytes); __db_msg(env, "\tbytes: %lu", (u_long)h->bytes); + __db_msg(env, "\tblob_threshold: %lu", (u_long)h->blob_threshold); + __db_msg(env, "\tblob_file_lo: %lu", (u_long)h->blob_file_lo); + __db_msg(env, "\tblob_file_hi: %lu", (u_long)h->blob_file_hi); return (0); } @@ -682,14 +701,19 @@ __db_prpage_int(env, mbp, dbp, lead, h, pagesize, data, flags) { BINTERNAL *bi; BKEYDATA *bk; + BBLOB bl; HOFFPAGE a_hkd; + HBLOB hblob; QAMDATA *qp, *qep; RINTERNAL *ri; HEAPHDR *hh; HEAPSPLITHDR *hs; + HEAPBLOBHDR bhdr; db_indx_t dlen, len, i, *inp, max; db_pgno_t pgno; db_recno_t recno; + off_t blob_size; + db_seq_t blob_id; u_int32_t qlen; u_int8_t *ep, *hk, *p; int deleted, ret; @@ -899,6 +923,23 @@ __db_prpage_int(env, mbp, dbp, lead, h, pagesize, data, flags) (u_long)a_hkd.tlen, (u_long)a_hkd.pgno); DB_MSGBUF_FLUSH(env, mbp); break; + case H_BLOB: + memcpy(&hblob, hk, HBLOB_SIZE); + blob_id = (db_seq_t)hblob.id; + __db_msgadd(env, mbp, "blob: id: %llu ", + (long long)blob_id); + GET_BLOB_SIZE(env, hblob, blob_size, ret); + if (ret != 0) + __db_msgadd(env, mbp, + "blob: blob_size overflow. "); + __db_msgadd(env, mbp, "blob: size: %llu", + (long long)blob_size); + /* + * No point printing the blob file, it is + * likely not readable by humans. + */ + DB_MSGBUF_FLUSH(env, mbp); + break; default: DB_MSGBUF_FLUSH(env, mbp); __db_msg(env, "ILLEGAL HASH PAGE TYPE: %lu", @@ -925,6 +966,7 @@ __db_prpage_int(env, mbp, dbp, lead, h, pagesize, data, flags) __db_proff(env, mbp, bi->data); break; default: + /* B_BLOB does not appear on internal pages. */ DB_MSGBUF_FLUSH(env, mbp); __db_msg(env, "ILLEGAL BINTERNAL TYPE: %lu", (u_long)B_TYPE(bi->type)); @@ -950,6 +992,19 @@ __db_prpage_int(env, mbp, dbp, lead, h, pagesize, data, flags) case B_OVERFLOW: __db_proff(env, mbp, bk); break; + case B_BLOB: + memcpy(&bl, bk, BBLOB_SIZE); + blob_id = (db_seq_t)bl.id; + __db_msgadd(env, mbp, "blob: id: %llu ", + (long long)blob_id); + GET_BLOB_SIZE(env, bl, blob_size, ret); + if (ret != 0) + __db_msgadd(env, mbp, + "blob: blob_size overflow. "); + __db_msgadd(env, mbp, "blob: size: %llu", + (long long)blob_size); + DB_MSGBUF_FLUSH(env, mbp); + break; default: DB_MSGBUF_FLUSH(env, mbp); __db_msg(env, @@ -961,9 +1016,27 @@ __db_prpage_int(env, mbp, dbp, lead, h, pagesize, data, flags) break; case P_HEAP: hh = sp; - if (!F_ISSET(hh,HEAP_RECSPLIT)) + if (!F_ISSET(hh,HEAP_RECSPLIT) && + !F_ISSET(hh, HEAP_RECBLOB)) hdata = (u_int8_t *)hh + sizeof(HEAPHDR); - else { + else if (F_ISSET(hh, HEAP_RECBLOB)) { + memcpy(&bhdr, hh, HEAPBLOBREC_SIZE); + blob_id = (db_seq_t)bhdr.id; + __db_msgadd(env, mbp, "blob: id: %llu ", + (long long)blob_id); + GET_BLOB_SIZE(env, bhdr, blob_size, ret); + if (ret != 0) + __db_msgadd(env, mbp, + "blob: blob_size overflow. "); + __db_msgadd(env, mbp, "blob: size: %llu", + (long long)blob_size); + /* + * No point printing the blob file, it is + * likely not readable by humans. + */ + DB_MSGBUF_FLUSH(env, mbp); + break; + } else { hs = sp; __db_msgadd(env, mbp, "split: 0x%02x tsize: %lu next: %lu.%lu ", @@ -1276,10 +1349,16 @@ __db_dump(dbp, subname, callback, handle, pflag, keyflag) ENV *env; db_recno_t recno; int is_recno, is_heap, ret, t_ret; + u_int32_t blob_threshold; void *pointer; env = dbp->env; is_heap = 0; + memset(&dataret, 0, sizeof(DBT)); + memset(&keyret, 0, sizeof(DBT)); + + if ((ret = __db_get_blob_threshold(dbp, &blob_threshold)) != 0) + return (ret); if ((ret = __db_prheader( dbp, subname, pflag, keyflag, handle, callback, NULL, 0)) != 0) @@ -1317,8 +1396,8 @@ retry: while ((ret = !is_heap ? DB_NEXT | DB_MULTIPLE_KEY : DB_NEXT )) == 0) { if (is_heap) { /* Never dump keys for HEAP */ - if ((ret = __db_prdbt( - &data, pflag, " ", handle, callback, 0, 0)) != 0) + if ((ret = __db_prdbt(&data, + pflag, " ", handle, callback, 0, 0, 0)) != 0) goto err; continue; } @@ -1337,17 +1416,24 @@ retry: while ((ret = if ((keyflag && (ret = __db_prdbt(&keyret, pflag, " ", - handle, callback, is_recno, 0)) != 0) || + handle, callback, is_recno, 0, 0)) != 0) || (ret = __db_prdbt(&dataret, pflag, " ", - handle, callback, 0, 0)) != 0) + handle, callback, 0, 0, 0)) != 0) goto err; } } if (ret == DB_BUFFER_SMALL) { - data.size = (u_int32_t)DB_ALIGN(data.size, 1024); - if ((ret = __os_realloc(env, data.size, &data.data)) != 0) - goto err; - data.ulen = data.size; + if (blob_threshold != 0 && data.size >= blob_threshold) { + if ((ret = __db_prblob(dbcp, &key, &data, pflag, + " ", handle, callback, is_heap, keyflag)) != 0) + goto err; + } else { + data.size = (u_int32_t)DB_ALIGN(data.size, 1024); + if ((ret = __os_realloc( + env, data.size, &data.data)) != 0) + goto err; + data.ulen = data.size; + } goto retry; } if (ret == DB_NOTFOUND) @@ -1365,14 +1451,153 @@ err: if ((t_ret = __dbc_close(dbcp)) != 0 && ret == 0) } /* + * __db_prblob + * Print a blob file. + */ +static int +__db_prblob(dbc, key, data, checkprint, + prefix, handle, callback, is_heap, keyflag) + DBC *dbc; + DBT *key; + DBT *data; + int checkprint; + const char *prefix; + void *handle; + int (*callback) __P((void *, const void *)); + int is_heap; + int keyflag; +{ + DBC *local; + DBT partial; + int ret, t_ret; + off_t blob_size; + db_seq_t blob_id; + + local = NULL; + memset(&partial, 0, sizeof(DBT)); + partial.flags = DB_DBT_PARTIAL; + + if ((ret = __dbc_idup(dbc, &local, DB_POSITION)) != 0) + goto err; + + /* Move the cursor to the blob. */ + if ((ret = __dbc_get(local, key, &partial, DB_NEXT)) != 0) + return (ret); + + if ((ret = __dbc_get_blob_id(local, &blob_id)) != 0) { + /* + * It is possible this is not a blob. Non-blob items that are + * larger than the blob threshold can exist if the item was + * smaller than the threshold when created, then later updated + * to larger than the threshold value. + */ + if (ret == EINVAL) { + ret = 0; + data->size = (u_int32_t)DB_ALIGN(data->size, 1024); + if ((ret = __os_realloc( + dbc->env, data->size, &data->data)) != 0) + goto err; + data->ulen = data->size; + } + goto err; + } + + if (data->ulen < MEGABYTE) { + if ((data->data = realloc( + data->data, data->ulen = MEGABYTE)) == NULL) { + ret = ENOMEM; + goto err; + } + } + + if ((ret = __dbc_get_blob_size(local, &blob_size)) != 0) + goto err; + + if (keyflag && !is_heap && (ret = __db_prdbt( + key, checkprint, " ", handle, callback, 0, 0, 0)) != 0) + goto err; + + if ((ret = __db_prblob_id(local->dbp, blob_id, blob_size, + data, checkprint, prefix, handle, callback)) != 0) + goto err; + + /* Move the cursor. */ + ret = __dbc_get(dbc, key, &partial, DB_NEXT); + +err: if (local != NULL) { + if ((t_ret = __dbc_close(local)) != 0 && ret == 0) + ret = t_ret; + } + + return (ret); +} + +/* + * __db_prblob_id -- + * Print a blob file identified by the given id. + */ +static int +__db_prblob_id(dbp, blob_id, + blob_size, data, checkprint, prefix, handle, callback) + DB *dbp; + db_seq_t blob_id; + off_t blob_size; + DBT *data; + int checkprint; + const char *prefix; + void *handle; + int (*callback) __P((void *, const void *)); +{ + DB_FH *fhp; + const char *pre; + int ret, skip_newline, t_ret; + off_t left, offset; + + fhp = NULL; + offset = 0; + + if ((ret = __blob_file_open( + dbp, &fhp, blob_id, DB_FOP_READONLY, 1)) != 0) + goto err; + + left = blob_size; + while (left > 0) { + if ((ret = __blob_file_read( + dbp->env, fhp, data, offset, data->ulen)) != 0) + goto err; + if (offset == 0) + pre = prefix; + else + pre = NULL; + skip_newline = data->size < left ? 1 : 0; + if ((ret = __db_prdbt(data, checkprint, pre, + handle, callback, 0, 0, skip_newline)) != 0) + goto err; + if (data->size > left) + left = 0; + else + left = left - data->size; + offset = offset + data->size; + } + +err: if (fhp != NULL) { + if ((t_ret = __os_closehandle(dbp->env, fhp)) != 0 && ret == 0) + ret = t_ret; + } + + return (ret); +} + +/* * __db_prdbt -- * Print out a DBT data element. * * PUBLIC: int __db_prdbt __P((DBT *, int, const char *, void *, - * PUBLIC: int (*)(void *, const void *), int, int)); + * PUBLIC: int (*)(void *, const void *), int, int, int)); */ int -__db_prdbt(dbtp, checkprint, prefix, handle, callback, is_recno, is_heap) +__db_prdbt(dbtp, checkprint, + prefix, handle, callback, is_recno, is_heap, no_newline) DBT *dbtp; int checkprint; const char *prefix; @@ -1380,16 +1605,17 @@ __db_prdbt(dbtp, checkprint, prefix, handle, callback, is_recno, is_heap) int (*callback) __P((void *, const void *)); int is_recno; int is_heap; + int no_newline; { - static const u_char hex[] = "0123456789abcdef"; db_recno_t recno; DB_HEAP_RID rid; - size_t len; + size_t count, len; int ret; + u_int8_t *p; #define DBTBUFLEN 100 - u_int8_t *p, *hp; - char buf[DBTBUFLEN], hbuf[DBTBUFLEN]; + char buf[DBTBUFLEN], hexbuf[2 * DBTBUFLEN + 1]; + ret = 0; /* * !!! * This routine is the routine that dumps out items in the format @@ -1409,13 +1635,8 @@ __db_prdbt(dbtp, checkprint, prefix, handle, callback, is_recno, is_heap) /* If we're printing data as hex, print keys as hex too. */ if (!checkprint) { - for (len = strlen(buf), p = (u_int8_t *)buf, - hp = (u_int8_t *)hbuf; len-- > 0; ++p) { - *hp++ = hex[(u_int8_t)(*p & 0xf0) >> 4]; - *hp++ = hex[*p & 0x0f]; - } - *hp = '\0'; - ret = callback(handle, hbuf); + (void)__db_tohex(buf, strlen(buf), hexbuf); + ret = callback(handle, hexbuf); } else ret = callback(handle, buf); @@ -1433,44 +1654,46 @@ __db_prdbt(dbtp, checkprint, prefix, handle, callback, is_recno, is_heap) /* If we're printing data as hex, print keys as hex too. */ if (!checkprint) { - for (len = strlen(buf), p = (u_int8_t *)buf, - hp = (u_int8_t *)hbuf; len-- > 0; ++p) { - *hp++ = hex[(u_int8_t)(*p & 0xf0) >> 4]; - *hp++ = hex[*p & 0x0f]; - } - *hp = '\0'; - ret = callback(handle, hbuf); + (void)__db_tohex(buf, strlen(buf), hexbuf); + ret = callback(handle, hexbuf); } else ret = callback(handle, buf); if (ret != 0) return (ret); } else if (checkprint) { + /* + * Prepare buf for the 'isprint()' case: printable single char + * strings; prepare hexbuf for the other case '\<2 hex digits>'. + */ + buf[1] = '\0'; + hexbuf[0] = '\\'; for (len = dbtp->size, p = dbtp->data; len--; ++p) if (isprint((int)*p)) { if (*p == '\\' && (ret = callback(handle, "\\")) != 0) return (ret); - snprintf(buf, DBTBUFLEN, "%c", *p); + buf[0] = (char)*p; if ((ret = callback(handle, buf)) != 0) return (ret); } else { - snprintf(buf, DBTBUFLEN, "\\%c%c", - hex[(u_int8_t)(*p & 0xf0) >> 4], - hex[*p & 0x0f]); - if ((ret = callback(handle, buf)) != 0) + (void)__db_tohex(p, 1, hexbuf + 1); + if ((ret = callback(handle, hexbuf)) != 0) return (ret); } } else - for (len = dbtp->size, p = dbtp->data; len--; ++p) { - snprintf(buf, DBTBUFLEN, "%c%c", - hex[(u_int8_t)(*p & 0xf0) >> 4], - hex[*p & 0x0f]); - if ((ret = callback(handle, buf)) != 0) + for (len = dbtp->size, p = dbtp->data, count = DBTBUFLEN; + len > 0; len -= count, p += count) { + if (count > len) + count = len; + (void)__db_tohex(p, count, hexbuf); + if ((ret = callback(handle, hexbuf)) != 0) return (ret); } - - return (callback(handle, "\n")); + if (no_newline == 0) + return (callback(handle, "\n")); + else + return (ret); } /* @@ -1598,7 +1821,7 @@ __db_prheader(dbp, subname, pflag, keyflag, handle, callback, vdp, meta_pgno) goto err; DB_INIT_DBT(dbt, subname, strlen(subname)); if ((ret = __db_prdbt(&dbt, 1, - NULL, handle, callback, 0, 0)) != 0) + NULL, handle, callback, 0, 0, 0)) != 0) goto err; } switch (dbtype) { @@ -1868,7 +2091,7 @@ __db_prheader(dbp, subname, pflag, keyflag, handle, callback, vdp, meta_pgno) goto err; for (i = 0; i < tmp_u_int32 - 1; i++) if ((ret = __db_prdbt(&keys[i], - pflag, " ", handle, callback, 0, 0)) != 0) + pflag, " ", handle, callback, 0, 0, 0)) != 0) goto err; } } @@ -1954,3 +2177,33 @@ __db_dbtype_to_string(type) } return ("UNKNOWN TYPE"); } + +/* + * __db_tohex -- + * Generate a hex string representation of a byte array. + * The size of the destination must be at least 2*len + 1 bytes long, + * to allow for the '\0' terminator, which is always added. + * + * PUBLIC: char *__db_tohex __P((const void *, size_t, char *)); + */ +char * +__db_tohex(source, len, dest) + const void *source; + size_t len; + char *dest; +{ + static const char hex[] = "0123456789abcdef"; + const u_int8_t *s; + char *d; + + s = source; + d = dest; + while (len > 0) { + *d++ = hex[(*s & 0xf0) >> 4]; + *d++ = hex[*s & 0x0f]; + s++; + len--; + } + *d = '\0'; + return ((char *)dest); +} diff --git a/src/db/db_rec.c b/src/db/db_rec.c index 8ba1124e..98b29b22 100644 --- a/src/db/db_rec.c +++ b/src/db/db_rec.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -1194,8 +1194,9 @@ __db_pg_init_recover(env, dbtp, lsnp, op, info) DB_LSN copy_lsn; DB_MPOOLFILE *mpf; PAGE *pagep; - int cmp_n, cmp_p, ret, type; + int cmp_n, cmp_p, ret, t_ret, type; + pagep = NULL; ip = ((DB_TXNHEAD *)info)->thread_info; REC_PRINT(__db_pg_init_print); REC_INTRO(__db_pg_init_read, ip, 0); @@ -1247,11 +1248,12 @@ __db_pg_init_recover(env, dbtp, lsnp, op, info) memcpy((u_int8_t*)pagep + HOFFSET(pagep), argp->data.data, argp->data.size); } - if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0) - goto out; done: *lsnp = argp->prev_lsn; out: + if (pagep != NULL && (t_ret = + __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0 && ret == 0) + ret = t_ret; REC_CLOSE; } diff --git a/src/db/db_reclaim.c b/src/db/db_reclaim.c index b902769a..abae33d9 100644 --- a/src/db/db_reclaim.c +++ b/src/db/db_reclaim.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -181,6 +181,7 @@ __db_truncate_callback(dbc, p, cookie, putp) switch (*H_PAIRDATA(dbp, p, indx)) { case H_OFFDUP: break; + case H_BLOB: case H_OFFPAGE: case H_KEYDATA: ++*countp; diff --git a/src/db/db_remove.c b/src/db/db_remove.c index 591a29b2..d6118fae 100644 --- a/src/db/db_remove.c +++ b/src/db/db_remove.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -18,7 +18,7 @@ #include "dbinc/txn.h" static int __db_dbtxn_remove __P((DB *, - DB_THREAD_INFO *, DB_TXN *, const char *, const char *)); + DB_THREAD_INFO *, DB_TXN *, const char *, const char *, APPNAME)); static int __db_subdb_remove __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char *, const char *, u_int32_t)); @@ -264,7 +264,7 @@ __db_remove_int(dbp, ip, txn, name, subdb, flags) /* Handle transactional file removes separately. */ if (IS_REAL_TXN(txn)) { - ret = __db_dbtxn_remove(dbp, ip, txn, name, subdb); + ret = __db_dbtxn_remove(dbp, ip, txn, name, subdb, DB_APP_DATA); goto err; } @@ -293,6 +293,10 @@ __db_remove_int(dbp, ip, txn, name, subdb, flags) (ret = dbp->db_am_remove(dbp, ip, NULL, name, subdb, flags)) != 0) goto err; + if (dbp->db_am_remove == NULL && + (ret = __blob_del_all(dbp, txn, 0)) != 0) + goto err; + ret = F_ISSET(dbp, DB_AM_INMEM) ? __db_inmem_remove(dbp, NULL, real_name) : __fop_remove(env, @@ -407,6 +411,10 @@ __db_subdb_remove(dbp, ip, txn, name, subdb, flags) txn, name, subdb, DB_UNKNOWN, DB_WRITEOPEN, 0, PGNO_BASE_MD)) != 0) goto err; + if (sdbp->blob_threshold != 0) + if ((ret = __blob_del_all(sdbp, txn, 0)) != 0) + goto err; + DB_TEST_RECOVERY(sdbp, DB_TEST_PREDESTROY, ret, name); /* Have the handle locked so we will not lock pages. */ @@ -460,18 +468,21 @@ err: } static int -__db_dbtxn_remove(dbp, ip, txn, name, subdb) +__db_dbtxn_remove(dbp, ip, txn, name, subdb, appname) DB *dbp; DB_THREAD_INFO *ip; DB_TXN *txn; const char *name, *subdb; + APPNAME appname; { ENV *env; int ret; char *tmpname; + u_int32_t flags; env = dbp->env; tmpname = NULL; + flags = DB_NOSYNC; /* * This is a transactional remove, so we have to keep the name @@ -488,7 +499,12 @@ __db_dbtxn_remove(dbp, ip, txn, name, subdb) DB_TEST_RECOVERY(dbp, DB_TEST_PREDESTROY, ret, name); if ((ret = __db_rename_int(dbp, - txn->thread_info, txn, name, subdb, tmpname, DB_NOSYNC)) != 0) + txn->thread_info, txn, name, subdb, tmpname, flags)) != 0) + goto err; + + /* Delete all blob files, if this database supports blobs. */ + if (appname != DB_APP_BLOB && (dbp->blob_file_id != 0 || + dbp->blob_sdb_id != 0) && (ret = __blob_del_all(dbp, txn, 0)) != 0) goto err; /* @@ -501,7 +517,7 @@ __db_dbtxn_remove(dbp, ip, txn, name, subdb) ret = F_ISSET(dbp, DB_AM_INMEM) ? __db_inmem_remove(dbp, txn, tmpname) : __fop_remove(env, - txn, dbp->fileid, tmpname, &dbp->dirname, DB_APP_DATA, + txn, dbp->fileid, tmpname, &dbp->dirname, appname, F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0); DB_TEST_RECOVERY(dbp, DB_TEST_POSTDESTROY, ret, name); diff --git a/src/db/db_rename.c b/src/db/db_rename.c index 2812b948..5b2bed42 100644 --- a/src/db/db_rename.c +++ b/src/db/db_rename.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -285,10 +285,11 @@ __db_rename_int(dbp, ip, txn, name, subdb, newname, flags) * taken care of in the fop layer. */ if (IS_REAL_TXN(txn)) { - if ((ret = __fop_dummy(dbp, txn, old, newname)) != 0) + if ((ret = + __fop_dummy(dbp, txn, old, newname, DB_APP_DATA)) != 0) goto err; } else { - if ((ret = __fop_dbrename(dbp, old, newname)) != 0) + if ((ret = __fop_dbrename(dbp, old, newname, DB_APP_DATA)) != 0) goto err; } diff --git a/src/db/db_ret.c b/src/db/db_ret.c index 709605f6..ddd0ef51 100644 --- a/src/db/db_ret.c +++ b/src/db/db_ret.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -29,18 +29,27 @@ __db_ret(dbc, h, indx, dbt, memp, memsize) void **memp; u_int32_t *memsize; { + BBLOB bl; BKEYDATA *bk; BOVERFLOW *bo; DB *dbp; + ENV *env; + HBLOB hblob; + HEAPBLOBHDR bhdr; HEAPHDR *hdr; + db_seq_t blob_id; + int ret; HOFFPAGE ho; + off_t blob_size; u_int32_t len; u_int8_t *hk; void *data; if (F_ISSET(dbt, DB_DBT_READONLY)) return (0); + ret = 0; dbp = dbc->dbp; + env = dbp->env; switch (TYPE(h)) { case P_HASH_UNSORTED: @@ -50,6 +59,20 @@ __db_ret(dbc, h, indx, dbt, memp, memsize) memcpy(&ho, hk, sizeof(HOFFPAGE)); return (__db_goff(dbc, dbt, ho.tlen, ho.pgno, memp, memsize)); + } else if (HPAGE_PTYPE(hk) == H_BLOB) { + /* Get the record instead of the blob item. */ + if (F_ISSET(dbt, DB_DBT_BLOB_REC)) { + data = P_ENTRY(dbp, h, indx); + len = HBLOB_SIZE; + break; + } + memcpy(&hblob, hk, HBLOB_SIZE); + blob_id = (db_seq_t)hblob.id; + GET_BLOB_SIZE(env, hblob, blob_size, ret); + if (ret != 0) + return (ret); + return (__blob_get( + dbc, dbt, blob_id, blob_size, memp, memsize)); } len = LEN_HKEYDATA(dbp, h, dbp->pgsize, indx); data = HKEYDATA_DATA(hk); @@ -58,6 +81,21 @@ __db_ret(dbc, h, indx, dbt, memp, memsize) hdr = (HEAPHDR *)P_ENTRY(dbp, h, indx); if (F_ISSET(hdr,(HEAP_RECSPLIT | HEAP_RECFIRST))) return (__heapc_gsplit(dbc, dbt, memp, memsize)); + else if (F_ISSET(hdr, HEAP_RECBLOB)) { + /* Get the record instead of the blob item. */ + if (F_ISSET(dbt, DB_DBT_BLOB_REC)) { + data = P_ENTRY(dbp, h, indx); + len = HEAPBLOBREC_SIZE; + break; + } + memcpy(&bhdr, hdr, HEAPBLOBREC_SIZE); + blob_id = (db_seq_t)bhdr.id; + GET_BLOB_SIZE(env, bhdr, blob_size, ret); + if (ret != 0) + return (ret); + return (__blob_get( + dbc, dbt, blob_id, blob_size, memp, memsize)); + } len = hdr->size; data = (u_int8_t *)hdr + sizeof(HEAPHDR); break; @@ -69,6 +107,20 @@ __db_ret(dbc, h, indx, dbt, memp, memsize) bo = (BOVERFLOW *)bk; return (__db_goff(dbc, dbt, bo->tlen, bo->pgno, memp, memsize)); + } else if (B_TYPE(bk->type) == B_BLOB) { + /* Get the record instead of the blob item. */ + if (F_ISSET(dbt, DB_DBT_BLOB_REC)) { + data = P_ENTRY(dbp, h, indx); + len = BBLOB_SIZE; + break; + } + memcpy(&bl, bk, BBLOB_SIZE); + blob_id = (db_seq_t)bl.id; + GET_BLOB_SIZE(env, bl, blob_size, ret); + if (ret != 0) + return (ret); + return (__blob_get( + dbc, dbt, blob_id, blob_size, memp, memsize)); } len = bk->len; data = bk->data; @@ -167,3 +219,71 @@ __db_retcopy(env, dbt, data, len, memp, memsize) return (ret); } + +/* + * __db_dbt_clone -- + * Clone a DBT from another DBT. + * The input dest DBT must be a zero initialized DBT that will be populated. + * The function does not allocate a dest DBT to allow for cloning into stack + * or locally allocated variables. It is the callers responsibility to free + * the memory allocated in dest->data. + * + * PUBLIC: int __db_dbt_clone __P((ENV *, DBT *, const DBT *)); + */ +int +__db_dbt_clone(env, dest, src) + ENV *env; + DBT *dest; + const DBT *src; +{ + u_int32_t err_flags; + int ret; + + DB_ASSERT(env, dest->data == NULL); + + ret = 0; + + /* The function does not support the following DBT flags. */ + err_flags = DB_DBT_MALLOC | DB_DBT_REALLOC | + DB_DBT_MULTIPLE | DB_DBT_PARTIAL; + if (F_ISSET(src, err_flags)) { + __db_errx(env, DB_STR("0758", + "Unsupported flags when cloning the DBT.")); + return (EINVAL); + } + + if ((ret = __os_malloc(env, src->size, &dest->data)) != 0) + return (ret); + + memcpy(dest->data, src->data, src->size); + dest->ulen = src->size; + dest->size = src->size; + dest->flags = DB_DBT_USERMEM; + + return (ret); +} + +/* + * __db_dbt_clone_free -- + * Free a DBT cloned by __db_dbt_clone + * + * PUBLIC: int __db_dbt_clone_free __P((ENV *, DBT *)); + */ +int +__db_dbt_clone_free(env, dbt) + ENV *env; + DBT *dbt; +{ + /* Currently only DB_DBT_USERMEM is supported. */ + if (dbt->flags != DB_DBT_USERMEM) { + __db_errx(env, DB_STR("0759", + "Unsupported flags when freeing the cloned DBT.")); + return (EINVAL); + } + + if (dbt->data != NULL) + __os_free(env, dbt->data); + dbt->size = dbt->ulen = 0; + + return (0); +} diff --git a/src/db/db_setid.c b/src/db/db_setid.c index 697c3ff7..5c61a139 100644 --- a/src/db/db_setid.c +++ b/src/db/db_setid.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2000, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2000, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/db/db_setlsn.c b/src/db/db_setlsn.c index 1a3280ed..acee80f6 100644 --- a/src/db/db_setlsn.c +++ b/src/db/db_setlsn.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2000, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2000, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/db/db_sort_multiple.c b/src/db/db_sort_multiple.c index c5e2e941..7facb80e 100644 --- a/src/db/db_sort_multiple.c +++ b/src/db/db_sort_multiple.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ #include "db_config.h" @@ -34,7 +34,7 @@ __db_compare_both(db, akey, adata, bkey, bdata) t = (BTREE *)db->bt_internal; - cmp = t->bt_compare(db, akey, bkey); + cmp = t->bt_compare(db, akey, bkey, NULL); if (cmp != 0) return cmp; if (!F_ISSET(db, DB_AM_DUPSORT)) return (0); @@ -44,9 +44,9 @@ __db_compare_both(db, akey, adata, bkey, bdata) #ifdef HAVE_COMPRESSION if (DB_IS_COMPRESSED(db)) - return t->compress_dup_compare(db, adata, bdata); + return t->compress_dup_compare(db, adata, bdata, NULL); #endif - return db->dup_compare(db, adata, bdata); + return db->dup_compare(db, adata, bdata, NULL); } #define DB_SORT_SWAP(a, ad, b, bd) \ diff --git a/src/db/db_stati.c b/src/db/db_stati.c index 61744e81..b7367f37 100644 --- a/src/db/db_stati.c +++ b/src/db/db_stati.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/db/db_truncate.c b/src/db/db_truncate.c index 0eeb0c64..d57a23b2 100644 --- a/src/db/db_truncate.c +++ b/src/db/db_truncate.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -191,6 +191,10 @@ __db_truncate(dbp, ip, txn, countp) if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0) ret = t_ret; + /* Delete all blob files. */ + if (ret == 0) + ret = __blob_del_all(dbp, txn, 1); + DB_TEST_RECOVERY(dbp, DB_TEST_POSTDESTROY, ret, NULL); DB_TEST_RECOVERY_LABEL diff --git a/src/db/db_upg.c b/src/db/db_upg.c index de5d0dc7..7dcc3b1c 100644 --- a/src/db/db_upg.c +++ b/src/db/db_upg.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -13,6 +13,7 @@ #include "dbinc/db_swap.h" #include "dbinc/btree.h" #include "dbinc/hash.h" +#include "dbinc/heap.h" #include "dbinc/qam.h" /* @@ -98,6 +99,27 @@ static int (* const func_46_list[P_PAGETYPE_MAX]) NULL, /* P_IHEAP */ }; +static int (* const func_60_list[P_PAGETYPE_MAX]) + __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)) = { + NULL, /* P_INVALID */ + NULL, /* __P_DUPLICATE */ + NULL, /* P_HASH_UNSORTED */ + NULL, /* P_IBTREE */ + NULL, /* P_IRECNO */ + __bam_60_lbtree, /* P_LBTREE */ + NULL, /* P_LRECNO */ + NULL, /* P_OVERFLOW */ + __ham_60_hashmeta, /* P_HASHMETA */ + __bam_60_btreemeta, /* P_BTREEMETA */ + NULL, /* P_QAMMETA */ + NULL, /* P_QAMDATA */ + NULL, /* P_LDUP */ + __ham_60_hash, /* P_HASH */ + __heap_60_heapmeta, /* P_HEAPMETA */ + __heap_60_heap, /* P_HEAP */ + NULL, /* P_IHEAP */ +}; + static int __db_page_pass __P((DB *, char *, u_int32_t, int (* const []) (DB *, char *, u_int32_t, DB_FH *, PAGE *, int *), DB_FH *)); static int __db_set_lastpgno __P((DB *, char *, DB_FH *)); @@ -181,6 +203,34 @@ __db_upgrade(dbp, fname, flags) goto err; /* FALLTHROUGH */ case 9: + /* + * Various blob ids and size use two u_int32_t values + * to represent 64 bit integers in early 6.0. Change + * those values to 64 bit integers. + */ + /* + * Read the encrypt_alg and chksum fields from the + * metadata page. + */ + meta = (DBMETA *)mbuf; + if (FLD_ISSET(meta->metaflags, DBMETA_CHKSUM)) + F_SET(dbp, DB_AM_CHKSUM); + if (meta->encrypt_alg != 0) { + if (!CRYPTO_ON(dbp->env)) { + __db_errx(env, DB_STR("0777", +"Attempt to upgrade an encrypted database without providing a password.")); + ret = EINVAL; + goto err; + } + F_SET(dbp, DB_AM_ENCRYPT); + } + memcpy(&dbp->pgsize, + &meta->pagesize, sizeof(u_int32_t)); + if ((ret = __db_page_pass(dbp, + real_name, flags, func_60_list, fhp)) != 0) + goto err; + /* FALLTHROUGH */ + case 10: break; default: __db_errx(env, DB_STR_A("0666", @@ -307,6 +357,34 @@ __db_upgrade(dbp, fname, flags) /* FALLTHROUGH */ case 9: + /* + * Various blob ids and size use two u_int32_t values + * to represent 64 bit integers in early 6.0. Change + * those values to 64 bit integers. + */ + meta = (DBMETA*)mbuf; + memcpy(&dbp->pgsize, + &meta->pagesize, sizeof(u_int32_t)); + /* + * Read the encrypt_alg and chksum fields from the + * metadata page. + */ + if (FLD_ISSET(meta->metaflags, DBMETA_CHKSUM)) + F_SET(dbp, DB_AM_CHKSUM); + if (meta->encrypt_alg != 0) { + if (!CRYPTO_ON(dbp->env)) { + __db_errx(env, DB_STR("0778", +"Attempt to upgrade an encrypted database without providing a password.")); + ret = EINVAL; + goto err; + } + F_SET(dbp, DB_AM_ENCRYPT); + } + if ((ret = __db_page_pass(dbp, + real_name, flags, func_60_list, fhp)) != 0) + goto err; + /* FALLTHROUGH */ + case 10: break; default: __db_errx(env, DB_STR_A("0668", @@ -317,9 +395,45 @@ __db_upgrade(dbp, fname, flags) } break; case DB_HEAPMAGIC: - /* - * There's no upgrade needed for Heap yet. - */ + switch (((DBMETA *)mbuf)->version) { + case 1: + /* + * Various blob ids and size use two u_int32_t values + * to represent 64 bit integers in early 6.0. Change + * those values to 64 bit integers. + */ + meta = (DBMETA*)mbuf; + memcpy(&dbp->pgsize, + &meta->pagesize, sizeof(u_int32_t)); + /* + * Read the encrypt_alg and chksum fields from the + * metadata page. + */ + if (FLD_ISSET(meta->metaflags, DBMETA_CHKSUM)) + F_SET(dbp, DB_AM_CHKSUM); + if (meta->encrypt_alg != 0) { + if (!CRYPTO_ON(dbp->env)) { + __db_errx(env, DB_STR("0779", +"Attempt to upgrade an encrypted database without providing a password.")); + ret = EINVAL; + goto err; + } + F_SET(dbp, DB_AM_ENCRYPT); + } + if ((ret = __db_page_pass(dbp, + real_name, flags, func_60_list, fhp)) != 0) + goto err; + /* FALLTHROUGH */ + case 2: + break; + default: + __db_errx(env, DB_STR_A("0776", + "%s: unsupported heap version: %lu", + "%s %lu"), real_name, + (u_long)((DBMETA *)mbuf)->version); + ret = DB_OLD_VERSION; + goto err; + } break; case DB_QAMMAGIC: switch (((DBMETA *)mbuf)->version) { diff --git a/src/db/db_upg_opd.c b/src/db/db_upg_opd.c index 992115ad..6f6dfb71 100644 --- a/src/db/db_upg_opd.c +++ b/src/db/db_upg_opd.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -37,6 +37,9 @@ static int __db_up_ovref __P((DB *, DB_FH *, db_pgno_t)); * __db_31_offdup -- * Convert 3.0 off-page duplicates to 3.1 off-page duplicates. * + * This code and its descendants should be removed when support for + * upgrading from a 3.0 database format is removed. + * * PUBLIC: int __db_31_offdup __P((DB *, char *, DB_FH *, int, db_pgno_t *)); */ int @@ -317,7 +320,7 @@ __db_build_ri(dbp, fhp, ipage, page, indx, nomemp) /* * __db_up_ovref -- - * Increment/decrement the reference count on an overflow page. + * Increment the reference count on an overflow page. */ static int __db_up_ovref(dbp, fhp, pgno) diff --git a/src/db/db_vrfy.c b/src/db/db_vrfy.c index 9cb94ad2..a8c80cae 100644 --- a/src/db/db_vrfy.c +++ b/src/db/db_vrfy.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2000, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2000, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -553,7 +553,7 @@ __db_vrfy_pagezero(dbp, vdp, fhp, name, flags) if ((ret = __db_vrfy_getpageinfo(vdp, PGNO_BASE_MD, &pip)) != 0) return (ret); - if ((ret = __db_chk_meta(env, dbp, meta, 1)) != 0) { + if ((ret = __db_chk_meta(env, dbp, meta, DB_CHK_META)) != 0) { EPRINT((env, DB_STR_A("0522", "Page %lu: metadata page corrupted", "%lu"), (u_long)PGNO_BASE_MD)); @@ -920,7 +920,7 @@ err1: if (ret == 0) * If we've seen a Queue metadata page, we may need to walk Queue * extent pages that won't show up between 0 and vdp->last_pgno. */ - if (F_ISSET(vdp, VRFY_QMETA_SET) && (t_ret = + if (F_ISSET(vdp, SALVAGE_QMETA_SET) && (t_ret = __qam_vrfy_walkqueue(dbp, vdp, handle, callback, flags)) != 0) { if (ret == 0) ret = t_ret; @@ -1563,6 +1563,10 @@ __db_vrfy_meta(dbp, vdp, meta, pgno, flags) * If we don't have FTRUNCATE then mpool could include some * zeroed pages at the end of the file, we assume the meta page * is correct. Queue does not update the meta page's last_pgno. + * + * We have seen one false positive after a failure while rolling the log + * forward, last_pgno was updated and the file had not yet been + * extended. [#18418] */ if (pgno == PGNO_BASE_MD && dbtype != DB_QUEUE && meta->last_pgno != vdp->last_pgno) { @@ -2401,6 +2405,15 @@ __db_vrfy_inpitem(dbp, h, pgno, i, is_btree, flags, himarkp, offsetp) * length, so it's not possible to certify it as safe. */ switch (B_TYPE(bk->type)) { + case B_BLOB: + len = bk->len; + if (len != BBLOB_DSIZE) { + EPRINT((env, DB_STR_A("0771", + "Page %lu: item %lu illegal size.", + "%lu %lu"), (u_long)pgno, (u_long)i)); + return (DB_VERIFY_BAD); + } + break; case B_KEYDATA: len = bk->len; break; diff --git a/src/db/db_vrfy_stub.c b/src/db/db_vrfy_stub.c index 5037f33e..a9eed84c 100644 --- a/src/db/db_vrfy_stub.c +++ b/src/db/db_vrfy_stub.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/db/db_vrfyutil.c b/src/db/db_vrfyutil.c index d72e1188..3a64bd50 100644 --- a/src/db/db_vrfyutil.c +++ b/src/db/db_vrfyutil.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2000, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2000, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -43,6 +43,9 @@ __db_vrfy_dbinfo_create(env, ip, pgsize, vdpp) if ((ret = __db_create_internal(&cdbp, env, 0)) != 0) goto err; + if ((ret = __db_set_blob_threshold(cdbp, 0, 0)) != 0) + goto err; + if ((ret = __db_set_flags(cdbp, DB_DUP)) != 0) goto err; @@ -60,6 +63,9 @@ __db_vrfy_dbinfo_create(env, ip, pgsize, vdpp) if ((ret = __db_create_internal(&pgdbp, env, 0)) != 0) goto err; + if ((ret = __db_set_blob_threshold(pgdbp, 0, 0)) != 0) + goto err; + if ((ret = __db_set_pagesize(pgdbp, pgsize)) != 0) goto err; @@ -928,5 +934,6 @@ __db_vrfy_prdbt(dbtp, checkprint, prefix, } return ( __db_prdbt(dbtp, checkprint, - prefix, handle, callback, is_recno, is_heap)); + prefix, handle, callback, is_recno, is_heap, + vdp != NULL && F_ISSET(vdp, SALVAGE_STREAM_BLOB) ? 1 : 0)); } diff --git a/src/db/partition.c b/src/db/partition.c index f8beaf16..86491ba3 100644 --- a/src/db/partition.c +++ b/src/db/partition.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -32,13 +32,12 @@ static int __partc_writelock __P((DBC*)); static int __partition_chk_meta __P((DB *, DB_THREAD_INFO *, DB_TXN *, u_int32_t)); static int __partition_setup_keys __P((DBC *, - DB_PARTITION *, DBMETA *, u_int32_t)); + DB_PARTITION *, u_int32_t, u_int32_t)); static int __part_key_cmp __P((const void *, const void *)); static inline void __part_search __P((DB *, DB_PARTITION *, DBT *, u_int32_t *)); -static char *Alloc_err = DB_STR_A("0644", - "Partition open failed to allocate %d bytes", "%d"); +#define ALLOC_ERR DB_STR_A("0764","Partition failed to allocate %d bytes","%d") /* * Allocate a partition cursor and copy flags to the partition cursor. @@ -70,20 +69,27 @@ static inline void __part_search(dbp, part, key, part_idp) { db_indx_t base, indx, limit; int cmp; - int (*func) __P((DB *, const DBT *, const DBT *)); + int (*func) __P((DB *, const DBT *, const DBT *, size_t *)); + size_t pos, pos_h, pos_l; DB_ASSERT(dbp->env, part->nparts != 0); COMPQUIET(cmp, 0); COMPQUIET(indx, 0); + pos_h = 0; + pos_l = 0; func = ((BTREE *)dbp->bt_internal)->bt_compare; DB_BINARY_SEARCH_FOR(base, limit, part->nparts, O_INDX) { + pos = pos_l > pos_h ? pos_h : pos_l; DB_BINARY_SEARCH_INCR(indx, base, limit, O_INDX); - cmp = func(dbp, key, &part->keys[indx]); + cmp = func(dbp, key, &part->keys[indx], &pos); if (cmp == 0) break; - if (cmp > 0) + if (cmp > 0) { DB_BINARY_SEARCH_SHIFT_BASE(indx, base, limit, O_INDX); + pos_l = pos; + } else + pos_h = pos; } if (cmp == 0) *part_idp = indx; @@ -146,7 +152,8 @@ __partition_set(dbp, parts, keys, callback) { DB_PARTITION *part; ENV *env; - int ret; + u_int32_t i; + int ret, t_ret; DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_partition"); env = dbp->dbenv->env; @@ -155,6 +162,11 @@ __partition_set(dbp, parts, keys, callback) __db_errx(env, DB_STR("0646", "Must specify at least 2 partitions.")); return (EINVAL); + } else if (parts > PART_MAXIMUM) { + __db_errx(env, DB_STR_A("0772", + "Must not specify more than %u partitions.", "%u"), + (unsigned int)PART_MAXIMUM); + return (EINVAL); } if (keys == NULL && callback == NULL) { @@ -178,11 +190,59 @@ bad: __db_errx(env, DB_STR("0648", (part->callback != NULL && keys != NULL)) goto bad; + /* + * Free a key array that was allocated by an earlier set_partition call. + */ + if (part->keys != NULL) { + for (i = 0; i < part->nparts - 1; i++) { + /* + * Always free all entries in the key array and return + * the first error code. + */ + if ((t_ret = __db_dbt_clone_free(dbp->env, + &part->keys[i])) != 0 && ret == 0) + ret = t_ret; + } + __os_free(dbp->env, part->keys); + part->keys = NULL; + } + + if (ret != 0) + return (ret); + part->nparts = parts; - part->keys = keys; part->callback = callback; - return (0); + /* + * Take a copy of the users key array otherwise we cannot be sure + * that the memory will still be valid when the database is opened. + */ + if (keys != NULL) { + if ((ret = __os_calloc(dbp->env, + part->nparts - 1, sizeof(DBT), &part->keys)) != 0) + goto err; + + for (i = 0, parts = 0; i < part->nparts - 1; i++, parts++) + if ((ret = __db_dbt_clone(dbp->env, + &part->keys[i], &keys[i])) != 0) + goto err; + } + +err: if (ret != 0 && part->keys != NULL) { + /* + * Always free those entries cloned successfully in the key + * array and the one which fails in __db_dbt_clone, and + * return the first error code. As ret != 0 here, so it is + * safe to ignore any error from __db_dbt_clone_free. + */ + for (i = 0; i < parts; i++) + (void)__db_dbt_clone_free(dbp->env, &part->keys[i]); + if (parts < part->nparts - 1 && part->keys[parts].data != NULL) + __os_free(dbp->env, part->keys[parts].data); + __os_free(dbp->env, part->keys); + part->keys = NULL; + } + return (ret); } /* @@ -288,15 +348,16 @@ __partition_open(dbp, ip, txn, fname, type, flags, mode, do_open) if ((ret = __os_calloc(env, part->nparts, sizeof(*part->handles), &part->handles)) != 0) { - __db_errx(env, - Alloc_err, part->nparts * sizeof(*part->handles)); + __db_errx(env, ALLOC_ERR, + (int)(part->nparts * sizeof(*part->handles))); goto err; } DB_ASSERT(env, fname != NULL); if ((ret = __os_malloc(env, strlen(fname) + PART_LEN + 1, &name)) != 0) { - __db_errx(env, Alloc_err, strlen(fname) + PART_LEN + 1); + __db_errx(env, ALLOC_ERR, + (int)(strlen(fname) + PART_LEN + 1)); goto err; } @@ -330,6 +391,9 @@ __partition_open(dbp, ip, txn, fname, type, flags, mode, do_open) part_db->dup_compare = dbp->dup_compare; part_db->app_private = dbp->app_private; part_db->api_internal = dbp->api_internal; + part_db->blob_threshold = dbp->blob_threshold; + part_db->blob_file_id = dbp->blob_file_id; + part_db->blob_sdb_id = dbp->blob_sdb_id; if (dbp->type == DB_BTREE) __bam_copy_config(dbp, part_db, part->nparts); @@ -388,7 +452,8 @@ __partition_chk_meta(dbp, ip, txn, flags) DB_MPOOLFILE *mpf; ENV *env; db_pgno_t base_pgno; - int ret, t_ret; + int ret, set_keys, t_ret; + u_int32_t pgsize; dbc = NULL; meta = NULL; @@ -397,6 +462,14 @@ __partition_chk_meta(dbp, ip, txn, flags) mpf = dbp->mpf; env = dbp->env; ret = 0; + set_keys = 0; + + /* + * Just to fix the lint warning. + * The real value will be set later, and we will + * only use the value after being set properly. + */ + pgsize = dbp->pgsize; /* Get a cursor on the main db. */ dbp->p_internal = NULL; @@ -475,10 +548,12 @@ __partition_chk_meta(dbp, ip, txn, flags) } } else if (meta->magic != DB_BTREEMAGIC) { __db_errx(env, DB_STR("0658", - "Partitioning only supported on BTREE nad HASH.")); + "Partitioning only supported on BTREE and HASH.")); ret = EINVAL; - } else - ret = __partition_setup_keys(dbc, part, meta, flags); + } else { + set_keys = 1; + pgsize = meta->pagesize; + } err: /* Put the metadata page back. */ if (meta != NULL && (t_ret = __memp_fput(mpf, @@ -487,6 +562,15 @@ err: /* Put the metadata page back. */ if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0) ret = t_ret; + /* + * We can only call __partition_setup_keys after putting + * the meta page and releasing the meta lock, or self-deadlock + * will occur. + */ + if (ret == 0 && set_keys && (t_ret = + __partition_setup_keys(dbc, part, pgsize, flags)) != 0) + ret = t_ret; + if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0) ret = t_ret; @@ -502,7 +586,7 @@ err: /* Put the metadata page back. */ struct key_sort { DB *dbp; DBT *key; - int (*compare) __P((DB *, const DBT *, const DBT *)); + int (*compare) __P((DB *, const DBT *, const DBT *, size_t *)); }; static int __part_key_cmp(a, b) @@ -512,7 +596,7 @@ static int __part_key_cmp(a, b) ka = a; kb = b; - return (ka->compare(ka->dbp, ka->key, kb->key)); + return (ka->compare(ka->dbp, ka->key, kb->key, NULL)); } /* * __partition_setup_keys -- @@ -520,25 +604,22 @@ static int __part_key_cmp(a, b) * are creating a partitioned database. */ static int -__partition_setup_keys(dbc, part, meta, flags) +__partition_setup_keys(dbc, part, pgsize, flags) DBC *dbc; DB_PARTITION *part; - DBMETA *meta; - u_int32_t flags; + u_int32_t flags, pgsize; { BTREE *t; DB *dbp; - DBT data, key, *keys, *kp; + DBT data, key, *keys, *kp, *okp; ENV *env; - u_int32_t ds, i, j; - u_int8_t *dd; + db_pgno_t last_pgno; + u_int32_t cgetflags, i, j; + size_t dsize; struct key_sort *ks; - int have_keys, ret; - int (*compare) __P((DB *, const DBT *, const DBT *)); - void *dp; + int have_keys, ret, t_ret; + int (*compare) __P((DB *, const DBT *, const DBT *, size_t *)); - COMPQUIET(dd, NULL); - COMPQUIET(ds, 0); memset(&data, 0, sizeof(data)); memset(&key, 0, sizeof(key)); ks = NULL; @@ -549,6 +630,9 @@ __partition_setup_keys(dbc, part, meta, flags) /* Need to just read the main database. */ dbp->p_internal = NULL; have_keys = 0; + dsize = 0; + + keys = part->keys; /* First verify that things what we expect. */ if ((ret = __dbc_get(dbc, &key, &data, DB_FIRST)) != 0) { @@ -581,11 +665,15 @@ __partition_setup_keys(dbc, part, meta, flags) } if (LF_ISSET(DB_CREATE) && have_keys == 0) { - /* Insert the keys into the master database. */ + /* + * Insert the keys into the master database. We will also + * compute the total size of the keys for later use. + */ for (i = 0; i < part->nparts - 1; i++) { if ((ret = __db_put(dbp, dbc->thread_info, dbc->txn, &part->keys[i], &data, 0)) != 0) goto err; + dsize += part->keys[i].size; } /* @@ -604,39 +692,71 @@ __partition_setup_keys(dbc, part, meta, flags) } done: if (F_ISSET(part, PART_RANGE)) { /* - * Allocate one page to hold the keys plus space at the - * end of the buffer to put an array of DBTs. If there - * is not enough space __dbc_get will return how much - * is needed and we realloc. + * If we just did the insert, we have known the total size of + * the keys. Otherwise, the keys must have been in the database, + * and we can calculate the size by checking the last pgno of + * the corresponding mpoolfile. + * + * We make the size aligned at 1024 for performance. */ + if (dsize == 0) { + ret = __memp_get_last_pgno(dbp->mpf, &last_pgno); + if (ret != 0) + goto err; + if (last_pgno > 1) + last_pgno--; + dsize = last_pgno * pgsize; + } + dsize = DB_ALIGN(dsize, 1024); + if ((ret = __os_malloc(env, - meta->pagesize + (sizeof(DBT) * part->nparts), + dsize + (sizeof(DBT) * part->nparts), &part->data)) != 0) { - __db_errx(env, Alloc_err, meta->pagesize); + __db_errx(env, ALLOC_ERR, (int)dsize); goto err; } + memset(part->data, 0, + dsize + (sizeof(DBT) * part->nparts)); + + kp = okp = (DBT *) + ((u_int8_t *)part->data + dsize); memset(&key, 0, sizeof(key)); memset(&data, 0, sizeof(data)); - data.data = part->data; - data.ulen = meta->pagesize; data.flags = DB_DBT_USERMEM; -again: if ((ret = __dbc_get(dbc, &key, &data, - DB_FIRST | DB_MULTIPLE_KEY)) == DB_BUFFER_SMALL) { - if ((ret = __os_realloc(env, - data.size + (sizeof(DBT) * part->nparts), - &part->data)) != 0) + j = 0; + cgetflags = DB_FIRST; + while ((ret = __dbc_get(dbc, &key, &data, cgetflags)) == 0) { + /* It is an error if we get more keys than expect. */ + if ((u_int32_t)(kp - okp) > part->nparts) { + ret = EINVAL; goto err; - data.data = part->data; - data.ulen = data.size; - goto again; + } + kp->size = key.size; + kp->data = (u_int8_t *)part->data + j; + /* It is an error if the keys overflow the space. */ + if (j + kp->size > dsize) { + ret = EINVAL; + goto err; + } + memcpy(kp->data, key.data, kp->size); + j += kp->size; + cgetflags = DB_NEXT; + kp++; } + + /* + * We should get part->nparts keys back, otherwise it means + * the passed-in keys are not valid. + */ + if (ret == DB_NOTFOUND && (u_int32_t)(kp - okp) == part->nparts) + ret = 0; + if (ret == 0) { /* * They passed in keys, they must match. */ - keys = NULL; compare = NULL; - if (have_keys == 1 && (keys = part->keys) != NULL) { + if (have_keys == 1 && keys != NULL) { t = dbc->dbp->bt_internal; compare = t->bt_compare; if ((ret = __os_malloc(env, (part->nparts - 1) @@ -651,20 +771,15 @@ again: if ((ret = __dbc_get(dbc, &key, &data, qsort(ks, (size_t)part->nparts - 1, sizeof(struct key_sort), __part_key_cmp); } - DB_MULTIPLE_INIT(dp, &data); part->keys = (DBT *) - ((u_int8_t *)part->data + data.size); + ((u_int8_t *)part->data + dsize); + F_SET(part, PART_KEYS_SETUP); j = 0; for (kp = part->keys; kp < &part->keys[part->nparts]; kp++, j++) { - DB_MULTIPLE_KEY_NEXT(dp, - &data, kp->data, kp->size, dd, ds); - if (dp == NULL) { - ret = DB_NOTFOUND; - break; - } - if (keys != NULL && j != 0 && - compare(dbc->dbp, ks[j - 1].key, kp) != 0) { + if (have_keys == 1 && keys != NULL && j != 0 && + compare(dbc->dbp, ks[j - 1].key, + kp, NULL) != 0) { if (kp->data == NULL && F_ISSET(dbp, DB_AM_RECOVER)) goto err; @@ -683,6 +798,24 @@ again: if ((ret = __dbc_get(dbc, &key, &data, err: dbp->p_internal = part; if (ks != NULL) __os_free(env, ks); + + /* + * We only free the original copy of the key array when + * the keys have been setup properly, otherwise we let + * the close function to free the memory. + */ + if (keys != NULL && F_ISSET(part, PART_KEYS_SETUP)) { + for (i = 0; i < part->nparts - 1; i++) + /* + * Always free all entries in the key array and return + * the first error code. + */ + if ((t_ret = __db_dbt_clone_free(env, + &keys[i])) != 0 && ret == 0) + ret = t_ret; + __os_free(env, keys); + } + return (ret); } @@ -1183,6 +1316,15 @@ __partition_close(dbp, txn, flags) ret = t_ret; __os_free(env, part->handles); } + if (!F_ISSET(part, PART_KEYS_SETUP) && part->keys != NULL) { + for (i = 0; i < part->nparts - 1; i++) { + if (part->keys[i].data != NULL && (t_ret = + __db_dbt_clone_free(env, &part->keys[i])) != 0 && + ret == 0) + ret = t_ret; + } + __os_free(env, part->keys); + } if (part->dirs != NULL) __os_free(env, (char **)part->dirs); if (part->data != NULL) @@ -1471,7 +1613,8 @@ __part_fileid_reset(env, ip, fname, nparts, encrypted) if ((ret = __os_malloc(env, strlen(fname) + PART_LEN + 1, &name)) != 0) { - __db_errx(env, Alloc_err, strlen(fname) + PART_LEN + 1); + __db_errx(env, ALLOC_ERR, + (int)(strlen(fname) + PART_LEN + 1)); return (ret); } @@ -1747,7 +1890,8 @@ __part_rr(dbp, ip, txn, name, subdb, newname, flags) COMPQUIET(np, NULL); if (newname != NULL && (ret = __os_malloc(env, strlen(newname) + PART_LEN + 1, &np)) != 0) { - __db_errx(env, Alloc_err, strlen(newname) + PART_LEN + 1); + __db_errx(env, ALLOC_ERR, + (int)(strlen(newname) + PART_LEN + 1)); goto err; } for (i = 0; i < part->nparts; i++, pdbp++) { @@ -1790,6 +1934,32 @@ err: /* } return (ret); } + +/* + * __partc_dup -- + * Duplicate a cursor on a partitioned database. + * + * PUBLIC: int __partc_dup __P((DBC *, DBC *)); + */ +int +__partc_dup(dbc_orig, dbc_n) + DBC *dbc_orig; + DBC *dbc_n; +{ + PART_CURSOR *orig, *new; + + orig = (PART_CURSOR *)dbc_orig->internal; + new = (PART_CURSOR *)dbc_n->internal; + + /* + * A cursor on a partitioned database contains the identifier + * of the underlying database and a regular cursor that points + * to the underlying database. Copy both pieces. + */ + new->part_id = orig->part_id; + + return (__dbc_dup(orig->sub_cursor, &new->sub_cursor, DB_POSITION)); +} #ifdef HAVE_VERIFY /* * __part_verify -- diff --git a/src/dbinc/atomic.h b/src/dbinc/atomic.h index 096176a5..61f2ead9 100644 --- a/src/dbinc/atomic.h +++ b/src/dbinc/atomic.h @@ -1,7 +1,7 @@ /* * See the file LICENSE for redistribution information. * - * Copyright (c) 2009, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2009, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -79,12 +79,11 @@ typedef struct { #define WINCE_ATOMIC_MAGIC(p) \ /* \ * Memory mapped regions on Windows CE cause problems with \ - * InterlockedXXX calls. Each page in a mapped region needs to \ - * have been written to prior to an InterlockedXXX call, or the \ - * InterlockedXXX call hangs. This does not seem to be \ - * documented anywhere. For now, read/write a non-critical \ - * piece of memory from the shared region prior to attempting \ - * shared region prior to attempting an InterlockedExchange \ + * InterlockedXXX calls. Each process making an InterlockedXXX \ + * call must make sure that it has written to the page prior to \ + * the call, or the InterlockedXXX call hangs. This does not \ + * seem to be documented anywhere. Write a non-critical piece \ + * of memory from the shared region prior to attempting an \ * InterlockedXXX operation. \ */ \ (p)->dummy = 0 @@ -144,7 +143,7 @@ typedef LONG volatile *interlocked_val; #define atomic_inc(env, p) __atomic_inc(p) #define atomic_dec(env, p) __atomic_dec(p) #define atomic_compare_exchange(env, p, o, n) \ - __atomic_compare_exchange((p), (o), (n)) + __atomic_compare_exchange_int((p), (o), (n)) static inline int __atomic_inc(db_atomic_t *p) { int temp; @@ -176,7 +175,7 @@ static inline int __atomic_dec(db_atomic_t *p) * http://gcc.gnu.org/onlinedocs/gcc-4.1.0/gcc/Atomic-Builtins.html * which configure could be changed to use. */ -static inline int __atomic_compare_exchange( +static inline int __atomic_compare_exchange_int( db_atomic_t *p, atomic_value_t oldval, atomic_value_t newval) { atomic_value_t was; diff --git a/src/dbinc/blob.h b/src/dbinc/blob.h new file mode 100644 index 00000000..f4ff475b --- /dev/null +++ b/src/dbinc/blob.h @@ -0,0 +1,103 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2013, 2015 Oracle and/or its affiliates. All rights reserved. + * + * $Id$ + */ + +#ifndef _DB_BLOB_H_ +#define _DB_BLOB_H_ + +#if defined(__cplusplus) +extern "C" { +#endif + +/* + * How many characters can the path for a blob file use? + * Up to 6 subdirectory separators. + * Up to 6 directory names of up to three characters each. + * Up to 21 characters for blob_id identifier. + * 7 characters for the standard prefix (__db.bl) + * 1 for luck (or NULL) + * The largest blob id, 9,223,372,036,854,775,807 would + * produce a path and file name: + * 009/223/372/036/854/775/807/__db.bl009223372036854775807 + */ +#define MAX_BLOB_PATH "009/223/372/036/854/775/807/__db.bl009223372036854775807" +#define MAX_BLOB_PATH_SZ sizeof(MAX_BLOB_PATH) +#define BLOB_DEFAULT_DIR "__db_bl" +#define BLOB_META_FILE_NAME "__db_blob_meta.db" +#define BLOB_DIR_PREFIX "__db" +#define BLOB_FILE_PREFIX "__db.bl" + +#define BLOB_DIR_ELEMS 1000 + +#define IS_BLOB_META(name) \ + (name != NULL && strstr(name, BLOB_META_FILE_NAME) != NULL) +#define IS_BLOB_FILE(name) \ + (name != NULL && strstr(name, BLOB_FILE_PREFIX) != NULL) + +/* + * Combines two unsigned 32 bit integers into a 64 bit integer. + * Blob database file ids and sub database ids are 64 bit integers, + * but have to be stored on database metadata pages that must + * be readable on 32 bit only compilers. So the ids are split into + * two 32 bit integers, and combined when needed. + */ +#define GET_LO_HI(e, lo, hi, o, ret) do { \ + DB_ASSERT((e), sizeof(o) <= 8); \ + if (sizeof(o) == 8) { \ + (o) = (hi); \ + (o) = ((o) << 32); \ + (o) += (lo); \ + } else { \ + if ((hi) > 0) { \ + __db_errx((e), DB_STR("0765", \ + "Offset or id size overflow.")); \ + (ret) = EINVAL; \ + } \ + (o) = (lo); \ + } \ +} while (0); + +#define GET_BLOB_FILE_ID(e, p, o, ret) \ + GET_LO_HI(e, (p)->blob_file_lo, (p)->blob_file_hi, o, ret); + +#define GET_BLOB_SDB_ID(e, p, o, ret) \ + GET_LO_HI(e, (p)->blob_sdb_lo, (p)->blob_sdb_hi, o, ret); + +/* Splits a 64 bit integer into two unsigned 32 bit integers. */ +#define SET_LO_HI(p, v, type, field_lo, field_hi) do { \ + u_int32_t tmp; \ + if (sizeof((v)) == 8) { \ + tmp = (u_int32_t)((v) >> 32); \ + memcpy(((u_int8_t *)p) + SSZ(type, field_hi), \ + &tmp, sizeof(u_int32_t)); \ + } else { \ + memset(((u_int8_t *)p) + SSZ(type, field_hi), \ + 0, sizeof(u_int32_t)); \ + } \ + tmp = (u_int32_t)(v); \ + memcpy(((u_int8_t *)p) + SSZ(type, field_lo), \ + &tmp, sizeof(u_int32_t)); \ +} while (0); + +#define SET_LO_HI_VAR(v, field_lo, field_hi) do { \ + if (sizeof((v)) == 8) \ + field_hi = (u_int32_t)((v) >> 32); \ + else \ + field_hi = 0; \ + field_lo = (u_int32_t)(v); \ +} while (0); + +#define SET_BLOB_META_FILE_ID(p, v, type) \ + SET_LO_HI(p, v, type, blob_file_lo, blob_file_hi); + +#define SET_BLOB_META_SDB_ID(p, v, type) \ + SET_LO_HI(p, v, type, blob_sdb_lo, blob_sdb_hi); + +#if defined(__cplusplus) +} +#endif +#endif /* !_DB_BLOB_H_ */ diff --git a/src/dbinc/btree.h b/src/dbinc/btree.h index 86bbec14..a8b9e1ee 100644 --- a/src/dbinc/btree.h +++ b/src/dbinc/btree.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1990, 1993, 1994, 1995, 1996 @@ -472,7 +472,7 @@ struct __btree { /* Btree access method. */ u_int32_t bt_minkey; /* Minimum keys per page. */ /* Btree comparison function. */ - int (*bt_compare) __P((DB *, const DBT *, const DBT *)); + int (*bt_compare) __P((DB *, const DBT *, const DBT *, size_t *)); /* Btree prefix function. */ size_t (*bt_prefix) __P((DB *, const DBT *, const DBT *)); /* Btree compress function. */ @@ -483,7 +483,8 @@ struct __btree { /* Btree access method. */ int (*bt_decompress) __P((DB *, const DBT *, const DBT *, DBT *, DBT *, DBT *)); /* dup_compare for compression */ - int (*compress_dup_compare) __P((DB *, const DBT *, const DBT *)); + int (*compress_dup_compare) __P((DB *, const DBT *, const DBT *, + size_t *)); #endif /* Recno access method. */ @@ -539,7 +540,7 @@ typedef enum { * Flags for __bam_pinsert. */ #define BPI_SPACEONLY 0x01 /* Only check for space to update. */ -#define BPI_NORECNUM 0x02 /* Not update the recnum on the left. */ +#define BPI_NORECNUM 0x02 /* Don't update the left's recnum. */ #define BPI_NOLOGGING 0x04 /* Don't log the update. */ #define BPI_REPLACE 0x08 /* Replace the record. */ diff --git a/src/dbinc/clock.h b/src/dbinc/clock.h index caeaee70..b2815ea2 100644 --- a/src/dbinc/clock.h +++ b/src/dbinc/clock.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -125,6 +125,13 @@ typedef struct { timespecadd((vvp), &__tmp); \ } while (0) +#define TIMESPEC_SUB_DB_TIMEOUT(vvp, t) \ + do { \ + db_timespec __tmp; \ + DB_TIMEOUT_TO_TIMESPEC(t, &__tmp); \ + timespecsub((vvp), &__tmp); \ + } while (0) + #if defined(__cplusplus) } #endif diff --git a/src/dbinc/crypto.h b/src/dbinc/crypto.h index ea7a9cf0..4d889fd9 100644 --- a/src/dbinc/crypto.h +++ b/src/dbinc/crypto.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/dbinc/cxx_int.h b/src/dbinc/cxx_int.h index 5492ead7..368bac86 100644 --- a/src/dbinc/cxx_int.h +++ b/src/dbinc/cxx_int.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/dbinc/db.in b/src/dbinc/db.in index a948910e..b592b746 100644 --- a/src/dbinc/db.in +++ b/src/dbinc/db.in @@ -1,7 +1,7 @@ /* * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ * @@ -102,6 +102,7 @@ extern "C" { @FILE_t_decl@ @off_t_decl@ +@db_off_t_decl@ @pid_t_decl@ @size_t_decl@ #ifdef HAVE_MIXED_SIZE_ADDRESSING @@ -131,9 +132,9 @@ typedef u_int16_t db_indx_t; /* Page offset type. */ #define DB_MAX_PAGES 0xffffffff /* >= # of pages in a file */ typedef u_int32_t db_recno_t; /* Record number type. */ -#define DB_MAX_RECORDS 0xffffffff /* >= # of records in a tree */ +#define DB_MAX_RECORDS 0xffffffff /* >= # of records in a recno tree. */ -typedef u_int32_t db_timeout_t; /* Type of a timeout. */ +typedef u_int32_t db_timeout_t; /* Type of a timeout in microseconds. */ /* * Region offsets are the difference between a pointer in a region and the @@ -157,6 +158,10 @@ struct __db_compact; typedef struct __db_compact DB_COMPACT; struct __db_dbt; typedef struct __db_dbt DBT; struct __db_distab; typedef struct __db_distab DB_DISTAB; struct __db_env; typedef struct __db_env DB_ENV; +struct __db_event_mutex_died_info; + typedef struct __db_event_mutex_died_info DB_EVENT_MUTEX_DIED_INFO; +struct __db_event_failchk_info; + typedef struct __db_event_failchk_info DB_EVENT_FAILCHK_INFO; struct __db_h_stat; typedef struct __db_h_stat DB_HASH_STAT; struct __db_heap_rid; typedef struct __db_heap_rid DB_HEAP_RID; struct __db_heap_stat; typedef struct __db_heap_stat DB_HEAP_STAT; @@ -189,6 +194,7 @@ struct __db_repmgr_site;typedef struct __db_repmgr_site DB_REPMGR_SITE; struct __db_repmgr_stat;typedef struct __db_repmgr_stat DB_REPMGR_STAT; struct __db_seq_record; typedef struct __db_seq_record DB_SEQ_RECORD; struct __db_seq_stat; typedef struct __db_seq_stat DB_SEQUENCE_STAT; +struct __db_stream; typedef struct __db_stream DB_STREAM; struct __db_site; typedef struct __db_site DB_SITE; struct __db_sequence; typedef struct __db_sequence DB_SEQUENCE; struct __db_thread_info;typedef struct __db_thread_info DB_THREAD_INFO; @@ -226,18 +232,20 @@ struct __db_dbt { void *app_data; -#define DB_DBT_APPMALLOC 0x001 /* Callback allocated memory. */ -#define DB_DBT_BULK 0x002 /* Internal: Insert if duplicate. */ -#define DB_DBT_DUPOK 0x004 /* Internal: Insert if duplicate. */ -#define DB_DBT_ISSET 0x008 /* Lower level calls set value. */ -#define DB_DBT_MALLOC 0x010 /* Return in malloc'd memory. */ -#define DB_DBT_MULTIPLE 0x020 /* References multiple records. */ -#define DB_DBT_PARTIAL 0x040 /* Partial put/get. */ -#define DB_DBT_REALLOC 0x080 /* Return in realloc'd memory. */ -#define DB_DBT_READONLY 0x100 /* Readonly, don't update. */ -#define DB_DBT_STREAMING 0x200 /* Internal: DBT is being streamed. */ -#define DB_DBT_USERCOPY 0x400 /* Use the user-supplied callback. */ -#define DB_DBT_USERMEM 0x800 /* Return in user's memory. */ +#define DB_DBT_APPMALLOC 0x0001 /* Callback allocated memory. */ +#define DB_DBT_BULK 0x0002 /* Internal: Insert if duplicate. */ +#define DB_DBT_DUPOK 0x0004 /* Internal: Insert if duplicate. */ +#define DB_DBT_ISSET 0x0008 /* Lower level calls set value. */ +#define DB_DBT_MALLOC 0x0010 /* Return in malloc'd memory. */ +#define DB_DBT_MULTIPLE 0x0020 /* References multiple records. */ +#define DB_DBT_PARTIAL 0x0040 /* Partial put/get. */ +#define DB_DBT_REALLOC 0x0080 /* Return in realloc'd memory. */ +#define DB_DBT_READONLY 0x0100 /* Readonly, don't update. */ +#define DB_DBT_STREAMING 0x0200 /* Internal: DBT is being streamed. */ +#define DB_DBT_USERCOPY 0x0400 /* Use the user-supplied callback. */ +#define DB_DBT_USERMEM 0x0800 /* Return in user's memory. */ +#define DB_DBT_BLOB 0x1000 /* Data item is a blob. */ +#define DB_DBT_BLOB_REC 0x2000 /* Internal: Blob database record. */ u_int32_t flags; }; @@ -274,6 +282,23 @@ struct __db_mutex_stat { /* SHARED */ #endif }; +/* Buffers passed to __mutex_describe() must be at least this large. */ +#define DB_MUTEX_DESCRIBE_STRLEN 128 + +/* This is the info of a DB_EVENT_MUTEX_DIED event notification. */ +struct __db_event_mutex_died_info { + pid_t pid; /* Process which last owned the mutex */ + db_threadid_t tid; /* Thread which last owned the mutex */ + db_mutex_t mutex; /* ID of the mutex */ + char desc[DB_MUTEX_DESCRIBE_STRLEN]; +}; + +/* This is the info of a DB_EVENT_FAILCHK event notification. */ +#define DB_FAILURE_SYMPTOM_SIZE 120 +struct __db_event_failchk_info { + int error; + char symptom[DB_FAILURE_SYMPTOM_SIZE]; +}; /* This is the length of the buffer passed to DB_ENV->thread_id_string() */ #define DB_THREADID_STRLEN 128 @@ -400,6 +425,8 @@ struct __db_lock_stat { /* SHARED */ uintmax_t st_lockers_nowait; /* Locker lock granted without wait. */ uintmax_t st_region_wait; /* Region lock granted after wait. */ uintmax_t st_region_nowait; /* Region lock granted without wait. */ + uintmax_t st_nlockers_hit; /* Lockers found in thread info. */ + uintmax_t st_nlockers_reused; /* Lockers reallocated from thread info. */ u_int32_t st_hash_len; /* Max length of bucket. */ roff_t st_regsize; /* Region size. */ #endif @@ -469,7 +496,7 @@ struct __db_lockreq { /******************************************************* * Logging. *******************************************************/ -#define DB_LOGVERSION 19 /* Current log version. */ +#define DB_LOGVERSION 22 /* Current log version. */ #define DB_LOGVERSION_LATCHING 15 /* Log version using latching: db-4.8 */ #define DB_LOGCHKSUM 12 /* Check sum headers: db-4.5 */ #define DB_LOGOLDVER 8 /* Oldest version supported: db-4.2 */ @@ -595,7 +622,8 @@ typedef enum { LOGREC_PGDDBT, LOGREC_PGLIST, LOGREC_POINTER, - LOGREC_TIME + LOGREC_TIME, + LOGREC_LONGARG } log_rec_type_t; typedef const struct __log_rec_spec { @@ -755,6 +783,7 @@ struct __db_mpool_stat { /* SHARED */ uintmax_t st_mvcc_frozen; /* Buffers frozen. */ uintmax_t st_mvcc_thawed; /* Buffers thawed. */ uintmax_t st_mvcc_freed; /* Frozen buffers freed. */ + uintmax_t st_mvcc_reused; /* Outdated invisible buffers reused. */ uintmax_t st_alloc; /* Number of page allocations. */ uintmax_t st_alloc_buckets; /* Buckets checked during allocation. */ uintmax_t st_alloc_max_buckets;/* Max checked during allocation. */ @@ -762,6 +791,8 @@ struct __db_mpool_stat { /* SHARED */ uintmax_t st_alloc_max_pages; /* Max checked during allocation. */ uintmax_t st_io_wait; /* Thread waited on buffer I/O. */ uintmax_t st_sync_interrupted; /* Number of times sync interrupted. */ + u_int32_t st_oddfsize_detect; /* Odd file size detected. */ + u_int32_t st_oddfsize_resolve; /* Odd file size resolved. */ roff_t st_regsize; /* Region size. */ roff_t st_regmax; /* Region max. */ #endif @@ -956,7 +987,7 @@ struct __db_txn { #define TXN_SNAPSHOT 0x08000 /* Snapshot Isolation. */ #define TXN_SYNC 0x10000 /* Write and sync on prepare/commit. */ #define TXN_WRITE_NOSYNC 0x20000 /* Write only on prepare/commit. */ -#define TXN_BULK 0x40000 /* Enable bulk loading optimization. */ +#define TXN_BULK 0x40000 /* Enable bulk loading optimization. */ u_int32_t flags; }; @@ -1065,30 +1096,34 @@ struct __db_txn_token { /* * Event notification types. (Tcl testing interface currently assumes there are - * no more than 32 of these.) + * no more than 32 of these.). Comments include any relevant event_info types. */ #define DB_EVENT_PANIC 0 -#define DB_EVENT_REG_ALIVE 1 -#define DB_EVENT_REG_PANIC 2 -#define DB_EVENT_REP_CLIENT 3 -#define DB_EVENT_REP_CONNECT_BROKEN 4 -#define DB_EVENT_REP_CONNECT_ESTD 5 -#define DB_EVENT_REP_CONNECT_TRY_FAILED 6 -#define DB_EVENT_REP_DUPMASTER 7 -#define DB_EVENT_REP_ELECTED 8 -#define DB_EVENT_REP_ELECTION_FAILED 9 -#define DB_EVENT_REP_INIT_DONE 10 -#define DB_EVENT_REP_JOIN_FAILURE 11 -#define DB_EVENT_REP_LOCAL_SITE_REMOVED 12 -#define DB_EVENT_REP_MASTER 13 -#define DB_EVENT_REP_MASTER_FAILURE 14 -#define DB_EVENT_REP_NEWMASTER 15 -#define DB_EVENT_REP_PERM_FAILED 16 -#define DB_EVENT_REP_SITE_ADDED 17 -#define DB_EVENT_REP_SITE_REMOVED 18 -#define DB_EVENT_REP_STARTUPDONE 19 -#define DB_EVENT_REP_WOULD_ROLLBACK 20 /* Undocumented; C API only. */ -#define DB_EVENT_WRITE_FAILED 21 +#define DB_EVENT_REG_ALIVE 1 /* int: pid which was in env */ +#define DB_EVENT_REG_PANIC 2 /* int: error causing the panic. */ +#define DB_EVENT_REP_AUTOTAKEOVER_FAILED 3 +#define DB_EVENT_REP_CLIENT 4 +#define DB_EVENT_REP_CONNECT_BROKEN 5 /* DB_REPMGR_CONN_ERR */ +#define DB_EVENT_REP_CONNECT_ESTD 6 /* int: EID of remote site */ +#define DB_EVENT_REP_CONNECT_TRY_FAILED 7 /* DB_REPMGR_CONN_ERR */ +#define DB_EVENT_REP_DUPMASTER 8 +#define DB_EVENT_REP_ELECTED 9 +#define DB_EVENT_REP_ELECTION_FAILED 10 +#define DB_EVENT_REP_INIT_DONE 11 +#define DB_EVENT_REP_INQUEUE_FULL 12 +#define DB_EVENT_REP_JOIN_FAILURE 13 +#define DB_EVENT_REP_LOCAL_SITE_REMOVED 14 +#define DB_EVENT_REP_MASTER 15 +#define DB_EVENT_REP_MASTER_FAILURE 16 +#define DB_EVENT_REP_NEWMASTER 17 /* int: new master's site id */ +#define DB_EVENT_REP_PERM_FAILED 18 +#define DB_EVENT_REP_SITE_ADDED 19 /* int: eid */ +#define DB_EVENT_REP_SITE_REMOVED 20 /* int: eid */ +#define DB_EVENT_REP_STARTUPDONE 21 +#define DB_EVENT_REP_WOULD_ROLLBACK 22 /* Undocumented; C API only. */ +#define DB_EVENT_WRITE_FAILED 23 +#define DB_EVENT_MUTEX_DIED 24 /* DB_EVENT_MUTEX_DIED_INFO */ +#define DB_EVENT_FAILCHK_PANIC 25 /* DB_EVENT_FAILCHK_INFO */ #define DB_EVENT_NO_SUCH_EVENT 0xffffffff /* OOB sentinel value */ /* Replication Manager site status. */ @@ -1102,6 +1137,7 @@ struct __db_repmgr_site { u_int32_t status; #define DB_REPMGR_ISPEER 0x01 +#define DB_REPMGR_ISVIEW 0x02 u_int32_t flags; }; @@ -1117,6 +1153,7 @@ struct __db_rep_stat { /* SHARED */ * circumstances, garbaged). */ u_int32_t st_startup_complete; /* Site completed client sync-up. */ + u_int32_t st_view; /* Site is a view. */ #ifndef __TEST_DB_NO_STATISTICS uintmax_t st_log_queued; /* Log records currently queued.+ */ u_int32_t st_status; /* Current replication status. */ @@ -1194,6 +1231,7 @@ struct __db_rep_stat { /* SHARED */ /* Undocumented statistics only used by the test system. */ #ifdef CONFIG_TEST u_int32_t st_filefail_cleanups; /* # of FILE_FAIL cleanups done. */ + uintmax_t st_log_futuredup; /* Future log records that are dups. */ #endif #endif }; @@ -1204,10 +1242,18 @@ struct __db_repmgr_stat { /* SHARED */ uintmax_t st_msgs_queued; /* # msgs queued for network delay. */ uintmax_t st_msgs_dropped; /* # msgs discarded due to excessive queue length. */ + u_int32_t st_incoming_queue_gbytes; /* Incoming queue size: GB. */ + u_int32_t st_incoming_queue_bytes; /* Incoming queue size: B. */ + uintmax_t st_incoming_msgs_dropped; /* # of msgs discarded due to + incoming queue full. */ uintmax_t st_connection_drop; /* Existing connections dropped. */ uintmax_t st_connect_fail; /* Failed new connection attempts. */ - uintmax_t st_elect_threads; /* # of active election threads. */ - uintmax_t st_max_elect_threads; /* Max concurrent e-threads ever. */ + u_int32_t st_elect_threads; /* # of active election threads. */ + u_int32_t st_max_elect_threads; /* Max concurrent e-threads ever. */ + u_int32_t st_site_participants; /* # of repgroup participant sites. */ + u_int32_t st_site_total; /* # of repgroup total sites. */ + u_int32_t st_site_views; /* # of repgroup view sites. */ + uintmax_t st_takeovers; /* # of automatic listener takeovers. */ }; /* Replication Manager connection error. */ @@ -1238,7 +1284,7 @@ struct __db_sequence { db_mutex_t mtx_seq; /* Mutex if sequence is threaded. */ DB_SEQ_RECORD *seq_rp; /* Pointer to current data. */ DB_SEQ_RECORD seq_record; /* Data from DB_SEQUENCE. */ - int32_t seq_cache_size; /* Number of values cached. */ + u_int32_t seq_cache_size; /* Number of values cached. */ db_seq_t seq_last_value; /* Last value cached. */ db_seq_t seq_prev_value; /* Last value returned. */ DBT seq_key; /* DBT pointing to sequence key. */ @@ -1250,8 +1296,8 @@ struct __db_sequence { /* DB_SEQUENCE PUBLIC HANDLE LIST BEGIN */ int (*close) __P((DB_SEQUENCE *, u_int32_t)); int (*get) __P((DB_SEQUENCE *, - DB_TXN *, int32_t, db_seq_t *, u_int32_t)); - int (*get_cachesize) __P((DB_SEQUENCE *, int32_t *)); + DB_TXN *, u_int32_t, db_seq_t *, u_int32_t)); + int (*get_cachesize) __P((DB_SEQUENCE *, u_int32_t *)); int (*get_db) __P((DB_SEQUENCE *, DB **)); int (*get_flags) __P((DB_SEQUENCE *, u_int32_t *)); int (*get_key) __P((DB_SEQUENCE *, DBT *)); @@ -1261,7 +1307,7 @@ struct __db_sequence { int (*open) __P((DB_SEQUENCE *, DB_TXN *, DBT *, u_int32_t)); int (*remove) __P((DB_SEQUENCE *, DB_TXN *, u_int32_t)); - int (*set_cachesize) __P((DB_SEQUENCE *, int32_t)); + int (*set_cachesize) __P((DB_SEQUENCE *, u_int32_t)); int (*set_flags) __P((DB_SEQUENCE *, u_int32_t)); int (*set_range) __P((DB_SEQUENCE *, db_seq_t, db_seq_t)); int (*stat) __P((DB_SEQUENCE *, @@ -1278,7 +1324,7 @@ struct __db_seq_stat { /* SHARED */ db_seq_t st_last_value; /* Last cached value. */ db_seq_t st_min; /* Minimum value. */ db_seq_t st_max; /* Maximum value. */ - int32_t st_cache_size; /* Cache size. */ + u_int32_t st_cache_size; /* Cache size. */ u_int32_t st_flags; /* Flag value. */ }; @@ -1300,15 +1346,15 @@ typedef enum { #define DB_RENAMEMAGIC 0x030800 /* File has been renamed. */ -#define DB_BTREEVERSION 9 /* Current btree version. */ +#define DB_BTREEVERSION 10 /* Current btree version. */ #define DB_BTREEOLDVER 8 /* Oldest btree version supported. */ #define DB_BTREEMAGIC 0x053162 -#define DB_HASHVERSION 9 /* Current hash version. */ +#define DB_HASHVERSION 10 /* Current hash version. */ #define DB_HASHOLDVER 7 /* Oldest hash version supported. */ #define DB_HASHMAGIC 0x061561 -#define DB_HEAPVERSION 1 /* Current heap version. */ +#define DB_HEAPVERSION 2 /* Current heap version. */ #define DB_HEAPOLDVER 1 /* Oldest heap version supported. */ #define DB_HEAPMAGIC 0x074582 @@ -1377,6 +1423,7 @@ typedef enum { #define DB_LOCK_NOTGRANTED (-30992)/* Lock unavailable. */ #define DB_LOG_BUFFER_FULL (-30991)/* In-memory log buffer full. */ #define DB_LOG_VERIFY_BAD (-30990)/* Log verification failed. */ +#define DB_META_CHKSUM_FAIL (-30968)/* Metadata page checksum failed. */ #define DB_NOSERVER (-30989)/* Server panic return. */ #define DB_NOTFOUND (-30988)/* Key/data pair not found (EOF). */ #define DB_OLD_VERSION (-30987)/* Out-of-date version. */ @@ -1405,6 +1452,8 @@ typedef enum { #define DB_DELETED (-30897)/* Recovery file marked deleted. */ #define DB_EVENT_NOT_HANDLED (-30896)/* Forward event to application. */ #define DB_NEEDSPLIT (-30895)/* Page needs to be split. */ +#define DB_NOINTMP (-30886)/* Sequences not supported in temporary + or in-memory databases. */ #define DB_REP_BULKOVF (-30894)/* Rep bulk buffer overflow. */ #define DB_REP_LOGREADY (-30893)/* Rep log ready for recovery. */ #define DB_REP_NEWMASTER (-30892)/* We have learned of a new master. */ @@ -1415,6 +1464,13 @@ typedef enum { #define DB_TXN_CKP (-30888)/* Encountered ckp record in log. */ #define DB_VERIFY_FATAL (-30887)/* DB->verify cannot proceed. */ +/* + * This exit status indicates that a BDB utility failed because it needed a + * resource which had been held by a process which crashed or otherwise did + * not exit cleanly. + */ +#define DB_EXIT_FAILCHK 3 + /* Database handle. */ struct __db { /******************************************************* @@ -1426,7 +1482,7 @@ struct __db { /* Callbacks. */ int (*db_append_recno) __P((DB *, DBT *, db_recno_t)); void (*db_feedback) __P((DB *, int, int)); - int (*dup_compare) __P((DB *, const DBT *, const DBT *)); + int (*dup_compare) __P((DB *, const DBT *, const DBT *, size_t *)); void *app_private; /* Application-private handle. */ @@ -1450,6 +1506,8 @@ struct __db { u_int32_t adj_fileid; /* File's unique ID for curs. adj. */ + u_int32_t blob_threshold; /* Blob threshold record size. */ + #define DB_LOGFILEID_INVALID -1 FNAME *log_filename; /* File's naming info for logging. */ @@ -1593,6 +1651,12 @@ struct __db { /* Reference to foreign -- set in the secondary. */ DB *s_foreign; + DB *blob_meta_db; /* Databases holding blob metadata. */ + DB_SEQUENCE *blob_seq; /* Sequence of blob ids. */ + char *blob_sub_dir; /* Subdirectory for blob files */ + db_seq_t blob_file_id; /* Id of the file blob directory. */ + db_seq_t blob_sdb_id; /* Id of the subdb blob directory. */ + /* API-private structure: used by DB 1.85, C++, Java, Perl and Tcl */ void *api_internal; @@ -1623,8 +1687,11 @@ struct __db { void *(**)(void *, size_t), void (**)(void *))); int (*get_append_recno) __P((DB *, int (**)(DB *, DBT *, db_recno_t))); int (*get_assoc_flags) __P((DB *, u_int32_t *)); + int (*get_blob_dir) __P((DB *, const char **)); + int (*get_blob_sub_dir) __P((DB *, const char **)); + int (*get_blob_threshold) __P((DB *, u_int32_t *)); int (*get_bt_compare) - __P((DB *, int (**)(DB *, const DBT *, const DBT *))); + __P((DB *, int (**)(DB *, const DBT *, const DBT *, size_t *))); int (*get_bt_compress) __P((DB *, int (**)(DB *, const DBT *, const DBT *, const DBT *, const DBT *, DBT *), @@ -1637,7 +1704,7 @@ struct __db { int (*get_create_dir) __P((DB *, const char **)); int (*get_dbname) __P((DB *, const char **, const char **)); int (*get_dup_compare) - __P((DB *, int (**)(DB *, const DBT *, const DBT *))); + __P((DB *, int (**)(DB *, const DBT *, const DBT *, size_t *))); int (*get_encrypt_flags) __P((DB *, u_int32_t *)); DB_ENV *(*get_env) __P((DB *)); void (*get_errcall) __P((DB *, @@ -1647,7 +1714,7 @@ struct __db { int (*get_feedback) __P((DB *, void (**)(DB *, int, int))); int (*get_flags) __P((DB *, u_int32_t *)); int (*get_h_compare) - __P((DB *, int (**)(DB *, const DBT *, const DBT *))); + __P((DB *, int (**)(DB *, const DBT *, const DBT *, size_t *))); int (*get_h_ffactor) __P((DB *, u_int32_t *)); int (*get_h_hash) __P((DB *, u_int32_t (**)(DB *, const void *, u_int32_t))); @@ -1688,8 +1755,10 @@ struct __db { int (*set_alloc) __P((DB *, void *(*)(size_t), void *(*)(void *, size_t), void (*)(void *))); int (*set_append_recno) __P((DB *, int (*)(DB *, DBT *, db_recno_t))); + int (*set_blob_dir) __P((DB *, const char *)); + int (*set_blob_threshold) __P((DB *, u_int32_t, u_int32_t)); int (*set_bt_compare) - __P((DB *, int (*)(DB *, const DBT *, const DBT *))); + __P((DB *, int (*)(DB *, const DBT *, const DBT *, size_t *))); int (*set_bt_compress) __P((DB *, int (*)(DB *, const DBT *, const DBT *, const DBT *, const DBT *, DBT *), int (*)(DB *, const DBT *, const DBT *, DBT *, DBT *, DBT *))); @@ -1699,7 +1768,7 @@ struct __db { int (*set_cachesize) __P((DB *, u_int32_t, u_int32_t, int)); int (*set_create_dir) __P((DB *, const char *)); int (*set_dup_compare) - __P((DB *, int (*)(DB *, const DBT *, const DBT *))); + __P((DB *, int (*)(DB *, const DBT *, const DBT *, size_t *))); int (*set_encrypt) __P((DB *, const char *, u_int32_t)); void (*set_errcall) __P((DB *, void (*)(const DB_ENV *, const char *, const char *))); @@ -1708,7 +1777,7 @@ struct __db { int (*set_feedback) __P((DB *, void (*)(DB *, int, int))); int (*set_flags) __P((DB *, u_int32_t)); int (*set_h_compare) - __P((DB *, int (*)(DB *, const DBT *, const DBT *))); + __P((DB *, int (*)(DB *, const DBT *, const DBT *, size_t *))); int (*set_h_ffactor) __P((DB *, u_int32_t)); int (*set_h_hash) __P((DB *, u_int32_t (*)(DB *, const void *, u_int32_t))); @@ -1808,13 +1877,34 @@ struct __db { u_int32_t orig_flags; /* Flags at open, for refresh */ u_int32_t flags; -#define DB2_AM_EXCL 0x00000001 /* Exclusively lock the handle */ -#define DB2_AM_INTEXCL 0x00000002 /* Internal exclusive lock. */ -#define DB2_AM_NOWAIT 0x00000004 /* Do not wait for handle lock */ - u_int32_t orig_flags2; /* Second flags word; for refresh */ +#define DB2_AM_EXCL 0x00000001 /* Exclusively lock the handle */ +#define DB2_AM_INTEXCL 0x00000002 /* Internal exclusive lock. */ +#define DB2_AM_NOWAIT 0x00000004 /* Do not wait for handle lock */ u_int32_t flags2; /* Second flags word */ }; +/* + * Stream interface for blob files. + */ +struct __db_stream { + DBC *dbc; /* Cursor pointing to the db blob record. */ + DB_FH *fhp; + + /* DB_STREAM PUBLIC HANDLE LIST BEGIN */ + int (*close) __P((DB_STREAM *, u_int32_t)); + int (*read) __P((DB_STREAM *, DBT *, db_off_t, u_int32_t, u_int32_t)); + int (*size) __P((DB_STREAM *, db_off_t *, u_int32_t)); + int (*write) __P((DB_STREAM *, DBT *, db_off_t, u_int32_t)); + /* DB_STREAM PUBLIC HANDLE LIST END */ + + u_int32_t flags; +#define DB_STREAM_READ 0x00000001 /* Stream is read only. */ +#define DB_STREAM_WRITE 0x00000002 /* Stream is writeable. */ +#define DB_STREAM_SYNC_WRITE 0x00000004 /* Sync file on each write. */ + db_seq_t blob_id; + db_off_t file_size; +}; + /* * Macros for bulk operations. These are only intended for the C API. * For C++, use DbMultiple*Iterator or DbMultiple*Builder. @@ -1889,7 +1979,7 @@ struct __db { pointer = __p; \ } while (0) -#define DB_MULTIPLE_WRITE_INIT(pointer, dbt) \ +#define DB_MULTIPLE_WRITE_INIT(pointer, dbt) \ do { \ (dbt)->flags |= DB_DBT_BULK; \ pointer = (u_int8_t *)(dbt)->data + \ @@ -1897,7 +1987,7 @@ struct __db { *(u_int32_t *)(pointer) = (u_int32_t)-1; \ } while (0) -#define DB_MULTIPLE_RESERVE_NEXT(pointer, dbt, writedata, writedlen) \ +#define DB_MULTIPLE_RESERVE_NEXT(pointer, dbt, writedata, writedlen) \ do { \ u_int32_t *__p = (u_int32_t *)(pointer); \ u_int32_t __off = ((pointer) == (u_int8_t *)(dbt)->data +\ @@ -1914,7 +2004,7 @@ struct __db { } \ } while (0) -#define DB_MULTIPLE_WRITE_NEXT(pointer, dbt, writedata, writedlen) \ +#define DB_MULTIPLE_WRITE_NEXT(pointer, dbt, writedata, writedlen) \ do { \ void *__destd; \ DB_MULTIPLE_RESERVE_NEXT((pointer), (dbt), \ @@ -1925,7 +2015,7 @@ struct __db { memcpy(__destd, (writedata), (writedlen)); \ } while (0) -#define DB_MULTIPLE_KEY_RESERVE_NEXT(pointer, dbt, writekey, writeklen, writedata, writedlen) \ +#define DB_MULTIPLE_KEY_RESERVE_NEXT(pointer, dbt, writekey, writeklen, writedata, writedlen) \ do { \ u_int32_t *__p = (u_int32_t *)(pointer); \ u_int32_t __off = ((pointer) == (u_int8_t *)(dbt)->data +\ @@ -1948,7 +2038,7 @@ struct __db { } \ } while (0) -#define DB_MULTIPLE_KEY_WRITE_NEXT(pointer, dbt, writekey, writeklen, writedata, writedlen) \ +#define DB_MULTIPLE_KEY_WRITE_NEXT(pointer, dbt, writekey, writeklen, writedata, writedlen) \ do { \ void *__destk, *__destd; \ DB_MULTIPLE_KEY_RESERVE_NEXT((pointer), (dbt), \ @@ -1962,7 +2052,7 @@ struct __db { } \ } while (0) -#define DB_MULTIPLE_RECNO_WRITE_INIT(pointer, dbt) \ +#define DB_MULTIPLE_RECNO_WRITE_INIT(pointer, dbt) \ do { \ (dbt)->flags |= DB_DBT_BULK; \ pointer = (u_int8_t *)(dbt)->data + \ @@ -1970,7 +2060,7 @@ struct __db { *(u_int32_t *)(pointer) = 0; \ } while (0) -#define DB_MULTIPLE_RECNO_RESERVE_NEXT(pointer, dbt, recno, writedata, writedlen) \ +#define DB_MULTIPLE_RECNO_RESERVE_NEXT(pointer, dbt, recno, writedata, writedlen) \ do { \ u_int32_t *__p = (u_int32_t *)(pointer); \ u_int32_t __off = ((pointer) == (u_int8_t *)(dbt)->data +\ @@ -1988,7 +2078,7 @@ struct __db { } \ } while (0) -#define DB_MULTIPLE_RECNO_WRITE_NEXT(pointer, dbt, recno, writedata, writedlen)\ +#define DB_MULTIPLE_RECNO_WRITE_NEXT(pointer, dbt, recno, writedata, writedlen)\ do { \ void *__destd; \ DB_MULTIPLE_RECNO_RESERVE_NEXT((pointer), (dbt), \ @@ -2003,7 +2093,7 @@ struct __db_heap_rid { db_pgno_t pgno; /* Page number. */ db_indx_t indx; /* Index in the offset table. */ }; -#define DB_HEAP_RID_SZ (sizeof(db_pgno_t) + sizeof(db_indx_t)) +#define DB_HEAP_RID_SZ (sizeof(db_pgno_t) + sizeof(db_indx_t)) /******************************************************* * Access method cursors. @@ -2074,6 +2164,7 @@ struct __dbc { int (*close) __P((DBC *)); int (*cmp) __P((DBC *, DBC *, int *, u_int32_t)); int (*count) __P((DBC *, db_recno_t *, u_int32_t)); + int (*db_stream) __P((DBC *, DB_STREAM **, u_int32_t)); int (*del) __P((DBC *, u_int32_t)); int (*dup) __P((DBC *, DBC **, u_int32_t)); int (*get) __P((DBC *, DBT *, DBT *, u_int32_t)); @@ -2151,6 +2242,7 @@ struct __db_bt_stat { /* SHARED */ u_int32_t bt_pagecnt; /* Page count. */ u_int32_t bt_pagesize; /* Page size. */ u_int32_t bt_minkey; /* Minkey value. */ + u_int32_t bt_nblobs; /* Number of blobs. */ u_int32_t bt_re_len; /* Fixed-length record length. */ u_int32_t bt_re_pad; /* Fixed-length record pad. */ u_int32_t bt_levels; /* Tree levels. */ @@ -2179,7 +2271,7 @@ struct __db_compact { u_int32_t compact_deadlock; /* Number of deadlocks. */ db_pgno_t compact_pages_truncated; /* Pages truncated to OS. */ /* Internal. */ - db_pgno_t compact_truncate; /* Page number for truncation */ + db_pgno_t compact_truncate; /* Exchange pages above here. */ }; /* Hash statistics structure. */ @@ -2189,6 +2281,7 @@ struct __db_h_stat { /* SHARED */ u_int32_t hash_metaflags; /* Metadata flags. */ u_int32_t hash_nkeys; /* Number of unique keys. */ u_int32_t hash_ndata; /* Number of data items. */ + u_int32_t hash_nblobs; /* Number of blobs. */ u_int32_t hash_pagecnt; /* Page count. */ u_int32_t hash_pagesize; /* Page size. */ u_int32_t hash_ffactor; /* Fill factor specified at create. */ @@ -2208,6 +2301,7 @@ struct __db_heap_stat { /* SHARED */ u_int32_t heap_magic; /* Magic number. */ u_int32_t heap_version; /* Version number. */ u_int32_t heap_metaflags; /* Metadata flags. */ + u_int32_t heap_nblobs; /* Number of blobs. */ u_int32_t heap_nrecs; /* Number of records. */ u_int32_t heap_pagecnt; /* Page count. */ u_int32_t heap_pagesize; /* Page size. */ @@ -2267,21 +2361,15 @@ typedef enum { * Backup configuration types. */ typedef enum { - DB_BACKUP_READ_COUNT = 1, - DB_BACKUP_READ_SLEEP = 2, - DB_BACKUP_SIZE = 3, - DB_BACKUP_WRITE_DIRECT = 4 + DB_BACKUP_READ_COUNT=1, + DB_BACKUP_READ_SLEEP=2, + DB_BACKUP_SIZE=3, + DB_BACKUP_WRITE_DIRECT=4 } DB_BACKUP_CONFIG; struct __db_env { ENV *env; /* Linked ENV structure */ - /* - * The DB_ENV structure can be used concurrently, so field access is - * protected. - */ - db_mutex_t mtx_db_env; /* DB_ENV structure mutex */ - /* Error message callback */ void (*db_errcall) __P((const DB_ENV *, const char *, const char *)); FILE *db_errfile; /* Error message file stream */ @@ -2304,6 +2392,7 @@ struct __db_env { char *(*thread_id_string) __P((DB_ENV *, pid_t, db_threadid_t, char *)); /* Application specified paths */ + char *db_blob_dir; /* Blob file directory */ char *db_log_dir; /* Database log file directory */ char *db_md_dir; /* Persistent metadata directory */ char *db_tmp_dir; /* Database tmp file directory */ @@ -2327,6 +2416,8 @@ struct __db_env { u_int32_t verbose; /* DB_VERB_XXX flags */ + u_int32_t blob_threshold; /* Blob threshold record size */ + /* Mutex configuration */ u_int32_t mutex_align; /* Mutex alignment */ u_int32_t mutex_cnt; /* Number of mutexes to configure */ @@ -2395,6 +2486,11 @@ struct __db_env { * build settings. */ db_timeout_t envreg_timeout; /* DB_REGISTER wait timeout */ + /* + * When failchk broadcasting is active, any wait for a mutex will wake + * up this frequently in order to check whether the mutex has died. + */ + db_timeout_t mutex_failchk_timeout; #define DB_ENV_AUTO_COMMIT 0x00000001 /* DB_AUTO_COMMIT */ #define DB_ENV_CDB_ALLDB 0x00000002 /* CDB environment wide locking */ @@ -2414,8 +2510,8 @@ struct __db_env { #define DB_ENV_TXN_SNAPSHOT 0x00008000 /* DB_TXN_SNAPSHOT set */ #define DB_ENV_TXN_WRITE_NOSYNC 0x00010000 /* DB_TXN_WRITE_NOSYNC set */ #define DB_ENV_YIELDCPU 0x00020000 /* DB_YIELDCPU set */ -#define DB_ENV_HOTBACKUP 0x00040000 /* DB_HOTBACKUP_IN_PROGRESS set */ -#define DB_ENV_NOFLUSH 0x00080000 /* DB_NOFLUSH set */ +#define DB_ENV_HOTBACKUP 0x00040000 /* DB_HOTBACKUP_IN_PROGRESS set */ +#define DB_ENV_NOFLUSH 0x00080000 /* DB_NOFLUSH set */ u_int32_t flags; /* DB_ENV PUBLIC HANDLE LIST BEGIN */ @@ -2436,6 +2532,8 @@ struct __db_env { void *(**)(void *, size_t), void (**)(void *))); int (*get_app_dispatch) __P((DB_ENV *, int (**)(DB_ENV *, DBT *, DB_LSN *, db_recops))); + int (*get_blob_dir) __P((DB_ENV *, const char **)); + int (*get_blob_threshold) __P((DB_ENV*, u_int32_t *)); int (*get_cache_max) __P((DB_ENV *, u_int32_t *, u_int32_t *)); int (*get_cachesize) __P((DB_ENV *, u_int32_t *, u_int32_t *, int *)); int (*get_create_dir) __P((DB_ENV *, const char **)); @@ -2451,8 +2549,8 @@ struct __db_env { void (**)(const DB_ENV *, const char *, const char *))); void (*get_errfile) __P((DB_ENV *, FILE **)); void (*get_errpfx) __P((DB_ENV *, const char **)); - int (*get_flags) __P((DB_ENV *, u_int32_t *)); int (*get_feedback) __P((DB_ENV *, void (**)(DB_ENV *, int, int))); + int (*get_flags) __P((DB_ENV *, u_int32_t *)); int (*get_home) __P((DB_ENV *, const char **)); int (*get_intermediate_dir_mode) __P((DB_ENV *, const char **)); int (*get_isalive) __P((DB_ENV *, @@ -2568,17 +2666,23 @@ struct __db_env { int (*rep_set_timeout) __P((DB_ENV *, int, db_timeout_t)); int (*rep_set_transport) __P((DB_ENV *, int, int (*)(DB_ENV *, const DBT *, const DBT *, const DB_LSN *, int, u_int32_t))); + int (*rep_set_view) __P((DB_ENV *, int (*)(DB_ENV *, + const char *, int *, u_int32_t))); int (*rep_start) __P((DB_ENV *, DBT *, u_int32_t)); int (*rep_stat) __P((DB_ENV *, DB_REP_STAT **, u_int32_t)); int (*rep_stat_print) __P((DB_ENV *, u_int32_t)); int (*rep_sync) __P((DB_ENV *, u_int32_t)); int (*repmgr_channel) __P((DB_ENV *, int, DB_CHANNEL **, u_int32_t)); int (*repmgr_get_ack_policy) __P((DB_ENV *, int *)); + int (*repmgr_get_incoming_queue_max) + __P((DB_ENV *, u_int32_t *, u_int32_t *)); int (*repmgr_local_site) __P((DB_ENV *, DB_SITE **)); int (*repmgr_msg_dispatch) __P((DB_ENV *, void (*)(DB_ENV *, DB_CHANNEL *, DBT *, u_int32_t, u_int32_t), u_int32_t)); int (*repmgr_set_ack_policy) __P((DB_ENV *, int)); + int (*repmgr_set_incoming_queue_max) + __P((DB_ENV *, u_int32_t, u_int32_t)); int (*repmgr_site) __P((DB_ENV *, const char *, u_int, DB_SITE**, u_int32_t)); int (*repmgr_site_by_eid) __P((DB_ENV *, int, DB_SITE**)); @@ -2590,6 +2694,8 @@ struct __db_env { void *(*)(void *, size_t), void (*)(void *))); int (*set_app_dispatch) __P((DB_ENV *, int (*)(DB_ENV *, DBT *, DB_LSN *, db_recops))); + int (*set_blob_dir) __P((DB_ENV *, const char *)); + int (*set_blob_threshold) __P((DB_ENV *, u_int32_t, u_int32_t)); int (*set_cache_max) __P((DB_ENV *, u_int32_t, u_int32_t)); int (*set_cachesize) __P((DB_ENV *, u_int32_t, u_int32_t, int)); int (*set_create_dir) __P((DB_ENV *, const char *)); @@ -2662,8 +2768,8 @@ struct __db_env { /* DB_ENV PUBLIC HANDLE LIST END */ /* DB_ENV PRIVATE HANDLE LIST BEGIN */ - int (*prdbt) __P((DBT *, int, - const char *, void *, int (*)(void *, const void *), int, int)); + int (*prdbt) __P((DBT *, int, const char *, void *, + int (*)(void *, const void *), int, int, int)); /* DB_ENV PRIVATE HANDLE LIST END */ }; diff --git a/src/dbinc/db_185.in b/src/dbinc/db_185.in index 43735344..3aef2eca 100644 --- a/src/dbinc/db_185.in +++ b/src/dbinc/db_185.in @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1990, 1993, 1994 diff --git a/src/dbinc/db_am.h b/src/dbinc/db_am.h index f34578c4..2b5c49d2 100644 --- a/src/dbinc/db_am.h +++ b/src/dbinc/db_am.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -199,12 +199,16 @@ struct __db_foreign_info { #define DB_IS_PRIMARY(dbp) (LIST_FIRST(&dbp->s_secondaries) != NULL) /* * A database should be required to be readonly if it's been explicitly - * specified as such or if we're a client in a replicated environment - * and the user did not specify DB_TXN_NOT_DURABLE. + * specified as such, if we're a client in a replicated environment + * and the user did not specify DB_TXN_NOT_DURABLE, or if we're a master + * in a replicated environment and the REP_F_READONLY_MASTER flag has been + * set in preparation for a preferred master takeover. */ #define DB_IS_READONLY(dbp) \ (F_ISSET(dbp, DB_AM_RDONLY) || \ - (IS_REP_CLIENT((dbp)->env) && !F_ISSET((dbp), DB_AM_NOT_DURABLE))) + (IS_REP_CLIENT((dbp)->env) && !F_ISSET((dbp), DB_AM_NOT_DURABLE)) \ + || (IS_REP_MASTER((dbp)->env) && \ + F_ISSET((dbp)->env->rep_handle->region, REP_F_READONLY_MASTER))) #ifdef HAVE_COMPRESSION /* diff --git a/src/dbinc/db_cxx.in b/src/dbinc/db_cxx.in index 84fc0f88..5b29f7e8 100644 --- a/src/dbinc/db_cxx.in +++ b/src/dbinc/db_cxx.in @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -76,6 +76,7 @@ class DbMpoolFile; // forward class DbPreplist; // forward class DbSequence; // forward class DbSite; // forward +class DbStream; // forward class Dbt; // forward class DbTxn; // forward @@ -159,13 +160,13 @@ extern "C" { typedef void (*db_free_fcn_type) (void *); typedef int (*bt_compare_fcn_type) /*C++ version available*/ - (DB *, const DBT *, const DBT *); + (DB *, const DBT *, const DBT *, size_t *); typedef size_t (*bt_prefix_fcn_type) /*C++ version available*/ (DB *, const DBT *, const DBT *); typedef int (*dup_compare_fcn_type) /*C++ version available*/ - (DB *, const DBT *, const DBT *); + (DB *, const DBT *, const DBT *, size_t *); typedef int (*h_compare_fcn_type) /*C++ version available*/ - (DB *, const DBT *, const DBT *); + (DB *, const DBT *, const DBT *, size_t *); typedef u_int32_t (*h_hash_fcn_type) /*C++ version available*/ (DB *, const void *, u_int32_t); typedef int (*pgin_fcn_type) @@ -204,7 +205,10 @@ public: virtual int get_alloc( db_malloc_fcn_type *, db_realloc_fcn_type *, db_free_fcn_type *); virtual int get_append_recno(int (**)(Db *, Dbt *, db_recno_t)); - virtual int get_bt_compare(int (**)(Db *, const Dbt *, const Dbt *)); + virtual int get_blob_dir(const char **); + virtual int get_blob_threshold(u_int32_t *); + virtual int get_bt_compare( + int (**)(Db *, const Dbt *, const Dbt *, size_t *)); virtual int get_bt_compress( int (**)( Db *, const Dbt *, const Dbt *, const Dbt *, const Dbt *, Dbt *), @@ -215,7 +219,8 @@ public: virtual int get_cachesize(u_int32_t *, u_int32_t *, int *); virtual int get_create_dir(const char **); virtual int get_dbname(const char **, const char **); - virtual int get_dup_compare(int (**)(Db *, const Dbt *, const Dbt *)); + virtual int get_dup_compare( + int (**)(Db *, const Dbt *, const Dbt *, size_t *)); virtual int get_encrypt_flags(u_int32_t *); virtual void get_errcall( void (**)(const DbEnv *, const char *, const char *)); @@ -225,7 +230,8 @@ public: virtual int get_flags(u_int32_t *); virtual int get_heapsize(u_int32_t *, u_int32_t *); virtual int get_heap_regionsize(u_int32_t *); - virtual int get_h_compare(int (**)(Db *, const Dbt *, const Dbt *)); + virtual int get_h_compare( + int (**)(Db *, const Dbt *, const Dbt *, size_t *)); virtual int get_h_ffactor(u_int32_t *); virtual int get_h_hash(u_int32_t (**)(Db *, const void *, u_int32_t)); virtual int get_h_nelem(u_int32_t *); @@ -261,8 +267,11 @@ public: db_malloc_fcn_type, db_realloc_fcn_type, db_free_fcn_type); virtual void set_app_private(void *); virtual int set_append_recno(int (*)(Db *, Dbt *, db_recno_t)); + virtual int set_blob_dir(const char *); + virtual int set_blob_threshold(u_int32_t, u_int32_t); virtual int set_bt_compare(bt_compare_fcn_type); /*deprecated*/ - virtual int set_bt_compare(int (*)(Db *, const Dbt *, const Dbt *)); + virtual int set_bt_compare( + int (*)(Db *, const Dbt *, const Dbt *, size_t *)); virtual int set_bt_compress( int (*) (Db *, const Dbt *, const Dbt *, const Dbt *, const Dbt *, Dbt *), @@ -273,7 +282,8 @@ public: virtual int set_cachesize(u_int32_t, u_int32_t, int); virtual int set_create_dir(const char *); virtual int set_dup_compare(dup_compare_fcn_type); /*deprecated*/ - virtual int set_dup_compare(int (*)(Db *, const Dbt *, const Dbt *)); + virtual int set_dup_compare( + int (*)(Db *, const Dbt *, const Dbt *, size_t *)); virtual int set_encrypt(const char *, u_int32_t); virtual void set_errcall( void (*)(const DbEnv *, const char *, const char *)); @@ -284,7 +294,8 @@ public: virtual int set_heapsize(u_int32_t, u_int32_t); virtual int set_heap_regionsize(u_int32_t); virtual int set_h_compare(h_compare_fcn_type); /*deprecated*/ - virtual int set_h_compare(int (*)(Db *, const Dbt *, const Dbt *)); + virtual int set_h_compare( + int (*)(Db *, const Dbt *, const Dbt *, size_t *)); virtual int set_h_ffactor(u_int32_t); virtual int set_h_hash(h_hash_fcn_type); /*deprecated*/ virtual int set_h_hash(u_int32_t (*)(Db *, const void *, u_int32_t)); @@ -383,16 +394,16 @@ public: int (*associate_callback_)(Db *, const Dbt *, const Dbt *, Dbt *); int (*associate_foreign_callback_) (Db *, const Dbt *, Dbt *, const Dbt *, int *); - int (*bt_compare_callback_)(Db *, const Dbt *, const Dbt *); + int (*bt_compare_callback_)(Db *, const Dbt *, const Dbt *, size_t *); int (*bt_compress_callback_)( Db *, const Dbt *, const Dbt *, const Dbt *, const Dbt *, Dbt *); int (*bt_decompress_callback_)( Db *, const Dbt *, const Dbt *, Dbt *, Dbt *, Dbt *); size_t (*bt_prefix_callback_)(Db *, const Dbt *, const Dbt *); u_int32_t (*db_partition_callback_)(Db *, Dbt *); - int (*dup_compare_callback_)(Db *, const Dbt *, const Dbt *); + int (*dup_compare_callback_)(Db *, const Dbt *, const Dbt *, size_t *); void (*feedback_callback_)(Db *, int, int); - int (*h_compare_callback_)(Db *, const Dbt *, const Dbt *); + int (*h_compare_callback_)(Db *, const Dbt *, const Dbt *, size_t *); u_int32_t (*h_hash_callback_)(Db *, const void *, u_int32_t); }; @@ -407,6 +418,7 @@ public: int close(); int cmp(Dbc *other_csr, int *result, u_int32_t flags); int count(db_recno_t *countp, u_int32_t flags); + int db_stream(DbStream **dbsp, u_int32_t flags); int del(u_int32_t flags); int dup(Dbc** cursorp, u_int32_t flags); int get(Dbt* key, Dbt *data, u_int32_t flags); @@ -527,6 +539,10 @@ public: int (*)(DbEnv *, const char *, void *)); virtual int get_backup_config(DB_BACKUP_CONFIG, u_int32_t *); virtual int set_backup_config(DB_BACKUP_CONFIG, u_int32_t); + virtual int get_blob_dir(const char **); + virtual int set_blob_dir(const char *); + virtual int get_blob_threshold(u_int32_t *); + virtual int set_blob_threshold(u_int32_t, u_int32_t); virtual int get_cachesize(u_int32_t *, u_int32_t *, int *); virtual int set_cachesize(u_int32_t, u_int32_t, int); virtual int get_cache_max(u_int32_t *, u_int32_t *); @@ -761,10 +777,16 @@ public: virtual int rep_set_priority(u_int32_t priority); virtual int rep_get_timeout(int which, db_timeout_t *timeout); virtual int rep_set_timeout(int which, db_timeout_t timeout); + virtual int rep_set_view(int (*)(DbEnv *, + const char *, int *, u_int32_t)); virtual int repmgr_channel(int eid, DbChannel **channel, u_int32_t flags); virtual int repmgr_get_ack_policy(int *policy); virtual int repmgr_set_ack_policy(int policy); + virtual int repmgr_get_incoming_queue_max(u_int32_t *gbytesp, + u_int32_t *bytesp); + virtual int repmgr_set_incoming_queue_max(u_int32_t gbytes, + u_int32_t bytes); virtual int repmgr_local_site(DbSite **site); virtual int repmgr_msg_dispatch(void (*) (DbEnv *, DbChannel *, Dbt *, u_int32_t, u_int32_t), u_int32_t flags); @@ -824,6 +846,8 @@ public: static int _backup_write_intercept(DB_ENV *dbenv, u_int32_t off_gbytes, u_int32_t off_bytes, u_int32_t size, u_int8_t *buf, void *handle); static void _paniccall_intercept(DB_ENV *dbenv, int errval); + static int _partial_rep_intercept(DB_ENV *dbenv, + const char *name, int *result, u_int32_t flags); static void _feedback_intercept(DB_ENV *dbenv, int opcode, int pct); static void _event_func_intercept(DB_ENV *dbenv, u_int32_t, void *); static int _isalive_intercept(DB_ENV *dbenv, pid_t pid, @@ -872,6 +896,7 @@ private: void (*feedback_callback_)(DbEnv *, int, int); void (*message_callback_)(const DbEnv *, const char *); void (*paniccall_callback_)(DbEnv *, int); + int (*partial_rep_callback_)(DbEnv *, const char *, int *, u_int32_t); void (*event_func_callback_)(DbEnv *, u_int32_t, void *); int (*rep_send_callback_)(DbEnv *, const Dbt *, const Dbt *, const DbLsn *, int, u_int32_t); @@ -1057,9 +1082,9 @@ public: int stat(DB_SEQUENCE_STAT **sp, u_int32_t flags); int stat_print(u_int32_t flags); - int get(DbTxn *txnid, int32_t delta, db_seq_t *retp, u_int32_t flags); - int get_cachesize(int32_t *sizep); - int set_cachesize(int32_t size); + int get(DbTxn *txnid, u_int32_t delta, db_seq_t *retp, u_int32_t flags); + int get_cachesize(u_int32_t *sizep); + int set_cachesize(u_int32_t size); int get_flags(u_int32_t *flagsp); int set_flags(u_int32_t flags); int get_range(db_seq_t *minp, db_seq_t *maxp); @@ -1137,6 +1162,34 @@ private: }; // +// DbStream +// +class _exported DbStream : protected DB_STREAM +{ + friend class Dbc; + +public: + int close(u_int32_t flags); + int read(Dbt *data, db_off_t offset, u_int32_t size, u_int32_t flags); + int size(db_off_t *size, u_int32_t flags); + int write(Dbt *data, db_off_t offset, u_int32_t flags); + +private: + // No data is permitted in this class (see comment at top) + + // Note: use Dbc::dbstream() to get pointers to a DbStream, + // and call Dbstream::close() rather than delete to release them. + // + DbStream(); + ~DbStream(); + + // no copying + DbStream(const DbStream &); + DbStream &operator = (const DbStream &); + +}; + +// // Transaction // class _exported DbTxn @@ -1245,6 +1298,7 @@ class _exported Dbt : private DBT friend class DbEnv; friend class DbLogc; friend class DbSequence; + friend class DbStream; public: // key/data diff --git a/src/dbinc/db_dispatch.h b/src/dbinc/db_dispatch.h index b6382871..b3aedab1 100644 --- a/src/dbinc/db_dispatch.h +++ b/src/dbinc/db_dispatch.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1995, 1996 diff --git a/src/dbinc/db_int.in b/src/dbinc/db_int.in index 42439107..593deef6 100644 --- a/src/dbinc/db_int.in +++ b/src/dbinc/db_int.in @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -73,6 +73,17 @@ #endif /* !HAVE_SYSTEM_INCLUDE_FILES */ +/* + * The Windows compiler needs to be told about structures that are available + * outside a dll. + */ +#if defined(DB_WIN32) && defined(_MSC_VER) && \ + !defined(DB_CREATE_DLL) && !defined(_LIB) +#define __DB_IMPORT __declspec(dllimport) +#else +#define __DB_IMPORT +#endif + #ifdef DB_WIN32 #include "dbinc/win_db.h" #endif @@ -88,22 +99,12 @@ #include "dbinc/queue.h" #include "dbinc/shqueue.h" #include "dbinc/perfmon.h" +#include "dbinc/clock.h" #if defined(__cplusplus) extern "C" { #endif -/* - * The Windows compiler needs to be told about structures that are available - * outside a dll. - */ -#if defined(DB_WIN32) && defined(_MSC_VER) && \ - !defined(DB_CREATE_DLL) && !defined(_LIB) -#define __DB_IMPORT __declspec(dllimport) -#else -#define __DB_IMPORT -#endif - /******************************************************* * Forward structure declarations. *******************************************************/ @@ -366,22 +367,27 @@ typedef struct __fn { /* * Structure used for callback message aggregation. * - * Display values in XXX_stat_print calls. + * DB_MSGBUF_FLUSH displays values in XXX_stat_print calls. + * DB_MSGBUF_REP_FLUSH displays replication system messages. */ typedef struct __db_msgbuf { char *buf; /* Heap allocated buffer. */ char *cur; /* Current end of message. */ size_t len; /* Allocated length of buffer. */ + int flags; } DB_MSGBUF; +#define DB_MSGBUF_PREALLOCATED 0x0001 + #define DB_MSGBUF_INIT(a) do { \ (a)->buf = (a)->cur = NULL; \ - (a)->len = 0; \ + (a)->len = (a)->flags = 0; \ } while (0) #define DB_MSGBUF_FLUSH(env, a) do { \ if ((a)->buf != NULL) { \ if ((a)->cur != (a)->buf) \ __db_msg(env, "%s", (a)->buf); \ - __os_free(env, (a)->buf); \ + if (!F_ISSET((a), DB_MSGBUF_PREALLOCATED)) \ + __os_free(env, (a)->buf); \ DB_MSGBUF_INIT(a); \ } \ } while (0) @@ -392,18 +398,14 @@ typedef struct __db_msgbuf { if (regular_msg) \ DB_MSGBUF_FLUSH(env, a); \ else { \ - __os_free(env, (a)->buf); \ + if (!F_ISSET((a), DB_MSGBUF_PREALLOCATED)) \ + __os_free(env, (a)->buf); \ DB_MSGBUF_INIT(a); \ } \ } \ } while (0) -#define STAT_FMT(msg, fmt, type, v) do { \ - DB_MSGBUF __mb; \ - DB_MSGBUF_INIT(&__mb); \ - __db_msgadd(env, &__mb, fmt, (type)(v)); \ - __db_msgadd(env, &__mb, "\t%s", msg); \ - DB_MSGBUF_FLUSH(env, &__mb); \ -} while (0) +#define STAT_FMT(msg, fmt, type, v) \ + __db_msg(env, fmt "\t%s", (type)(v), msg); #define STAT_HEX(msg, v) \ __db_msg(env, "%#lx\t%s", (u_long)(v), msg) #define STAT_ISSET(msg, p) \ @@ -441,25 +443,21 @@ typedef struct __db_msgbuf { * * Error message IDs are automatically assigned by dist/s_message_id script. */ -#ifdef HAVE_LOCALIZATION -#define _(msg) msg /* Replace with localization function. */ -#else -#define _(msg) msg -#endif - #ifdef HAVE_STRIPPED_MESSAGES #define DB_STR_C(msg, fmt) fmt #else -#define DB_STR_C(msg, fmt) _(msg) +#define DB_STR_C(msg, fmt) msg #endif -#define DB_MSGID(id) "BDB" id - -#define DB_STR(id, msg) DB_MSGID(id) " " DB_STR_C(msg, "") - -#define DB_STR_A(id, msg, fmt) DB_MSGID(id) " " DB_STR_C(msg, fmt) +#ifdef HAVE_LOCALIZATION +#define _(msg) (msg) /* Replace with localization function. */ +#else +#define _(msg) msg +#endif -#define DB_STR_P(msg) _(msg) +#define DB_STR(id, msg) _("BDB" id " " DB_STR_C(msg, "")) +#define DB_STR_A(id, msg, fmt) _("BDB" id " " DB_STR_C(msg, fmt)) +#define DB_STR_P(msg) _(msg) /* * There are quite a few places in Berkeley DB where we want to initialize @@ -542,6 +540,7 @@ typedef struct __db_msgbuf { /* Type passed to __db_appname(). */ typedef enum { DB_APP_NONE=0, /* No type (region). */ + DB_APP_BLOB, /* Blob file. */ DB_APP_DATA, /* Data file. */ DB_APP_LOG, /* Log file. */ DB_APP_META, /* Persistent metadata file. */ @@ -612,8 +611,13 @@ typedef enum { if (F_ISSET((env), ENV_OPEN_CALLED)) \ ENV_REQUIRES_CONFIG(env, handle, i, flags) +/* + * The ENV_ENTER and ENV_LEAVE macros announce to other threads that + * the current thread is entering or leaving the BDB api. + */ #define ENV_ENTER_RET(env, ip, ret) do { \ ret = 0; \ + DISCARD_HISTORY(env); \ PANIC_CHECK_RET(env, ret); \ if (ret == 0) { \ if ((env)->thr_hashtab == NULL) \ @@ -631,6 +635,10 @@ typedef enum { return (__ret); \ } while (0) +/* + * Publicize the current thread's intention to run failchk. This invokes + * DB_ENV->is_alive() in the mutex code, to avoid hanging on dead processes. + */ #define FAILCHK_THREAD(env, ip) do { \ if ((ip) != NULL) \ (ip)->dbth_state = THREAD_FAILCHK; \ @@ -638,20 +646,15 @@ typedef enum { #define ENV_GET_THREAD_INFO(env, ip) ENV_ENTER(env, ip) -#ifdef DIAGNOSTIC #define ENV_LEAVE(env, ip) do { \ - if ((ip) != NULL) { \ - DB_ASSERT(env, ((ip)->dbth_state == THREAD_ACTIVE || \ - (ip)->dbth_state == THREAD_FAILCHK)); \ + if ((ip) != NULL) { \ + DB_ASSERT((env), (ip)->dbth_state == THREAD_ACTIVE || \ + (ip)->dbth_state == THREAD_FAILCHK); \ (ip)->dbth_state = THREAD_OUT; \ } \ } while (0) -#else -#define ENV_LEAVE(env, ip) do { \ - if ((ip) != NULL) \ - (ip)->dbth_state = THREAD_OUT; \ -} while (0) -#endif + + #ifdef DIAGNOSTIC #define CHECK_THREAD(env) do { \ if ((env)->thr_hashtab != NULL) \ @@ -688,6 +691,23 @@ typedef struct __pin_list { } PIN_LIST; #define PINMAX 4 +typedef enum { + MUTEX_ACTION_UNLOCKED=0, + MUTEX_ACTION_INTEND_SHARE, /* Thread is attempting a read-lock. */ + MUTEX_ACTION_SHARED /* Thread has gotten a read lock. */ +} MUTEX_ACTION; + +typedef struct __mutex_state { /* SHARED */ + db_mutex_t mutex; + MUTEX_ACTION action; +#ifdef DIAGNOSTIC + db_timespec when; +#endif +} MUTEX_STATE; + +#define MUTEX_STATE_MAX 10 /* It only needs enough for shared latches. */ + + struct __db_thread_info { /* SHARED */ pid_t dbth_pid; db_threadid_t dbth_tid; @@ -707,11 +727,25 @@ struct __db_thread_info { /* SHARED */ u_int16_t dbth_pinmax; /* Number of slots allocated. */ roff_t dbth_pinlist; /* List of pins. */ PIN_LIST dbth_pinarray[PINMAX]; /* Initial array of slots. */ + + /* + * While thread tracking is active this caches one of the lockers + * created by each thread. This locker remains allocated, with an + * invalid id, even after the locker id is freed. + */ + roff_t dbth_local_locker; + /* + * Each latch shared by this thread has an entry here. Exclusive + * ownership, for both latches and mutexes, are in the DB_MUTEX. + */ + MUTEX_STATE dbth_latches[MUTEX_STATE_MAX]; #ifdef DIAGNOSTIC roff_t dbth_locker; /* Current locker for this thread. */ u_int32_t dbth_check_off; /* Count of number of LOCK_OFF calls. */ #endif + db_timespec dbth_failtime; /* Time when its crash was detected. */ }; + #ifdef DIAGNOSTIC #define LOCK_CHECK_OFF(ip) if ((ip) != NULL) \ (ip)->dbth_check_off++ @@ -729,7 +763,7 @@ struct __db_thread_info { /* SHARED */ #define LOCK_CHECK(dbc, pgno, mode) NOP_STATEMENT #endif -typedef struct __env_thread_info { +typedef struct __env_thread_info { /* SHARED */ u_int32_t thr_count; u_int32_t thr_init; u_int32_t thr_max; @@ -803,6 +837,11 @@ struct __env { #define ENV_DEF_DATA_LEN 100 u_int32_t data_len; /* Data length in __db_prbytes. */ + /* Registered processes */ + size_t num_active_pids; /* number of entries in active_pids */ + size_t size_active_pids; /* allocated size of active_pids */ + pid_t *active_pids; /* array active pids */ + /* Thread tracking */ u_int32_t thr_nbucket; /* Number of hash buckets */ DB_HASHTAB *thr_hashtab; /* Hash table of DB_THREAD_INFO */ @@ -866,6 +905,7 @@ struct __env { #define DB_TEST_PREOPEN 10 /* before __os_open */ #define DB_TEST_REPMGR_PERM 11 /* repmgr perm/archiving tests */ #define DB_TEST_SUBDB_LOCKS 12 /* subdb locking tests */ +#define DB_TEST_REPMGR_HEARTBEAT 13 /* repmgr stop sending heartbeats */ int test_abort; /* Abort value for testing */ int test_check; /* Checkpoint value for testing */ int test_copy; /* Copy value for testing */ @@ -881,7 +921,9 @@ struct __env { #define ENV_REF_COUNTED 0x00000100 /* Region references this handle */ #define ENV_SYSTEM_MEM 0x00000200 /* DB_SYSTEM_MEM set */ #define ENV_THREAD 0x00000400 /* DB_THREAD set */ -#define ENV_FORCE_TXN_BULK 0x00000800 /* Txns use bulk mode-for testing */ +#define ENV_FORCE_TXN_BULK 0x00000800 /* Txns use bulk mode-for testing */ +#define ENV_REMEMBER_PANIC 0x00001000 /* Panic was on during cleanup. */ +#define ENV_FORCESYNCENV 0x00002000 /* Force msync on closing. */ u_int32_t flags; }; @@ -1106,7 +1148,6 @@ typedef struct __dbpginfo { @db_int_def@ #include "dbinc/globals.h" -#include "dbinc/clock.h" #include "dbinc/debug.h" #include "dbinc/region.h" #include "dbinc_auto/env_ext.h" @@ -1118,6 +1159,7 @@ typedef struct __dbpginfo { #include "dbinc/os.h" #include "dbinc_auto/clib_ext.h" #include "dbinc_auto/common_ext.h" +#include "dbinc_auto/blob_ext.h" /******************************************************* * Remaining Log. diff --git a/src/dbinc/db_join.h b/src/dbinc/db_join.h index aecf059a..8f22adcb 100644 --- a/src/dbinc/db_join.h +++ b/src/dbinc/db_join.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1998, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/dbinc/db_page.h b/src/dbinc/db_page.h index 2d4de2e5..4694c4cf 100644 --- a/src/dbinc/db_page.h +++ b/src/dbinc/db_page.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -93,6 +93,7 @@ typedef struct _dbmeta33 { u_int8_t uid[DB_FILE_ID_LEN]; } DBMETA33, DBMETA; + /************************************************************************ BTREE METADATA PAGE LAYOUT ************************************************************************/ @@ -113,7 +114,13 @@ typedef struct _btmeta33 { u_int32_t re_len; /* 80-83: Recno: fixed-length record length. */ u_int32_t re_pad; /* 84-87: Recno: fixed-length record pad. */ u_int32_t root; /* 88-91: Root page. */ - u_int32_t unused2[92]; /* 92-459: Unused space. */ + u_int32_t blob_threshold; + /* 92-95: Minimum blob file size. */ + u_int32_t blob_file_lo; /* 96-99: Blob file dir id lo. */ + u_int32_t blob_file_hi; /* 100-103: Blob file dir id hi. */ + u_int32_t blob_sdb_lo; /* 104-107: Blob sdb dir id lo */ + u_int32_t blob_sdb_hi; /* 108-111: Blob sdb dir id hi */ + u_int32_t unused2[87]; /* 112-459: Unused space. */ u_int32_t crypto_magic; /* 460-463: Crypto magic number */ u_int32_t trash[3]; /* 464-475: Trash space - Do not use */ u_int8_t iv[DB_IV_BYTES]; /* 476-495: Crypto IV */ @@ -142,7 +149,13 @@ typedef struct _hashmeta33 { #define NCACHED 32 /* number of spare points */ /* 96-223: Spare pages for overflow */ u_int32_t spares[NCACHED]; - u_int32_t unused[59]; /* 224-459: Unused space */ + u_int32_t blob_threshold; + /* 224-227: Minimum blob file size. */ + u_int32_t blob_file_lo; /* 228-231: Blob file dir id lo. */ + u_int32_t blob_file_hi; /* 232-235: Blob file dir id hi. */ + u_int32_t blob_sdb_lo; /* 236-239: Blob sdb dir id lo. */ + u_int32_t blob_sdb_hi; /* 240-243: Blob sdb dir id hi. */ + u_int32_t unused[54]; /* 244-459: Unused space */ u_int32_t crypto_magic; /* 460-463: Crypto magic number */ u_int32_t trash[3]; /* 464-475: Trash space - Do not use */ u_int8_t iv[DB_IV_BYTES]; /* 476-495: Crypto IV */ @@ -168,7 +181,10 @@ typedef struct _heapmeta { u_int32_t gbytes; /* 80-83: GBytes for fixed size heap. */ u_int32_t bytes; /* 84-87: Bytes for fixed size heap. */ u_int32_t region_size; /* 88-91: Max region size. */ - u_int32_t unused2[92]; /* 92-459: Unused space.*/ + u_int32_t blob_threshold; /* 92-95: Minimum blob file size. */ + u_int32_t blob_file_lo; /* 96-97: Blob file dir id lo. */ + u_int32_t blob_file_hi; /* 98-101: Blob file dir id hi. */ + u_int32_t unused2[89]; /* 102-459: Unused space.*/ u_int32_t crypto_magic; /* 460-463: Crypto magic number */ u_int32_t trash[3]; /* 464-475: Trash space - Do not use */ u_int8_t iv[DB_IV_BYTES]; /* 476-495: Crypto IV */ @@ -371,6 +387,7 @@ typedef struct __heaphdr { #define HEAP_RECSPLIT 0x01 /* Heap data record is split */ #define HEAP_RECFIRST 0x02 /* First piece of a split record */ #define HEAP_RECLAST 0x04 /* Last piece of a split record */ +#define HEAP_RECBLOB 0x08 /* Record refers to a blob */ u_int8_t flags; /* 00: Flags describing record. */ u_int8_t unused; /* 01: Padding. */ u_int16_t size; /* 02-03: The size of the stored data piece. */ @@ -384,8 +401,35 @@ typedef struct __heaphdrsplt { u_int16_t unused; /* 14-15: Padding. */ } HEAPSPLITHDR; +/* + * HEAPBLOB, the blob database record for heap. + * Saving bytes is not a concern for the blob record type - if too many + * fit onto a single page, then we're likely to introduce unnecessary + * contention for blobs. Using blobs implies storing large items, thus slightly + * more per-item overhead is acceptable. + * If this proves untrue, the crypto section of the record could be optional. + * encoding, lsn, encryption, and checksum fields are unused at the moment, but + * included to make adding those features easier. + */ +typedef struct _heapblob { + HEAPHDR std_hdr; /* 00-03: The standard data header */ + u_int8_t encoding; /* 04: Encoding of blob file. */ + u_int8_t unused[7]; /* 05-11: Padding, unused. */ + u_int8_t chksum[DB_MAC_KEY]; /* 12-31: Checksum */ + u_int8_t iv[DB_IV_BYTES]; /* 32-47: IV */ + DB_LSN lsn; /* 48-55: LSN for blob file update. */ + u_int64_t id; /* 56-63: Blob file identifier. */ + u_int64_t size; /* 64-71: Blob file size. */ + u_int64_t file_id; /* 72-80: File directory. */ +} HEAPBLOBHDR, HEAPBLOBHDR60P1; + #define HEAP_HDRSIZE(hdr) \ - (F_ISSET((hdr), HEAP_RECSPLIT) ? sizeof(HEAPSPLITHDR) : sizeof(HEAPHDR)) + (F_ISSET((hdr), HEAP_RECSPLIT) ? sizeof(HEAPSPLITHDR) : \ + sizeof(HEAPHDR)) + +#define HEAPBLOBREC_SIZE (sizeof(HEAPBLOBHDR)) +#define HEAPBLOBREC_DSIZE (sizeof(HEAPBLOBHDR) - sizeof(HEAPHDR)) +#define HEAPBLOBREC_DATA(p) (((u_int8_t *)p) + sizeof(HEAPHDR)) #define HEAPPG_SZ(dbp) \ (F_ISSET((dbp), DB_AM_ENCRYPT) ? HEAPPG_SEC : \ @@ -441,12 +485,12 @@ typedef struct __heaphdrsplt { /* Return the amount of free space on a heap data page. */ #define HEAP_FREESPACE(dbp, p) \ - (HOFFSET(p) - HEAPPG_SZ(dbp) - \ + ((HOFFSET(p) - HEAPPG_SZ(dbp)) - \ (NUM_ENT(p) == 0 ? 0 : ((HEAP_HIGHINDX(p) + 1) * sizeof(db_indx_t)))) /* The maximum amount of data that can fit on an empty heap data page. */ #define HEAP_MAXDATASIZE(dbp) \ - ((dbp)->pgsize - HEAPPG_SZ(dbp) - sizeof(db_indx_t)) + (((dbp)->pgsize - HEAPPG_SZ(dbp)) - sizeof(db_indx_t)) #define HEAP_FREEINDX(p) (((HEAPPG *)p)->free_indx) #define HEAP_HIGHINDX(p) (((HEAPPG *)p)->high_indx) @@ -549,9 +593,9 @@ typedef struct _qpage { * The amount of overflow data stored on each page is stored in the * hf_offset field. * - * The implementation reference counts overflow items as it's possible - * for them to be promoted onto btree internal pages. The reference - * count is stored in the entries field. + * Before 4.3 the implementation reference counted overflow items as it + * once was possible for them to be promoted onto btree internal pages. + * The reference count is stored in the entries field. */ #define OV_LEN(p) (((PAGE *)p)->hf_offset) #define OV_REF(p) (((PAGE *)p)->entries) @@ -571,6 +615,7 @@ typedef struct _qpage { #define H_DUPLICATE 2 /* Duplicate key/data item. */ #define H_OFFPAGE 3 /* Overflow key/data item. */ #define H_OFFDUP 4 /* Overflow page of duplicates. */ +#define H_BLOB 5 /* Blob file data item. */ /* * !!! @@ -685,6 +730,78 @@ typedef struct _hoffdup { */ #define HOFFDUP_SIZE (sizeof(HOFFDUP)) +/* + * The fifth type is the H_BLOB, represented by the HBLOB structure. + * Saving bytes is not a concern for the blob record type - if too many + * fit onto a single page, then we're likely to introduce unnecessary + * contention for blobs. Using blobs implies storing large items, thus slightly + * more per-item overhead is acceptable. + * If this proves untrue, the crypto section of the record could be optional. + * encoding, encryption, and checksum fields are unused at the moment, but + * included to make adding those features easier. + */ +typedef struct _hblob { + u_int8_t type; /* 00: Page type and delete flag. */ + u_int8_t encoding; /* 01: Encoding of blob file. */ + u_int8_t unused[10]; /* 02-11: Padding, unused. */ + u_int8_t chksum[DB_MAC_KEY]; /* 12-31: Checksum */ + u_int8_t iv[DB_IV_BYTES]; /* 32-47: IV */ + u_int64_t id; /* 48-55: Blob file identifier. */ + u_int64_t size; /* 56-63: Blob file size. */ + u_int64_t file_id; /* 64-71: File directory. */ + u_int64_t sdb_id; /* 72-79: Subdb that owns this blob. */ +} HBLOB, HBLOB60P1; + +#define HBLOB_ID(p) (((u_int8_t *)p) + SSZ(HBLOB, id)) +#define HBLOB_FILE_ID(p) (((u_int8_t *)p) + SSZ(HBLOB, file_id)) + +/* + * Return a off_t version of the u_int64_t blob size. + * Since off_t can be a 32 or 64 integer on different systems, this macro + * is used to catch cases of overflow. + */ +#define GET_BLOB_SIZE(e, p, o, ret) do { \ + DB_ASSERT((e), sizeof(o) <= 8); \ + if (sizeof(o) == 8) { \ + (o) = (off_t)(p).size; \ + } else { \ + if ((p).size > INT_MAX) { \ + __db_errx((e), DB_STR("0769", \ + "Blob size overflow.")); \ + (ret) = EINVAL; \ + } \ + (o) = (int32_t)(p).size; \ + } \ +} while (0); + +#define SET_BLOB_FIELD(p, v, type, field) do { \ + u_int64_t tmp; \ + tmp = (u_int64_t)(v); \ + memcpy((u_int8_t *)(p) + SSZ(type, field), \ + &tmp, sizeof(u_int64_t)); \ +} while (0); + +#define SET_BLOB_ID(p, v, type) \ + SET_BLOB_FIELD(p, v, type, id) + +#define SET_BLOB_SIZE(p, v, type) \ + SET_BLOB_FIELD(p, v, type, size) + +#define SET_BLOB_FILE_ID(p, v, type) \ + SET_BLOB_FIELD(p, v, type, file_id) + +#define SET_BLOB_SDB_ID(p, v, type) \ + SET_BLOB_FIELD(p, v, type, sdb_id) + +/* + * Page space required to add a new HBLOB item to the page, with and + * without the index value. + */ +#define HBLOB_SIZE (sizeof(HBLOB)) +#define HBLOB_DSIZE (sizeof(HBLOB) - SSZA(HKEYDATA, data)) +#define HBLOB_PSIZE (HBLOB_SIZE + sizeof(db_indx_t)) + + /************************************************************************ BTREE PAGE LAYOUT ************************************************************************/ @@ -693,6 +810,7 @@ typedef struct _hoffdup { #define B_KEYDATA 1 /* Key/data item. */ #define B_DUPLICATE 2 /* Duplicate key/data item. */ #define B_OVERFLOW 3 /* Overflow key/data item. */ +#define B_BLOB 4 /* Blob file key/data item. */ /* * We have to store a deleted entry flag in the page. The reason is complex, @@ -746,6 +864,32 @@ typedef struct _boverflow { u_int32_t tlen; /* 08-11: Total length of item. */ } BOVERFLOW; +/* + * The fourth type is the B_BLOB, represented by the BBLOB structure. + * Saving bytes is not a concern for the blob record type - if too many + * fit onto a single page, then we're likely to introduce unnecessary + * contention for blobs. Using blobs implies storing large items, thus slightly + * more per-item overhead is acceptable. + * The len field is set to BBLOB_DSIZE, so that a B_BLOB can be treated just + * like a B_KEYDATA for the purposes of moving items between or on a page. + * If this proves untrue, the crypto section of the record could be optional. + * encoding, lsn, encryption, and checksum fields are unused at the moment, but + * included to make adding those features easier. + */ +typedef struct _bblob { + db_indx_t len; /* 00-01: BBLOB_DSIZE. */ + u_int8_t type; /* 02: Page type and delete flag. */ + u_int8_t encoding; /* 03: Encoding of blob file. */ + u_int8_t unused[8]; /* 04-11: Padding, unused. */ + u_int8_t chksum[DB_MAC_KEY]; /* 12-31: Checksum */ + u_int8_t iv[DB_IV_BYTES]; /* 32-47: IV */ + u_int64_t id; /* 48-55: Blob file identifier. */ + u_int64_t size; /* 56-63: Blob file size. */ + u_int64_t file_id; /* 64-71: File directory. */ + u_int64_t sdb_id; /* 72-79: Subdb that owns this blob. */ +} BBLOB, BBLOB60P1; +#define BBLOB_DATA(p) ((u_int8_t *)((BKEYDATA *)p)->data) + /* Get a BOVERFLOW item for a specific index. */ #define GET_BOVERFLOW(dbp, pg, indx) \ ((BOVERFLOW *)P_ENTRY(dbp, pg, indx)) @@ -759,13 +903,26 @@ typedef struct _boverflow { #define BOVERFLOW_PSIZE \ (BOVERFLOW_SIZE + sizeof(db_indx_t)) +/* + * Page space required to add a new BBLOB item to the page, with and + * without the index value. BBLOB_DSIZE is used so that a B_BLOB item + * can be treated just like a B_KEYDATA for the purposes of moving items + * between or on a page, such as when doing compaction. + */ +#define BBLOB_SIZE \ + ((u_int16_t)DB_ALIGN(sizeof(BBLOB), sizeof(u_int32_t))) +#define BBLOB_DSIZE \ + (BBLOB_SIZE - SSZA(BKEYDATA, data)) +#define BBLOB_PSIZE \ + (BBLOB_SIZE + sizeof(db_indx_t)) + #define BITEM_SIZE(bk) \ - (B_TYPE((bk)->type) != B_KEYDATA ? BOVERFLOW_SIZE : \ - BKEYDATA_SIZE((bk)->len)) + (B_TYPE((bk)->type) == B_KEYDATA ? BKEYDATA_SIZE((bk)->len) : \ + (B_TYPE((bk)->type) == B_BLOB ? BBLOB_SIZE : BOVERFLOW_SIZE)) #define BITEM_PSIZE(bk) \ - (B_TYPE((bk)->type) != B_KEYDATA ? BOVERFLOW_PSIZE : \ - BKEYDATA_PSIZE((bk)->len)) + (B_TYPE((bk)->type) == B_KEYDATA ? BKEYDATA_PSIZE((bk)->len) : \ + (B_TYPE((bk)->type) == B_BLOB ? BBLOB_PSIZE : BOVERFLOW_PSIZE)) /* * Btree leaf and hash page layouts group indices in sets of two, one for the diff --git a/src/dbinc/db_swap.h b/src/dbinc/db_swap.h index 352ae227..06f4eb47 100644 --- a/src/dbinc/db_swap.h +++ b/src/dbinc/db_swap.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1990, 1993, 1994 @@ -51,15 +51,26 @@ extern "C" { #define M_64_SWAP(a) { \ u_int64_t _tmp; \ _tmp = (u_int64_t)a; \ - ((u_int8_t *)&a)[0] = ((u_int8_t *)&_tmp)[7]; \ - ((u_int8_t *)&a)[1] = ((u_int8_t *)&_tmp)[6]; \ - ((u_int8_t *)&a)[2] = ((u_int8_t *)&_tmp)[5]; \ - ((u_int8_t *)&a)[3] = ((u_int8_t *)&_tmp)[4]; \ - ((u_int8_t *)&a)[4] = ((u_int8_t *)&_tmp)[3]; \ - ((u_int8_t *)&a)[5] = ((u_int8_t *)&_tmp)[2]; \ - ((u_int8_t *)&a)[6] = ((u_int8_t *)&_tmp)[1]; \ - ((u_int8_t *)&a)[7] = ((u_int8_t *)&_tmp)[0]; \ + ((u_int8_t *)&(a))[0] = ((u_int8_t *)&_tmp)[7]; \ + ((u_int8_t *)&(a))[1] = ((u_int8_t *)&_tmp)[6]; \ + ((u_int8_t *)&(a))[2] = ((u_int8_t *)&_tmp)[5]; \ + ((u_int8_t *)&(a))[3] = ((u_int8_t *)&_tmp)[4]; \ + ((u_int8_t *)&(a))[4] = ((u_int8_t *)&_tmp)[3]; \ + ((u_int8_t *)&(a))[5] = ((u_int8_t *)&_tmp)[2]; \ + ((u_int8_t *)&(a))[6] = ((u_int8_t *)&_tmp)[1]; \ + ((u_int8_t *)&(a))[7] = ((u_int8_t *)&_tmp)[0]; \ } +#undef P_64_COPYSWAP +#define P_64_COPYSWAP(a, b) do { \ + ((u_int8_t *)b)[0] = ((u_int8_t *)a)[7]; \ + ((u_int8_t *)b)[1] = ((u_int8_t *)a)[6]; \ + ((u_int8_t *)b)[2] = ((u_int8_t *)a)[5]; \ + ((u_int8_t *)b)[3] = ((u_int8_t *)a)[4]; \ + ((u_int8_t *)b)[4] = ((u_int8_t *)a)[3]; \ + ((u_int8_t *)b)[5] = ((u_int8_t *)a)[2]; \ + ((u_int8_t *)b)[6] = ((u_int8_t *)a)[1]; \ + ((u_int8_t *)b)[7] = ((u_int8_t *)a)[0]; \ +} while (0) #undef P_64_COPY #define P_64_COPY(a, b) { \ ((u_int8_t *)b)[0] = ((u_int8_t *)a)[0]; \ @@ -113,7 +124,7 @@ extern "C" { P_32_COPYSWAP(&_tmp, a); \ } while (0) #undef M_32_SWAP -#define M_32_SWAP(a) P_32_SWAP(&a) +#define M_32_SWAP(a) P_32_SWAP(&(a)) /* * Little endian <==> big endian 16-bit swap macros. @@ -139,8 +150,13 @@ extern "C" { P_16_COPYSWAP(&_tmp, a); \ } while (0) #undef M_16_SWAP -#define M_16_SWAP(a) P_16_SWAP(&a) +#define M_16_SWAP(a) P_16_SWAP(&(a)) +#undef SWAP64 +#define SWAP64(p) { \ + P_64_SWAP(p); \ + (p) += sizeof(u_int64_t); \ +} #undef SWAP32 #define SWAP32(p) { \ P_32_SWAP(p); \ @@ -168,6 +184,25 @@ extern "C" { P_32_SWAP(p); \ } while (0) +#undef DB_NTOHLL_COPYIN +#define DB_NTOHLL_COPYIN(env, i, p) do { \ + u_int8_t *tmp; \ + tmp = (u_int8_t *)&(i); \ + if (F_ISSET(env, ENV_LITTLEENDIAN)) { \ + tmp[7] = *p++; \ + tmp[6] = *p++; \ + tmp[5] = *p++; \ + tmp[4] = *p++; \ + tmp[3] = *p++; \ + tmp[2] = *p++; \ + tmp[1] = *p++; \ + tmp[0] = *p++; \ + } else { \ + memcpy(&(i), p, sizeof(u_int64_t)); \ + p = (u_int8_t *)p + sizeof(u_int64_t); \ + } \ +} while (0) + #undef DB_NTOHL_COPYIN #define DB_NTOHL_COPYIN(env, i, p) do { \ u_int8_t *tmp; \ @@ -178,7 +213,7 @@ extern "C" { tmp[1] = *p++; \ tmp[0] = *p++; \ } else { \ - memcpy(&i, p, sizeof(u_int32_t)); \ + memcpy(&(i), p, sizeof(u_int32_t)); \ p = (u_int8_t *)p + sizeof(u_int32_t); \ } \ } while (0) @@ -191,11 +226,29 @@ extern "C" { tmp[1] = *p++; \ tmp[0] = *p++; \ } else { \ - memcpy(&i, p, sizeof(u_int16_t)); \ + memcpy(&(i), p, sizeof(u_int16_t)); \ p = (u_int8_t *)p + sizeof(u_int16_t); \ } \ } while (0) +#undef DB_HTONLL_COPYOUT +#define DB_HTONLL_COPYOUT(env, p, i) do { \ + u_int8_t *tmp; \ + tmp = (u_int8_t *)p; \ + if (F_ISSET(env, ENV_LITTLEENDIAN)) { \ + *tmp++ = ((u_int8_t *)&(i))[7]; \ + *tmp++ = ((u_int8_t *)&(i))[6]; \ + *tmp++ = ((u_int8_t *)&(i))[5]; \ + *tmp++ = ((u_int8_t *)&(i))[4]; \ + *tmp++ = ((u_int8_t *)&(i))[3]; \ + *tmp++ = ((u_int8_t *)&(i))[2]; \ + *tmp++ = ((u_int8_t *)&(i))[1]; \ + *tmp++ = ((u_int8_t *)&(i))[0]; \ + } else \ + memcpy(p, &(i), sizeof(u_int64_t)); \ + p = (u_int8_t *)p + sizeof(u_int64_t); \ +} while (0) + #undef DB_HTONL_COPYOUT #define DB_HTONL_COPYOUT(env, p, i) do { \ u_int8_t *tmp; \ @@ -206,7 +259,7 @@ extern "C" { *tmp++ = ((u_int8_t *)&(i))[1]; \ *tmp++ = ((u_int8_t *)&(i))[0]; \ } else \ - memcpy(p, &i, sizeof(u_int32_t)); \ + memcpy(p, &(i), sizeof(u_int32_t)); \ p = (u_int8_t *)p + sizeof(u_int32_t); \ } while (0) @@ -229,6 +282,13 @@ extern "C" { */ #define LOG_SWAPPED(env) !F_ISSET(env, ENV_LITTLEENDIAN) +#define LOGCOPY_64(env, x, p) do { \ + if (LOG_SWAPPED(env)) \ + P_64_COPYSWAP((p), (x)); \ + else \ + memcpy((x), (p), sizeof(u_int64_t)); \ +} while (0) + #define LOGCOPY_32(env, x, p) do { \ if (LOG_SWAPPED(env)) \ P_32_COPYSWAP((p), (x)); \ diff --git a/src/dbinc/db_upgrade.h b/src/dbinc/db_upgrade.h index 45fb624d..716594c9 100644 --- a/src/dbinc/db_upgrade.h +++ b/src/dbinc/db_upgrade.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -242,6 +242,123 @@ typedef struct hashhdr { /* Disk resident portion */ */ } HASHHDR; + +/************************************************************************ + BLOB RECORD LAYOUTS + ************************************************************************/ + +/* + * Hash BLOB record layout. + */ +typedef struct _hblob60 { + u_int8_t type; /* 00: Page type and delete flag. */ + u_int8_t encoding; /* 01: Encoding of blob file. */ + u_int8_t unused[2]; /* 02-03: Padding, unused. */ + u_int32_t id_lo; /* 04-07: Blob file identifier. */ + u_int32_t id_hi; /* 07-11: Blob file identifier. */ + u_int32_t size_lo; /* 12-15: Blob file size. */ + u_int32_t size_hi; /* 15-19: Blob file size. */ + DB_LSN lsn; /* 20-27: LSN for blob file update. */ + u_int8_t chksum[DB_MAC_KEY]; /* 28-47: Checksum */ + u_int8_t iv[DB_IV_BYTES]; /* 48-63: IV */ + u_int32_t file_id_lo; /* 64-67: File directory lo. */ + u_int32_t file_id_hi; /* 68-71: File directory hi. */ + u_int32_t sdb_id_lo; /* 72-75: Subdb that owns this blob. */ + u_int32_t sdb_id_hi; /* 76-79: Subdb that owns this blob. */ +} HBLOB60; + +#define HBLOB60_SIZE (sizeof(HBLOB60)) + +/* + * Btree BLOB record layout. + */ +typedef struct _bblob60 { + db_indx_t len; /* 00-01: BBLOB_DSIZE. */ + u_int8_t type; /* 02: Page type and delete flag. */ + u_int8_t encoding; /* 03: Encoding of blob file. */ + u_int32_t id_lo; /* 04-07: Blob file identifier. */ + u_int32_t id_hi; /* 08-11: Blob file identifier. */ + u_int32_t size_lo; /* 12-15: Blob file size. */ + u_int32_t size_hi; /* 15-19: Blob file size. */ + DB_LSN lsn; /* 20-27: LSN for blob file update. */ + u_int8_t chksum[DB_MAC_KEY]; /* 28-47: Checksum */ + u_int8_t iv[DB_IV_BYTES]; /* 48-63: IV */ + u_int32_t file_id_lo; /* 64-67: File directory lo. */ + u_int32_t file_id_hi; /* 68-71: File directory hi. */ + u_int32_t sdb_id_lo; /* 72-75: Subdb that owns this blob. */ + u_int32_t sdb_id_hi; /* 76-79: Subdb that owns this blob. */ +} BBLOB60; + +#define BBLOB60_SIZE \ + ((u_int16_t)DB_ALIGN(sizeof(BBLOB60), sizeof(u_int32_t))) +/* + * Heap BLOB record layout. + */ +typedef struct _heapblob60 { + u_int8_t flags; /* 00: Flags describing record. */ + u_int8_t unused; /* 01: Padding. */ + u_int16_t size; /* 02-03: The size of the stored data piece. */ + u_int8_t encoding; /* 04: Encoding of blob file. */ + u_int8_t unused2[3]; /* 05-07: Padding, unused. */ + u_int32_t id_lo; /* 08-11: Blob file identifier. */ + u_int32_t id_hi; /* 12-15: Blob file identifier. */ + u_int32_t size_lo; /* 16-19: Blob file size. */ + u_int32_t size_hi; /* 20-23: Blob file size. */ + u_int8_t unused3[4]; /* 24-27: Padding, unused. */ + u_int8_t chksum[DB_MAC_KEY]; /* 28-47: Checksum */ + u_int8_t iv[DB_IV_BYTES]; /* 48-63: IV */ + DB_LSN lsn; /* 64-67: LSN for blob file update. */ + u_int32_t file_id_lo; /* 68-71: File directory lo. */ + u_int32_t file_id_hi; /* 72-75: File directory hi. */ +} HEAPBLOBHDR60; + +#define HEAPBLOBREC60_SIZE (sizeof(HEAPBLOBHDR60)) + +#define GET_BLOB60_FILE_ID(e, p, o, ret) \ + GET_LO_HI(e, (p)->file_id_lo, (p)->file_id_hi, o, ret); + +#define GET_BLOB60_SDB_ID(e, p, o, ret) \ + GET_LO_HI(e, (p)->sdb_id_lo, (p)->sdb_id_hi, o, ret); + +/* Return a uintmax_t version of blob_id. */ +#define GET_BLOB60_ID(e, p, o, ret) do { \ + DB_ASSERT((e), sizeof(o) <= 8); \ + if (sizeof(o) == 8) { \ + (o) = (p).id_hi; \ + (o) = (o) << 32; \ + (o) += (p).id_lo; \ + } else { \ + if ((p).id_hi > 0) { \ + __db_errx((e), DB_STR("0766", \ + "Blob identifier overflow.")); \ + (ret) = EINVAL; \ + } \ + (o) = (p).id_lo; \ + } \ +} while (0); + +/* Return a off_t version of blob size. */ +#define GET_BLOB60_SIZE(e, p, o, ret) do { \ + DB_ASSERT((e), sizeof(o) <= 8); \ + if (sizeof(o) == 8) { \ + (o) = (p).size_hi; \ + (o) = (o) << 32; \ + (o) += (p).size_lo; \ + } else { \ + if ((p).size_hi > 0) { \ + __db_errx((e), DB_STR("0767", \ + "Blob size overflow.")); \ + (ret) = EINVAL; \ + } \ + if ((p).size_lo > INT_MAX) { \ + __db_errx((e), DB_STR("0768", \ + "Blob size overflow.")); \ + (ret) = EINVAL; \ + } \ + (o) = (int32_t)(p).size_lo; \ + } \ +} while (0); + #if defined(__cplusplus) } #endif diff --git a/src/dbinc/db_verify.h b/src/dbinc/db_verify.h index 68acbf6c..ea87680f 100644 --- a/src/dbinc/db_verify.h +++ b/src/dbinc/db_verify.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -120,9 +120,10 @@ struct __vrfy_dbinfo { #define SALVAGE_PRINTABLE 0x01 /* Output printable chars literally. */ #define SALVAGE_PRINTHEADER 0x02 /* Print the unknown-key header. */ #define SALVAGE_PRINTFOOTER 0x04 /* Print the unknown-key footer. */ -#define SALVAGE_HASSUBDBS 0x08 /* There are subdatabases to salvage. */ -#define VRFY_LEAFCHAIN_BROKEN 0x10 /* Lost one or more Btree leaf pgs. */ -#define VRFY_QMETA_SET 0x20 /* We've seen a QUEUE meta page and +#define SALVAGE_STREAM_BLOB 0x08 /* Currently streaming a blob. */ +#define SALVAGE_HASSUBDBS 0x10 /* There are subdatabases to salvage. */ +#define SALVAGE_LEAFCHAIN_BROKEN 0x20 /* Lost one or more Btree leaf pgs. */ +#define SALVAGE_QMETA_SET 0x40 /* We've seen a QUEUE meta page and set things up for it. */ u_int32_t flags; }; /* VRFY_DBINFO */ diff --git a/src/dbinc/debug.h b/src/dbinc/debug.h index a8da000d..5388b791 100644 --- a/src/dbinc/debug.h +++ b/src/dbinc/debug.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1998, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -36,7 +36,13 @@ extern "C" { #define DB_ASSERT(env, e) \ ((e) ? (void)0 : __db_assert(env, #e, __FILE__, __LINE__)) #else -#define DB_ASSERT(env, e) NOP_STATEMENT +#define DB_ASSERT(env, e) ((void)0) +#endif + +#if defined(HAVE_ERROR_HISTORY) +#define DB_DEBUG_MSG __db_debug_msg +#else +#define DB_DEBUG_MSG if (0) __db_debug_msg #endif /* @@ -55,10 +61,11 @@ extern "C" { * of structure fields whose only purpose is padding, as well as when heap * memory that was never initialized is written to disk. */ +#define UMRW_SET(var) UMRW_SET_VALUE((var), 0) #ifdef UMRW -#define UMRW_SET(v) (v) = 0 +#define UMRW_SET_VALUE(var, value) (var) = (value) #else -#define UMRW_SET(v) NOP_STATEMENT +#define UMRW_SET_VALUE(var, value) NOP_STATEMENT #endif /* @@ -73,6 +80,34 @@ typedef enum { } db_error_set_t; /* + * Use these macros wherever an error condition is initially noticed, e.g., when + * setting a value to any of the user visible error return codes, whether + * defined by Berkeley DB or by the operating environment (EINVAL). + * saving the specific source of an instance of an error code, including the + * time, stack, db name, current LSN, etc. If the error turns out to be + * important, the deferred message text is added to the text produced by + * __db_err(), __db_errx, and __db_syserr(). The additional information can be + * useful for diagnosing the behavior of applications under error conditions. + * It is enabled by configuring with --enable-error_history. The current + * implmentation requires pthreads' version of thread local storage. + */ +#ifdef HAVE_ERROR_HISTORY +#define USR_ERR(env, errcode) __db_diags((env), (errcode)) +#define DBC_ERR(dbc, errcode) __dbc_diags((dbc), (errcode)) +#define MUTEX_ERR(env, mutex, errcode) __mutex_diags((env), (mutex), (errcode)) +#define DISCARD_HISTORY(env) __db_deferred_discard() +/* Save at most 10KB of error history in an API call. Adjust this as desired. */ +#define DB_ERROR_HISTORY_SIZE (10 * 1024) +#else +#define USR_ERR(env, errcode) (errcode) +#define DBC_ERR(dbc, errcode) (errcode) +#define MUTEX_ERR(env, mutex, errcode) (errcode) +#define DISCARD_HISTORY(env) NOP_STATEMENT +/* No space is needed when error history is disabled. */ +#define DB_ERROR_HISTORY_SIZE 0 +#endif + +/* * Message handling. Use a macro instead of a function because va_list * references to variadic arguments cannot be reset to the beginning of the * variadic argument list (and then rescanned), by functions other than the @@ -102,6 +137,7 @@ typedef enum { ((app_call) || F_ISSET((dbenv)->env, ENV_NO_OUTPUT_SET)))) \ __db_errfile(dbenv, error, error_set, fmt, __ap); \ va_end(__ap); \ + DISCARD_HISTORY((dbenv)->env); \ } #else #define DB_REAL_ERR(dbenv, error, error_set, app_call, fmt) { \ @@ -127,6 +163,7 @@ typedef enum { ((app_call) || F_ISSET((dbenv)->env, ENV_NO_OUTPUT_SET)))) \ __db_errfile(env, error, error_set, fmt, __ap); \ va_end(__ap); \ + DISCARD_HISTORY(env); \ } #endif #if defined(STDC_HEADERS) || defined(__cplusplus) @@ -192,7 +229,7 @@ typedef enum { #define LOG_OP(C, T, O, K, A, F) { \ DB_LSN __lsn; \ DBT __op; \ - if (DBC_LOGGING((C))) { \ + if ((C)->dbp->log_filename != NULL && DBC_LOGGING((C))) { \ memset(&__op, 0, sizeof(__op)); \ __op.data = O; \ __op.size = (u_int32_t)strlen(O) + 1; \ diff --git a/src/dbinc/fop.h b/src/dbinc/fop.h index 94f27f9f..7ea62023 100644 --- a/src/dbinc/fop.h +++ b/src/dbinc/fop.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -23,6 +23,20 @@ extern "C" { (void)__memp_set_flags((D)->mpf, DB_MPOOL_NOFILE, 0); \ } while (0) +/* + * Never change the value of DB_FOP_CREATE (0x00000002), + * DB_FOP_APPEND (0x00000001), and DB_FOP_REDO(0x00000008), + * as those values are used in write_file logs. + */ +#define DB_FOP_APPEND 0x00000001 /* Appending to a file. */ +#define DB_FOP_CREATE 0x00000002 /* Creating the file. */ +#define DB_FOP_PARTIAL_LOG 0x00000004 /* Partial logging of file data. */ +#define DB_FOP_REDO 0x00000008 /* File operation can be redone. */ +#define DB_FOP_READONLY 0x00000010 /* File is read only. */ +#define DB_FOP_WRITE 0x00000020 /* File is writeable. */ +#define DB_FOP_SYNC_WRITE 0x00000040 /* Sync file on each write. */ + + #include "dbinc_auto/fileops_auto.h" #include "dbinc_auto/fileops_ext.h" diff --git a/src/dbinc/globals.h b/src/dbinc/globals.h index 95e5c118..becd6365 100644 --- a/src/dbinc/globals.h +++ b/src/dbinc/globals.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -52,21 +52,27 @@ typedef struct __db_globals { char error_buf[40]; /* Error string buffer. */ - int uid_init; /* srand set in UID generator */ + int random_seeded; /* Has __os_srandom been called? */ - u_long rand_next; /* rand/srand value */ +#if defined(HAVE_RANDOM_R) + struct random_data random_data; /* srandom_r/random_r argument */ + char random_state[64]; /* random number state */ +#elif !defined(HAVE_RAND) && !defined(HAVE_RANDOM) + u_long rand_next; /* next rand value for clib/rand.c */ +#endif u_int32_t fid_serial; /* file id counter */ int db_errno; /* Errno value if not available */ - size_t num_active_pids; /* number of entries in active_pids */ - - size_t size_active_pids; /* allocated size of active_pids */ + char *saved_errstr; /* saved error string from backup */ - pid_t *active_pids; /* array active pids */ + char *time_format; /* strftime-format for printing dates */ - char *saved_errstr; /* saved error string from backup */ +#if defined(HAVE_ERROR_HISTORY) && defined(HAVE_PTHREAD_SELF) + pthread_key_t msgs_key; + pthread_once_t thread_once; +#endif /* Underlying OS interface jump table.*/ void (*j_assert) __P((const char *, const char *, int)); diff --git a/src/dbinc/hash.h b/src/dbinc/hash.h index f485128a..55a64f87 100644 --- a/src/dbinc/hash.h +++ b/src/dbinc/hash.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1990, 1993, 1994 @@ -56,7 +56,7 @@ typedef struct hash_t { u_int32_t h_nelem; /* Number of elements. */ /* Hash and compare functions. */ u_int32_t (*h_hash) __P((DB *, const void *, u_int32_t)); - int (*h_compare) __P((DB *, const DBT *, const DBT *)); + int (*h_compare) __P((DB *, const DBT *, const DBT *, size_t *)); } HASH; /* Cursor structure definitions. */ diff --git a/src/dbinc/heap.h b/src/dbinc/heap.h index ca3407e0..bb96ebec 100644 --- a/src/dbinc/heap.h +++ b/src/dbinc/heap.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2010, 2015 Oracle and/or its affiliates. All rights reserved. */ #ifndef _DB_HEAP_H_ @@ -26,7 +26,8 @@ struct __heap { /* Heap access method. */ db_pgno_t curregion; /* The region of the next insert. */ db_pgno_t maxpgno; /* Maximum page number of a fixed size heap. */ - int curpgindx; /* The last used offset in the region's space bitmap. */ + u_int32_t curpgindx; /* The last used offset in the + * region's space bitmap. */ }; struct __heap_cursor { diff --git a/src/dbinc/hmac.h b/src/dbinc/hmac.h index 2a495b17..f87965eb 100644 --- a/src/dbinc/hmac.h +++ b/src/dbinc/hmac.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/dbinc/lock.h b/src/dbinc/lock.h index eab51832..298b8527 100644 --- a/src/dbinc/lock.h +++ b/src/dbinc/lock.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -37,7 +37,10 @@ extern "C" { */ #define LOCK_INVALID INVALID_ROFF #define LOCK_ISSET(lock) ((lock).off != LOCK_INVALID) -#define LOCK_INIT(lock) ((lock).off = LOCK_INVALID) +#define LOCK_INIT(lock) do { \ + (lock).off = LOCK_INVALID; \ + UMRW_SET_VALUE((lock).mode, DB_LOCK_NG); \ +} while(0) /* * Macro to identify a write lock for the purpose of counting locks @@ -66,8 +69,8 @@ extern "C" { typedef struct __db_lockregion { /* SHARED */ db_mutex_t mtx_region; /* Region mutex. */ - u_int32_t need_dd; /* flag for deadlock detector */ - u_int32_t detect; /* run dd on every conflict */ + u_int32_t need_dd; /* run dd on every conflict */ + u_int32_t detect; /* flag for deadlock detector */ db_timespec next_timeout; /* next time to expire a lock */ db_mutex_t mtx_dd; /* mutex for lock object dd list. */ db_mutex_t mtx_lockers; /* mutex for locker allocation. */ @@ -92,7 +95,7 @@ typedef struct __db_lockregion { /* SHARED */ u_int32_t lock_id; /* Current lock(er) id to allocate. */ u_int32_t cur_maxid; /* Current max lock(er) id. */ - u_int32_t nlockers; /* Current number of lockers. */ + u_int32_t nlockers; /* Current number of locker ids. */ int32_t nmodes; /* Number of modes in conflict table. */ DB_LOCK_STAT stat; /* stats about locking. */ } DB_LOCKREGION; @@ -157,12 +160,16 @@ struct __db_locker { /* SHARED */ db_timespec lk_expire; /* When current lock expires. */ db_timespec tx_expire; /* When this txn expires. */ db_timeout_t lk_timeout; /* How long do we let locks live. */ +#ifdef DIAGNOSTIC + roff_t prev_locker; /* The thread's previous dbth_locker. */ +#endif #define DB_LOCKER_DIRTY 0x0001 /* Has write locks. */ #define DB_LOCKER_INABORT 0x0002 /* Is aborting, don't abort again. */ #define DB_LOCKER_TIMEOUT 0x0004 /* Has timeout set. */ #define DB_LOCKER_FAMILY_LOCKER 0x0008 /* Part of a family of lockers. */ #define DB_LOCKER_HANDLE_LOCKER 0x0010 /* Not associated with a thread. */ +#define DB_LOCKER_FREE 0x0020 /* Diag: it is on the free list. */ u_int32_t flags; }; diff --git a/src/dbinc/log.h b/src/dbinc/log.h index c4dea6fc..2e2929f0 100644 --- a/src/dbinc/log.h +++ b/src/dbinc/log.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -55,6 +55,8 @@ struct __fname { /* number of txn referencing + 1 for the db handle. */ u_int32_t txn_ref; + db_seq_t blob_file_id; /* BLOB file directory id. */ + #define DB_FNAME_CLOSED 0x01 /* DBP was closed. */ #define DB_FNAME_DURABLE 0x02 /* File is durable. */ #define DB_FNAME_INMEM 0x04 /* File is in memory. */ @@ -137,16 +139,18 @@ struct __db_log { ENV *env; /* Environment */ REGINFO reginfo; /* Region information. */ -#define DBLOG_AUTOREMOVE 0x01 /* Autoremove log files. */ -#define DBLOG_DIRECT 0x02 /* Do direct I/O on the log. */ -#define DBLOG_DSYNC 0x04 /* Set OS_DSYNC on the log. */ -#define DBLOG_FORCE_OPEN 0x08 /* Force the DB open even if it appears +#define DBLOG_AUTOREMOVE 0x001 /* Autoremove log files. */ +#define DBLOG_BLOB 0x002 /* Full logging of blob data. */ +#define DBLOG_DIRECT 0x004 /* Do direct I/O on the log. */ +#define DBLOG_DSYNC 0x008 /* Set OS_DSYNC on the log. */ +#define DBLOG_FORCE_OPEN 0x010 /* Force the DB open even if it appears * to be deleted. */ -#define DBLOG_INMEMORY 0x10 /* Logging is in memory. */ -#define DBLOG_OPENFILES 0x20 /* Prepared files need to be open. */ -#define DBLOG_RECOVER 0x40 /* We are in recovery. */ -#define DBLOG_ZERO 0x80 /* Zero fill the log. */ -#define DBLOG_VERIFYING 0x100 /* The log is being verified. */ +#define DBLOG_INMEMORY 0x020 /* Logging is in memory. */ +#define DBLOG_NOSYNC 0x040 /* Don't sync log files during flush. */ +#define DBLOG_OPENFILES 0x080 /* Prepared files need to be open. */ +#define DBLOG_RECOVER 0x100 /* We are in recovery. */ +#define DBLOG_ZERO 0x200 /* Zero fill the log. */ +#define DBLOG_VERIFYING 0x400 /* The log is being verified. */ u_int32_t flags; }; @@ -251,7 +255,8 @@ struct __log { /* SHARED */ * rather than by the region mutex. */ db_mutex_t mtx_flush; /* Mutex guarding flushing. */ - int32_t in_flush; /* Log flush in progress. */ + int32_t in_flush; /* Log flush in progress. */ + int32_t nosync; /* log_set_config(DB_LOG_NOSYNC) */ DB_LSN s_lsn; /* LSN of the last sync. */ DB_LOG_STAT stat; /* Log statistics. */ diff --git a/src/dbinc/log_verify.h b/src/dbinc/log_verify.h index fa90ace4..ec43c4d7 100644 --- a/src/dbinc/log_verify.h +++ b/src/dbinc/log_verify.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/dbinc/mp.h b/src/dbinc/mp.h index 9a10c6d9..598ca366 100644 --- a/src/dbinc/mp.h +++ b/src/dbinc/mp.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -226,10 +226,15 @@ struct __mpool { /* SHARED */ #define DB_MEMP_SYNC_INTERRUPT 0x02 u_int32_t config_flags; - /* Free frozen buffer headers, protected by the region lock. */ + /* These MVCC fields are protected by the mpool region lock. */ + + /* This is the free list of BH_FROZEN_PAGEs, the frozen headers. */ SH_TAILQ_HEAD(__free_frozen) free_frozen; - /* Allocated blocks of frozen buffer headers. */ + /* + * This list of BH_FROZEN_ALLOCs contains all the BH_FROZEN_PAGEs, + * whether they are in free_frozen or busy (in a bh.vc version chain). + */ SH_TAILQ_HEAD(__alloc_frozen) alloc_frozen; }; @@ -550,9 +555,10 @@ struct __bh { /* SHARED */ #define BH_FROZEN 0x040 /* Frozen buffer: allocate & re-read. */ #define BH_TRASH 0x080 /* Page is garbage. */ #define BH_THAWED 0x100 /* Page was thawed. */ +#define BH_UNREACHABLE 0x200 /* Discard this defunct MVCC version. */ u_int16_t flags; - u_int32_t priority; /* Priority. */ + u_int32_t priority; /* Cache priority. */ SH_TAILQ_ENTRY hq; /* MPOOL hash bucket queue. */ db_pgno_t pgno; /* Underlying MPOOLFILE page number. */ @@ -587,9 +593,12 @@ struct __bh_frozen_p { /* * BH_FROZEN_ALLOC -- - * Frozen buffer headers are allocated a page at a time in general. This - * structure is allocated at the beginning of the page so that the - * allocation chunks can be tracked and freed (for private environments). + * This structure is the container for one or more frozen buffer headers. + * Blocks of BH_FROZEN_PAGE structs are usually allocated a page at a time, + * though when an mpool is nearly full and a whole page isn't available + * there can be single-item blocks. BH_FROZEN_ALLOC is the block header + * allocated at the beginning of the chunk and is linked to the mpool's + * alloc_frozen so that the allocation chunks can be tracked and freed. */ struct __bh_frozen_a { SH_TAILQ_ENTRY links; @@ -602,33 +611,36 @@ struct __bh_frozen_a { (F_ISSET(PAGE_TO_BH(p), BH_DIRTY|BH_EXCLUSIVE) == (BH_DIRTY|BH_EXCLUSIVE)) #define BH_OWNER(env, bhp) \ - ((TXN_DETAIL *)R_ADDR(&env->tx_handle->reginfo, bhp->td_off)) + ((TXN_DETAIL *)R_ADDR(&(env)->tx_handle->reginfo, (bhp)->td_off)) #define BH_OWNED_BY(env, bhp, txn) ((txn) != NULL && \ - (bhp)->td_off != INVALID_ROFF && \ - (txn)->td == BH_OWNER(env, bhp)) + (bhp)->td_off != INVALID_ROFF && (txn)->td == BH_OWNER(env, bhp)) -#define VISIBLE_LSN(env, bhp) \ - (&BH_OWNER(env, bhp)->visible_lsn) +#define VISIBLE_LSN(env, bhp) (&BH_OWNER(env, bhp)->visible_lsn) /* - * Make a copy of the buffer's visible LSN, one field at a time. We rely on the - * 32-bit operations being atomic. The visible_lsn starts at MAX_LSN and is - * set during commit or abort to the current LSN. + * MVCC Versions are visible only to snapshot transactions whose read_lsn is at + * least as recent (large) as the buffer's lsn. Visibility checks must be made + * from newest to oldest along bhp.vc, stopping at the first visible one. + * Unversioned buffers (those with invalid td_off) are always visible. + * + * BH_VISIBLE() makes a copy of the buffer's visible LSN, one field at a time. + * We rely on the 32-bit operations being atomic. The visible_lsn starts at + * MAX_LSN and is set during commit or abort to the current LSN. * - * If we race with a commit / abort, we may see either the file or the offset + * If we race with a commit or abort, we may see either the file or the offset * still at UINT32_MAX, so vlsn is guaranteed to be in the future. That's OK, * since we had to take the log region lock to allocate the read LSN so we were * never going to see this buffer anyway. */ #define BH_VISIBLE(env, bhp, read_lsnp, vlsn) \ (bhp->td_off == INVALID_ROFF || \ - ((vlsn).file = VISIBLE_LSN(env, bhp)->file, \ + ((vlsn).file = VISIBLE_LSN(env, bhp)->file, \ (vlsn).offset = VISIBLE_LSN(env, bhp)->offset, \ LOG_COMPARE((read_lsnp), &(vlsn)) >= 0)) #define BH_OBSOLETE(bhp, old_lsn, vlsn) (SH_CHAIN_HASNEXT(bhp, vc) ? \ - BH_VISIBLE(env, SH_CHAIN_NEXTP(bhp, vc, __bh), &(old_lsn), vlsn) :\ + BH_VISIBLE(env, SH_CHAIN_NEXTP(bhp, vc, __bh), &(old_lsn), vlsn) : \ BH_VISIBLE(env, bhp, &(old_lsn), vlsn)) #define MVCC_SKIP_CURADJ(dbc, pgno) (dbc->txn != NULL && \ diff --git a/src/dbinc/mutex.h b/src/dbinc/mutex.h index b699142c..334d8f96 100644 --- a/src/dbinc/mutex.h +++ b/src/dbinc/mutex.h @@ -1,7 +1,7 @@ /* * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -24,10 +24,14 @@ extern "C" { #endif /* - * By default, spin 50 times per processor if fail to acquire a test-and-set - * mutex, we have anecdotal evidence it's a reasonable value. + * These specify the default spin parameters for test-and-set mutexes. A single + * processor system spins just once, a multiprocessor system spins 50 times per + * processor up to a default maximum of 200. This limit reduces excessive + * busy-waiting on machines with many hyperthreads. We have anecdotal evidence + * that these are reasonable default values. */ #define MUTEX_SPINS_PER_PROCESSOR 50 +#define MUTEX_SPINS_DEFAULT_MAX 200 /* * Mutexes are represented by unsigned, 32-bit integral values. As the @@ -163,13 +167,6 @@ static inline int __db_pthread_mutex_tryreadlock(ENV *env, db_mutex_t mutex) #define __mutex_rdlock(a, b) __db_win32_mutex_readlock(a, b) #define __mutex_tryrdlock(a, b) __db_win32_mutex_tryreadlock(a, b) #endif -#elif defined(HAVE_MUTEX_FCNTL) -#define __mutex_init(a, b, c) __db_fcntl_mutex_init(a, b, c) -#define __mutex_lock(a, b) __db_fcntl_mutex_lock(a, b, 0) -#define __mutex_timedlock(a, b, c) __db_fcntl_lock(a, b, c) -#define __mutex_trylock(a, b) __db_fcntl_mutex_trylock(a, b) -#define __mutex_unlock(a, b) __db_fcntl_mutex_unlock(a, b) -#define __mutex_destroy(a, b) __db_fcntl_mutex_destroy(a, b) #else #define __mutex_init(a, b, c) __db_tas_mutex_init(a, b, c) #define __mutex_lock(a, b) __db_tas_mutex_lock(a, b, 0) @@ -184,9 +181,8 @@ static inline int __db_pthread_mutex_tryreadlock(ENV *env, db_mutex_t mutex) #endif /* - * When there is no method to get a shared latch, fall back to - * implementing __mutex_rdlock() as getting an exclusive one. - * This occurs either when !HAVE_SHARED_LATCHES or HAVE_MUTEX_FCNTL. + * When there is no method to get a shared latch, fall back to implementing + * __mutex_rdlock() as an exclusive one. This may no longer be supported? */ #ifndef __mutex_rdlock #define __mutex_rdlock(a, b) __mutex_lock(a, b) @@ -199,17 +195,25 @@ static inline int __db_pthread_mutex_tryreadlock(ENV *env, db_mutex_t mutex) * Lock/unlock a mutex. If the mutex was never required, the thread of * control can proceed without it. * - * We never fail to acquire or release a mutex without panicing. Simplify + * We rarely fail to acquire or release a mutex without panicing. Simplify * the macros to always return a panic value rather than saving the actual - * return value of the mutex routine. + * return value of the mutex routine. Use MUTEX_LOCK_RET() when the caller has + * a code path for a mutex failure, e.g., when cleaning up after a panic. */ #ifdef HAVE_MUTEX_SUPPORT #define MUTEX_LOCK(env, mutex) do { \ - if ((mutex) != MUTEX_INVALID && \ - __mutex_lock(env, mutex) != 0) \ + if ((mutex) != MUTEX_INVALID && __mutex_lock(env, mutex) != 0) \ return (DB_RUNRECOVERY); \ } while (0) +#define MUTEX_LOCK_RET(env, mutex) \ + ((mutex) == MUTEX_INVALID ? 0 : __mutex_lock(env, mutex)) + +/* + * Always check the return value of MUTEX_TRYLOCK()! Expect 0 on success, + * or possibly DB_RUNRECOVERY for failchk. + */ + /* * Always check the return value of MUTEX_TRYLOCK()! Expect 0 on success, * or DB_LOCK_NOTGRANTED, or possibly DB_RUNRECOVERY for failchk. @@ -217,9 +221,7 @@ static inline int __db_pthread_mutex_tryreadlock(ENV *env, db_mutex_t mutex) #define MUTEX_TRYLOCK(env, mutex) \ (((mutex) == MUTEX_INVALID) ? 0 : __mutex_trylock(env, mutex)) -/* - * Acquire a DB_MUTEX_SHARED "mutex" in shared mode. - */ +/* Acquire a latch (a DB_MUTEX_SHARED "mutex") in shared mode. */ #define MUTEX_READLOCK(env, mutex) do { \ if ((mutex) != MUTEX_INVALID && \ __mutex_rdlock(env, mutex) != 0) \ @@ -234,30 +236,68 @@ static inline int __db_pthread_mutex_tryreadlock(ENV *env, db_mutex_t mutex) return (DB_RUNRECOVERY); \ } while (0) -#define MUTEX_WAIT(env, mutex, duration) do { \ - int __ret; \ - if ((mutex) != MUTEX_INVALID && \ - (__ret = __mutex_timedlock(env, mutex, duration)) != 0 && \ - __ret != DB_TIMEOUT) \ - return (DB_RUNRECOVERY); \ +#define MUTEX_WAIT(env, mutex, duration) do { \ + int __ret; \ + if ((mutex) != MUTEX_INVALID && \ + (__ret = __mutex_timedlock(env, mutex, duration)) != 0 && \ + __ret != DB_TIMEOUT) \ + return (DB_RUNRECOVERY); \ } while (0) + +/* + * Check that a particular mutex is exclusively held at least by someone, not + * necessarily the current thread. + */ +#define MUTEX_IS_OWNED(env, mutex) \ + (mutex == MUTEX_INVALID || !MUTEX_ON(env) || \ + F_ISSET(env->dbenv, DB_ENV_NOLOCKING) || \ + F_ISSET(MUTEXP_SET(env, mutex), DB_MUTEX_LOCKED)) #else /* * There are calls to lock/unlock mutexes outside of #ifdef's -- replace * the call with something the compiler can discard, but which will make - * if-then-else blocks work correctly. + * if-then-else blocks work correctly, and suppress unused variable messages. + */ +#define MUTEX_LOCK(env, mutex) { env = (env); mutex = (mutex); } +#define MUTEX_LOCK_RET(env, mutex) ( env = (env), mutex = (mutex), 0) +#define MUTEX_TRYLOCK(env, mutex) ( env = (env), mutex = (mutex), 0) +#define MUTEX_READLOCK(env, mutex) { env = (env); mutex = (mutex); } +#define MUTEX_TRY_READLOCK(env, mutex) ( env = (env), mutex = (mutex), 0 ) +#define MUTEX_UNLOCK(env, mutex) { env = (env); mutex = (mutex); } +#define MUTEX_REQUIRED(env, mutex) { env = (env); mutex = (mutex); } +#define MUTEX_REQUIRED_READ(env, mutex) { env = (env); mutex = (mutex); } +#define MUTEX_WAIT(env, mutex, duration) { \ + (env) = (env); (mutex) = (mutex); (duration) = (duration); \ +} + +/* + * Every MUTEX_IS_OWNED() caller expects to own it. When there is no mutex + * support, act as if we have ownership. */ -#define MUTEX_LOCK(env, mutex) (mutex) = (mutex) -#define MUTEX_TRYLOCK(env, mutex) (mutex) = (mutex) -#define MUTEX_READLOCK(env, mutex) (mutex) = (mutex) -#define MUTEX_TRY_READLOCK(env, mutex) (mutex) = (mutex) -#define MUTEX_UNLOCK(env, mutex) (mutex) = (mutex) -#define MUTEX_REQUIRED(env, mutex) (mutex) = (mutex) -#define MUTEX_REQUIRED_READ(env, mutex) (mutex) = (mutex) -#define MUTEX_WAIT(env, mutex, duration) (mutex) = (mutex) +#define MUTEX_IS_OWNED(env, mutex) 1 #endif /* + * Bulk initialization of mutexes in regions. + */ + +#define MUTEX_BULK_INIT(env, region, start, howmany) do { \ + DB_MUTEX *__mutexp; \ + db_mutex_t __i = start; \ + u_int32_t __n = howmany; \ + for (__mutexp = MUTEXP_SET(env, __i); \ + --__n > 0; \ + __mutexp = MUTEXP_SET(env, __i)) { \ + __mutexp->flags = 0; \ + __i = (F_ISSET(env, ENV_PRIVATE)) ? \ + ((uintptr_t)__mutexp + region->mutex_size) : __i + 1; \ + __mutexp->mutex_next_link = __i; \ + } \ + __mutexp->flags = 0; \ + __mutexp->mutex_next_link = MUTEX_INVALID; \ +} while (0) + +/* * Berkeley DB ports may require single-threading at places in the code. */ #ifdef HAVE_MUTEX_VXWORKS diff --git a/src/dbinc/mutex_int.h b/src/dbinc/mutex_int.h index b9bccdf7..4a4468af 100644 --- a/src/dbinc/mutex_int.h +++ b/src/dbinc/mutex_int.h @@ -1,7 +1,7 @@ /* * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -73,6 +73,14 @@ extern "C" { else \ RET_SET((pthread_mutex_lock(&(mutexp)->u.m.mutex)), ret); \ } while (0) +#define RET_SET_PTHREAD_TIMEDLOCK(mutexp, timespec, ret) do { \ + if (F_ISSET(mutexp, DB_MUTEX_SHARED)) \ + RET_SET(pthread_rwlock_timedwrlock(&(mutexp)->u.rwlock, \ + (timespec)), ret); \ + else \ + RET_SET(pthread_mutex_timedlock(&(mutexp)->u.m.mutex, \ + (timespec)), ret); \ +} while (0) #define RET_SET_PTHREAD_TRYLOCK(mutexp, ret) do { \ if (F_ISSET(mutexp, DB_MUTEX_SHARED)) \ RET_SET((pthread_rwlock_trywrlock(&(mutexp)->u.rwlock)), \ @@ -84,6 +92,9 @@ extern "C" { #else #define RET_SET_PTHREAD_LOCK(mutexp, ret) \ RET_SET(pthread_mutex_lock(&(mutexp)->u.m.mutex), ret); +#define RET_SET_PTHREAD_TIMEDLOCK(mutexp, timespec, ret) \ + RET_SET(pthread_mutex_timedlock(&(mutexp)->u.m.mutex, \ + (timespec)), ret); #define RET_SET_PTHREAD_TRYLOCK(mutexp, ret) \ RET_SET(pthread_mutex_trylock(&(mutexp)->u.m.mutex), ret); #endif @@ -267,6 +278,11 @@ typedef abilock_t tsl_t; #include <sys/machlock.h> typedef lock_t tsl_t; +/* + * Solaris requires 8 byte alignment for pthread_mutex_t values. + */ +#define MUTEX_ALIGN 8 + /* * The functions are declared in <sys/machlock.h>, but under #ifdef KERNEL. * Re-declare them here to avoid warnings. @@ -778,6 +794,7 @@ MUTEX_SET(tsl_t *tsl) { static inline void MUTEX_UNSET(tsl_t *tsl) { __asm__ volatile( + " .set mips2 \n" " .set noreorder \n" " sync \n" " sw $0, %0 \n" @@ -892,15 +909,22 @@ struct __db_mutexmgr { REGINFO reginfo; /* Region information */ void *mutex_array; /* Base of the mutex array */ +#ifdef HAVE_FAILCHK_BROADCAST + /* + * The mutex lock functions wait for at most this long between checks + * for DB_MUTEX_OWNER_DEAD. This field needs no mutex protection. + */ + db_timeout_t failchk_polltime; +#endif }; /* Macros to lock/unlock the mutex region as a whole. */ -#define MUTEX_SYSTEM_LOCK(dbenv) \ - MUTEX_LOCK(dbenv, ((DB_MUTEXREGION *) \ - (dbenv)->mutex_handle->reginfo.primary)->mtx_region) -#define MUTEX_SYSTEM_UNLOCK(dbenv) \ - MUTEX_UNLOCK(dbenv, ((DB_MUTEXREGION *) \ - (dbenv)->mutex_handle->reginfo.primary)->mtx_region) +#define MUTEX_SYSTEM_LOCK(env) \ + MUTEX_LOCK(env, ((DB_MUTEXREGION *) \ + (env)->mutex_handle->reginfo.primary)->mtx_region) +#define MUTEX_SYSTEM_UNLOCK(env) \ + MUTEX_UNLOCK(env, ((DB_MUTEXREGION *) \ + (env)->mutex_handle->reginfo.primary)->mtx_region) /* * DB_MUTEXREGION -- @@ -927,6 +951,16 @@ typedef struct __db_mutexregion { /* SHARED */ } DB_MUTEXREGION; #ifdef HAVE_MUTEX_SUPPORT +/* + * MTX_DIAG turns on the recording of when and where a mutex was locked. It has + * a large impact, and should only be turned on when debugging mutexes. + */ +#define MUTEX_STACK_TEXT_SIZE 600 +typedef struct __mutex_history { /* SHARED */ + db_timespec when; + char stacktext[MUTEX_STACK_TEXT_SIZE]; +} MUTEX_HISTORY; + struct __db_mutex_t { /* SHARED */ /* Mutex. */ #ifdef MUTEX_FIELDS MUTEX_FIELDS /* Opaque thread mutex structures. */ @@ -959,9 +993,9 @@ struct __db_mutex_t { /* SHARED */ /* Mutex. */ db_mutex_t mutex_next_link; /* Linked list of free mutexes. */ -#ifdef HAVE_STATISTICS int alloc_id; /* Allocation ID. */ +#ifdef HAVE_STATISTICS u_int32_t mutex_set_wait; /* Granted after wait. */ u_int32_t mutex_set_nowait; /* Granted without waiting. */ #ifdef HAVE_SHARED_LATCHES @@ -973,7 +1007,9 @@ struct __db_mutex_t { /* SHARED */ /* Mutex. */ u_int32_t hybrid_wakeup; /* for counting spurious wakeups */ #endif #endif - +#ifdef MUTEX_DIAG + MUTEX_HISTORY mutex_history; +#endif /* * A subset of the flag arguments for __mutex_alloc(). * @@ -992,19 +1028,6 @@ struct __db_mutex_t { /* SHARED */ /* Mutex. */ (indx) * \ ((DB_MUTEXREGION *)env->mutex_handle->reginfo.primary)->mutex_size)) -/* - * Check that a particular mutex is exclusively held at least by someone, not - * necessarily the current thread. - */ -#ifdef HAVE_MUTEX_SUPPORT -#define MUTEX_IS_OWNED(env, mutex) \ - (mutex == MUTEX_INVALID || !MUTEX_ON(env) || \ - F_ISSET(env->dbenv, DB_ENV_NOLOCKING) || \ - F_ISSET(MUTEXP_SET(env, mutex), DB_MUTEX_LOCKED)) -#else -#define MUTEX_IS_OWNED(env, mutex) 0 -#endif - #if defined(HAVE_MUTEX_HYBRID) || defined(DB_WIN32) || \ (defined(HAVE_SHARED_LATCHES) && !defined(HAVE_MUTEX_PTHREADS)) #define MUTEXP_IS_BUSY(mutexp) \ diff --git a/src/dbinc/os.h b/src/dbinc/os.h index 2515e6ee..ea1fd2c4 100644 --- a/src/dbinc/os.h +++ b/src/dbinc/os.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/dbinc/partition.h b/src/dbinc/partition.h index 09e42573..11cdfa6f 100644 --- a/src/dbinc/partition.h +++ b/src/dbinc/partition.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * $Id$ @@ -22,6 +22,7 @@ typedef struct __db_partition { u_int32_t (*callback) (DB *, DBT *); #define PART_CALLBACK 0x01 #define PART_RANGE 0x02 +#define PART_KEYS_SETUP 0x04 u_int32_t flags; } DB_PARTITION; @@ -36,7 +37,14 @@ typedef struct __part_internal { #ifdef HAVE_PARTITION #define PART_NAME "__dbp.%s.%03d" -#define PART_LEN (strlen("__dbp..")+3) +/* + * Currently we only support no more than 1000000 partitions. + * If the limit is changed, the PART_DIGITS and PART_MAXIMUM + * should be changed accordingly. + */ +#define PART_DIGITS 6 +#define PART_MAXIMUM 1000000 +#define PART_LEN (sizeof("__dbp..") + PART_DIGITS) #define PART_PREFIX "__dbp." #define IS_PARTITION_DB_FILE(name) (strncmp(name, PART_PREFIX, \ sizeof(PART_PREFIX) - 1) == 0) diff --git a/src/dbinc/perfmon.h b/src/dbinc/perfmon.h index c3b9b9fa..e89eba33 100644 --- a/src/dbinc/perfmon.h +++ b/src/dbinc/perfmon.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2010, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/dbinc/qam.h b/src/dbinc/qam.h index 657c11e2..d18f91f3 100644 --- a/src/dbinc/qam.h +++ b/src/dbinc/qam.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/dbinc/queue.h b/src/dbinc/queue.h index 5a62741a..c53941ab 100644 --- a/src/dbinc/queue.h +++ b/src/dbinc/queue.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1991, 1993 diff --git a/src/dbinc/region.h b/src/dbinc/region.h index ac0ff16f..070aff5f 100644 --- a/src/dbinc/region.h +++ b/src/dbinc/region.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1998, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -134,7 +134,10 @@ typedef enum { REGION_TYPE_LOG, REGION_TYPE_MPOOL, REGION_TYPE_MUTEX, - REGION_TYPE_TXN } reg_type_t; + REGION_TYPE_TXN, + /* This enum always must be the last, and is the largest valid type. */ + REGION_TYPE_MAX = REGION_TYPE_TXN +} reg_type_t; #define INVALID_REGION_SEGID -1 /* Segment IDs are either shmget(2) or * Win16 segment identifiers. They are @@ -196,10 +199,10 @@ typedef struct __db_reg_env { /* SHARED */ /* - * The mtx_regenv mutex protects the environment reference count and - * memory allocation from the primary shared region (the crypto, thread - * control block and replication implementations allocate memory from - * the primary shared region). + * The mtx_regenv mutex protects the environment reference count, + * blob threshold and memory allocation from the primary shared region + * (the crypto, thread control block and replication implementations + * allocate memory from the primary shared region). * * The rest of the fields are initialized at creation time, and don't * need mutex protection. The flags, op_timestamp and rep_timestamp @@ -209,6 +212,7 @@ typedef struct __db_reg_env { /* SHARED */ */ db_mutex_t mtx_regenv; /* Refcnt, region allocation mutex. */ u_int32_t refcnt; /* References to the environment. */ + u_int32_t blob_threshold; /* Environment wide blob threshold. */ u_int32_t region_cnt; /* Number of REGIONs. */ roff_t region_off; /* Offset of region array */ @@ -227,6 +231,8 @@ typedef struct __db_reg_env { /* SHARED */ time_t op_timestamp; /* Timestamp for operations. */ time_t rep_timestamp; /* Timestamp for rep db handles. */ u_int32_t reg_panic; /* DB_REGISTER triggered panic */ + u_int32_t failure_panic; /* Failchk or mutex lock saw a crash. */ + char failure_symptom[DB_FAILURE_SYMPTOM_SIZE]; uintmax_t unused; /* The ALLOC_LAYOUT structure follows * the REGENV structure in memory and * contains uintmax_t fields. Force @@ -308,11 +314,14 @@ struct __db_reginfo_t { /* __env_region_attach IN parameters. */ /* * PANIC_ISSET, PANIC_CHECK: - * Check to see if the DB environment is dead. + * Check to see if the DB environment is dead. If the environment is still + * attached to its regions, look in the REGENV. Otherwise, check whether + * the region had the panic state set when this even detached from it. */ #define PANIC_ISSET(env) \ - ((env) != NULL && (env)->reginfo != NULL && \ - ((REGENV *)(env)->reginfo->primary)->panic != 0 && \ + ((env) != NULL && ((env)->reginfo != NULL ? \ + ((REGENV *)(env)->reginfo->primary)->panic != 0 : \ + F_ISSET(env, ENV_REMEMBER_PANIC)) && \ !F_ISSET((env)->dbenv, DB_ENV_NOPANIC)) #define PANIC_CHECK(env) \ diff --git a/src/dbinc/rep.h b/src/dbinc/rep.h index 75004239..f3bdf481 100644 --- a/src/dbinc/rep.h +++ b/src/dbinc/rep.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -19,6 +19,7 @@ extern "C" { * Names of client temp databases. */ #define REPFILEPREFIX "__db.rep" +#define REPBLOBNAME "__db.rep.blob.db" #define REPDBNAME "__db.rep.db" #define REPPAGENAME "__db.reppg.db" @@ -42,43 +43,58 @@ extern "C" { /* * Message types */ -#define REP_INVALID 0 /* Invalid message type. */ -#define REP_ALIVE 1 /* I am alive message. */ -#define REP_ALIVE_REQ 2 /* Request for alive messages. */ -#define REP_ALL_REQ 3 /* Request all log records greater than LSN. */ -#define REP_BULK_LOG 4 /* Bulk transfer of log records. */ -#define REP_BULK_PAGE 5 /* Bulk transfer of pages. */ -#define REP_DUPMASTER 6 /* Duplicate master detected; propagate. */ -#define REP_FILE 7 /* Page of a database file. NOTUSED */ -#define REP_FILE_FAIL 8 /* File requested does not exist. */ -#define REP_FILE_REQ 9 /* Request for a database file. NOTUSED */ -#define REP_LEASE_GRANT 10 /* Client grants a lease to a master. */ -#define REP_LOG 11 /* Log record. */ -#define REP_LOG_MORE 12 /* There are more log records to request. */ -#define REP_LOG_REQ 13 /* Request for a log record. */ -#define REP_MASTER_REQ 14 /* Who is the master */ -#define REP_NEWCLIENT 15 /* Announces the presence of a new client. */ -#define REP_NEWFILE 16 /* Announce a log file change. */ -#define REP_NEWMASTER 17 /* Announces who the master is. */ -#define REP_NEWSITE 18 /* Announces that a site has heard from a new - * site; like NEWCLIENT, but indirect. A - * NEWCLIENT message comes directly from the new - * client while a NEWSITE comes indirectly from - * someone who heard about a NEWSITE. - */ -#define REP_PAGE 19 /* Database page. */ -#define REP_PAGE_FAIL 20 /* Requested page does not exist. */ -#define REP_PAGE_MORE 21 /* There are more pages to request. */ -#define REP_PAGE_REQ 22 /* Request for a database page. */ -#define REP_REREQUEST 23 /* Force rerequest. */ -#define REP_START_SYNC 24 /* Tell client to begin syncing a ckp.*/ -#define REP_UPDATE 25 /* Environment hotcopy information. */ -#define REP_UPDATE_REQ 26 /* Request for hotcopy information. */ -#define REP_VERIFY 27 /* A log record for verification. */ -#define REP_VERIFY_FAIL 28 /* The client is outdated. */ -#define REP_VERIFY_REQ 29 /* Request for a log record to verify. */ -#define REP_VOTE1 30 /* Send out your information for an election. */ -#define REP_VOTE2 31 /* Send a "you are master" vote. */ +#define REP_INVALID 0 /* Invalid message type. */ +#define REP_ALIVE 1 /* I am alive message. */ +#define REP_ALIVE_REQ 2 /* Request for alive messages. */ +#define REP_ALL_REQ 3 /* Request all log records greater than + * LSN. */ +#define REP_BLOB_ALL_REQ 4 /* Request all the given blob files. */ +#define REP_BLOB_CHUNK 5 /* A piece of data contained in a blob + * file. */ +#define REP_BLOB_CHUNK_REQ 6 /* Request a piece of data from a blob + * file. */ +#define REP_BLOB_UPDATE 7 /* A list of blob files for a + * database. */ +#define REP_BLOB_UPDATE_REQ 8 /* Request blob files. */ +#define REP_BULK_LOG 9 /* Bulk transfer of log records. */ +#define REP_BULK_PAGE 10 /* Bulk transfer of pages. */ +#define REP_DUPMASTER 11 /* Duplicate master detected; + * propagate. */ +#define REP_FILE 12 /* Page of a database file. NOTUSED */ +#define REP_FILE_FAIL 13 /* File requested does not exist. */ +#define REP_FILE_REQ 14 /* Request for a database file. + * NOTUSED */ +#define REP_LEASE_GRANT 15 /* Client grants a lease to a master. */ +#define REP_LOG 16 /* Log record. */ +#define REP_LOG_MORE 17 /* There are more log records to + * request. */ +#define REP_LOG_REQ 18 /* Request for a log record. */ +#define REP_MASTER_REQ 19 /* Who is the master */ +#define REP_NEWCLIENT 20 /* Announces the presence of a new + * client. */ +#define REP_NEWFILE 21 /* Announce a log file change. */ +#define REP_NEWMASTER 22 /* Announces who the master is. */ +#define REP_NEWSITE 23 /* Announces that a site has heard from + * a new site; like NEWCLIENT, but + * indirect. A NEWCLIENT message comes + * directly from the new client while a + * NEWSITE comes indirectly from + * someone who heard about a NEWSITE.*/ +#define REP_PAGE 24 /* Database page. */ +#define REP_PAGE_FAIL 25 /* Requested page does not exist. */ +#define REP_PAGE_MORE 26 /* There are more pages to request. */ +#define REP_PAGE_REQ 27 /* Request for a database page. */ +#define REP_REREQUEST 28 /* Force rerequest. */ +#define REP_START_SYNC 29 /* Tell client to begin syncing a ckp.*/ +#define REP_UPDATE 30 /* Environment hotcopy information. */ +#define REP_UPDATE_REQ 31 /* Request for hotcopy information. */ +#define REP_VERIFY 32 /* A log record for verification. */ +#define REP_VERIFY_FAIL 33 /* The client is outdated. */ +#define REP_VERIFY_REQ 34 /* Request for a log record to + * verify. */ +#define REP_VOTE1 35 /* Send out your information for an + * election. */ +#define REP_VOTE2 36 /* Send a "you are master" vote. */ /* * Maximum message number for conversion tables. Update this * value as the largest message number above increases. @@ -90,7 +106,7 @@ extern "C" { * NOTE: When changing messages above, the two tables for upgrade support * need adjusting. They are in rep_util.c. */ -#define REP_MAX_MSG 31 +#define REP_MAX_MSG 36 /* * This is the list of client-to-client requests messages. @@ -99,6 +115,8 @@ extern "C" { */ #define REP_MSG_REQ(rectype) \ (rectype == REP_ALL_REQ || \ + rectype == REP_BLOB_ALL_REQ || \ + rectype == REP_BLOB_CHUNK_REQ || \ rectype == REP_LOG_REQ || \ rectype == REP_PAGE_REQ || \ rectype == REP_VERIFY_REQ) @@ -125,6 +143,9 @@ extern "C" { #define DB_LOGVERSION_51 17 #define DB_LOGVERSION_52 18 #define DB_LOGVERSION_53 19 +#define DB_LOGVERSION_60 20 +#define DB_LOGVERSION_60p1 21 +#define DB_LOGVERSION_61 22 #define DB_LOGVERSION_MIN DB_LOGVERSION_44 #define DB_REPVERSION_INVALID 0 #define DB_REPVERSION_44 3 @@ -132,11 +153,12 @@ extern "C" { #define DB_REPVERSION_46 4 #define DB_REPVERSION_47 5 #define DB_REPVERSION_48 5 -#define DB_REPVERSION_50 5 #define DB_REPVERSION_51 5 #define DB_REPVERSION_52 6 #define DB_REPVERSION_53 7 -#define DB_REPVERSION DB_REPVERSION_53 +#define DB_REPVERSION_60 7 +#define DB_REPVERSION_61 8 +#define DB_REPVERSION DB_REPVERSION_61 #define DB_REPVERSION_MIN DB_REPVERSION_44 /* @@ -204,9 +226,20 @@ extern "C" { #define REP_INITVERSION 3 /* + * View/partial replication file name. + * The file is empty. It exists as a permanent indicator that this + * environment can never be master. + */ +#define REPVIEW "__db.rep.view" +#define IS_VIEW_SITE(env) \ + (REP_ON(env) && \ + ((env)->rep_handle->region->stat.st_view != 0)) + +/* * Database types for __rep_client_dbinit */ typedef enum { + REP_BLOB, /* Blob file database. */ REP_DB, /* Log record database. */ REP_PG /* Pg database. */ } repdb_t; @@ -239,7 +272,7 @@ typedef enum { typedef enum { SYNC_OFF, /* No recovery. */ SYNC_LOG, /* Recovery - log. */ - SYNC_PAGE, /* Recovery - pages. */ + SYNC_PAGE, /* Recovery - pages and blobs. */ SYNC_UPDATE, /* Recovery - update. */ SYNC_VERIFY /* Recovery - verify. */ } repsync_t; @@ -346,6 +379,17 @@ typedef struct __rep { /* SHARED */ u_int32_t first_vers; /* Log version of first log file. */ DB_LSN last_lsn; /* Latest LSN we need. */ /* These are protected by mtx_clientdb. */ + db_seq_t gap_bl_hi_id; /* Last id in the blob gap. */ + db_seq_t gap_bl_hi_sid; /* Last sid in the blob gap. */ + off_t gap_bl_hi_off; /* Last offset in the blob gap. */ + db_seq_t last_blob_id; /* Last id on the list to process. */ + db_seq_t last_blob_sid; /* Last sid on the list to process. */ + db_seq_t prev_blob_id; /* Previous last id on list. */ + db_seq_t prev_blob_sid; /* Previous last sid on list. */ + db_seq_t highest_id; /* Highest file id to request. */ + u_int32_t blob_more_files;/* More blob files to be processed. */ + int blob_sync; /* Currently handling blobs. */ + int blob_rereq; /* When to rereq a blob update msg. */ db_timespec last_pg_ts; /* Last page stored timestamp. */ db_pgno_t ready_pg; /* Next pg expected. */ db_pgno_t waiting_pg; /* First pg after gap. */ @@ -391,11 +435,13 @@ typedef struct __rep { /* SHARED */ roff_t siteinfo_off; /* Offset of site array region. */ u_int site_cnt; /* Array slots in use. */ u_int site_max; /* Total array slots allocated. */ + u_int sites_avail; /* Total number of available sites. */ int self_eid; /* Where to find the local site. */ u_int siteinfo_seq; /* Number of updates to this info. */ u_int32_t min_log_file; /* Earliest log needed by repgroup. */ pid_t listener; + u_int listener_nthreads; /* # of msg threads in listener. */ int perm_policy; db_timeout_t ack_timeout; @@ -403,6 +449,11 @@ typedef struct __rep { /* SHARED */ db_timeout_t connection_retry_wait; db_timeout_t heartbeat_frequency; /* Max period between msgs. */ db_timeout_t heartbeat_monitor_timeout; + u_int32_t inqueue_max_gbytes; + u_int32_t inqueue_max_bytes; + u_int32_t inqueue_rz_gbytes; + u_int32_t inqueue_rz_bytes; + u_int32_t inqueue_full_event_on; #endif /* HAVE_REPLICATION_THREADS */ /* Statistics. */ @@ -419,12 +470,16 @@ typedef struct __rep { /* SHARED */ #define REP_C_2SITE_STRICT 0x00001 /* Don't cheat on elections. */ #define REP_C_AUTOINIT 0x00002 /* Auto initialization. */ #define REP_C_AUTOROLLBACK 0x00004 /* Discard client txns: sync. */ -#define REP_C_BULK 0x00008 /* Bulk transfer. */ -#define REP_C_DELAYCLIENT 0x00010 /* Delay client sync-up. */ -#define REP_C_ELECTIONS 0x00020 /* Repmgr to use elections. */ -#define REP_C_INMEM 0x00040 /* In-memory replication. */ -#define REP_C_LEASE 0x00080 /* Leases configured. */ -#define REP_C_NOWAIT 0x00100 /* Immediate error return. */ +#define REP_C_AUTOTAKEOVER 0x00008 /* Auto listener take over. */ +#define REP_C_BULK 0x00010 /* Bulk transfer. */ +#define REP_C_DELAYCLIENT 0x00020 /* Delay client sync-up. */ +#define REP_C_ELECT_LOGLENGTH 0x00040 /* Log length wins election. */ +#define REP_C_ELECTIONS 0x00080 /* Repmgr to use elections. */ +#define REP_C_INMEM 0x00100 /* In-memory replication. */ +#define REP_C_LEASE 0x00200 /* Leases configured. */ +#define REP_C_NOWAIT 0x00400 /* Immediate error return. */ +#define REP_C_PREFMAS_CLIENT 0x00800 /* Preferred master client. */ +#define REP_C_PREFMAS_MASTER 0x01000 /* Preferred master site. */ u_int32_t config; /* Configuration flags. */ /* Election. */ @@ -455,15 +510,17 @@ typedef struct __rep { /* SHARED */ #define REP_F_CLIENT 0x00000008 /* Client replica. */ #define REP_F_DELAY 0x00000010 /* Delaying client sync-up. */ #define REP_F_GROUP_ESTD 0x00000020 /* Rep group is established. */ -#define REP_F_INUPDREQ 0x00000040 /* Thread in rep_update_req. */ -#define REP_F_LEASE_EXPIRED 0x00000080 /* Leases guaranteed expired. */ -#define REP_F_MASTER 0x00000100 /* Master replica. */ -#define REP_F_MASTERELECT 0x00000200 /* Master elect. */ -#define REP_F_NEWFILE 0x00000400 /* Newfile in progress. */ -#define REP_F_NIMDBS_LOADED 0x00000800 /* NIMDBs are materialized. */ -#define REP_F_SKIPPED_APPLY 0x00001000 /* Skipped applying a record. */ -#define REP_F_START_CALLED 0x00002000 /* Rep_start called. */ -#define REP_F_SYS_DB_OP 0x00004000 /* Operation in progress. */ +#define REP_F_HOLD_GEN 0x00000040 /* PrefMas startup hold gen. */ +#define REP_F_INUPDREQ 0x00000080 /* Thread in rep_update_req. */ +#define REP_F_LEASE_EXPIRED 0x00000100 /* Leases guaranteed expired. */ +#define REP_F_MASTER 0x00000200 /* Master replica. */ +#define REP_F_MASTERELECT 0x00000400 /* Master elect. */ +#define REP_F_NEWFILE 0x00000800 /* Newfile in progress. */ +#define REP_F_NIMDBS_LOADED 0x00001000 /* NIMDBs are materialized. */ +#define REP_F_READONLY_MASTER 0x00002000 /* PrefMas readonly master. */ +#define REP_F_SKIPPED_APPLY 0x00004000 /* Skipped applying a record. */ +#define REP_F_START_CALLED 0x00008000 /* Rep_start called. */ +#define REP_F_SYS_DB_OP 0x00010000 /* Operation in progress. */ u_int32_t flags; } REP; @@ -525,7 +582,7 @@ do { \ /* * REP_F_EPHASE0 is not a *real* election phase. It is used for * master leases and allowing the client to find the master or - * expire its lease. However, EPHASE0 is cleared by __rep_elect_done. + * expire its lease. */ #define IN_ELECTION(R) \ FLD_ISSET((R)->elect_flags, REP_E_PHASE1 | REP_E_PHASE2) @@ -594,6 +651,22 @@ do { \ } while (0) +/* Macros to determine current replication configuration options. */ +#define REP_CONFIG_IS_SET(env, flags) \ + (REP_ON(env) ? \ + FLD_ISSET(((env)->rep_handle->region)->config, flags) : \ + FLD_ISSET(((env)->rep_handle)->config, flags)) +#ifdef HAVE_REPLICATION_THREADS +#define PREFMAS_IS_SET(env) \ + (REP_CONFIG_IS_SET(env, \ + (REP_C_PREFMAS_MASTER | REP_C_PREFMAS_CLIENT))) +#else +#define PREFMAS_IS_SET(env) 0 +#endif +#define IS_PREFMAS_MODE(env) \ + (REP_ON(env) && PREFMAS_IS_SET(env) && \ + ((env)->rep_handle->region)->config_nsites < 3) + /* * Gap processing flags. These provide control over the basic * gap processing algorithm for some special cases. @@ -603,11 +676,28 @@ do { \ /* REREQUEST is a superset of FORCE. */ /* + * Internal options for rep_start_int(). These are used by preferred master + * mode to help coordinate between the sites during changes of master. + */ +#define REP_START_FORCE_ROLECHG 0x001 /* Force role change to advance gen. */ +#define REP_START_HOLD_CLIGEN 0x002 /* Hold client gen before doing + * lsnhist match. */ +#define REP_START_WAIT_LOCKMSG 0x004 /* Wait for REP_LOCKOUT_MSG. */ + +/* * Flags indicating what kind of record we want to back up to, in the log. */ -#define REP_REC_COMMIT 0x001 /* Most recent commit record. */ -#define REP_REC_PERM 0x002 /* Most recent perm record. */ +#define REP_REC_COMMIT 0x001 /* Most recent commit record. */ +#define REP_REC_PERM 0x002 /* Most recent perm record. */ /* PERM is a superset of COMMIT. */ +#define REP_REC_PERM_DEL 0x004 /* Most recent PERM, or fail if a + * file delete is found first. */ + +/* + * Permanent record types. + */ +#define IS_PERM_RECTYPE(rectype) \ + ((rectype) == DB___txn_regop || (rectype) == DB___txn_ckp) /* * Basic pre/post-amble processing. @@ -692,7 +782,7 @@ do { \ * machine instruction. A single 32-bit integer value is safe without a * mutex, but most other types of value should use a mutex. * - * Any use of a mutex must be inside a matched pair of ENV_ENTER() and + * Use of a db_mutex_t mutex must be inside a matched pair of ENV_ENTER() and * ENV_LEAVE() macros. This ensures that if a thread dies while holding * a lock (i.e. a mutex), recovery can clean it up so that it does not * indefinitely block other threads. @@ -727,6 +817,9 @@ struct __db_rep { /* * End of shared configuration information. */ + int (*partial) /* View/partial replication function. */ + __P((DB_ENV *, const char *, int *, u_int32_t)); + int (*send) /* Send function. */ __P((DB_ENV *, const DBT *, const DBT *, const DB_LSN *, int, u_int32_t)); @@ -745,6 +838,7 @@ struct __db_rep { DB_MPOOLFILE *file_mpf; /* Mpoolfile for current database. */ DB *file_dbp; /* This file's page info. */ DBC *queue_dbc; /* Dbc for a queue file. */ + DB *blob_dbp; /* Blob file database. */ /* * Please change __rep_print_all (rep_stat.c) to track any changes made @@ -759,6 +853,7 @@ struct __db_rep { /* * Replication Framework (repmgr) per-process information. */ + int config_nthreads;/* Configured msg processing threads. */ u_int nthreads; /* Msg processing threads. */ u_int athreads; /* Space allocated for msg threads. */ u_int non_rep_th; /* Threads in GMDB or channel msgs. */ @@ -771,10 +866,13 @@ struct __db_rep { db_timeout_t connection_retry_wait; db_timeout_t heartbeat_frequency; /* Max period between msgs. */ db_timeout_t heartbeat_monitor_timeout; + u_int32_t inqueue_max_gbytes; + u_int32_t inqueue_max_bytes; /* Thread synchronization. */ REPMGR_RUNNABLE *selector, **messengers, **elect_threads; REPMGR_RUNNABLE *preferred_elect_thr; + REPMGR_RUNNABLE *takeover_thread; db_timespec repstart_time; mgr_mutex_t *mutex; cond_var_t check_election, gmdb_idle, msg_avail; @@ -799,12 +897,18 @@ struct __db_rep { CONNECTION_LIST connections; RETRY_Q_HEADER retries; /* Sites needing connection retry. */ struct { - int size; + u_int32_t gbytes; + u_int32_t bytes; STAILQ_HEAD(__repmgr_q_header, __repmgr_message) header; } input_queue; socket_t listen_fd; db_timespec last_bcast; /* Time of last broadcast msg. */ + db_timespec last_hbeat; /* Time of last heartbeat (prefmas). */ + db_timespec l_listener_chk; /* Time to check local listener. */ + db_timeout_t l_listener_wait;/* Timeout to check local listener. */ + db_timespec m_listener_chk; /* Time to check master listener. */ + db_timeout_t m_listener_wait;/* Timeout to check master listener. */ /* * Status of repmgr. It is ready when repmgr is not yet started. It @@ -813,12 +917,15 @@ struct __db_rep { */ enum { ready, running, stopped } repmgr_status; int new_connection; /* Since last master seek attempt. */ + int demotion_pending; /* We're being demoted to a view. */ int takeover_pending; /* We've been elected master. */ + int rejoin_pending; /* Join group retry after rejection. */ int gmdb_busy; int client_intent; /* Will relinquish master role. */ int gmdb_dirty; int have_gmdb; int seen_repmsg; + int view_mismatch; /* View callback and gmdb don't match. */ /* * Flag to show what kind of transaction is currently in progress. @@ -854,6 +961,16 @@ struct __db_rep { u_int8_t *restored_list; size_t restored_list_length; + /* + * Preferred master mode indicator for a pending action. A + * master_switch is initiated when the preferred master site is + * ready to take over as master. A start_temp_master is initiated + * when the client site needs to start as the temporary master. + */ + enum { no_action, master_switch, start_temp_master } prefmas_pending; + /* The LSN at the very beginning of preferred master site startup. */ + DB_LSN prefmas_init_lsn; + /* Application's message dispatch call-back function. */ void (*msg_dispatch) __P((DB_ENV *, DB_CHANNEL *, DBT *, u_int32_t, u_int32_t)); @@ -920,6 +1037,10 @@ struct __db_rep { } else if (!F_ISSET((env)->rep_handle, DBREP_APP_REPMGR)) \ F_SET((env)->rep_handle, DBREP_APP_BASEAPI); \ } while (0) +#define ADJUST_AUTOTAKEOVER_WAITS(db_rep, timeout) do { \ + (db_rep)->l_listener_wait = timeout; \ + (db_rep)->m_listener_wait = 3 * timeout; \ +} while (0) #else /* @@ -935,6 +1056,9 @@ struct __db_rep { #define APP_SET_BASEAPI(env) do { \ ; \ } while (0) +#define ADJUST_AUTOTAKEOVER_WAITS(db_rep, timeout) do { \ + ; \ +} while (0) #endif /* HAVE_REPLICATION_THREADS */ /* @@ -945,22 +1069,27 @@ struct __db_rep { * compatibility with old versions, these values must be reserved explicitly in * the list of flag values (below) */ -#define DB_LOG_PERM_42_44 0x20 -#define DB_LOG_RESEND_42_44 0x40 -#define REPCTL_INIT_45 0x02 /* Back compatible flag value. */ - -#define REPCTL_ELECTABLE 0x01 /* Upgraded client is electable. */ -#define REPCTL_FLUSH 0x02 /* Record should be flushed. */ -#define REPCTL_GROUP_ESTD 0x04 /* Message from site in a group. */ -#define REPCTL_INIT 0x08 /* Internal init message. */ -#define REPCTL_LEASE 0x10 /* Lease related message.. */ +#define DB_LOG_PERM_42_44 0x020 +#define DB_LOG_RESEND_42_44 0x040 +#define REPCTL_INIT_45 0x002 /* Back compatible flag value. */ + +/* + * Add new REPCTL flags to the end of this list to preserve compatibility + * with old versions. + */ +#define REPCTL_ELECTABLE 0x001 /* Upgraded client is electable. */ +#define REPCTL_FLUSH 0x002 /* Record should be flushed. */ +#define REPCTL_GROUP_ESTD 0x004 /* Message from site in a group. */ +#define REPCTL_INIT 0x008 /* Internal init message. */ +#define REPCTL_LEASE 0x010 /* Lease related message. */ /* * Skip over reserved values 0x20 * and 0x40, as explained above. */ -#define REPCTL_LOG_END 0x80 /* Approximate end of group-wide log. */ +#define REPCTL_LOG_END 0x080 /* Approximate end of group-wide log. */ #define REPCTL_PERM DB_LOG_PERM_42_44 #define REPCTL_RESEND DB_LOG_RESEND_42_44 +#define REPCTL_INMEM_ONLY 0x100 /* In-memory databases only. */ /* * File info flags for internal init. The per-database (i.e., file) flag @@ -1094,6 +1223,20 @@ typedef struct { DBT *objs; } linfo_t; +/* + * Used to store information on the child transaction that opens a blob meta + * database. In partial replication processing the child transaction of the + * blob meta database must be delayed until after processing the child + * transaction that opens the database that owns the BMD. + */ +typedef struct { + db_seq_t blob_file_id; + DB_LSN lsn; + u_int32_t child; + void *next; + void *prev; +} DELAYED_BLOB_LIST; + #if defined(__cplusplus) } #endif diff --git a/src/dbinc/repmgr.h b/src/dbinc/repmgr.h index d8fd199c..a38defa2 100644 --- a/src/dbinc/repmgr.h +++ b/src/dbinc/repmgr.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2006, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -47,20 +47,29 @@ extern "C" { * In protocol version one there were only three message types: 1, 2, and 3; so * 3 was the max. In protocol version 2 we introduced heartbeats, type 4. * (Protocol version 3 did not introduce any new message types.) In version 4 - * we introduced a few more new message types, the largest of which had value 7. + * we introduced a few more new message types, the largest of which had value 8. + * Protocol version 5 did not introduce any new message types, but changed + * the format of site info and membership data to support views. + * + * Protocol version 6 introduced preferred master mode, which added several + * new REPMGR_OWN messages. */ #define REPMGR_MAX_V1_MSG_TYPE 3 #define REPMGR_MAX_V2_MSG_TYPE 4 #define REPMGR_MAX_V3_MSG_TYPE 4 #define REPMGR_MAX_V4_MSG_TYPE 8 +#define REPMGR_MAX_V5_MSG_TYPE 8 +#define REPMGR_MAX_V6_MSG_TYPE 8 #define HEARTBEAT_MIN_VERSION 2 #define CHANNEL_MIN_VERSION 4 #define CONN_COLLISION_VERSION 4 #define GM_MIN_VERSION 4 #define OWN_MIN_VERSION 4 +#define VIEW_MIN_VERSION 5 +#define PREFMAS_MIN_VERSION 6 /* The range of protocol versions we're willing to support. */ -#define DB_REPMGR_VERSION 4 +#define DB_REPMGR_VERSION 6 #define DB_REPMGR_MIN_VERSION 1 /* @@ -73,18 +82,30 @@ extern "C" { * Like the message format types, these message type values should be * permanently frozen. */ -#define REPMGR_CONNECT_REJECT 1 -#define REPMGR_GM_FAILURE 2 -#define REPMGR_GM_FORWARD 3 -#define REPMGR_JOIN_REQUEST 4 -#define REPMGR_JOIN_SUCCESS 5 -#define REPMGR_PARM_REFRESH 6 -#define REPMGR_REJOIN 7 -#define REPMGR_REMOVE_REQUEST 8 -#define REPMGR_REMOVE_SUCCESS 9 -#define REPMGR_RESOLVE_LIMBO 10 -#define REPMGR_SHARING 11 - +#define REPMGR_CONNECT_REJECT 1 +#define REPMGR_GM_FAILURE 2 +#define REPMGR_GM_FORWARD 3 +#define REPMGR_JOIN_REQUEST 4 +#define REPMGR_JOIN_SUCCESS 5 +#define REPMGR_PARM_REFRESH 6 +#define REPMGR_REJOIN 7 +#define REPMGR_REMOVE_REQUEST 8 +#define REPMGR_REMOVE_SUCCESS 9 +#define REPMGR_RESOLVE_LIMBO 10 +#define REPMGR_SHARING 11 +#define REPMGR_LSNHIST_REQUEST 12 +#define REPMGR_LSNHIST_RESPONSE 13 +#define REPMGR_PREFMAS_FAILURE 14 +#define REPMGR_PREFMAS_SUCCESS 15 +#define REPMGR_READONLY_MASTER 16 +#define REPMGR_READONLY_RESPONSE 17 +#define REPMGR_RESTART_CLIENT 18 + +/* Detect inconsistencies between view callback and site's gmdb. */ +#define PARTICIPANT_TO_VIEW(db_rep, site) \ + ((db_rep)->partial && !FLD_ISSET((site)->gmdb_flags, SITE_VIEW)) +#define VIEW_TO_PARTICIPANT(db_rep, site) \ + (!(db_rep)->partial && FLD_ISSET((site)->gmdb_flags, SITE_VIEW)) struct __repmgr_connection; typedef struct __repmgr_connection REPMGR_CONNECTION; @@ -98,7 +119,8 @@ struct __cond_waiters_table; typedef struct __cond_waiters_table COND_WAITERS_TABLE; /* Current Group Membership DB format ID. */ -#define REPMGR_GMDB_FMT_VERSION 1 +#define REPMGR_GMDB_FMT_VERSION 2 +#define REPMGR_GMDB_FMT_MIN_VERSION 1 #ifdef DB_WIN32 typedef SOCKET socket_t; @@ -151,6 +173,17 @@ typedef char SITE_STRING_BUFFER[MAX_SITE_LOC_STRING+1]; #define DB_REPMGR_DEFAULT_ELECTION_RETRY (10 * US_PER_SEC) #define DB_REPMGR_DEFAULT_CHANNEL_TIMEOUT (5 * US_PER_SEC) +/* Default preferred master automatic configuration values. */ +#define DB_REPMGR_PREFMAS_ELECTION_RETRY (1 * US_PER_SEC) +#define DB_REPMGR_PREFMAS_HEARTBEAT_MONITOR (2 * US_PER_SEC) +#define DB_REPMGR_PREFMAS_HEARTBEAT_SEND (75 * (US_PER_SEC / 100)) +#define DB_REPMGR_PREFMAS_PRIORITY_CLIENT 75 +#define DB_REPMGR_PREFMAS_PRIORITY_MASTER 200 + +/* Defaults for undocumented incoming queue maximum messages. */ +#define DB_REPMGR_DEFAULT_INQUEUE_MAX (100 * MEGABYTE) +#define DB_REPMGR_INQUEUE_REDZONE_PERCENT 85 + typedef TAILQ_HEAD(__repmgr_conn_list, __repmgr_connection) CONNECTION_LIST; typedef STAILQ_HEAD(__repmgr_out_q_head, __queued_output) OUT_Q_HEADER; typedef TAILQ_HEAD(__repmgr_retry_q, __repmgr_retry) RETRY_Q_HEADER; @@ -170,14 +203,20 @@ struct __repmgr_runnable { /* * Options governing requested behavior of election thread. */ -#define ELECT_F_EVENT_NOTIFY 0x01 /* Notify application of master failure. */ -#define ELECT_F_FAST 0x02 /* First election "fast" (n-1 trick). */ -#define ELECT_F_IMMED 0x04 /* Start with immediate election. */ -#define ELECT_F_INVITEE 0x08 /* Honor (remote) inviter's nsites. */ -#define ELECT_F_STARTUP 0x10 /* Observe repmgr_start() policy. */ +#define ELECT_F_CLIENT_RESTART 0x01 /* Do client restarts but no elections. */ +#define ELECT_F_EVENT_NOTIFY 0x02 /* Notify application of master failure. */ +#define ELECT_F_FAST 0x04 /* First election "fast" (n-1 trick). */ +#define ELECT_F_IMMED 0x08 /* Start with immediate election. */ +#define ELECT_F_INVITEE 0x10 /* Honor (remote) inviter's nsites. */ +#define ELECT_F_STARTUP 0x20 /* Observe repmgr_start() policy. */ u_int32_t flags; - int eid; /* For Connector thread. */ + /* For connector thread. */ + struct { + int eid; +#define CONNECT_F_REFRESH 0x01 /* New connection to replace old one. */ + u_int32_t flags; + } conn_th; /* * Args for other thread types can be added here in the future @@ -265,6 +304,7 @@ struct __queued_output { */ typedef struct __repmgr_message { STAILQ_ENTRY(__repmgr_message) entries; + size_t size; __repmgr_msg_hdr_args msg_hdr; union { struct { @@ -343,6 +383,7 @@ struct __repmgr_connection { #define CONN_PARAMETERS 5 /* Awaiting parameters handshake. */ #define CONN_READY 6 /* Everything's fine. */ int state; + u_int32_t auto_takeover;/* Connection to remote listener candidate. */ /* * Input: while we're reading a message, we keep track of what phase @@ -464,6 +505,8 @@ typedef struct { SITEADDR addr; /* Unprocessed network address of site. */ u_int32_t config; /* Configuration flags: peer, helper, etc. */ u_int32_t status; /* Group membership status. */ + u_int32_t flags; /* Group membership flags. */ + u_int32_t listener_cand;/* Number of listener candidates of site. */ } SITEINFO; /* @@ -489,6 +532,42 @@ typedef struct { ((u_int)i) < db_rep->site_cnt; \ (int)(++(i)) == db_rep->self_eid ? ++(i) : i) +/* + * Enable replication manager auto listener takeover. + */ +#define HAVE_REPLICATION_LISTENER_TAKEOVER 1 + +/* Listener candidate, that is subordinate rep-aware process. */ +#define IS_LISTENER_CAND(db_rep) \ + (FLD_ISSET((db_rep)->region->config, REP_C_AUTOTAKEOVER) && \ + IS_SUBORDINATE(db_rep) && (db_rep)->repmgr_status == running) + +/* + * The number of listener candidates for each remote site is maintained in + * the listener process and used in subordinate rep-aware processes. + */ +#define SET_LISTENER_CAND(cond, op) \ + do { \ + if (FLD_ISSET(rep->config, REP_C_AUTOTAKEOVER) && \ + !IS_SUBORDINATE(db_rep) && (cond)) { \ + MUTEX_LOCK(env, rep->mtx_repmgr); \ + sites = R_ADDR(env->reginfo, rep->siteinfo_off);\ + (sites[eid].listener_cand)op; \ + MUTEX_UNLOCK(env, rep->mtx_repmgr); \ + } \ + } while (0) + +#define CHECK_LISTENER_CAND(val, op, tval, fval) \ + do { \ + if (IS_LISTENER_CAND(db_rep)) { \ + MUTEX_LOCK(env, rep->mtx_repmgr); \ + sites = R_ADDR(env->reginfo, rep->siteinfo_off);\ + val = ((sites[eid].listener_cand)op) ? \ + (tval) : (fval); \ + MUTEX_UNLOCK(env, rep->mtx_repmgr); \ + } \ + } while (0) + struct __repmgr_site { repmgr_netaddr_t net_addr; @@ -499,12 +578,14 @@ struct __repmgr_site { * host/port network address is promised to be associated with the * locally known EID for the life of the environment. */ - u_int32_t membership; /* Status flags from GMDB. */ + u_int32_t membership; /* Status value from GMDB. */ + u_int32_t gmdb_flags; /* Flags from GMDB. */ u_int32_t config; /* Flags from site->set_config() */ /* * Everything below here is applicable only to remote sites. */ + u_int32_t max_ack_gen; /* Master generation for max_ack. */ DB_LSN max_ack; /* Best ack we've heard from this site. */ int ack_policy; /* Or 0 if unknown. */ u_int16_t alignment; /* Requirements for app channel msgs. */ @@ -604,11 +685,11 @@ struct __channel { * connections may be found: (1) SITE->ref.conn, (2) SITE->sub_conns, and * (3) db_rep->connections. * - * 1. SITE->ref.conn points to our connection with the main process running - * at the given site, if such a connection exists. We may have initiated - * the connection to the site ourselves, or we may have received it as an - * incoming connection. Once it is established there is very little - * difference between those two cases. + * 1. SITE->ref.conn points to our connection with the listener process + * running at the given site, if such a connection exists. We may have + * initiated the connection to the site ourselves, or we may have received + * it as an incoming connection. Once it is established there is very + * little difference between those two cases. * * 2. SITE->sub_conns is a list of connections we have with subordinate * processes running at the given site. There can be any number of these @@ -694,6 +775,7 @@ struct __channel { */ #define APP_CHANNEL_CONNECTION 0x02 /* Connection used for app channel. */ #define ELECTABLE_SITE 0x04 +#define REPMGR_AUTOTAKEOVER 0x08 /* Could become main connection. */ #define REPMGR_SUBORDINATE 0x01 /* This is a subordinate connection. */ /* @@ -719,13 +801,20 @@ typedef struct { * As with message formats, stored formats are defined in repmgr.msg. */ /* - * Flags for the Group Membership data portion of a record. Like message type - * codes, these values are frozen across releases, in order to avoid pointless - * churn. + * Status values for the Group Membership data portion of a record. Like + * message type codes, these values are frozen across releases, in order to + * avoid pointless churn. These values are mutually exclusive. */ #define SITE_ADDING 0x01 #define SITE_DELETING 0x02 #define SITE_PRESENT 0x04 +/* + * Flags for the Group Membership data portion of a record. These values are + * also frozen across releases. These values are bit fields and may be OR'ed + * together. + */ +#define SITE_VIEW 0x01 +#define SITE_JOIN_ELECTABLE 0x02 /* * Message types whose processing could take a long time. We're careful to @@ -755,9 +844,9 @@ typedef struct { * fraction of the code, it's a tiny fraction of the time: repmgr spends most of * its time in a call to select(), and as well a bit in calls into the Base * replication API. All of those release the mutex. - * Access to repmgr's shared list of site addresses is protected by - * another mutex: mtx_repmgr. And, when changing space allocation for that site - * list we conform to the convention of acquiring renv->mtx_regenv. These are + * Access to repmgr's shared values is protected by another mutex: + * mtx_repmgr. And, when changing space allocation for that site list + * we conform to the convention of acquiring renv->mtx_regenv. These are * less frequent of course. * When it's necessary to acquire more than one of these mutexes, the * ordering priority (or "lock ordering protocol") is: diff --git a/src/dbinc/shqueue.h b/src/dbinc/shqueue.h index 22464462..20e0fae7 100644 --- a/src/dbinc/shqueue.h +++ b/src/dbinc/shqueue.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -140,6 +140,17 @@ struct { \ ((struct type *)(((u_int8_t *)(elm)) + (elm)->field.sle_next))) /* + * __SH_LIST_WAS_EMPTY is private API. SH_LIST_FIRST is not thread-safe; + * the slh_first field could be evaluated multiple times if the optimizer + * does not eliminate the second load. __SH_LIST_WAS_EMPTY tests whether a + * prior call of SH_LIST_FIRSTP occurred while the list was empty; i.e., its + * relative offset was -1. It is thread-safe to call SH_LIST_FIRSTP and then + * test the resulting pointer with __SH_LIST_WAS_EMPTY. + */ +#define __SH_LIST_WAS_EMPTY(head, ptr) \ + ((u_int8_t *)(ptr) == (((u_int8_t *)(head)) + (-1))) + + /* *__SH_LIST_PREV_OFF is private API. It calculates the address of * the elm->field.sle_next member of a SH_LIST structure. All offsets * between elements are relative to that point in SH_LIST structures. diff --git a/src/dbinc/tcl_db.h b/src/dbinc/tcl_db.h index 4c56164f..99992467 100644 --- a/src/dbinc/tcl_db.h +++ b/src/dbinc/tcl_db.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -16,7 +16,7 @@ extern "C" { #define MSG_SIZE 100 /* Message size */ enum INFOTYPE { - I_AUX, I_DB, I_DBC, I_ENV, I_LOCK, I_LOGC, I_MP, I_NDBM, I_PG, I_SEQ, I_TXN}; + I_AUX, I_DB, I_DBC, I_DBSTREAM, I_ENV, I_LOCK, I_LOGC, I_MP, I_NDBM, I_PG, I_SEQ, I_TXN}; #define MAX_ID 8 /* Maximum number of sub-id's we need */ #define DBTCL_PREP 64 /* Size of txn_recover preplist */ @@ -24,9 +24,11 @@ enum INFOTYPE { #define DBTCL_DBM 1 #define DBTCL_NDBM 2 -#define DBTCL_GETCLOCK 0 -#define DBTCL_GETLIMIT 1 -#define DBTCL_GETREQ 2 +#define DBTCL_GETCLOCK 0 +#define DBTCL_GETINQUEUE_MAX 1 +#define DBTCL_GETINQUEUE_REDZONE 2 +#define DBTCL_GETLIMIT 3 +#define DBTCL_GETREQ 4 #define DBTCL_MUT_ALIGN 0 #define DBTCL_MUT_INCR 1 @@ -36,9 +38,11 @@ enum INFOTYPE { /* * Data structure to record information about events that have occurred. Tcl - * command "env event_info" can retrieve the information. For now, we record - * only one occurrence per event type; "env event_info -clear" can be used to - * reset the info. + * command "env event_info" can retrieve all the information except the number + * of times, and "env event_count" can retrieve the number of times a specific + * event is fired. We added "env event_count" instead of merging the times + * information into "env event_info" to avoid breaking the existing tests. + * Tcl command "env event_info -clear" can be used to reset the info. * * Besides the bit flag that records the fact that an event type occurred, some * event types have associated "info" and we record that here too. When new @@ -47,16 +51,17 @@ enum INFOTYPE { * with the "env event_info" results. */ typedef struct dbtcl_event_info { - u_int32_t events; /* Bit flag on for each event fired. */ - int panic_error; - int newmaster_eid; - int added_eid; - int removed_eid; - pid_t attached_process; - int connected_eid; + u_int32_t events; /* Bit flag on for each event fired. */ + int panic_error; + int newmaster_eid; + int added_eid; + int removed_eid; + pid_t attached_process; + int connected_eid; DB_REPMGR_CONN_ERR conn_broken_info; DB_REPMGR_CONN_ERR conn_failed_try_info; - DB_LSN sync_point; + DB_LSN sync_point; + size_t count[32]; /* The number of times for each event. */ } DBTCL_EVENT_INFO; /* @@ -99,6 +104,7 @@ typedef struct dbtcl_info { DB_LOCK *lock; DB_LOGC *logc; DB_MPOOLFILE *mp; + DB_STREAM *dbsp; DB_TXN *txnp; void *anyp; } un; @@ -128,6 +134,7 @@ typedef struct dbtcl_info { Tcl_Obj *i_isalive; Tcl_Obj *i_part_callback; Tcl_Obj *i_rep_send; + Tcl_Obj *i_rep_view; Tcl_Obj *i_second_call; /* Environment ID for the i_rep_send callback. */ @@ -144,6 +151,7 @@ typedef struct dbtcl_info { #define i_anyp un.anyp #define i_dbp un.dbp #define i_dbcp un.dbcp +#define i_dbsp un.dbsp #define i_envp un.envp #define i_lock un.lock #define i_logc un.logc @@ -170,6 +178,8 @@ typedef struct dbtcl_info { #define i_dbdbcid i_otherid[0] +#define i_dbcdbsid i_otherid[0] + extern int __debug_on, __debug_print, __debug_stop, __debug_test; typedef struct dbtcl_global { @@ -202,6 +212,7 @@ extern DBTCL_GLOBAL __dbtcl_global; * functions this will typically go before the "free" function to free the * stat structure returned by DB. */ +#ifdef HAVE_STATISTICS #define MAKE_STAT_LIST(s, v) do { \ result = _SetListElemInt(interp, res, (s), (long)(v)); \ if (result != TCL_OK) \ @@ -213,6 +224,11 @@ extern DBTCL_GLOBAL __dbtcl_global; if (result != TCL_OK) \ goto error; \ } while (0) +#else +/* These do-nothing versions streamline the code & reduce warning messages. */ +#define MAKE_STAT_LIST(s, v) if (0) goto error +#define MAKE_WSTAT_LIST(s, v) if (0) goto error +#endif /* * MAKE_STAT_LSN appends a {name {LSNfile LSNoffset}} pair to a result list @@ -257,13 +273,14 @@ extern DBTCL_GLOBAL __dbtcl_global; * This macro also assumes a label "error" to go to in the event of a Tcl * error. */ -#define MAKE_SITE_LIST(e, h, p, s, pr) do { \ - myobjc = 5; \ +#define MAKE_SITE_LIST(e, h, p, s, pr, vw) do { \ + myobjc = 6; \ myobjv[0] = Tcl_NewIntObj(e); \ myobjv[1] = Tcl_NewStringObj((h), (int)strlen(h)); \ myobjv[2] = Tcl_NewIntObj((int)p); \ myobjv[3] = Tcl_NewStringObj((s), (int)strlen(s)); \ myobjv[4] = Tcl_NewStringObj((pr), (int)strlen(pr)); \ + myobjv[5] = Tcl_NewStringObj((vw), (int)strlen(vw)); \ thislist = Tcl_NewListObj(myobjc, myobjv); \ result = Tcl_ListObjAppendElement(interp, res, thislist); \ if (result != TCL_OK) \ diff --git a/src/dbinc/txn.h b/src/dbinc/txn.h index 7cbae263..682d7c42 100644 --- a/src/dbinc/txn.h +++ b/src/dbinc/txn.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/dbinc/win_db.h b/src/dbinc/win_db.h index ba57cd1f..e22aba98 100644 --- a/src/dbinc/win_db.h +++ b/src/dbinc/win_db.h @@ -1,17 +1,21 @@ /*- - * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2010, 2015 Oracle and/or its affiliates. All rights reserved. * * The following provides the information necessary to build Berkeley * DB on native Windows, and other Windows environments such as MinGW. */ /* - * Berkeley DB requires at least Windows 2000, tell Visual Studio of the - * requirement. + * Berkeley DB requires at least Windows 2000, and Windows XP if we are using + * Visual Studio 2012. Tell Visual Studio of the requirement. */ #ifndef _WIN32_WINNT +#if _MSC_VER >= 1700 +#define _WIN32_WINNT 0x0501 +#else #define _WIN32_WINNT 0x0500 #endif +#endif #ifndef DB_WINCE #include <sys/types.h> @@ -69,12 +73,46 @@ #endif #define getpid GetCurrentProcessId #define snprintf _snprintf +#ifndef strcasecmp #define strcasecmp _stricmp #define strncasecmp _strnicmp +#endif #define vsnprintf _vsnprintf #define h_errno WSAGetLastError() +#ifdef DB_WINCE +/* Macros used by setvbuf on WINCE */ +#ifndef _IOFBF +#define _IOFBF 0x0000 +#endif +#ifndef _IOLBF +#define _IOLBF 0x0040 +#endif +#ifndef _IONBF +#define _IONBF 0x0004 +#endif +/* The macros for time functions */ +#define freopen __ce_freopen +#define gmtime __ce_gmtime +#define mktime __ce_mktime +#define remove __ce_remove +#define SECSPERMIN 60 +#define MINSPERHOUR 60 +#define HOURSPERDAY 24 +#define DAYSPERWEEK 7 +#define DAYSPERNYEAR 365 +#define DAYSPERLYEAR 366 +#define SECSPERHOUR (SECSPERMIN * MINSPERHOUR) +#define SECSPERDAY ((long) SECSPERHOUR * HOURSPERDAY) +#define MONSPERYEAR 12 +#define TM_YEAR_BASE 1900 +#define TM_YEAR_EPOCH 1970 +#define isleap(y) ((((y) % 4) == 0 && ((y) % 100) != 0) || ((y) % 400) == 0) +extern const __DB_IMPORT unsigned int mon_lengths[][MONSPERYEAR]; +extern const __DB_IMPORT unsigned int year_lengths[]; +#endif + /* * Win32 does not have getopt. * diff --git a/src/dbinc/xa.h b/src/dbinc/xa.h index 7283c1ea..7b7e2cb0 100644 --- a/src/dbinc/xa.h +++ b/src/dbinc/xa.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1998, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/dbinc_auto/api_flags.in b/src/dbinc_auto/api_flags.in index 9727ede2..a10b6b62 100644 --- a/src/dbinc_auto/api_flags.in +++ b/src/dbinc_auto/api_flags.in @@ -36,6 +36,7 @@ #define DB_FLUSH 0x00000002 #define DB_FORCE 0x00000001 #define DB_FORCESYNC 0x00000001 +#define DB_FORCESYNCENV 0x00000002 #define DB_FOREIGN_ABORT 0x00000001 #define DB_FOREIGN_CASCADE 0x00000002 #define DB_FOREIGN_NULLIFY 0x00000004 @@ -53,8 +54,9 @@ #define DB_INIT_REP 0x00001000 #define DB_INIT_TXN 0x00002000 #define DB_INORDER 0x00000020 -#define DB_INTERNAL_PERSISTENT_DB 0x00001000 -#define DB_INTERNAL_TEMPORARY_DB 0x00002000 +#define DB_INTERNAL_BLOB_DB 0x00001000 +#define DB_INTERNAL_PERSISTENT_DB 0x00002000 +#define DB_INTERNAL_TEMPORARY_DB 0x00004000 #define DB_JOIN_NOSORT 0x00000001 #define DB_LEGACY 0x00000004 #define DB_LOCAL_SITE 0x00000008 @@ -67,12 +69,14 @@ #define DB_LOCK_SWITCH 0x00000020 #define DB_LOCK_UPGRADE 0x00000040 #define DB_LOG_AUTO_REMOVE 0x00000001 +#define DB_LOG_BLOB 0x00000002 #define DB_LOG_CHKPNT 0x00000001 #define DB_LOG_COMMIT 0x00000004 -#define DB_LOG_DIRECT 0x00000002 -#define DB_LOG_DSYNC 0x00000004 -#define DB_LOG_IN_MEMORY 0x00000008 +#define DB_LOG_DIRECT 0x00000004 +#define DB_LOG_DSYNC 0x00000008 +#define DB_LOG_IN_MEMORY 0x00000010 #define DB_LOG_NOCOPY 0x00000008 +#define DB_LOG_NOSYNC 0x00000020 #define DB_LOG_NOT_DURABLE 0x00000010 #define DB_LOG_NO_DATA 0x00000002 #define DB_LOG_VERIFY_CAF 0x00000001 @@ -84,7 +88,7 @@ #define DB_LOG_VERIFY_VERBOSE 0x00000040 #define DB_LOG_VERIFY_WARNING 0x00000080 #define DB_LOG_WRNOSYNC 0x00000020 -#define DB_LOG_ZERO 0x00000010 +#define DB_LOG_ZERO 0x00000040 #define DB_MPOOL_CREATE 0x00000001 #define DB_MPOOL_DIRTY 0x00000002 #define DB_MPOOL_DISCARD 0x00000001 @@ -102,17 +106,18 @@ #define DB_MUTEX_ALLOCATED 0x00000001 #define DB_MUTEX_LOCKED 0x00000002 #define DB_MUTEX_LOGICAL_LOCK 0x00000004 +#define DB_MUTEX_OWNER_DEAD 0x00000020 #define DB_MUTEX_PROCESS_ONLY 0x00000008 #define DB_MUTEX_SELF_BLOCK 0x00000010 -#define DB_MUTEX_SHARED 0x00000020 -#define DB_NOERROR 0x00004000 +#define DB_MUTEX_SHARED 0x00000040 +#define DB_NOERROR 0x00008000 #define DB_NOFLUSH 0x00001000 #define DB_NOLOCKING 0x00002000 #define DB_NOMMAP 0x00000010 #define DB_NOORDERCHK 0x00000002 #define DB_NOPANIC 0x00004000 #define DB_NOSYNC 0x00000001 -#define DB_NO_AUTO_COMMIT 0x00008000 +#define DB_NO_AUTO_COMMIT 0x00010000 #define DB_NO_CHECKPOINT 0x00008000 #define DB_ODDFILESIZE 0x00000080 #define DB_ORDERCHKONLY 0x00000004 @@ -123,7 +128,7 @@ #define DB_PR_PAGE 0x00000010 #define DB_PR_RECOVERYTEST 0x00000020 #define DB_RDONLY 0x00000400 -#define DB_RDWRMASTER 0x00010000 +#define DB_RDWRMASTER 0x00020000 #define DB_READ_COMMITTED 0x00000400 #define DB_READ_UNCOMMITTED 0x00000200 #define DB_RECNUM 0x00000040 @@ -134,17 +139,20 @@ #define DB_RENUMBER 0x00000080 #define DB_REPMGR_CONF_2SITE_STRICT 0x00000001 #define DB_REPMGR_CONF_ELECTIONS 0x00000002 +#define DB_REPMGR_CONF_PREFMAS_CLIENT 0x00000004 +#define DB_REPMGR_CONF_PREFMAS_MASTER 0x00000008 #define DB_REPMGR_NEED_RESPONSE 0x00000001 #define DB_REPMGR_PEER 0x00000010 #define DB_REP_ANYWHERE 0x00000001 #define DB_REP_CLIENT 0x00000001 -#define DB_REP_CONF_AUTOINIT 0x00000004 -#define DB_REP_CONF_AUTOROLLBACK 0x00000008 -#define DB_REP_CONF_BULK 0x00000010 -#define DB_REP_CONF_DELAYCLIENT 0x00000020 -#define DB_REP_CONF_INMEM 0x00000040 -#define DB_REP_CONF_LEASE 0x00000080 -#define DB_REP_CONF_NOWAIT 0x00000100 +#define DB_REP_CONF_AUTOINIT 0x00000010 +#define DB_REP_CONF_AUTOROLLBACK 0x00000020 +#define DB_REP_CONF_BULK 0x00000040 +#define DB_REP_CONF_DELAYCLIENT 0x00000080 +#define DB_REP_CONF_ELECT_LOGLENGTH 0x00000100 +#define DB_REP_CONF_INMEM 0x00000200 +#define DB_REP_CONF_LEASE 0x00000400 +#define DB_REP_CONF_NOWAIT 0x00000800 #define DB_REP_ELECTION 0x00000004 #define DB_REP_MASTER 0x00000002 #define DB_REP_NOBUFFER 0x00000002 @@ -161,8 +169,9 @@ #define DB_SEQ_WRAP 0x00000008 #define DB_SEQ_WRAPPED 0x00000010 #define DB_SET_LOCK_TIMEOUT 0x00000001 -#define DB_SET_REG_TIMEOUT 0x00000004 -#define DB_SET_TXN_NOW 0x00000008 +#define DB_SET_MUTEX_FAILCHK_TIMEOUT 0x00000004 +#define DB_SET_REG_TIMEOUT 0x00000008 +#define DB_SET_TXN_NOW 0x00000010 #define DB_SET_TXN_TIMEOUT 0x00000002 #define DB_SHALLOW_DUP 0x00000100 #define DB_SNAPSHOT 0x00000200 @@ -188,7 +197,7 @@ #define DB_SYSTEM_MEM 0x00080000 #define DB_THREAD 0x00000020 #define DB_TIME_NOTGRANTED 0x00040000 -#define DB_TRUNCATE 0x00020000 +#define DB_TRUNCATE 0x00040000 #define DB_TXN_BULK 0x00000010 #define DB_TXN_FAMILY 0x00000040 #define DB_TXN_NOSYNC 0x00000001 @@ -206,23 +215,24 @@ #define DB_VERB_DEADLOCK 0x00000002 #define DB_VERB_FILEOPS 0x00000004 #define DB_VERB_FILEOPS_ALL 0x00000008 -#define DB_VERB_RECOVERY 0x00000010 -#define DB_VERB_REGISTER 0x00000020 -#define DB_VERB_REPLICATION 0x00000040 -#define DB_VERB_REPMGR_CONNFAIL 0x00000080 -#define DB_VERB_REPMGR_MISC 0x00000100 -#define DB_VERB_REP_ELECT 0x00000200 -#define DB_VERB_REP_LEASE 0x00000400 -#define DB_VERB_REP_MISC 0x00000800 -#define DB_VERB_REP_MSGS 0x00001000 -#define DB_VERB_REP_SYNC 0x00002000 -#define DB_VERB_REP_SYSTEM 0x00004000 -#define DB_VERB_REP_TEST 0x00008000 -#define DB_VERB_WAITSFOR 0x00010000 +#define DB_VERB_MVCC 0x00000010 +#define DB_VERB_RECOVERY 0x00000020 +#define DB_VERB_REGISTER 0x00000040 +#define DB_VERB_REPLICATION 0x00000080 +#define DB_VERB_REPMGR_CONNFAIL 0x00000100 +#define DB_VERB_REPMGR_MISC 0x00000200 +#define DB_VERB_REP_ELECT 0x00000400 +#define DB_VERB_REP_LEASE 0x00000800 +#define DB_VERB_REP_MISC 0x00001000 +#define DB_VERB_REP_MSGS 0x00002000 +#define DB_VERB_REP_SYNC 0x00004000 +#define DB_VERB_REP_SYSTEM 0x00008000 +#define DB_VERB_REP_TEST 0x00010000 +#define DB_VERB_WAITSFOR 0x00020000 #define DB_VERIFY 0x00000002 #define DB_VERIFY_PARTITION 0x00040000 #define DB_WRITECURSOR 0x00000010 #define DB_WRITELOCK 0x00000020 -#define DB_WRITEOPEN 0x00040000 +#define DB_WRITEOPEN 0x00080000 #define DB_XA_CREATE 0x00000001 #define DB_YIELDCPU 0x00080000 diff --git a/src/dbinc_auto/blob_ext.h b/src/dbinc_auto/blob_ext.h new file mode 100644 index 00000000..3eac5c8d --- /dev/null +++ b/src/dbinc_auto/blob_ext.h @@ -0,0 +1,41 @@ +/* DO NOT EDIT: automatically built by dist/s_include. */ +#ifndef _blob_ext_h_ +#define _blob_ext_h_ + +#if defined(__cplusplus) +extern "C" { +#endif + +int __blob_file_create __P ((DBC *, DB_FH **, db_seq_t *)); +int __blob_file_close __P ((DBC *, DB_FH *, u_int32_t)); +int __blob_file_delete __P((DBC *, db_seq_t)); +int __blob_file_open __P((DB *, DB_FH **, db_seq_t, u_int32_t, int)); +int __blob_file_read __P((ENV *, DB_FH *, DBT *, off_t, u_int32_t)); +int __blob_file_write __P((DBC *, DB_FH *, DBT *, off_t, db_seq_t, off_t *, u_int32_t)); +int __blob_bulk __P((DBC *, u_int32_t, db_seq_t, u_int8_t *)); +int __blob_get __P((DBC *, DBT *, db_seq_t, off_t, void **, u_int32_t *)); +int __blob_put __P(( DBC *, DBT *, db_seq_t *, off_t *size, DB_LSN *)); +int __blob_repl __P((DBC *, DBT *, db_seq_t, db_seq_t *,off_t *)); +int __blob_del __P((DBC *, db_seq_t)); +int __db_stream_init __P((DBC *, DB_STREAM **, u_int32_t)); +int __db_stream_close_int __P ((DB_STREAM *)); +int __blob_make_sub_dir __P((ENV *, char **, db_seq_t, db_seq_t)); +int __blob_make_meta_fname __P((ENV *, DB *, char **)); +int __blob_get_dir __P((DB *, char **)); +int __blob_generate_dir_ids __P((DB *, DB_TXN *, db_seq_t *)); +int __blob_generate_id __P((DB *, DB_TXN *, db_seq_t *)); +int __blob_highest_id __P((DB *, DB_TXN *, db_seq_t *)); +void __blob_calculate_dirs __P((db_seq_t, char *, int *, int *)); +int __blob_id_to_path __P((ENV *, const char *, db_seq_t, char **)); +int __blob_str_to_id __P((ENV *, const char **, db_seq_t *)); +int __blob_path_to_dir_ids __P((ENV *, const char *, db_seq_t *, db_seq_t *)); +int __blob_salvage __P((ENV *, db_seq_t, off_t, size_t, db_seq_t, db_seq_t, DBT *)); +int __blob_vrfy __P((ENV *, db_seq_t, off_t, db_seq_t, db_seq_t, db_pgno_t, u_int32_t)); +int __blob_del_hierarchy __P((ENV *)); +int __blob_del_all __P((DB *, DB_TXN *, int)); +int __blob_copy_all __P((DB*, const char *, u_int32_t)); + +#if defined(__cplusplus) +} +#endif +#endif /* !_blob_ext_h_ */ diff --git a/src/dbinc_auto/btree_ext.h b/src/dbinc_auto/btree_ext.h index c90f5b80..bdd95750 100644 --- a/src/dbinc_auto/btree_ext.h +++ b/src/dbinc_auto/btree_ext.h @@ -8,11 +8,11 @@ extern "C" { int __bam_compact_int __P((DBC *, DBT *, DBT *, u_int32_t, int *, DB_COMPACT *, int *)); int __bam_compact_opd __P((DBC *, db_pgno_t, PAGE **, u_int32_t, DB_COMPACT *, int *)); -int __bam_truncate_ipages __P((DB *, DB_THREAD_INFO *, DB_TXN *, DB_COMPACT *)); -int __bam_cmp __P((DBC *, const DBT *, PAGE *, u_int32_t, int (*)(DB *, const DBT *, const DBT *), int *)); -int __bam_defcmp __P((DB *, const DBT *, const DBT *)); +int __bam_truncate_ipages __P((DB *, DB_THREAD_INFO *, DB_TXN *, DB_COMPACT *, int *)); +int __bam_cmp __P((DBC *, const DBT *, PAGE *, u_int32_t, int (*)(DB *, const DBT *, const DBT *, size_t *), int *, size_t *)); +int __bam_defcmp __P((DB *, const DBT *, const DBT *, size_t *)); size_t __bam_defpfx __P((DB *, const DBT *, const DBT *)); -int __bam_compress_dupcmp __P((DB *, const DBT *, const DBT *)); +int __bam_compress_dupcmp __P((DB *, const DBT *, const DBT *, size_t *)); int __bam_defcompress __P((DB *, const DBT *, const DBT *, const DBT *, const DBT *, DBT *)); int __bam_defdecompress __P((DB *, const DBT *, const DBT *, DBT *, DBT *, DBT *)); int __bamc_compress_get __P((DBC *, DBT *, DBT *, u_int32_t)); @@ -52,7 +52,7 @@ int __bam_db_create __P((DB *)); int __bam_db_close __P((DB *)); void __bam_map_flags __P((DB *, u_int32_t *, u_int32_t *)); int __bam_set_flags __P((DB *, u_int32_t *flagsp)); -int __bam_set_bt_compare __P((DB *, int (*)(DB *, const DBT *, const DBT *))); +int __bam_set_bt_compare __P((DB *, int (*)(DB *, const DBT *, const DBT *, size_t *))); int __bam_set_bt_compress __P((DB *, int (*)(DB *, const DBT *, const DBT *, const DBT *, const DBT *, DBT *), int (*)(DB *, const DBT *, const DBT *, DBT *, DBT *, DBT *))); int __bam_get_bt_minkey __P((DB *, u_int32_t *)); void __bam_copy_config __P((DB *, DB*, u_int32_t)); @@ -115,6 +115,8 @@ int __bam_traverse __P((DBC *, db_lockmode_t, db_pgno_t, int (*)(DBC *, PAGE *, int __bam_30_btreemeta __P((DB *, char *, u_int8_t *)); int __bam_31_btreemeta __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)); int __bam_31_lbtree __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)); +int __bam_60_btreemeta __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)); +int __bam_60_lbtree __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)); int __bam_vrfy_meta __P((DB *, VRFY_DBINFO *, BTMETA *, db_pgno_t, u_int32_t)); int __ram_vrfy_leaf __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, u_int32_t)); int __bam_vrfy __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, u_int32_t)); diff --git a/src/dbinc_auto/common_ext.h b/src/dbinc_auto/common_ext.h index ac16e9db..1a94d3a1 100644 --- a/src/dbinc_auto/common_ext.h +++ b/src/dbinc_auto/common_ext.h @@ -25,6 +25,7 @@ int __db_pgfmt __P((ENV *, db_pgno_t)); #ifdef DIAGNOSTIC void __db_assert __P((ENV *, const char *, const char *, int)); #endif +void __env_panic_event __P((ENV *, int)); int __env_panic_msg __P((ENV *)); int __env_panic __P((ENV *, int)); char *__db_unknown_error __P((int)); @@ -33,9 +34,10 @@ void __db_err __P((const ENV *, int, const char *, ...)) __attribute__ ((__forma void __db_errx __P((const ENV *, const char *, ...)) __attribute__ ((__format__ (__printf__, 2, 3))); void __db_errcall __P((const DB_ENV *, int, db_error_set_t, const char *, va_list)); void __db_errfile __P((const DB_ENV *, int, db_error_set_t, const char *, va_list)); -void __db_msgadd __P((ENV *, DB_MSGBUF *, const char *, ...)) __attribute__ ((__format__ (__printf__, 3, 4))); -void __db_msgadd_ap __P((ENV *, DB_MSGBUF *, const char *, va_list)); +void __db_msgadd __P((const ENV *, DB_MSGBUF *, const char *, ...)) __attribute__ ((__format__ (__printf__, 3, 4))); +void __db_msgadd_ap __P((const ENV *, DB_MSGBUF *, const char *, va_list)); void __db_msg __P((const ENV *, const char *, ...)) __attribute__ ((__format__ (__printf__, 2, 3))); +void __db_debug_msg __P((const ENV *, const char *, ...)); void __db_repmsg __P((const ENV *, const char *, ...)) __attribute__ ((__format__ (__printf__, 2, 3))); int __db_unknown_flag __P((ENV *, char *, u_int32_t)); int __db_unknown_type __P((ENV *, char *, DBTYPE)); @@ -50,6 +52,24 @@ int __db_check_lsn __P((ENV *, DB_LSN *, DB_LSN *)); int __db_rdonly __P((const ENV *, const char *)); int __db_space_err __P((const DB *)); int __db_failed __P((const ENV *, const char *, pid_t, db_threadid_t)); +int __env_failure_remember __P((const ENV *, const char *)); +#ifdef HAVE_ERROR_HISTORY +void __db_thread_init __P((void)); +#endif +#ifdef HAVE_ERROR_HISTORY +int __db_diags __P((const ENV *, int)); +#endif +#ifdef HAVE_ERROR_HISTORY +DB_MSGBUF *__db_deferred_get __P((void)); +#endif +#ifdef HAVE_ERROR_HISTORY +void __db_deferred_discard __P((void)); +#endif +#ifdef HAVE_ERROR_HISTORY +int __db_remember_context __P((const ENV *, DB_MSGBUF *, int)); +#endif +char * __db_ctimespec __P((const db_timespec *, char *)); +char *__db_fmt_quote __P((char *, size_t, const char *)); int __db_getlong __P((DB_ENV *, const char *, char *, long, long, long *)); int __db_getulong __P((DB_ENV *, const char *, char *, u_long, u_long, u_long *)); void __db_idspace __P((u_int32_t *, int, u_int32_t *, u_int32_t *)); diff --git a/src/dbinc_auto/db_ext.h b/src/dbinc_auto/db_ext.h index de2a6ce4..719fc0c5 100644 --- a/src/dbinc_auto/db_ext.h +++ b/src/dbinc_auto/db_ext.h @@ -62,14 +62,19 @@ int __db_merge_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __db_pgno_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __db_init_print __P((ENV *, DB_DISTAB *)); int __db_dbbackup_pp __P((DB_ENV *, const char *, const char *, u_int32_t)); -int __db_dbbackup __P((DB_ENV *, DB_THREAD_INFO *, const char *, const char *, u_int32_t)); -int __db_backup __P((DB_ENV *, const char *, u_int32_t)); +int __db_dbbackup __P((DB_ENV *, DB_THREAD_INFO *, const char *, const char *, u_int32_t, u_int32_t, const char *)); +int backup_data_copy __P(( DB_ENV *, const char *, const char *, const char *, int)); +int __db_backup_pp __P((DB_ENV *, const char *, u_int32_t)); int __dbc_close __P((DBC *)); int __dbc_destroy __P((DBC *)); int __dbc_cmp __P((DBC *, DBC *, int *)); int __dbc_count __P((DBC *, db_recno_t *)); int __dbc_del __P((DBC *, u_int32_t)); int __dbc_idel __P((DBC *, u_int32_t)); +int __dbc_db_stream __P((DBC *, DB_STREAM **, u_int32_t)); +int __dbc_get_blob_id __P((DBC *, db_seq_t *)); +int __dbc_get_blob_size __P((DBC *, off_t *)); +int __dbc_set_blob_size __P((DBC *, off_t)); #ifdef HAVE_COMPRESSION int __dbc_bulk_del __P((DBC *, DBT *, u_int32_t)); #endif @@ -93,15 +98,16 @@ u_int32_t __db_partsize __P((u_int32_t, DBT *)); #ifdef DIAGNOSTIC void __db_check_skeyset __P((DB *, DBT *)); #endif +int __dbc_diags __P((DBC *, int)); int __cdsgroup_begin __P((ENV *, DB_TXN **)); int __cdsgroup_begin_pp __P((DB_ENV *, DB_TXN **)); int __db_compact_int __P((DB *, DB_THREAD_INFO *, DB_TXN *, DBT *, DBT *, DB_COMPACT *, u_int32_t, DBT *)); -int __db_exchange_page __P((DBC *, PAGE **, PAGE *, db_pgno_t, int)); -int __db_truncate_overflow __P((DBC *, db_pgno_t, PAGE **, DB_COMPACT *)); -int __db_truncate_root __P((DBC *, PAGE *, u_int32_t, db_pgno_t *, u_int32_t)); +int __db_exchange_page __P((DBC *, PAGE **, PAGE *, db_pgno_t, int, int *)); +int __db_truncate_overflow __P((DBC *, db_pgno_t, PAGE **, DB_COMPACT *, int *)); +int __db_truncate_root __P((DBC *, PAGE *, u_int32_t, db_pgno_t *, u_int32_t, int *)); int __db_find_free __P((DBC *, u_int32_t, u_int32_t, db_pgno_t, db_pgno_t *)); int __db_relink __P((DBC *, PAGE *, PAGE *, db_pgno_t)); -int __db_move_metadata __P((DBC *, DBMETA **, DB_COMPACT *)); +int __db_move_metadata __P((DBC *, DBMETA **, DB_COMPACT *, int *)); int __db_pgin __P((DB_ENV *, db_pgno_t, void *, DBT *)); int __db_pgout __P((DB_ENV *, db_pgno_t, void *, DBT *)); int __db_decrypt_pg __P((ENV *, DB *, PAGE *)); @@ -185,6 +191,10 @@ int __db_has_pagelock __P((ENV *, DB_LOCKER *, DB_MPOOLFILE *, PAGE *, db_lockmo int __db_lput __P((DBC *, DB_LOCK *)); int __db_create_internal __P((DB **, ENV *, u_int32_t)); int __dbh_am_chk __P((DB *, u_int32_t)); +int __db_get_blob_threshold __P((DB *, u_int32_t *)); +int __db_set_blob_threshold __P((DB *, u_int32_t, u_int32_t)); +int __db_blobs_enabled __P((DB *)); +int __db_set_dup_compare __P((DB *, int (*)(DB *, const DBT *, const DBT *, size_t *))); int __db_get_flags __P((DB *, u_int32_t *)); int __db_set_flags __P((DB *, u_int32_t)); int __db_get_lorder __P((DB *, int *)); @@ -197,12 +207,13 @@ int __db_init_subdb __P((DB *, DB *, const char *, DB_THREAD_INFO *, DB_TXN *)); int __db_chk_meta __P((ENV *, DB *, DBMETA *, u_int32_t)); int __db_meta_setup __P((ENV *, DB *, const char *, DBMETA *, u_int32_t, u_int32_t)); int __db_reopen __P((DBC *)); +int __db_alloc_dbt __P((ENV *, DBT *, u_int32_t, u_int32_t *, u_int32_t *, void **, u_int32_t *)); int __db_goff __P((DBC *, DBT *, u_int32_t, db_pgno_t, void **, u_int32_t *)); int __db_poff __P((DBC *, const DBT *, db_pgno_t *)); int __db_ovref __P((DBC *, db_pgno_t)); int __db_doff __P((DBC *, db_pgno_t)); -int __db_moff __P((DBC *, const DBT *, db_pgno_t, u_int32_t, int (*)(DB *, const DBT *, const DBT *), int *)); -int __db_coff __P((DBC *, const DBT *, const DBT *, int (*)(DB *, const DBT *, const DBT *), int *)); +int __db_moff __P((DBC *, const DBT *, db_pgno_t, u_int32_t, int (*)(DB *, const DBT *, const DBT *, size_t *), int *, size_t *)); +int __db_coff __P((DBC *, const DBT *, const DBT *, int (*)(DB *, const DBT *, const DBT *, size_t *), int *)); int __db_vrfy_overflow __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, u_int32_t)); int __db_vrfy_ovfl_structure __P((DB *, VRFY_DBINFO *, db_pgno_t, u_int32_t, u_int32_t)); int __db_safe_goff __P((DB *, VRFY_DBINFO *, db_pgno_t, DBT *, void *, u_int32_t *, u_int32_t)); @@ -221,11 +232,12 @@ int __db_name_to_val __P((FN const *, char *)); const char *__db_pagetype_to_string __P((u_int32_t)); int __db_dump_pp __P((DB *, const char *, int (*)(void *, const void *), void *, int, int)); int __db_dump __P((DB *, const char *, int (*)(void *, const void *), void *, int, int)); -int __db_prdbt __P((DBT *, int, const char *, void *, int (*)(void *, const void *), int, int)); +int __db_prdbt __P((DBT *, int, const char *, void *, int (*)(void *, const void *), int, int, int)); int __db_prheader __P((DB *, const char *, int, int, void *, int (*)(void *, const void *), VRFY_DBINFO *, db_pgno_t)); int __db_prfooter __P((void *, int (*)(void *, const void *))); int __db_pr_callback __P((void *, const void *)); const char * __db_dbtype_to_string __P((DBTYPE)); +char *__db_tohex __P((const void *, size_t, char *)); int __db_addrem_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __db_addrem_42_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __db_big_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); @@ -263,6 +275,8 @@ int __db_rename_pp __P((DB *, const char *, const char *, const char *, u_int32_ int __db_rename_int __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char *, const char *, const char *, u_int32_t)); int __db_ret __P((DBC *, PAGE *, u_int32_t, DBT *, void **, u_int32_t *)); int __db_retcopy __P((ENV *, DBT *, void *, u_int32_t, void **, u_int32_t *)); +int __db_dbt_clone __P((ENV *, DBT *, const DBT *)); +int __db_dbt_clone_free __P((ENV *, DBT *)); int __env_fileid_reset_pp __P((DB_ENV *, const char *, u_int32_t)); int __env_fileid_reset __P((ENV *, DB_THREAD_INFO *, const char *, int)); int __env_lsn_reset_pp __P((DB_ENV *, const char *, u_int32_t)); @@ -329,6 +343,7 @@ int __part_fileid_reset __P((ENV *, DB_THREAD_INFO *, const char *, u_int32_t, i int __part_key_range __P((DBC *, DBT *, DB_KEY_RANGE *, u_int32_t)); int __part_remove __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char *, const char *, u_int32_t)); int __part_rename __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char *, const char *, const char *)); +int __partc_dup __P((DBC *, DBC *)); int __part_verify __P((DB *, VRFY_DBINFO *, const char *, void *, int (*)(void *, const void *), u_int32_t)); int __part_testdocopy __P((DB *, const char *)); int __db_no_partition __P((ENV *)); diff --git a/src/dbinc_auto/dbreg_auto.h b/src/dbinc_auto/dbreg_auto.h index 63ad0cd3..22f1e84c 100644 --- a/src/dbinc_auto/dbreg_auto.h +++ b/src/dbinc_auto/dbreg_auto.h @@ -3,6 +3,28 @@ #ifndef __dbreg_AUTO_H #define __dbreg_AUTO_H #include "dbinc/log.h" +#define DB___dbreg_register_42 2 +typedef struct ___dbreg_register_42_args { + u_int32_t type; + DB_TXN *txnp; + DB_LSN prev_lsn; + u_int32_t opcode; + DBT name; + DBT uid; + int32_t fileid; + DBTYPE ftype; + db_pgno_t meta_pgno; + u_int32_t id; +} __dbreg_register_42_args; + +extern __DB_IMPORT DB_LOG_RECSPEC __dbreg_register_42_desc[]; +static inline int __dbreg_register_42_read(ENV *env, + void *data, __dbreg_register_42_args **arg) +{ + *arg = NULL; + return (__log_read_record(env, + NULL, NULL, data, __dbreg_register_42_desc, sizeof(__dbreg_register_42_args), (void**)arg)); +} #define DB___dbreg_register 2 typedef struct ___dbreg_register_args { u_int32_t type; @@ -15,22 +37,25 @@ typedef struct ___dbreg_register_args { DBTYPE ftype; db_pgno_t meta_pgno; u_int32_t id; + u_int32_t blob_fid_lo; + u_int32_t blob_fid_hi; } __dbreg_register_args; extern __DB_IMPORT DB_LOG_RECSPEC __dbreg_register_desc[]; static inline int __dbreg_register_log(ENV *env, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, u_int32_t opcode, const DBT *name, const DBT *uid, int32_t fileid, DBTYPE ftype, - db_pgno_t meta_pgno, u_int32_t id) + db_pgno_t meta_pgno, u_int32_t id, u_int32_t blob_fid_lo, u_int32_t blob_fid_hi) { return (__log_put_record(env, NULL, txnp, ret_lsnp, flags, DB___dbreg_register, 0, sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) + sizeof(u_int32_t) + LOG_DBT_SIZE(name) + LOG_DBT_SIZE(uid) + sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) + - sizeof(u_int32_t), + sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t), __dbreg_register_desc, - opcode, name, uid, fileid, ftype, meta_pgno, id)); + opcode, name, uid, fileid, ftype, meta_pgno, id, blob_fid_lo, + blob_fid_hi)); } static inline int __dbreg_register_read(ENV *env, diff --git a/src/dbinc_auto/dbreg_ext.h b/src/dbinc_auto/dbreg_ext.h index 0f495c33..421c7989 100644 --- a/src/dbinc_auto/dbreg_ext.h +++ b/src/dbinc_auto/dbreg_ext.h @@ -20,9 +20,11 @@ int __dbreg_failchk __P((ENV *)); int __dbreg_log_close __P((ENV *, FNAME *, DB_TXN *, u_int32_t)); int __dbreg_log_id __P((DB *, DB_TXN *, int32_t, int)); int __dbreg_init_recover __P((ENV *, DB_DISTAB *)); +int __dbreg_register_42_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __dbreg_register_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __dbreg_init_print __P((ENV *, DB_DISTAB *)); int __dbreg_register_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); +int __dbreg_register_42_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __dbreg_stat_print __P((ENV *, u_int32_t)); void __dbreg_print_fname __P((ENV *, FNAME *)); int __dbreg_add_dbentry __P((ENV *, DB_LOG *, DB *, int32_t)); @@ -36,8 +38,9 @@ int __dbreg_invalidate_files __P((ENV *, int)); int __dbreg_id_to_db __P((ENV *, DB_TXN *, DB **, int32_t, int)); int __dbreg_id_to_fname __P((DB_LOG *, int32_t, int, FNAME **)); int __dbreg_fid_to_fname __P((DB_LOG *, u_int8_t *, int, FNAME **)); +int __dbreg_blob_file_to_fname __P((DB_LOG *, db_seq_t, int, FNAME **)); int __dbreg_get_name __P((ENV *, u_int8_t *, char **, char **)); -int __dbreg_do_open __P((ENV *, DB_TXN *, DB_LOG *, u_int8_t *, char *, DBTYPE, int32_t, db_pgno_t, void *, u_int32_t, u_int32_t)); +int __dbreg_do_open __P((ENV *, DB_TXN *, DB_LOG *, u_int8_t *, char *, DBTYPE, int32_t, db_pgno_t, void *, u_int32_t, u_int32_t, db_seq_t)); int __dbreg_lazy_id __P((DB *)); #if defined(__cplusplus) diff --git a/src/dbinc_auto/env_ext.h b/src/dbinc_auto/env_ext.h index 55dbcba4..7df61ea9 100644 --- a/src/dbinc_auto/env_ext.h +++ b/src/dbinc_auto/env_ext.h @@ -36,9 +36,13 @@ void __db_env_destroy __P((DB_ENV *)); int __env_get_alloc __P((DB_ENV *, void *(**)(size_t), void *(**)(void *, size_t), void (**)(void *))); int __env_set_alloc __P((DB_ENV *, void *(*)(size_t), void *(*)(void *, size_t), void (*)(void *))); int __env_get_memory_init __P((DB_ENV *, DB_MEM_CONFIG, u_int32_t *)); +int __env_get_blob_threshold_pp __P ((DB_ENV *, u_int32_t *)); +int __env_get_blob_threshold_int __P ((ENV *, u_int32_t *)); +int __env_set_blob_threshold __P((DB_ENV *, u_int32_t, u_int32_t)); int __env_set_memory_init __P((DB_ENV *, DB_MEM_CONFIG, u_int32_t)); int __env_get_memory_max __P((DB_ENV *, u_int32_t *, u_int32_t *)); int __env_set_memory_max __P((DB_ENV *, u_int32_t, u_int32_t)); +int __env_set_blob_dir __P((DB_ENV *, const char *)); int __env_get_encrypt_flags __P((DB_ENV *, u_int32_t *)); int __env_set_encrypt __P((DB_ENV *, const char *, u_int32_t)); void __env_map_flags __P((const FLAG_MAP *, u_int, u_int32_t *, u_int32_t *)); @@ -91,6 +95,7 @@ void __env_panic_set __P((ENV *, int)); int __env_ref_increment __P((ENV *)); int __env_ref_decrement __P((ENV *)); int __env_ref_get __P((DB_ENV *, u_int32_t *)); +int __env_region_cleanup __P((ENV *)); int __env_detach __P((ENV *, int)); int __env_remove_env __P((ENV *)); int __env_region_attach __P((ENV *, REGINFO *, size_t, size_t)); @@ -102,6 +107,7 @@ int __envreg_xunlock __P((ENV *)); int __envreg_isalive __P((DB_ENV *, pid_t, db_threadid_t, u_int32_t)); u_int32_t __env_struct_sig __P((void)); int __env_stat_print_pp __P((DB_ENV *, u_int32_t)); +int __env_print_thread __P((ENV *)); void __db_print_fh __P((ENV *, const char *, DB_FH *, u_int32_t)); void __db_print_fileid __P((ENV *, u_int8_t *, const char *)); void __db_dl __P((ENV *, const char *, u_long)); @@ -119,6 +125,18 @@ int __repmgr_get_ack_policy __P((DB_ENV *, int *)); int __repmgr_set_ack_policy __P((DB_ENV *, int)); #endif #ifndef HAVE_REPLICATION_THREADS +int __repmgr_get_incoming_queue_max __P((DB_ENV *, u_int32_t *, u_int32_t *)); +#endif +#ifndef HAVE_REPLICATION_THREADS +int __repmgr_set_incoming_queue_max __P((DB_ENV *, u_int32_t, u_int32_t)); +#endif +#ifndef HAVE_REPLICATION_THREADS +int __repmgr_get_incoming_queue_redzone __P((DB_ENV *, u_int32_t *, u_int32_t *)); +#endif +#ifndef HAVE_REPLICATION_THREADS +int __repmgr_get_incoming_queue_fullevent __P((DB_ENV *, int *)); +#endif +#ifndef HAVE_REPLICATION_THREADS int __repmgr_site __P((DB_ENV *, const char *, u_int, DB_SITE **, u_int32_t)); #endif #ifndef HAVE_REPLICATION_THREADS @@ -128,10 +146,10 @@ int __repmgr_site_by_eid __P((DB_ENV *, int, DB_SITE **)); int __repmgr_local_site __P((DB_ENV *, DB_SITE **)); #endif #ifndef HAVE_REPLICATION_THREADS -int __repmgr_site_list __P((DB_ENV *, u_int *, DB_REPMGR_SITE **)); +int __repmgr_site_list_pp __P((DB_ENV *, u_int *, DB_REPMGR_SITE **)); #endif #ifndef HAVE_REPLICATION_THREADS -int __repmgr_start __P((DB_ENV *, int, u_int32_t)); +int __repmgr_start_pp __P((DB_ENV *, int, u_int32_t)); #endif #ifndef HAVE_REPLICATION_THREADS int __repmgr_stat_pp __P((DB_ENV *, DB_REPMGR_STAT **, u_int32_t)); diff --git a/src/dbinc_auto/fileops_auto.h b/src/dbinc_auto/fileops_auto.h index 59385c88..3894c23d 100644 --- a/src/dbinc_auto/fileops_auto.h +++ b/src/dbinc_auto/fileops_auto.h @@ -21,6 +21,25 @@ static inline int __fop_create_42_read(ENV *env, return (__log_read_record(env, NULL, NULL, data, __fop_create_42_desc, sizeof(__fop_create_42_args), (void**)arg)); } +#define DB___fop_create_60 143 +typedef struct ___fop_create_60_args { + u_int32_t type; + DB_TXN *txnp; + DB_LSN prev_lsn; + DBT name; + DBT dirname; + u_int32_t appname; + u_int32_t mode; +} __fop_create_60_args; + +extern __DB_IMPORT DB_LOG_RECSPEC __fop_create_60_desc[]; +static inline int __fop_create_60_read(ENV *env, + void *data, __fop_create_60_args **arg) +{ + *arg = NULL; + return (__log_read_record(env, + NULL, NULL, data, __fop_create_60_desc, sizeof(__fop_create_60_args), (void**)arg)); +} #define DB___fop_create 143 typedef struct ___fop_create_args { u_int32_t type; @@ -53,6 +72,24 @@ static inline int __fop_create_read(ENV *env, return (__log_read_record(env, NULL, NULL, data, __fop_create_desc, sizeof(__fop_create_args), (void**)arg)); } +#define DB___fop_remove_60 144 +typedef struct ___fop_remove_60_args { + u_int32_t type; + DB_TXN *txnp; + DB_LSN prev_lsn; + DBT name; + DBT fid; + u_int32_t appname; +} __fop_remove_60_args; + +extern __DB_IMPORT DB_LOG_RECSPEC __fop_remove_60_desc[]; +static inline int __fop_remove_60_read(ENV *env, + void *data, __fop_remove_60_args **arg) +{ + *arg = NULL; + return (__log_read_record(env, + NULL, NULL, data, __fop_remove_60_desc, sizeof(__fop_remove_60_args), (void**)arg)); +} #define DB___fop_remove 144 typedef struct ___fop_remove_args { u_int32_t type; @@ -105,6 +142,29 @@ static inline int __fop_write_42_read(ENV *env, return (__log_read_record(env, NULL, NULL, data, __fop_write_42_desc, sizeof(__fop_write_42_args), (void**)arg)); } +#define DB___fop_write_60 145 +typedef struct ___fop_write_60_args { + u_int32_t type; + DB_TXN *txnp; + DB_LSN prev_lsn; + DBT name; + DBT dirname; + u_int32_t appname; + u_int32_t pgsize; + db_pgno_t pageno; + u_int32_t offset; + DBT page; + u_int32_t flag; +} __fop_write_60_args; + +extern __DB_IMPORT DB_LOG_RECSPEC __fop_write_60_desc[]; +static inline int __fop_write_60_read(ENV *env, + void *data, __fop_write_60_args **arg) +{ + *arg = NULL; + return (__log_read_record(env, + NULL, NULL, data, __fop_write_60_desc, sizeof(__fop_write_60_args), (void**)arg)); +} #define DB___fop_write 145 typedef struct ___fop_write_args { u_int32_t type; @@ -143,6 +203,66 @@ static inline int __fop_write_read(ENV *env, return (__log_read_record(env, NULL, NULL, data, __fop_write_desc, sizeof(__fop_write_args), (void**)arg)); } +#define DB___fop_write_file_60 86 +typedef struct ___fop_write_file_60_args { + u_int32_t type; + DB_TXN *txnp; + DB_LSN prev_lsn; + DBT name; + DBT dirname; + u_int32_t appname; + u_int32_t offset_lo; + u_int32_t offset_hi; + DBT old_data; + DBT new_data; + u_int32_t flag; +} __fop_write_file_60_args; + +extern __DB_IMPORT DB_LOG_RECSPEC __fop_write_file_60_desc[]; +static inline int __fop_write_file_60_read(ENV *env, + void *data, __fop_write_file_60_args **arg) +{ + *arg = NULL; + return (__log_read_record(env, + NULL, NULL, data, __fop_write_file_60_desc, sizeof(__fop_write_file_60_args), (void**)arg)); +} +#define DB___fop_write_file 86 +typedef struct ___fop_write_file_args { + u_int32_t type; + DB_TXN *txnp; + DB_LSN prev_lsn; + DBT name; + DBT dirname; + u_int32_t appname; + u_int64_t offset; + DBT old_data; + DBT new_data; + u_int32_t flag; +} __fop_write_file_args; + +extern __DB_IMPORT DB_LOG_RECSPEC __fop_write_file_desc[]; +static inline int +__fop_write_file_log(ENV *env, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, + const DBT *name, const DBT *dirname, u_int32_t appname, u_int64_t offset, const DBT *old_data, + const DBT *new_data, u_int32_t flag) +{ + return (__log_put_record(env, NULL, txnp, ret_lsnp, + flags, DB___fop_write_file, 0, + sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) + + LOG_DBT_SIZE(name) + LOG_DBT_SIZE(dirname) + sizeof(u_int32_t) + + sizeof(u_int64_t) + LOG_DBT_SIZE(old_data) + LOG_DBT_SIZE(new_data) + + sizeof(u_int32_t), + __fop_write_file_desc, + name, dirname, appname, offset, old_data, new_data, flag)); +} + +static inline int __fop_write_file_read(ENV *env, + void *data, __fop_write_file_args **arg) +{ + *arg = NULL; + return (__log_read_record(env, + NULL, NULL, data, __fop_write_file_desc, sizeof(__fop_write_file_args), (void**)arg)); +} #define DB___fop_rename_42 146 #define DB___fop_rename_noundo_46 150 typedef struct ___fop_rename_42_args { @@ -171,6 +291,35 @@ static inline int __fop_rename_noundo_46_read(ENV *env, return (__log_read_record(env, NULL, NULL, data, __fop_rename_noundo_46_desc, sizeof(__fop_rename_42_args), (void**)arg)); } +#define DB___fop_rename_60 146 +#define DB___fop_rename_noundo_60 150 +typedef struct ___fop_rename_60_args { + u_int32_t type; + DB_TXN *txnp; + DB_LSN prev_lsn; + DBT oldname; + DBT newname; + DBT dirname; + DBT fileid; + u_int32_t appname; +} __fop_rename_60_args; + +extern __DB_IMPORT DB_LOG_RECSPEC __fop_rename_60_desc[]; +static inline int __fop_rename_60_read(ENV *env, + void *data, __fop_rename_60_args **arg) +{ + *arg = NULL; + return (__log_read_record(env, + NULL, NULL, data, __fop_rename_60_desc, sizeof(__fop_rename_60_args), (void**)arg)); +} +extern __DB_IMPORT DB_LOG_RECSPEC __fop_rename_noundo_60_desc[]; +static inline int __fop_rename_noundo_60_read(ENV *env, + void *data, __fop_rename_60_args **arg) +{ + *arg = NULL; + return (__log_read_record(env, + NULL, NULL, data, __fop_rename_noundo_60_desc, sizeof(__fop_rename_60_args), (void**)arg)); +} #define DB___fop_rename 146 #define DB___fop_rename_noundo 150 typedef struct ___fop_rename_args { @@ -226,6 +375,26 @@ static inline int __fop_rename_noundo_read(ENV *env, return (__log_read_record(env, NULL, NULL, data, __fop_rename_noundo_desc, sizeof(__fop_rename_args), (void**)arg)); } +#define DB___fop_file_remove_60 141 +typedef struct ___fop_file_remove_60_args { + u_int32_t type; + DB_TXN *txnp; + DB_LSN prev_lsn; + DBT real_fid; + DBT tmp_fid; + DBT name; + u_int32_t appname; + u_int32_t child; +} __fop_file_remove_60_args; + +extern __DB_IMPORT DB_LOG_RECSPEC __fop_file_remove_60_desc[]; +static inline int __fop_file_remove_60_read(ENV *env, + void *data, __fop_file_remove_60_args **arg) +{ + *arg = NULL; + return (__log_read_record(env, + NULL, NULL, data, __fop_file_remove_60_desc, sizeof(__fop_file_remove_60_args), (void**)arg)); +} #define DB___fop_file_remove 141 typedef struct ___fop_file_remove_args { u_int32_t type; diff --git a/src/dbinc_auto/fileops_ext.h b/src/dbinc_auto/fileops_ext.h index 0aa6c1e1..89306183 100644 --- a/src/dbinc_auto/fileops_ext.h +++ b/src/dbinc_auto/fileops_ext.h @@ -8,35 +8,51 @@ extern "C" { int __fop_init_recover __P((ENV *, DB_DISTAB *)); int __fop_create_42_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); +int __fop_create_60_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __fop_create_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); +int __fop_remove_60_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __fop_remove_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __fop_write_42_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); +int __fop_write_60_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __fop_write_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); +int __fop_write_file_60_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); +int __fop_write_file_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __fop_rename_42_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); +int __fop_rename_60_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __fop_rename_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); +int __fop_file_remove_60_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __fop_file_remove_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __fop_init_print __P((ENV *, DB_DISTAB *)); int __fop_create __P((ENV *, DB_TXN *, DB_FH **, const char *, const char **, APPNAME, int, u_int32_t)); int __fop_remove __P((ENV *, DB_TXN *, u_int8_t *, const char *, const char **, APPNAME, u_int32_t)); int __fop_write __P((ENV *, DB_TXN *, const char *, const char *, APPNAME, DB_FH *, u_int32_t, db_pgno_t, u_int32_t, void *, u_int32_t, u_int32_t, u_int32_t)); +int __fop_write_file __P((ENV *, DB_TXN *, const char *, const char *, APPNAME, DB_FH *, off_t, void *, size_t, u_int32_t)); int __fop_rename __P((ENV *, DB_TXN *, const char *, const char *, const char **, u_int8_t *, APPNAME, int, u_int32_t)); int __fop_create_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); +int __fop_create_60_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __fop_create_42_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __fop_remove_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); +int __fop_remove_60_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __fop_write_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); +int __fop_write_60_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __fop_write_42_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); +int __fop_write_file_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); +int __fop_write_file_60_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __fop_rename_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __fop_rename_noundo_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); +int __fop_rename_60_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); +int __fop_rename_noundo_60_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __fop_rename_42_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __fop_rename_noundo_46_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __fop_file_remove_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); +int __fop_file_remove_60_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __fop_lock_handle __P((ENV *, DB *, DB_LOCKER *, db_lockmode_t, DB_LOCK *, u_int32_t)); int __fop_file_setup __P((DB *, DB_THREAD_INFO *ip, DB_TXN *, const char *, int, u_int32_t, u_int32_t *)); int __fop_subdb_setup __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char *, const char *, int, u_int32_t)); int __fop_remove_setup __P((DB *, DB_TXN *, const char *, u_int32_t)); int __fop_read_meta __P((ENV *, const char *, u_int8_t *, size_t, DB_FH *, int, size_t *)); -int __fop_dummy __P((DB *, DB_TXN *, const char *, const char *)); -int __fop_dbrename __P((DB *, const char *, const char *)); +int __fop_dummy __P((DB *, DB_TXN *, const char *, const char *, APPNAME)); +int __fop_dbrename __P((DB *, const char *, const char *, APPNAME)); #if defined(__cplusplus) } diff --git a/src/dbinc_auto/hash_ext.h b/src/dbinc_auto/hash_ext.h index e83fe817..4d7c2e9c 100644 --- a/src/dbinc_auto/hash_ext.h +++ b/src/dbinc_auto/hash_ext.h @@ -57,7 +57,7 @@ int __ham_return_meta __P((DBC *, u_int32_t, DBMETA **)); int __ham_db_create __P((DB *)); int __ham_db_close __P((DB *)); int __ham_get_h_ffactor __P((DB *, u_int32_t *)); -int __ham_set_h_compare __P((DB *, int (*)(DB *, const DBT *, const DBT *))); +int __ham_set_h_compare __P((DB *, int (*)(DB *, const DBT *, const DBT *, size_t *))); int __ham_get_h_nelem __P((DB *, u_int32_t *)); void __ham_copy_config __P((DB *, DB*, u_int32_t)); int __ham_open __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char * name, db_pgno_t, u_int32_t)); @@ -116,6 +116,8 @@ int __ham_31_hashmeta __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)); int __ham_31_hash __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)); int __ham_46_hashmeta __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)); int __ham_46_hash __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)); +int __ham_60_hashmeta __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)); +int __ham_60_hash __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)); int __ham_vrfy_meta __P((DB *, VRFY_DBINFO *, HMETA *, db_pgno_t, u_int32_t)); int __ham_vrfy __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, u_int32_t)); int __ham_vrfy_structure __P((DB *, VRFY_DBINFO *, db_pgno_t, u_int32_t)); diff --git a/src/dbinc_auto/heap_auto.h b/src/dbinc_auto/heap_auto.h index bf288627..f91cacfe 100644 --- a/src/dbinc_auto/heap_auto.h +++ b/src/dbinc_auto/heap_auto.h @@ -26,7 +26,7 @@ __heap_addrem_log(DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp, u_int32_t flags, const DBT *hdr, const DBT *dbt, DB_LSN * pagelsn) { return (__log_put_record((dbp)->env, dbp, txnp, ret_lsnp, - flags, DB___heap_addrem, 0, + flags, DB___heap_addrem, 1, sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN) + sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_int32_t) + LOG_DBT_SIZE(hdr) + @@ -42,6 +42,52 @@ static inline int __heap_addrem_read(ENV *env, return (__log_read_record(env, dbpp, td, data, __heap_addrem_desc, sizeof(__heap_addrem_args), (void**)arg)); } +#define DB___heap_addrem_60 151 +typedef struct ___heap_addrem_60_args { + u_int32_t type; + DB_TXN *txnp; + DB_LSN prev_lsn; + u_int32_t opcode; + int32_t fileid; + db_pgno_t pgno; + u_int32_t indx; + u_int32_t nbytes; + DBT hdr; + DBT dbt; + DB_LSN pagelsn; +} __heap_addrem_60_args; + +extern __DB_IMPORT DB_LOG_RECSPEC __heap_addrem_60_desc[]; +static inline int __heap_addrem_60_read(ENV *env, + DB **dbpp, void *td, void *data, __heap_addrem_60_args **arg) +{ + *arg = NULL; + return (__log_read_record(env, + dbpp, td, data, __heap_addrem_60_desc, sizeof(__heap_addrem_60_args), (void**)arg)); +} +#define DB___heap_addrem_50 151 +typedef struct ___heap_addrem_50_args { + u_int32_t type; + DB_TXN *txnp; + DB_LSN prev_lsn; + u_int32_t opcode; + int32_t fileid; + db_pgno_t pgno; + u_int32_t indx; + u_int32_t nbytes; + DBT hdr; + DBT dbt; + DB_LSN pagelsn; +} __heap_addrem_50_args; + +extern __DB_IMPORT DB_LOG_RECSPEC __heap_addrem_50_desc[]; +static inline int __heap_addrem_50_read(ENV *env, + DB **dbpp, void *td, void *data, __heap_addrem_50_args **arg) +{ + *arg = NULL; + return (__log_read_record(env, + dbpp, td, data, __heap_addrem_50_desc, sizeof(__heap_addrem_50_args), (void**)arg)); +} #define DB___heap_pg_alloc 152 typedef struct ___heap_pg_alloc_args { u_int32_t type; diff --git a/src/dbinc_auto/heap_ext.h b/src/dbinc_auto/heap_ext.h index 8bc24b61..e886d6c9 100644 --- a/src/dbinc_auto/heap_ext.h +++ b/src/dbinc_auto/heap_ext.h @@ -15,6 +15,8 @@ int __heapc_gsplit __P((DBC *, DBT *, void **, u_int32_t *)); int __heapc_refresh __P((DBC *)); int __heap_init_recover __P((ENV *, DB_DISTAB *)); int __heap_addrem_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); +int __heap_addrem_60_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); +int __heap_addrem_50_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __heap_pg_alloc_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __heap_trunc_meta_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __heap_trunc_page_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); @@ -39,6 +41,8 @@ int __heap_addrem_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __heap_pg_alloc_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __heap_trunc_meta_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __heap_trunc_page_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); +int __heap_addrem_60_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); +int __heap_addrem_50_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __heap_truncate __P((DBC *, u_int32_t *)); int __heap_stat __P((DBC *, void *, u_int32_t)); int __heap_stat_print __P((DBC *, u_int32_t)); @@ -46,6 +50,8 @@ void __heap_print_cursor __P((DBC *)); int __heap_stat_callback __P((DBC *, PAGE *, void *, int *)); int __heap_traverse __P((DBC *, int (*)(DBC *, PAGE *, void *, int *), void *)); int __db_no_heap_am __P((ENV *)); +int __heap_60_heapmeta __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)); +int __heap_60_heap __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)); int __heap_vrfy_meta __P((DB *, VRFY_DBINFO *, HEAPMETA *, db_pgno_t, u_int32_t)); int __heap_vrfy __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, u_int32_t)); int __heap_vrfy_structure __P((DB *, VRFY_DBINFO *, u_int32_t)); diff --git a/src/dbinc_auto/int_def.in b/src/dbinc_auto/int_def.in index dce2831c..5042dfd0 100644 --- a/src/dbinc_auto/int_def.in +++ b/src/dbinc_auto/int_def.in @@ -85,13 +85,18 @@ #define __db_init_print __db_init_print@DB_VERSION_UNIQUE_NAME@ #define __db_dbbackup_pp __db_dbbackup_pp@DB_VERSION_UNIQUE_NAME@ #define __db_dbbackup __db_dbbackup@DB_VERSION_UNIQUE_NAME@ -#define __db_backup __db_backup@DB_VERSION_UNIQUE_NAME@ +#define backup_data_copy backup_data_copy@DB_VERSION_UNIQUE_NAME@ +#define __db_backup_pp __db_backup_pp@DB_VERSION_UNIQUE_NAME@ #define __dbc_close __dbc_close@DB_VERSION_UNIQUE_NAME@ #define __dbc_destroy __dbc_destroy@DB_VERSION_UNIQUE_NAME@ #define __dbc_cmp __dbc_cmp@DB_VERSION_UNIQUE_NAME@ #define __dbc_count __dbc_count@DB_VERSION_UNIQUE_NAME@ #define __dbc_del __dbc_del@DB_VERSION_UNIQUE_NAME@ #define __dbc_idel __dbc_idel@DB_VERSION_UNIQUE_NAME@ +#define __dbc_db_stream __dbc_db_stream@DB_VERSION_UNIQUE_NAME@ +#define __dbc_get_blob_id __dbc_get_blob_id@DB_VERSION_UNIQUE_NAME@ +#define __dbc_get_blob_size __dbc_get_blob_size@DB_VERSION_UNIQUE_NAME@ +#define __dbc_set_blob_size __dbc_set_blob_size@DB_VERSION_UNIQUE_NAME@ #ifdef HAVE_COMPRESSION #define __dbc_bulk_del __dbc_bulk_del@DB_VERSION_UNIQUE_NAME@ #endif @@ -115,6 +120,7 @@ #ifdef DIAGNOSTIC #define __db_check_skeyset __db_check_skeyset@DB_VERSION_UNIQUE_NAME@ #endif +#define __dbc_diags __dbc_diags@DB_VERSION_UNIQUE_NAME@ #define __cdsgroup_begin __cdsgroup_begin@DB_VERSION_UNIQUE_NAME@ #define __cdsgroup_begin_pp __cdsgroup_begin_pp@DB_VERSION_UNIQUE_NAME@ #define __db_compact_int __db_compact_int@DB_VERSION_UNIQUE_NAME@ @@ -207,6 +213,10 @@ #define __db_lput __db_lput@DB_VERSION_UNIQUE_NAME@ #define __db_create_internal __db_create_internal@DB_VERSION_UNIQUE_NAME@ #define __dbh_am_chk __dbh_am_chk@DB_VERSION_UNIQUE_NAME@ +#define __db_get_blob_threshold __db_get_blob_threshold@DB_VERSION_UNIQUE_NAME@ +#define __db_set_blob_threshold __db_set_blob_threshold@DB_VERSION_UNIQUE_NAME@ +#define __db_blobs_enabled __db_blobs_enabled@DB_VERSION_UNIQUE_NAME@ +#define __db_set_dup_compare __db_set_dup_compare@DB_VERSION_UNIQUE_NAME@ #define __db_get_flags __db_get_flags@DB_VERSION_UNIQUE_NAME@ #define __db_set_flags __db_set_flags@DB_VERSION_UNIQUE_NAME@ #define __db_get_lorder __db_get_lorder@DB_VERSION_UNIQUE_NAME@ @@ -219,6 +229,7 @@ #define __db_chk_meta __db_chk_meta@DB_VERSION_UNIQUE_NAME@ #define __db_meta_setup __db_meta_setup@DB_VERSION_UNIQUE_NAME@ #define __db_reopen __db_reopen@DB_VERSION_UNIQUE_NAME@ +#define __db_alloc_dbt __db_alloc_dbt@DB_VERSION_UNIQUE_NAME@ #define __db_goff __db_goff@DB_VERSION_UNIQUE_NAME@ #define __db_poff __db_poff@DB_VERSION_UNIQUE_NAME@ #define __db_ovref __db_ovref@DB_VERSION_UNIQUE_NAME@ @@ -248,6 +259,7 @@ #define __db_prfooter __db_prfooter@DB_VERSION_UNIQUE_NAME@ #define __db_pr_callback __db_pr_callback@DB_VERSION_UNIQUE_NAME@ #define __db_dbtype_to_string __db_dbtype_to_string@DB_VERSION_UNIQUE_NAME@ +#define __db_tohex __db_tohex@DB_VERSION_UNIQUE_NAME@ #define __db_addrem_recover __db_addrem_recover@DB_VERSION_UNIQUE_NAME@ #define __db_addrem_42_recover __db_addrem_42_recover@DB_VERSION_UNIQUE_NAME@ #define __db_big_recover __db_big_recover@DB_VERSION_UNIQUE_NAME@ @@ -285,6 +297,8 @@ #define __db_rename_int __db_rename_int@DB_VERSION_UNIQUE_NAME@ #define __db_ret __db_ret@DB_VERSION_UNIQUE_NAME@ #define __db_retcopy __db_retcopy@DB_VERSION_UNIQUE_NAME@ +#define __db_dbt_clone __db_dbt_clone@DB_VERSION_UNIQUE_NAME@ +#define __db_dbt_clone_free __db_dbt_clone_free@DB_VERSION_UNIQUE_NAME@ #define __env_fileid_reset_pp __env_fileid_reset_pp@DB_VERSION_UNIQUE_NAME@ #define __env_fileid_reset __env_fileid_reset@DB_VERSION_UNIQUE_NAME@ #define __env_lsn_reset_pp __env_lsn_reset_pp@DB_VERSION_UNIQUE_NAME@ @@ -351,6 +365,7 @@ #define __part_key_range __part_key_range@DB_VERSION_UNIQUE_NAME@ #define __part_remove __part_remove@DB_VERSION_UNIQUE_NAME@ #define __part_rename __part_rename@DB_VERSION_UNIQUE_NAME@ +#define __partc_dup __partc_dup@DB_VERSION_UNIQUE_NAME@ #define __part_verify __part_verify@DB_VERSION_UNIQUE_NAME@ #define __part_testdocopy __part_testdocopy@DB_VERSION_UNIQUE_NAME@ #define __db_no_partition __db_no_partition@DB_VERSION_UNIQUE_NAME@ @@ -361,6 +376,34 @@ #define __partition_init __partition_init@DB_VERSION_UNIQUE_NAME@ #define __part_fileid_reset __part_fileid_reset@DB_VERSION_UNIQUE_NAME@ #define __partition_set_dirs __partition_set_dirs@DB_VERSION_UNIQUE_NAME@ +#define __blob_file_create __blob_file_create@DB_VERSION_UNIQUE_NAME@ +#define __blob_file_close __blob_file_close@DB_VERSION_UNIQUE_NAME@ +#define __blob_file_delete __blob_file_delete@DB_VERSION_UNIQUE_NAME@ +#define __blob_file_open __blob_file_open@DB_VERSION_UNIQUE_NAME@ +#define __blob_file_read __blob_file_read@DB_VERSION_UNIQUE_NAME@ +#define __blob_file_write __blob_file_write@DB_VERSION_UNIQUE_NAME@ +#define __blob_bulk __blob_bulk@DB_VERSION_UNIQUE_NAME@ +#define __blob_get __blob_get@DB_VERSION_UNIQUE_NAME@ +#define __blob_put __blob_put@DB_VERSION_UNIQUE_NAME@ +#define __blob_repl __blob_repl@DB_VERSION_UNIQUE_NAME@ +#define __blob_del __blob_del@DB_VERSION_UNIQUE_NAME@ +#define __db_stream_init __db_stream_init@DB_VERSION_UNIQUE_NAME@ +#define __db_stream_close_int __db_stream_close_int@DB_VERSION_UNIQUE_NAME@ +#define __blob_make_sub_dir __blob_make_sub_dir@DB_VERSION_UNIQUE_NAME@ +#define __blob_make_meta_fname __blob_make_meta_fname@DB_VERSION_UNIQUE_NAME@ +#define __blob_get_dir __blob_get_dir@DB_VERSION_UNIQUE_NAME@ +#define __blob_generate_dir_ids __blob_generate_dir_ids@DB_VERSION_UNIQUE_NAME@ +#define __blob_generate_id __blob_generate_id@DB_VERSION_UNIQUE_NAME@ +#define __blob_highest_id __blob_highest_id@DB_VERSION_UNIQUE_NAME@ +#define __blob_calculate_dirs __blob_calculate_dirs@DB_VERSION_UNIQUE_NAME@ +#define __blob_id_to_path __blob_id_to_path@DB_VERSION_UNIQUE_NAME@ +#define __blob_str_to_id __blob_str_to_id@DB_VERSION_UNIQUE_NAME@ +#define __blob_path_to_dir_ids __blob_path_to_dir_ids@DB_VERSION_UNIQUE_NAME@ +#define __blob_salvage __blob_salvage@DB_VERSION_UNIQUE_NAME@ +#define __blob_vrfy __blob_vrfy@DB_VERSION_UNIQUE_NAME@ +#define __blob_del_hierarchy __blob_del_hierarchy@DB_VERSION_UNIQUE_NAME@ +#define __blob_del_all __blob_del_all@DB_VERSION_UNIQUE_NAME@ +#define __blob_copy_all __blob_copy_all@DB_VERSION_UNIQUE_NAME@ #define __bam_compact_int __bam_compact_int@DB_VERSION_UNIQUE_NAME@ #define __bam_compact_opd __bam_compact_opd@DB_VERSION_UNIQUE_NAME@ #define __bam_truncate_ipages __bam_truncate_ipages@DB_VERSION_UNIQUE_NAME@ @@ -470,6 +513,8 @@ #define __bam_30_btreemeta __bam_30_btreemeta@DB_VERSION_UNIQUE_NAME@ #define __bam_31_btreemeta __bam_31_btreemeta@DB_VERSION_UNIQUE_NAME@ #define __bam_31_lbtree __bam_31_lbtree@DB_VERSION_UNIQUE_NAME@ +#define __bam_60_btreemeta __bam_60_btreemeta@DB_VERSION_UNIQUE_NAME@ +#define __bam_60_lbtree __bam_60_lbtree@DB_VERSION_UNIQUE_NAME@ #define __bam_vrfy_meta __bam_vrfy_meta@DB_VERSION_UNIQUE_NAME@ #define __ram_vrfy_leaf __ram_vrfy_leaf@DB_VERSION_UNIQUE_NAME@ #define __bam_vrfy __bam_vrfy@DB_VERSION_UNIQUE_NAME@ @@ -628,6 +673,7 @@ #ifdef DIAGNOSTIC #define __db_assert __db_assert@DB_VERSION_UNIQUE_NAME@ #endif +#define __env_panic_event __env_panic_event@DB_VERSION_UNIQUE_NAME@ #define __env_panic_msg __env_panic_msg@DB_VERSION_UNIQUE_NAME@ #define __env_panic __env_panic@DB_VERSION_UNIQUE_NAME@ #define __db_unknown_error __db_unknown_error@DB_VERSION_UNIQUE_NAME@ @@ -639,6 +685,7 @@ #define __db_msgadd __db_msgadd@DB_VERSION_UNIQUE_NAME@ #define __db_msgadd_ap __db_msgadd_ap@DB_VERSION_UNIQUE_NAME@ #define __db_msg __db_msg@DB_VERSION_UNIQUE_NAME@ +#define __db_debug_msg __db_debug_msg@DB_VERSION_UNIQUE_NAME@ #define __db_repmsg __db_repmsg@DB_VERSION_UNIQUE_NAME@ #define __db_unknown_flag __db_unknown_flag@DB_VERSION_UNIQUE_NAME@ #define __db_unknown_type __db_unknown_type@DB_VERSION_UNIQUE_NAME@ @@ -653,6 +700,24 @@ #define __db_rdonly __db_rdonly@DB_VERSION_UNIQUE_NAME@ #define __db_space_err __db_space_err@DB_VERSION_UNIQUE_NAME@ #define __db_failed __db_failed@DB_VERSION_UNIQUE_NAME@ +#define __env_failure_remember __env_failure_remember@DB_VERSION_UNIQUE_NAME@ +#ifdef HAVE_ERROR_HISTORY +#define __db_thread_init __db_thread_init@DB_VERSION_UNIQUE_NAME@ +#endif +#ifdef HAVE_ERROR_HISTORY +#define __db_diags __db_diags@DB_VERSION_UNIQUE_NAME@ +#endif +#ifdef HAVE_ERROR_HISTORY +#define __db_deferred_get __db_deferred_get@DB_VERSION_UNIQUE_NAME@ +#endif +#ifdef HAVE_ERROR_HISTORY +#define __db_deferred_discard __db_deferred_discard@DB_VERSION_UNIQUE_NAME@ +#endif +#ifdef HAVE_ERROR_HISTORY +#define __db_remember_context __db_remember_context@DB_VERSION_UNIQUE_NAME@ +#endif +#define __db_ctimespec __db_ctimespec@DB_VERSION_UNIQUE_NAME@ +#define __db_fmt_quote __db_fmt_quote@DB_VERSION_UNIQUE_NAME@ #define __db_getlong __db_getlong@DB_VERSION_UNIQUE_NAME@ #define __db_getulong __db_getulong@DB_VERSION_UNIQUE_NAME@ #define __db_idspace __db_idspace@DB_VERSION_UNIQUE_NAME@ @@ -709,11 +774,14 @@ #define __dbreg_failchk __dbreg_failchk@DB_VERSION_UNIQUE_NAME@ #define __dbreg_log_close __dbreg_log_close@DB_VERSION_UNIQUE_NAME@ #define __dbreg_log_id __dbreg_log_id@DB_VERSION_UNIQUE_NAME@ +#define __dbreg_register_42_desc __dbreg_register_42_desc@DB_VERSION_UNIQUE_NAME@ #define __dbreg_register_desc __dbreg_register_desc@DB_VERSION_UNIQUE_NAME@ #define __dbreg_init_recover __dbreg_init_recover@DB_VERSION_UNIQUE_NAME@ +#define __dbreg_register_42_print __dbreg_register_42_print@DB_VERSION_UNIQUE_NAME@ #define __dbreg_register_print __dbreg_register_print@DB_VERSION_UNIQUE_NAME@ #define __dbreg_init_print __dbreg_init_print@DB_VERSION_UNIQUE_NAME@ #define __dbreg_register_recover __dbreg_register_recover@DB_VERSION_UNIQUE_NAME@ +#define __dbreg_register_42_recover __dbreg_register_42_recover@DB_VERSION_UNIQUE_NAME@ #define __dbreg_stat_print __dbreg_stat_print@DB_VERSION_UNIQUE_NAME@ #define __dbreg_print_fname __dbreg_print_fname@DB_VERSION_UNIQUE_NAME@ #define __dbreg_add_dbentry __dbreg_add_dbentry@DB_VERSION_UNIQUE_NAME@ @@ -727,6 +795,7 @@ #define __dbreg_id_to_db __dbreg_id_to_db@DB_VERSION_UNIQUE_NAME@ #define __dbreg_id_to_fname __dbreg_id_to_fname@DB_VERSION_UNIQUE_NAME@ #define __dbreg_fid_to_fname __dbreg_fid_to_fname@DB_VERSION_UNIQUE_NAME@ +#define __dbreg_blob_file_to_fname __dbreg_blob_file_to_fname@DB_VERSION_UNIQUE_NAME@ #define __dbreg_get_name __dbreg_get_name@DB_VERSION_UNIQUE_NAME@ #define __dbreg_do_open __dbreg_do_open@DB_VERSION_UNIQUE_NAME@ #define __dbreg_lazy_id __dbreg_lazy_id@DB_VERSION_UNIQUE_NAME@ @@ -760,9 +829,13 @@ #define __env_get_alloc __env_get_alloc@DB_VERSION_UNIQUE_NAME@ #define __env_set_alloc __env_set_alloc@DB_VERSION_UNIQUE_NAME@ #define __env_get_memory_init __env_get_memory_init@DB_VERSION_UNIQUE_NAME@ +#define __env_get_blob_threshold_pp __env_get_blob_threshold_pp@DB_VERSION_UNIQUE_NAME@ +#define __env_get_blob_threshold_int __env_get_blob_threshold_int@DB_VERSION_UNIQUE_NAME@ +#define __env_set_blob_threshold __env_set_blob_threshold@DB_VERSION_UNIQUE_NAME@ #define __env_set_memory_init __env_set_memory_init@DB_VERSION_UNIQUE_NAME@ #define __env_get_memory_max __env_get_memory_max@DB_VERSION_UNIQUE_NAME@ #define __env_set_memory_max __env_set_memory_max@DB_VERSION_UNIQUE_NAME@ +#define __env_set_blob_dir __env_set_blob_dir@DB_VERSION_UNIQUE_NAME@ #define __env_get_encrypt_flags __env_get_encrypt_flags@DB_VERSION_UNIQUE_NAME@ #define __env_set_encrypt __env_set_encrypt@DB_VERSION_UNIQUE_NAME@ #define __env_map_flags __env_map_flags@DB_VERSION_UNIQUE_NAME@ @@ -815,6 +888,7 @@ #define __env_ref_increment __env_ref_increment@DB_VERSION_UNIQUE_NAME@ #define __env_ref_decrement __env_ref_decrement@DB_VERSION_UNIQUE_NAME@ #define __env_ref_get __env_ref_get@DB_VERSION_UNIQUE_NAME@ +#define __env_region_cleanup __env_region_cleanup@DB_VERSION_UNIQUE_NAME@ #define __env_detach __env_detach@DB_VERSION_UNIQUE_NAME@ #define __env_remove_env __env_remove_env@DB_VERSION_UNIQUE_NAME@ #define __env_region_attach __env_region_attach@DB_VERSION_UNIQUE_NAME@ @@ -826,6 +900,7 @@ #define __envreg_isalive __envreg_isalive@DB_VERSION_UNIQUE_NAME@ #define __env_struct_sig __env_struct_sig@DB_VERSION_UNIQUE_NAME@ #define __env_stat_print_pp __env_stat_print_pp@DB_VERSION_UNIQUE_NAME@ +#define __env_print_thread __env_print_thread@DB_VERSION_UNIQUE_NAME@ #define __db_print_fh __db_print_fh@DB_VERSION_UNIQUE_NAME@ #define __db_print_fileid __db_print_fileid@DB_VERSION_UNIQUE_NAME@ #define __db_dl __db_dl@DB_VERSION_UNIQUE_NAME@ @@ -843,6 +918,18 @@ #define __repmgr_set_ack_policy __repmgr_set_ack_policy@DB_VERSION_UNIQUE_NAME@ #endif #ifndef HAVE_REPLICATION_THREADS +#define __repmgr_get_incoming_queue_max __repmgr_get_incoming_queue_max@DB_VERSION_UNIQUE_NAME@ +#endif +#ifndef HAVE_REPLICATION_THREADS +#define __repmgr_set_incoming_queue_max __repmgr_set_incoming_queue_max@DB_VERSION_UNIQUE_NAME@ +#endif +#ifndef HAVE_REPLICATION_THREADS +#define __repmgr_get_incoming_queue_redzone __repmgr_get_incoming_queue_redzone@DB_VERSION_UNIQUE_NAME@ +#endif +#ifndef HAVE_REPLICATION_THREADS +#define __repmgr_get_incoming_queue_fullevent __repmgr_get_incoming_queue_fullevent@DB_VERSION_UNIQUE_NAME@ +#endif +#ifndef HAVE_REPLICATION_THREADS #define __repmgr_site __repmgr_site@DB_VERSION_UNIQUE_NAME@ #endif #ifndef HAVE_REPLICATION_THREADS @@ -852,10 +939,10 @@ #define __repmgr_local_site __repmgr_local_site@DB_VERSION_UNIQUE_NAME@ #endif #ifndef HAVE_REPLICATION_THREADS -#define __repmgr_site_list __repmgr_site_list@DB_VERSION_UNIQUE_NAME@ +#define __repmgr_site_list_pp __repmgr_site_list_pp@DB_VERSION_UNIQUE_NAME@ #endif #ifndef HAVE_REPLICATION_THREADS -#define __repmgr_start __repmgr_start@DB_VERSION_UNIQUE_NAME@ +#define __repmgr_start_pp __repmgr_start_pp@DB_VERSION_UNIQUE_NAME@ #endif #ifndef HAVE_REPLICATION_THREADS #define __repmgr_stat_pp __repmgr_stat_pp@DB_VERSION_UNIQUE_NAME@ @@ -876,39 +963,63 @@ #define __repmgr_init_recover __repmgr_init_recover@DB_VERSION_UNIQUE_NAME@ #endif #define __fop_create_42_desc __fop_create_42_desc@DB_VERSION_UNIQUE_NAME@ +#define __fop_create_60_desc __fop_create_60_desc@DB_VERSION_UNIQUE_NAME@ #define __fop_create_desc __fop_create_desc@DB_VERSION_UNIQUE_NAME@ +#define __fop_remove_60_desc __fop_remove_60_desc@DB_VERSION_UNIQUE_NAME@ #define __fop_remove_desc __fop_remove_desc@DB_VERSION_UNIQUE_NAME@ #define __fop_write_42_desc __fop_write_42_desc@DB_VERSION_UNIQUE_NAME@ +#define __fop_write_60_desc __fop_write_60_desc@DB_VERSION_UNIQUE_NAME@ #define __fop_write_desc __fop_write_desc@DB_VERSION_UNIQUE_NAME@ +#define __fop_write_file_60_desc __fop_write_file_60_desc@DB_VERSION_UNIQUE_NAME@ +#define __fop_write_file_desc __fop_write_file_desc@DB_VERSION_UNIQUE_NAME@ #define __fop_rename_42_desc __fop_rename_42_desc@DB_VERSION_UNIQUE_NAME@ #define __fop_rename_noundo_46_desc __fop_rename_noundo_46_desc@DB_VERSION_UNIQUE_NAME@ +#define __fop_rename_60_desc __fop_rename_60_desc@DB_VERSION_UNIQUE_NAME@ +#define __fop_rename_noundo_60_desc __fop_rename_noundo_60_desc@DB_VERSION_UNIQUE_NAME@ #define __fop_rename_desc __fop_rename_desc@DB_VERSION_UNIQUE_NAME@ #define __fop_rename_noundo_desc __fop_rename_noundo_desc@DB_VERSION_UNIQUE_NAME@ +#define __fop_file_remove_60_desc __fop_file_remove_60_desc@DB_VERSION_UNIQUE_NAME@ #define __fop_file_remove_desc __fop_file_remove_desc@DB_VERSION_UNIQUE_NAME@ #define __fop_init_recover __fop_init_recover@DB_VERSION_UNIQUE_NAME@ #define __fop_create_42_print __fop_create_42_print@DB_VERSION_UNIQUE_NAME@ +#define __fop_create_60_print __fop_create_60_print@DB_VERSION_UNIQUE_NAME@ #define __fop_create_print __fop_create_print@DB_VERSION_UNIQUE_NAME@ +#define __fop_remove_60_print __fop_remove_60_print@DB_VERSION_UNIQUE_NAME@ #define __fop_remove_print __fop_remove_print@DB_VERSION_UNIQUE_NAME@ #define __fop_write_42_print __fop_write_42_print@DB_VERSION_UNIQUE_NAME@ +#define __fop_write_60_print __fop_write_60_print@DB_VERSION_UNIQUE_NAME@ #define __fop_write_print __fop_write_print@DB_VERSION_UNIQUE_NAME@ +#define __fop_write_file_60_print __fop_write_file_60_print@DB_VERSION_UNIQUE_NAME@ +#define __fop_write_file_print __fop_write_file_print@DB_VERSION_UNIQUE_NAME@ #define __fop_rename_42_print __fop_rename_42_print@DB_VERSION_UNIQUE_NAME@ +#define __fop_rename_60_print __fop_rename_60_print@DB_VERSION_UNIQUE_NAME@ #define __fop_rename_print __fop_rename_print@DB_VERSION_UNIQUE_NAME@ +#define __fop_file_remove_60_print __fop_file_remove_60_print@DB_VERSION_UNIQUE_NAME@ #define __fop_file_remove_print __fop_file_remove_print@DB_VERSION_UNIQUE_NAME@ #define __fop_init_print __fop_init_print@DB_VERSION_UNIQUE_NAME@ #define __fop_create __fop_create@DB_VERSION_UNIQUE_NAME@ #define __fop_remove __fop_remove@DB_VERSION_UNIQUE_NAME@ #define __fop_write __fop_write@DB_VERSION_UNIQUE_NAME@ +#define __fop_write_file __fop_write_file@DB_VERSION_UNIQUE_NAME@ #define __fop_rename __fop_rename@DB_VERSION_UNIQUE_NAME@ #define __fop_create_recover __fop_create_recover@DB_VERSION_UNIQUE_NAME@ +#define __fop_create_60_recover __fop_create_60_recover@DB_VERSION_UNIQUE_NAME@ #define __fop_create_42_recover __fop_create_42_recover@DB_VERSION_UNIQUE_NAME@ #define __fop_remove_recover __fop_remove_recover@DB_VERSION_UNIQUE_NAME@ +#define __fop_remove_60_recover __fop_remove_60_recover@DB_VERSION_UNIQUE_NAME@ #define __fop_write_recover __fop_write_recover@DB_VERSION_UNIQUE_NAME@ +#define __fop_write_60_recover __fop_write_60_recover@DB_VERSION_UNIQUE_NAME@ #define __fop_write_42_recover __fop_write_42_recover@DB_VERSION_UNIQUE_NAME@ +#define __fop_write_file_recover __fop_write_file_recover@DB_VERSION_UNIQUE_NAME@ +#define __fop_write_file_60_recover __fop_write_file_60_recover@DB_VERSION_UNIQUE_NAME@ #define __fop_rename_recover __fop_rename_recover@DB_VERSION_UNIQUE_NAME@ #define __fop_rename_noundo_recover __fop_rename_noundo_recover@DB_VERSION_UNIQUE_NAME@ +#define __fop_rename_60_recover __fop_rename_60_recover@DB_VERSION_UNIQUE_NAME@ +#define __fop_rename_noundo_60_recover __fop_rename_noundo_60_recover@DB_VERSION_UNIQUE_NAME@ #define __fop_rename_42_recover __fop_rename_42_recover@DB_VERSION_UNIQUE_NAME@ #define __fop_rename_noundo_46_recover __fop_rename_noundo_46_recover@DB_VERSION_UNIQUE_NAME@ #define __fop_file_remove_recover __fop_file_remove_recover@DB_VERSION_UNIQUE_NAME@ +#define __fop_file_remove_60_recover __fop_file_remove_60_recover@DB_VERSION_UNIQUE_NAME@ #define __fop_lock_handle __fop_lock_handle@DB_VERSION_UNIQUE_NAME@ #define __fop_file_setup __fop_file_setup@DB_VERSION_UNIQUE_NAME@ #define __fop_subdb_setup __fop_subdb_setup@DB_VERSION_UNIQUE_NAME@ @@ -1041,6 +1152,8 @@ #define __ham_31_hash __ham_31_hash@DB_VERSION_UNIQUE_NAME@ #define __ham_46_hashmeta __ham_46_hashmeta@DB_VERSION_UNIQUE_NAME@ #define __ham_46_hash __ham_46_hash@DB_VERSION_UNIQUE_NAME@ +#define __ham_60_hashmeta __ham_60_hashmeta@DB_VERSION_UNIQUE_NAME@ +#define __ham_60_hash __ham_60_hash@DB_VERSION_UNIQUE_NAME@ #define __ham_vrfy_meta __ham_vrfy_meta@DB_VERSION_UNIQUE_NAME@ #define __ham_vrfy __ham_vrfy@DB_VERSION_UNIQUE_NAME@ #define __ham_vrfy_structure __ham_vrfy_structure@DB_VERSION_UNIQUE_NAME@ @@ -1055,11 +1168,15 @@ #define __heapc_gsplit __heapc_gsplit@DB_VERSION_UNIQUE_NAME@ #define __heapc_refresh __heapc_refresh@DB_VERSION_UNIQUE_NAME@ #define __heap_addrem_desc __heap_addrem_desc@DB_VERSION_UNIQUE_NAME@ +#define __heap_addrem_60_desc __heap_addrem_60_desc@DB_VERSION_UNIQUE_NAME@ +#define __heap_addrem_50_desc __heap_addrem_50_desc@DB_VERSION_UNIQUE_NAME@ #define __heap_pg_alloc_desc __heap_pg_alloc_desc@DB_VERSION_UNIQUE_NAME@ #define __heap_trunc_meta_desc __heap_trunc_meta_desc@DB_VERSION_UNIQUE_NAME@ #define __heap_trunc_page_desc __heap_trunc_page_desc@DB_VERSION_UNIQUE_NAME@ #define __heap_init_recover __heap_init_recover@DB_VERSION_UNIQUE_NAME@ #define __heap_addrem_print __heap_addrem_print@DB_VERSION_UNIQUE_NAME@ +#define __heap_addrem_60_print __heap_addrem_60_print@DB_VERSION_UNIQUE_NAME@ +#define __heap_addrem_50_print __heap_addrem_50_print@DB_VERSION_UNIQUE_NAME@ #define __heap_pg_alloc_print __heap_pg_alloc_print@DB_VERSION_UNIQUE_NAME@ #define __heap_trunc_meta_print __heap_trunc_meta_print@DB_VERSION_UNIQUE_NAME@ #define __heap_trunc_page_print __heap_trunc_page_print@DB_VERSION_UNIQUE_NAME@ @@ -1084,6 +1201,8 @@ #define __heap_pg_alloc_recover __heap_pg_alloc_recover@DB_VERSION_UNIQUE_NAME@ #define __heap_trunc_meta_recover __heap_trunc_meta_recover@DB_VERSION_UNIQUE_NAME@ #define __heap_trunc_page_recover __heap_trunc_page_recover@DB_VERSION_UNIQUE_NAME@ +#define __heap_addrem_60_recover __heap_addrem_60_recover@DB_VERSION_UNIQUE_NAME@ +#define __heap_addrem_50_recover __heap_addrem_50_recover@DB_VERSION_UNIQUE_NAME@ #define __heap_truncate __heap_truncate@DB_VERSION_UNIQUE_NAME@ #define __heap_stat __heap_stat@DB_VERSION_UNIQUE_NAME@ #define __heap_stat_print __heap_stat_print@DB_VERSION_UNIQUE_NAME@ @@ -1091,6 +1210,8 @@ #define __heap_stat_callback __heap_stat_callback@DB_VERSION_UNIQUE_NAME@ #define __heap_traverse __heap_traverse@DB_VERSION_UNIQUE_NAME@ #define __db_no_heap_am __db_no_heap_am@DB_VERSION_UNIQUE_NAME@ +#define __heap_60_heapmeta __heap_60_heapmeta@DB_VERSION_UNIQUE_NAME@ +#define __heap_60_heap __heap_60_heap@DB_VERSION_UNIQUE_NAME@ #define __heap_vrfy_meta __heap_vrfy_meta@DB_VERSION_UNIQUE_NAME@ #define __heap_vrfy __heap_vrfy@DB_VERSION_UNIQUE_NAME@ #define __heap_vrfy_structure __heap_vrfy_structure@DB_VERSION_UNIQUE_NAME@ @@ -1129,6 +1250,7 @@ #define __lock_addfamilylocker __lock_addfamilylocker@DB_VERSION_UNIQUE_NAME@ #define __lock_freelocker __lock_freelocker@DB_VERSION_UNIQUE_NAME@ #define __lock_familyremove __lock_familyremove@DB_VERSION_UNIQUE_NAME@ +#define __lock_local_locker_invalidate __lock_local_locker_invalidate@DB_VERSION_UNIQUE_NAME@ #define __lock_fix_list __lock_fix_list@DB_VERSION_UNIQUE_NAME@ #define __lock_get_list __lock_get_list@DB_VERSION_UNIQUE_NAME@ #define __lock_list_print __lock_list_print@DB_VERSION_UNIQUE_NAME@ @@ -1154,6 +1276,7 @@ #define __lock_set_env_timeout __lock_set_env_timeout@DB_VERSION_UNIQUE_NAME@ #define __lock_open __lock_open@DB_VERSION_UNIQUE_NAME@ #define __lock_env_refresh __lock_env_refresh@DB_VERSION_UNIQUE_NAME@ +#define __lock_region_detach __lock_region_detach@DB_VERSION_UNIQUE_NAME@ #define __lock_region_mutex_count __lock_region_mutex_count@DB_VERSION_UNIQUE_NAME@ #define __lock_region_mutex_max __lock_region_mutex_max@DB_VERSION_UNIQUE_NAME@ #define __lock_region_max __lock_region_max@DB_VERSION_UNIQUE_NAME@ @@ -1162,6 +1285,7 @@ #define __lock_stat_print_pp __lock_stat_print_pp@DB_VERSION_UNIQUE_NAME@ #define __lock_stat_print __lock_stat_print@DB_VERSION_UNIQUE_NAME@ #define __lock_printlock __lock_printlock@DB_VERSION_UNIQUE_NAME@ +#define __lock_dump_locker __lock_dump_locker@DB_VERSION_UNIQUE_NAME@ #define __lock_set_timeout __lock_set_timeout@DB_VERSION_UNIQUE_NAME@ #define __lock_set_timeout_internal __lock_set_timeout_internal@DB_VERSION_UNIQUE_NAME@ #define __lock_inherit_timeout __lock_inherit_timeout@DB_VERSION_UNIQUE_NAME@ @@ -1169,6 +1293,7 @@ #define __lock_lhash __lock_lhash@DB_VERSION_UNIQUE_NAME@ #define __lock_nomem __lock_nomem@DB_VERSION_UNIQUE_NAME@ #define __log_open __log_open@DB_VERSION_UNIQUE_NAME@ +#define __log_region_detach __log_region_detach@DB_VERSION_UNIQUE_NAME@ #define __log_find __log_find@DB_VERSION_UNIQUE_NAME@ #define __log_valid __log_valid@DB_VERSION_UNIQUE_NAME@ #define __log_env_refresh __log_env_refresh@DB_VERSION_UNIQUE_NAME@ @@ -1234,6 +1359,7 @@ #define __log_file_pp __log_file_pp@DB_VERSION_UNIQUE_NAME@ #define __log_name __log_name@DB_VERSION_UNIQUE_NAME@ #define __log_rep_put __log_rep_put@DB_VERSION_UNIQUE_NAME@ +#define __log_rep_write __log_rep_write@DB_VERSION_UNIQUE_NAME@ #define __log_put_record_pp __log_put_record_pp@DB_VERSION_UNIQUE_NAME@ #define __log_put_record __log_put_record@DB_VERSION_UNIQUE_NAME@ #define __log_stat_pp __log_stat_pp@DB_VERSION_UNIQUE_NAME@ @@ -1277,6 +1403,7 @@ #define __db_merge_verify __db_merge_verify@DB_VERSION_UNIQUE_NAME@ #define __db_pgno_verify __db_pgno_verify@DB_VERSION_UNIQUE_NAME@ #define __dbreg_register_verify __dbreg_register_verify@DB_VERSION_UNIQUE_NAME@ +#define __dbreg_register_42_verify __dbreg_register_42_verify@DB_VERSION_UNIQUE_NAME@ #define __bam_split_verify __bam_split_verify@DB_VERSION_UNIQUE_NAME@ #define __bam_split_42_verify __bam_split_42_verify@DB_VERSION_UNIQUE_NAME@ #define __bam_rsplit_verify __bam_rsplit_verify@DB_VERSION_UNIQUE_NAME@ @@ -1291,12 +1418,19 @@ #define __bam_relink_43_verify __bam_relink_43_verify@DB_VERSION_UNIQUE_NAME@ #define __bam_merge_44_verify __bam_merge_44_verify@DB_VERSION_UNIQUE_NAME@ #define __fop_create_42_verify __fop_create_42_verify@DB_VERSION_UNIQUE_NAME@ +#define __fop_create_60_verify __fop_create_60_verify@DB_VERSION_UNIQUE_NAME@ #define __fop_create_verify __fop_create_verify@DB_VERSION_UNIQUE_NAME@ +#define __fop_remove_60_verify __fop_remove_60_verify@DB_VERSION_UNIQUE_NAME@ #define __fop_remove_verify __fop_remove_verify@DB_VERSION_UNIQUE_NAME@ #define __fop_write_42_verify __fop_write_42_verify@DB_VERSION_UNIQUE_NAME@ +#define __fop_write_60_verify __fop_write_60_verify@DB_VERSION_UNIQUE_NAME@ #define __fop_write_verify __fop_write_verify@DB_VERSION_UNIQUE_NAME@ +#define __fop_write_file_60_verify __fop_write_file_60_verify@DB_VERSION_UNIQUE_NAME@ +#define __fop_write_file_verify __fop_write_file_verify@DB_VERSION_UNIQUE_NAME@ #define __fop_rename_42_verify __fop_rename_42_verify@DB_VERSION_UNIQUE_NAME@ +#define __fop_rename_60_verify __fop_rename_60_verify@DB_VERSION_UNIQUE_NAME@ #define __fop_rename_verify __fop_rename_verify@DB_VERSION_UNIQUE_NAME@ +#define __fop_file_remove_60_verify __fop_file_remove_60_verify@DB_VERSION_UNIQUE_NAME@ #define __fop_file_remove_verify __fop_file_remove_verify@DB_VERSION_UNIQUE_NAME@ #define __ham_insdel_verify __ham_insdel_verify@DB_VERSION_UNIQUE_NAME@ #define __ham_newpage_verify __ham_newpage_verify@DB_VERSION_UNIQUE_NAME@ @@ -1312,6 +1446,7 @@ #define __ham_curadj_verify __ham_curadj_verify@DB_VERSION_UNIQUE_NAME@ #define __ham_chgpg_verify __ham_chgpg_verify@DB_VERSION_UNIQUE_NAME@ #define __heap_addrem_verify __heap_addrem_verify@DB_VERSION_UNIQUE_NAME@ +#define __heap_addrem_60_verify __heap_addrem_60_verify@DB_VERSION_UNIQUE_NAME@ #define __heap_pg_alloc_verify __heap_pg_alloc_verify@DB_VERSION_UNIQUE_NAME@ #define __heap_trunc_meta_verify __heap_trunc_meta_verify@DB_VERSION_UNIQUE_NAME@ #define __heap_trunc_page_verify __heap_trunc_page_verify@DB_VERSION_UNIQUE_NAME@ @@ -1363,6 +1498,7 @@ #define __del_txn_pages __del_txn_pages@DB_VERSION_UNIQUE_NAME@ #define __is_ancestor_txn __is_ancestor_txn@DB_VERSION_UNIQUE_NAME@ #define __return_txn_pages __return_txn_pages@DB_VERSION_UNIQUE_NAME@ +#define __memp_bh_unreachable __memp_bh_unreachable@DB_VERSION_UNIQUE_NAME@ #define __memp_alloc __memp_alloc@DB_VERSION_UNIQUE_NAME@ #define __memp_free __memp_free@DB_VERSION_UNIQUE_NAME@ #define __memp_backup_open __memp_backup_open@DB_VERSION_UNIQUE_NAME@ @@ -1375,6 +1511,7 @@ #define __memp_bhfree __memp_bhfree@DB_VERSION_UNIQUE_NAME@ #define __memp_fget_pp __memp_fget_pp@DB_VERSION_UNIQUE_NAME@ #define __memp_fget __memp_fget@DB_VERSION_UNIQUE_NAME@ +#define __memp_find_obsolete_version __memp_find_obsolete_version@DB_VERSION_UNIQUE_NAME@ #define __memp_fcreate_pp __memp_fcreate_pp@DB_VERSION_UNIQUE_NAME@ #define __memp_fcreate __memp_fcreate@DB_VERSION_UNIQUE_NAME@ #define __memp_set_clear_len __memp_set_clear_len@DB_VERSION_UNIQUE_NAME@ @@ -1385,6 +1522,7 @@ #define __memp_get_ftype __memp_get_ftype@DB_VERSION_UNIQUE_NAME@ #define __memp_set_ftype __memp_set_ftype@DB_VERSION_UNIQUE_NAME@ #define __memp_set_lsn_offset __memp_set_lsn_offset@DB_VERSION_UNIQUE_NAME@ +#define __memp_set_maxpgno __memp_set_maxpgno@DB_VERSION_UNIQUE_NAME@ #define __memp_get_pgcookie __memp_get_pgcookie@DB_VERSION_UNIQUE_NAME@ #define __memp_set_pgcookie __memp_set_pgcookie@DB_VERSION_UNIQUE_NAME@ #define __memp_get_priority __memp_get_priority@DB_VERSION_UNIQUE_NAME@ @@ -1432,10 +1570,12 @@ #define __memp_bh_freeze __memp_bh_freeze@DB_VERSION_UNIQUE_NAME@ #define __memp_bh_thaw __memp_bh_thaw@DB_VERSION_UNIQUE_NAME@ #define __memp_open __memp_open@DB_VERSION_UNIQUE_NAME@ +#define __memp_region_detach __memp_region_detach@DB_VERSION_UNIQUE_NAME@ #define __memp_init __memp_init@DB_VERSION_UNIQUE_NAME@ #define __memp_max_regions __memp_max_regions@DB_VERSION_UNIQUE_NAME@ #define __memp_region_mutex_count __memp_region_mutex_count@DB_VERSION_UNIQUE_NAME@ #define __memp_env_refresh __memp_env_refresh@DB_VERSION_UNIQUE_NAME@ +#define __memp_region_bhfree __memp_region_bhfree@DB_VERSION_UNIQUE_NAME@ #define __memp_register_pp __memp_register_pp@DB_VERSION_UNIQUE_NAME@ #define __memp_register __memp_register@DB_VERSION_UNIQUE_NAME@ #define __memp_get_bucket __memp_get_bucket@DB_VERSION_UNIQUE_NAME@ @@ -1460,13 +1600,13 @@ #define __mutex_alloc_int __mutex_alloc_int@DB_VERSION_UNIQUE_NAME@ #define __mutex_free __mutex_free@DB_VERSION_UNIQUE_NAME@ #define __mutex_free_int __mutex_free_int@DB_VERSION_UNIQUE_NAME@ +#define __mutex_died __mutex_died@DB_VERSION_UNIQUE_NAME@ #define __mutex_refresh __mutex_refresh@DB_VERSION_UNIQUE_NAME@ -#define __mut_failchk __mut_failchk@DB_VERSION_UNIQUE_NAME@ -#define __db_fcntl_mutex_init __db_fcntl_mutex_init@DB_VERSION_UNIQUE_NAME@ -#define __db_fcntl_mutex_lock __db_fcntl_mutex_lock@DB_VERSION_UNIQUE_NAME@ -#define __db_fcntl_mutex_trylock __db_fcntl_mutex_trylock@DB_VERSION_UNIQUE_NAME@ -#define __db_fcntl_mutex_unlock __db_fcntl_mutex_unlock@DB_VERSION_UNIQUE_NAME@ -#define __db_fcntl_mutex_destroy __db_fcntl_mutex_destroy@DB_VERSION_UNIQUE_NAME@ +#define __mutex_record_lock __mutex_record_lock@DB_VERSION_UNIQUE_NAME@ +#define __mutex_record_unlock __mutex_record_unlock@DB_VERSION_UNIQUE_NAME@ +#define __mutex_record_print __mutex_record_print@DB_VERSION_UNIQUE_NAME@ +#define __mutex_failchk __mutex_failchk@DB_VERSION_UNIQUE_NAME@ +#define __mutex_failchk_thread __mutex_failchk_thread@DB_VERSION_UNIQUE_NAME@ #define __mutex_alloc_pp __mutex_alloc_pp@DB_VERSION_UNIQUE_NAME@ #define __mutex_free_pp __mutex_free_pp@DB_VERSION_UNIQUE_NAME@ #define __mutex_lock_pp __mutex_lock_pp@DB_VERSION_UNIQUE_NAME@ @@ -1481,6 +1621,9 @@ #define __mutex_set_max __mutex_set_max@DB_VERSION_UNIQUE_NAME@ #define __mutex_get_tas_spins __mutex_get_tas_spins@DB_VERSION_UNIQUE_NAME@ #define __mutex_set_tas_spins __mutex_set_tas_spins@DB_VERSION_UNIQUE_NAME@ +#ifdef HAVE_ERROR_HISTORY +#define __mutex_diags __mutex_diags@DB_VERSION_UNIQUE_NAME@ +#endif #if !defined(HAVE_ATOMIC_SUPPORT) && defined(HAVE_MUTEX_SUPPORT) #define __atomic_inc __atomic_inc@DB_VERSION_UNIQUE_NAME@ #endif @@ -1503,6 +1646,7 @@ #define __db_pthread_mutex_unlock __db_pthread_mutex_unlock@DB_VERSION_UNIQUE_NAME@ #define __db_pthread_mutex_destroy __db_pthread_mutex_destroy@DB_VERSION_UNIQUE_NAME@ #define __mutex_open __mutex_open@DB_VERSION_UNIQUE_NAME@ +#define __mutex_region_detach __mutex_region_detach@DB_VERSION_UNIQUE_NAME@ #define __mutex_env_refresh __mutex_env_refresh@DB_VERSION_UNIQUE_NAME@ #define __mutex_resource_return __mutex_resource_return@DB_VERSION_UNIQUE_NAME@ #define __mutex_stat_pp __mutex_stat_pp@DB_VERSION_UNIQUE_NAME@ @@ -1512,6 +1656,7 @@ #define __mutex_print_debug_stats __mutex_print_debug_stats@DB_VERSION_UNIQUE_NAME@ #define __mutex_set_wait_info __mutex_set_wait_info@DB_VERSION_UNIQUE_NAME@ #define __mutex_clear __mutex_clear@DB_VERSION_UNIQUE_NAME@ +#define __mutex_describe __mutex_describe@DB_VERSION_UNIQUE_NAME@ #define __db_tas_mutex_init __db_tas_mutex_init@DB_VERSION_UNIQUE_NAME@ #define __db_tas_mutex_lock __db_tas_mutex_lock@DB_VERSION_UNIQUE_NAME@ #define __db_tas_mutex_trylock __db_tas_mutex_trylock@DB_VERSION_UNIQUE_NAME@ @@ -1582,6 +1727,7 @@ #define __os_concat_path __os_concat_path@DB_VERSION_UNIQUE_NAME@ #define __os_id __os_id@DB_VERSION_UNIQUE_NAME@ #define __os_rename __os_rename@DB_VERSION_UNIQUE_NAME@ +#define __os_rmdir __os_rmdir@DB_VERSION_UNIQUE_NAME@ #define __os_isroot __os_isroot@DB_VERSION_UNIQUE_NAME@ #define __db_rpath __db_rpath@DB_VERSION_UNIQUE_NAME@ #define __os_io __os_io@DB_VERSION_UNIQUE_NAME@ @@ -1590,16 +1736,37 @@ #define __os_physwrite __os_physwrite@DB_VERSION_UNIQUE_NAME@ #define __os_seek __os_seek@DB_VERSION_UNIQUE_NAME@ #define __os_stack __os_stack@DB_VERSION_UNIQUE_NAME@ +#define __os_stack_top __os_stack_top@DB_VERSION_UNIQUE_NAME@ +#define __os_stack_text __os_stack_text@DB_VERSION_UNIQUE_NAME@ +#define __os_stack_save __os_stack_save@DB_VERSION_UNIQUE_NAME@ +#define __os_stack_msgadd __os_stack_msgadd@DB_VERSION_UNIQUE_NAME@ #define __os_exists __os_exists@DB_VERSION_UNIQUE_NAME@ #define __os_ioinfo __os_ioinfo@DB_VERSION_UNIQUE_NAME@ #define __os_tmpdir __os_tmpdir@DB_VERSION_UNIQUE_NAME@ #define __os_truncate __os_truncate@DB_VERSION_UNIQUE_NAME@ #define __os_unique_id __os_unique_id@DB_VERSION_UNIQUE_NAME@ +#define __os_srandom __os_srandom@DB_VERSION_UNIQUE_NAME@ +#define __os_random __os_random@DB_VERSION_UNIQUE_NAME@ #define __os_unlink __os_unlink@DB_VERSION_UNIQUE_NAME@ #define __os_yield __os_yield@DB_VERSION_UNIQUE_NAME@ #ifdef HAVE_QNX #define __os_qnx_region_open __os_qnx_region_open@DB_VERSION_UNIQUE_NAME@ #endif +#ifdef DB_WINCE +#define __ce_freopen __ce_freopen@DB_VERSION_UNIQUE_NAME@ +#endif +#ifdef DB_WINCE +#define __ce_gmtime __ce_gmtime@DB_VERSION_UNIQUE_NAME@ +#endif +#ifdef DB_WINCE +#define localtime localtime@DB_VERSION_UNIQUE_NAME@ +#endif +#ifdef DB_WINCE +#define __ce_mktime __ce_mktime@DB_VERSION_UNIQUE_NAME@ +#endif +#ifdef DB_WINCE +#define __ce_remove __ce_remove@DB_VERSION_UNIQUE_NAME@ +#endif #define __os_is_winnt __os_is_winnt@DB_VERSION_UNIQUE_NAME@ #define __os_cpu_count __os_cpu_count@DB_VERSION_UNIQUE_NAME@ #ifdef HAVE_REPLICATION_THREADS @@ -1673,6 +1840,8 @@ #define __rep_egen_unmarshal __rep_egen_unmarshal@DB_VERSION_UNIQUE_NAME@ #define __rep_fileinfo_marshal __rep_fileinfo_marshal@DB_VERSION_UNIQUE_NAME@ #define __rep_fileinfo_unmarshal __rep_fileinfo_unmarshal@DB_VERSION_UNIQUE_NAME@ +#define __rep_fileinfo_v7_marshal __rep_fileinfo_v7_marshal@DB_VERSION_UNIQUE_NAME@ +#define __rep_fileinfo_v7_unmarshal __rep_fileinfo_v7_unmarshal@DB_VERSION_UNIQUE_NAME@ #define __rep_fileinfo_v6_marshal __rep_fileinfo_v6_marshal@DB_VERSION_UNIQUE_NAME@ #define __rep_fileinfo_v6_unmarshal __rep_fileinfo_v6_unmarshal@DB_VERSION_UNIQUE_NAME@ #define __rep_grant_info_marshal __rep_grant_info_marshal@DB_VERSION_UNIQUE_NAME@ @@ -1691,13 +1860,29 @@ #define __rep_lsn_hist_key_unmarshal __rep_lsn_hist_key_unmarshal@DB_VERSION_UNIQUE_NAME@ #define __rep_lsn_hist_data_marshal __rep_lsn_hist_data_marshal@DB_VERSION_UNIQUE_NAME@ #define __rep_lsn_hist_data_unmarshal __rep_lsn_hist_data_unmarshal@DB_VERSION_UNIQUE_NAME@ +#define __rep_blob_update_req_marshal __rep_blob_update_req_marshal@DB_VERSION_UNIQUE_NAME@ +#define __rep_blob_update_req_unmarshal __rep_blob_update_req_unmarshal@DB_VERSION_UNIQUE_NAME@ +#define __rep_blob_update_marshal __rep_blob_update_marshal@DB_VERSION_UNIQUE_NAME@ +#define __rep_blob_update_unmarshal __rep_blob_update_unmarshal@DB_VERSION_UNIQUE_NAME@ +#define __rep_blob_file_marshal __rep_blob_file_marshal@DB_VERSION_UNIQUE_NAME@ +#define __rep_blob_file_unmarshal __rep_blob_file_unmarshal@DB_VERSION_UNIQUE_NAME@ +#define __rep_blob_chunk_marshal __rep_blob_chunk_marshal@DB_VERSION_UNIQUE_NAME@ +#define __rep_blob_chunk_unmarshal __rep_blob_chunk_unmarshal@DB_VERSION_UNIQUE_NAME@ +#define __rep_blob_chunk_req_marshal __rep_blob_chunk_req_marshal@DB_VERSION_UNIQUE_NAME@ +#define __rep_blob_chunk_req_unmarshal __rep_blob_chunk_req_unmarshal@DB_VERSION_UNIQUE_NAME@ #define __rep_update_req __rep_update_req@DB_VERSION_UNIQUE_NAME@ +#define __rep_blob_update_req __rep_blob_update_req@DB_VERSION_UNIQUE_NAME@ #define __rep_page_req __rep_page_req@DB_VERSION_UNIQUE_NAME@ #define __rep_update_setup __rep_update_setup@DB_VERSION_UNIQUE_NAME@ +#define __rep_blob_update __rep_blob_update@DB_VERSION_UNIQUE_NAME@ +#define __rep_blob_allreq __rep_blob_allreq@DB_VERSION_UNIQUE_NAME@ #define __rep_bulk_page __rep_bulk_page@DB_VERSION_UNIQUE_NAME@ #define __rep_page __rep_page@DB_VERSION_UNIQUE_NAME@ +#define __rep_blob_chunk __rep_blob_chunk@DB_VERSION_UNIQUE_NAME@ #define __rep_init_cleanup __rep_init_cleanup@DB_VERSION_UNIQUE_NAME@ +#define __rep_blob_chunk_req __rep_blob_chunk_req@DB_VERSION_UNIQUE_NAME@ #define __rep_pggap_req __rep_pggap_req@DB_VERSION_UNIQUE_NAME@ +#define __rep_blob_rereq __rep_blob_rereq@DB_VERSION_UNIQUE_NAME@ #define __rep_finfo_alloc __rep_finfo_alloc@DB_VERSION_UNIQUE_NAME@ #define __rep_remove_init_file __rep_remove_init_file@DB_VERSION_UNIQUE_NAME@ #define __rep_reset_init __rep_reset_init@DB_VERSION_UNIQUE_NAME@ @@ -1727,24 +1912,32 @@ #define __rep_start_int __rep_start_int@DB_VERSION_UNIQUE_NAME@ #define __rep_open_sysdb __rep_open_sysdb@DB_VERSION_UNIQUE_NAME@ #define __rep_client_dbinit __rep_client_dbinit@DB_VERSION_UNIQUE_NAME@ +#define __rep_blob_cmp __rep_blob_cmp@DB_VERSION_UNIQUE_NAME@ +#define __rep_offset_cmp __rep_offset_cmp@DB_VERSION_UNIQUE_NAME@ #define __rep_get_limit __rep_get_limit@DB_VERSION_UNIQUE_NAME@ #define __rep_set_limit __rep_set_limit@DB_VERSION_UNIQUE_NAME@ #define __rep_set_nsites_pp __rep_set_nsites_pp@DB_VERSION_UNIQUE_NAME@ #define __rep_set_nsites_int __rep_set_nsites_int@DB_VERSION_UNIQUE_NAME@ #define __rep_get_nsites __rep_get_nsites@DB_VERSION_UNIQUE_NAME@ -#define __rep_set_priority __rep_set_priority@DB_VERSION_UNIQUE_NAME@ +#define __rep_set_priority_pp __rep_set_priority_pp@DB_VERSION_UNIQUE_NAME@ +#define __rep_set_priority_int __rep_set_priority_int@DB_VERSION_UNIQUE_NAME@ #define __rep_get_priority __rep_get_priority@DB_VERSION_UNIQUE_NAME@ -#define __rep_set_timeout __rep_set_timeout@DB_VERSION_UNIQUE_NAME@ +#define __rep_set_timeout_pp __rep_set_timeout_pp@DB_VERSION_UNIQUE_NAME@ +#define __rep_set_timeout_int __rep_set_timeout_int@DB_VERSION_UNIQUE_NAME@ #define __rep_get_timeout __rep_get_timeout@DB_VERSION_UNIQUE_NAME@ #define __rep_get_request __rep_get_request@DB_VERSION_UNIQUE_NAME@ #define __rep_set_request __rep_set_request@DB_VERSION_UNIQUE_NAME@ +#define __rep_set_view __rep_set_view@DB_VERSION_UNIQUE_NAME@ +#define __rep_call_partial __rep_call_partial@DB_VERSION_UNIQUE_NAME@ #define __rep_set_transport_pp __rep_set_transport_pp@DB_VERSION_UNIQUE_NAME@ #define __rep_set_transport_int __rep_set_transport_int@DB_VERSION_UNIQUE_NAME@ #define __rep_get_clockskew __rep_get_clockskew@DB_VERSION_UNIQUE_NAME@ #define __rep_set_clockskew __rep_set_clockskew@DB_VERSION_UNIQUE_NAME@ -#define __rep_flush __rep_flush@DB_VERSION_UNIQUE_NAME@ +#define __rep_flush_pp __rep_flush_pp@DB_VERSION_UNIQUE_NAME@ +#define __rep_flush_int __rep_flush_int@DB_VERSION_UNIQUE_NAME@ #define __rep_sync __rep_sync@DB_VERSION_UNIQUE_NAME@ #define __rep_txn_applied __rep_txn_applied@DB_VERSION_UNIQUE_NAME@ +#define __rep_read_lsn_history __rep_read_lsn_history@DB_VERSION_UNIQUE_NAME@ #define __rep_process_message_pp __rep_process_message_pp@DB_VERSION_UNIQUE_NAME@ #define __rep_process_message_int __rep_process_message_int@DB_VERSION_UNIQUE_NAME@ #define __rep_apply __rep_apply@DB_VERSION_UNIQUE_NAME@ @@ -1760,6 +1953,7 @@ #define __rep_closefiles __rep_closefiles@DB_VERSION_UNIQUE_NAME@ #define __rep_write_egen __rep_write_egen@DB_VERSION_UNIQUE_NAME@ #define __rep_write_gen __rep_write_gen@DB_VERSION_UNIQUE_NAME@ +#define __rep_check_view __rep_check_view@DB_VERSION_UNIQUE_NAME@ #define __rep_stat_pp __rep_stat_pp@DB_VERSION_UNIQUE_NAME@ #define __rep_stat_print_pp __rep_stat_print_pp@DB_VERSION_UNIQUE_NAME@ #define __rep_stat_print __rep_stat_print@DB_VERSION_UNIQUE_NAME@ @@ -1798,6 +1992,8 @@ #define __rep_get_maxpermlsn __rep_get_maxpermlsn@DB_VERSION_UNIQUE_NAME@ #define __rep_is_internal_rep_file __rep_is_internal_rep_file@DB_VERSION_UNIQUE_NAME@ #define __rep_get_datagen __rep_get_datagen@DB_VERSION_UNIQUE_NAME@ +#define __rep_become_readonly_master __rep_become_readonly_master@DB_VERSION_UNIQUE_NAME@ +#define __rep_get_lsnhist_data __rep_get_lsnhist_data@DB_VERSION_UNIQUE_NAME@ #define __rep_verify __rep_verify@DB_VERSION_UNIQUE_NAME@ #define __rep_verify_fail __rep_verify_fail@DB_VERSION_UNIQUE_NAME@ #define __rep_verify_req __rep_verify_req@DB_VERSION_UNIQUE_NAME@ @@ -1827,6 +2023,8 @@ #define __repmgr_membership_key_unmarshal __repmgr_membership_key_unmarshal@DB_VERSION_UNIQUE_NAME@ #define __repmgr_membership_data_marshal __repmgr_membership_data_marshal@DB_VERSION_UNIQUE_NAME@ #define __repmgr_membership_data_unmarshal __repmgr_membership_data_unmarshal@DB_VERSION_UNIQUE_NAME@ +#define __repmgr_v4membership_data_marshal __repmgr_v4membership_data_marshal@DB_VERSION_UNIQUE_NAME@ +#define __repmgr_v4membership_data_unmarshal __repmgr_v4membership_data_unmarshal@DB_VERSION_UNIQUE_NAME@ #define __repmgr_member_metadata_marshal __repmgr_member_metadata_marshal@DB_VERSION_UNIQUE_NAME@ #define __repmgr_member_metadata_unmarshal __repmgr_member_metadata_unmarshal@DB_VERSION_UNIQUE_NAME@ #define __repmgr_gm_fwd_marshal __repmgr_gm_fwd_marshal@DB_VERSION_UNIQUE_NAME@ @@ -1835,21 +2033,34 @@ #define __repmgr_membr_vers_unmarshal __repmgr_membr_vers_unmarshal@DB_VERSION_UNIQUE_NAME@ #define __repmgr_site_info_marshal __repmgr_site_info_marshal@DB_VERSION_UNIQUE_NAME@ #define __repmgr_site_info_unmarshal __repmgr_site_info_unmarshal@DB_VERSION_UNIQUE_NAME@ +#define __repmgr_v4site_info_marshal __repmgr_v4site_info_marshal@DB_VERSION_UNIQUE_NAME@ +#define __repmgr_v4site_info_unmarshal __repmgr_v4site_info_unmarshal@DB_VERSION_UNIQUE_NAME@ #define __repmgr_connect_reject_marshal __repmgr_connect_reject_marshal@DB_VERSION_UNIQUE_NAME@ #define __repmgr_connect_reject_unmarshal __repmgr_connect_reject_unmarshal@DB_VERSION_UNIQUE_NAME@ +#define __repmgr_v4connect_reject_marshal __repmgr_v4connect_reject_marshal@DB_VERSION_UNIQUE_NAME@ +#define __repmgr_v4connect_reject_unmarshal __repmgr_v4connect_reject_unmarshal@DB_VERSION_UNIQUE_NAME@ +#define __repmgr_lsnhist_match_marshal __repmgr_lsnhist_match_marshal@DB_VERSION_UNIQUE_NAME@ +#define __repmgr_lsnhist_match_unmarshal __repmgr_lsnhist_match_unmarshal@DB_VERSION_UNIQUE_NAME@ #define __repmgr_member_print __repmgr_member_print@DB_VERSION_UNIQUE_NAME@ #define __repmgr_init_print __repmgr_init_print@DB_VERSION_UNIQUE_NAME@ #define __repmgr_init_election __repmgr_init_election@DB_VERSION_UNIQUE_NAME@ #define __repmgr_claim_victory __repmgr_claim_victory@DB_VERSION_UNIQUE_NAME@ #define __repmgr_turn_on_elections __repmgr_turn_on_elections@DB_VERSION_UNIQUE_NAME@ -#define __repmgr_start __repmgr_start@DB_VERSION_UNIQUE_NAME@ +#define __repmgr_start_pp __repmgr_start_pp@DB_VERSION_UNIQUE_NAME@ +#define __repmgr_start_int __repmgr_start_int@DB_VERSION_UNIQUE_NAME@ #define __repmgr_valid_config __repmgr_valid_config@DB_VERSION_UNIQUE_NAME@ +#define __repmgr_prefmas_auto_config __repmgr_prefmas_auto_config@DB_VERSION_UNIQUE_NAME@ #define __repmgr_autostart __repmgr_autostart@DB_VERSION_UNIQUE_NAME@ #define __repmgr_start_selector __repmgr_start_selector@DB_VERSION_UNIQUE_NAME@ #define __repmgr_close __repmgr_close@DB_VERSION_UNIQUE_NAME@ #define __repmgr_stop __repmgr_stop@DB_VERSION_UNIQUE_NAME@ #define __repmgr_set_ack_policy __repmgr_set_ack_policy@DB_VERSION_UNIQUE_NAME@ #define __repmgr_get_ack_policy __repmgr_get_ack_policy@DB_VERSION_UNIQUE_NAME@ +#define __repmgr_set_incoming_queue_max __repmgr_set_incoming_queue_max@DB_VERSION_UNIQUE_NAME@ +#define __repmgr_get_incoming_queue_max __repmgr_get_incoming_queue_max@DB_VERSION_UNIQUE_NAME@ +#define __repmgr_set_incoming_queue_redzone __repmgr_set_incoming_queue_redzone@DB_VERSION_UNIQUE_NAME@ +#define __repmgr_get_incoming_queue_redzone __repmgr_get_incoming_queue_redzone@DB_VERSION_UNIQUE_NAME@ +#define __repmgr_get_incoming_queue_fullevent __repmgr_get_incoming_queue_fullevent@DB_VERSION_UNIQUE_NAME@ #define __repmgr_env_create __repmgr_env_create@DB_VERSION_UNIQUE_NAME@ #define __repmgr_env_destroy __repmgr_env_destroy@DB_VERSION_UNIQUE_NAME@ #define __repmgr_stop_threads __repmgr_stop_threads@DB_VERSION_UNIQUE_NAME@ @@ -1870,7 +2081,8 @@ #define __repmgr_get_site_address __repmgr_get_site_address@DB_VERSION_UNIQUE_NAME@ #define __repmgr_get_eid __repmgr_get_eid@DB_VERSION_UNIQUE_NAME@ #define __repmgr_get_config __repmgr_get_config@DB_VERSION_UNIQUE_NAME@ -#define __repmgr_site_config __repmgr_site_config@DB_VERSION_UNIQUE_NAME@ +#define __repmgr_site_config_pp __repmgr_site_config_pp@DB_VERSION_UNIQUE_NAME@ +#define __repmgr_site_config_int __repmgr_site_config_int@DB_VERSION_UNIQUE_NAME@ #define __repmgr_site_close __repmgr_site_close@DB_VERSION_UNIQUE_NAME@ #define __repmgr_msg_thread __repmgr_msg_thread@DB_VERSION_UNIQUE_NAME@ #define __repmgr_send_err_resp __repmgr_send_err_resp@DB_VERSION_UNIQUE_NAME@ @@ -1930,7 +2142,6 @@ #define __repmgr_queue_destroy __repmgr_queue_destroy@DB_VERSION_UNIQUE_NAME@ #define __repmgr_queue_get __repmgr_queue_get@DB_VERSION_UNIQUE_NAME@ #define __repmgr_queue_put __repmgr_queue_put@DB_VERSION_UNIQUE_NAME@ -#define __repmgr_queue_size __repmgr_queue_size@DB_VERSION_UNIQUE_NAME@ #define __repmgr_member_recover __repmgr_member_recover@DB_VERSION_UNIQUE_NAME@ #define __repmgr_select_thread __repmgr_select_thread@DB_VERSION_UNIQUE_NAME@ #define __repmgr_bow_out __repmgr_bow_out@DB_VERSION_UNIQUE_NAME@ @@ -1938,6 +2149,7 @@ #define __repmgr_compute_timeout __repmgr_compute_timeout@DB_VERSION_UNIQUE_NAME@ #define __repmgr_connected_master __repmgr_connected_master@DB_VERSION_UNIQUE_NAME@ #define __repmgr_check_timeouts __repmgr_check_timeouts@DB_VERSION_UNIQUE_NAME@ +#define __repmgr_refresh_selector __repmgr_refresh_selector@DB_VERSION_UNIQUE_NAME@ #define __repmgr_first_try_connections __repmgr_first_try_connections@DB_VERSION_UNIQUE_NAME@ #define __repmgr_send_v1_handshake __repmgr_send_v1_handshake@DB_VERSION_UNIQUE_NAME@ #define __repmgr_read_from_site __repmgr_read_from_site@DB_VERSION_UNIQUE_NAME@ @@ -1949,7 +2161,8 @@ #define __repmgr_stat_pp __repmgr_stat_pp@DB_VERSION_UNIQUE_NAME@ #define __repmgr_stat_print_pp __repmgr_stat_print_pp@DB_VERSION_UNIQUE_NAME@ #define __repmgr_stat_print __repmgr_stat_print@DB_VERSION_UNIQUE_NAME@ -#define __repmgr_site_list __repmgr_site_list@DB_VERSION_UNIQUE_NAME@ +#define __repmgr_site_list_pp __repmgr_site_list_pp@DB_VERSION_UNIQUE_NAME@ +#define __repmgr_site_list_int __repmgr_site_list_int@DB_VERSION_UNIQUE_NAME@ #ifndef HAVE_REPLICATION_THREADS #define __repmgr_close __repmgr_close@DB_VERSION_UNIQUE_NAME@ #endif @@ -1960,6 +2173,18 @@ #define __repmgr_set_ack_policy __repmgr_set_ack_policy@DB_VERSION_UNIQUE_NAME@ #endif #ifndef HAVE_REPLICATION_THREADS +#define __repmgr_get_incoming_queue_max __repmgr_get_incoming_queue_max@DB_VERSION_UNIQUE_NAME@ +#endif +#ifndef HAVE_REPLICATION_THREADS +#define __repmgr_set_incoming_queue_max __repmgr_set_incoming_queue_max@DB_VERSION_UNIQUE_NAME@ +#endif +#ifndef HAVE_REPLICATION_THREADS +#define __repmgr_get_incoming_queue_redzone __repmgr_get_incoming_queue_redzone@DB_VERSION_UNIQUE_NAME@ +#endif +#ifndef HAVE_REPLICATION_THREADS +#define __repmgr_get_incoming_queue_fullevent __repmgr_get_incoming_queue_fullevent@DB_VERSION_UNIQUE_NAME@ +#endif +#ifndef HAVE_REPLICATION_THREADS #define __repmgr_site __repmgr_site@DB_VERSION_UNIQUE_NAME@ #endif #ifndef HAVE_REPLICATION_THREADS @@ -1969,10 +2194,10 @@ #define __repmgr_local_site __repmgr_local_site@DB_VERSION_UNIQUE_NAME@ #endif #ifndef HAVE_REPLICATION_THREADS -#define __repmgr_site_list __repmgr_site_list@DB_VERSION_UNIQUE_NAME@ +#define __repmgr_site_list_pp __repmgr_site_list_pp@DB_VERSION_UNIQUE_NAME@ #endif #ifndef HAVE_REPLICATION_THREADS -#define __repmgr_start __repmgr_start@DB_VERSION_UNIQUE_NAME@ +#define __repmgr_start_pp __repmgr_start_pp@DB_VERSION_UNIQUE_NAME@ #endif #ifndef HAVE_REPLICATION_THREADS #define __repmgr_stat_pp __repmgr_stat_pp@DB_VERSION_UNIQUE_NAME@ @@ -2023,7 +2248,14 @@ #define __repmgr_failchk __repmgr_failchk@DB_VERSION_UNIQUE_NAME@ #define __repmgr_master_is_known __repmgr_master_is_known@DB_VERSION_UNIQUE_NAME@ #define __repmgr_stable_lsn __repmgr_stable_lsn@DB_VERSION_UNIQUE_NAME@ +#define __repmgr_make_request_conn __repmgr_make_request_conn@DB_VERSION_UNIQUE_NAME@ #define __repmgr_send_sync_msg __repmgr_send_sync_msg@DB_VERSION_UNIQUE_NAME@ +#define __repmgr_read_own_msg __repmgr_read_own_msg@DB_VERSION_UNIQUE_NAME@ +#define __repmgr_prefmas_connected __repmgr_prefmas_connected@DB_VERSION_UNIQUE_NAME@ +#define __repmgr_restart_site_as_client __repmgr_restart_site_as_client@DB_VERSION_UNIQUE_NAME@ +#define __repmgr_make_site_readonly_master __repmgr_make_site_readonly_master@DB_VERSION_UNIQUE_NAME@ +#define __repmgr_lsnhist_match __repmgr_lsnhist_match@DB_VERSION_UNIQUE_NAME@ +#define __repmgr_prefmas_get_wait __repmgr_prefmas_get_wait@DB_VERSION_UNIQUE_NAME@ #define __repmgr_marshal_member_list __repmgr_marshal_member_list@DB_VERSION_UNIQUE_NAME@ #define __repmgr_refresh_membership __repmgr_refresh_membership@DB_VERSION_UNIQUE_NAME@ #define __repmgr_reload_gmdb __repmgr_reload_gmdb@DB_VERSION_UNIQUE_NAME@ @@ -2040,10 +2272,15 @@ #define __repmgr_bcast_parm_refresh __repmgr_bcast_parm_refresh@DB_VERSION_UNIQUE_NAME@ #define __repmgr_chg_prio __repmgr_chg_prio@DB_VERSION_UNIQUE_NAME@ #define __repmgr_bcast_own_msg __repmgr_bcast_own_msg@DB_VERSION_UNIQUE_NAME@ +#define __repmgr_bcast_member_list __repmgr_bcast_member_list@DB_VERSION_UNIQUE_NAME@ #define __seq_stat __seq_stat@DB_VERSION_UNIQUE_NAME@ #define __seq_stat_print __seq_stat_print@DB_VERSION_UNIQUE_NAME@ #define __db_get_seq_flags_fn __db_get_seq_flags_fn@DB_VERSION_UNIQUE_NAME@ #define __db_get_seq_flags_fn __db_get_seq_flags_fn@DB_VERSION_UNIQUE_NAME@ +#define __seq_open __seq_open@DB_VERSION_UNIQUE_NAME@ +#define __seq_initial_value __seq_initial_value@DB_VERSION_UNIQUE_NAME@ +#define __seq_get __seq_get@DB_VERSION_UNIQUE_NAME@ +#define __seq_close __seq_close@DB_VERSION_UNIQUE_NAME@ #define bdb_HCommand bdb_HCommand@DB_VERSION_UNIQUE_NAME@ #if DB_DBM_HSEARCH != 0 #define bdb_NdbmOpen bdb_NdbmOpen@DB_VERSION_UNIQUE_NAME@ @@ -2057,9 +2294,12 @@ #define tcl_CompactStat tcl_CompactStat@DB_VERSION_UNIQUE_NAME@ #define tcl_rep_send tcl_rep_send@DB_VERSION_UNIQUE_NAME@ #define dbc_Cmd dbc_Cmd@DB_VERSION_UNIQUE_NAME@ +#define dbstream_Cmd dbstream_Cmd@DB_VERSION_UNIQUE_NAME@ #define env_Cmd env_Cmd@DB_VERSION_UNIQUE_NAME@ #define tcl_EnvRemove tcl_EnvRemove@DB_VERSION_UNIQUE_NAME@ #define tcl_EnvClose tcl_EnvClose@DB_VERSION_UNIQUE_NAME@ +#define tcl_EnvBackup tcl_EnvBackup@DB_VERSION_UNIQUE_NAME@ +#define tcl_EnvDbBackup tcl_EnvDbBackup@DB_VERSION_UNIQUE_NAME@ #define tcl_EnvIdReset tcl_EnvIdReset@DB_VERSION_UNIQUE_NAME@ #define tcl_EnvLsnReset tcl_EnvLsnReset@DB_VERSION_UNIQUE_NAME@ #define tcl_EnvVerbose tcl_EnvVerbose@DB_VERSION_UNIQUE_NAME@ @@ -2069,6 +2309,7 @@ #define tcl_EnvGetEncryptFlags tcl_EnvGetEncryptFlags@DB_VERSION_UNIQUE_NAME@ #define tcl_EnvSetErrfile tcl_EnvSetErrfile@DB_VERSION_UNIQUE_NAME@ #define tcl_EnvSetMsgfile tcl_EnvSetMsgfile@DB_VERSION_UNIQUE_NAME@ +#define tcl_EnvCloseMsgfile tcl_EnvCloseMsgfile@DB_VERSION_UNIQUE_NAME@ #define tcl_EnvSetErrpfx tcl_EnvSetErrpfx@DB_VERSION_UNIQUE_NAME@ #define tcl_EnvStatPrint tcl_EnvStatPrint@DB_VERSION_UNIQUE_NAME@ #define _NewInfo _NewInfo@DB_VERSION_UNIQUE_NAME@ @@ -2111,9 +2352,11 @@ #define tcl_LogPut tcl_LogPut@DB_VERSION_UNIQUE_NAME@ #define tcl_LogStat tcl_LogStat@DB_VERSION_UNIQUE_NAME@ #define tcl_LogStatPrint tcl_LogStatPrint@DB_VERSION_UNIQUE_NAME@ +#define tcl_LogVerify tcl_LogVerify@DB_VERSION_UNIQUE_NAME@ #define logc_Cmd logc_Cmd@DB_VERSION_UNIQUE_NAME@ #define tcl_LogConfig tcl_LogConfig@DB_VERSION_UNIQUE_NAME@ #define tcl_LogGetConfig tcl_LogGetConfig@DB_VERSION_UNIQUE_NAME@ +#define tcl_LogSetMax tcl_LogSetMax@DB_VERSION_UNIQUE_NAME@ #define _MpInfoDelete _MpInfoDelete@DB_VERSION_UNIQUE_NAME@ #define tcl_MpSync tcl_MpSync@DB_VERSION_UNIQUE_NAME@ #define tcl_MpTrickle tcl_MpTrickle@DB_VERSION_UNIQUE_NAME@ @@ -2121,6 +2364,7 @@ #define tcl_MpStat tcl_MpStat@DB_VERSION_UNIQUE_NAME@ #define tcl_MpStatPrint tcl_MpStatPrint@DB_VERSION_UNIQUE_NAME@ #define tcl_Mutex tcl_Mutex@DB_VERSION_UNIQUE_NAME@ +#define tcl_MutexFailchkTimeout tcl_MutexFailchkTimeout@DB_VERSION_UNIQUE_NAME@ #define tcl_MutFree tcl_MutFree@DB_VERSION_UNIQUE_NAME@ #define tcl_MutGet tcl_MutGet@DB_VERSION_UNIQUE_NAME@ #define tcl_MutLock tcl_MutLock@DB_VERSION_UNIQUE_NAME@ @@ -2227,6 +2471,7 @@ #define __txn_get_prepared __txn_get_prepared@DB_VERSION_UNIQUE_NAME@ #define __txn_openfiles __txn_openfiles@DB_VERSION_UNIQUE_NAME@ #define __txn_open __txn_open@DB_VERSION_UNIQUE_NAME@ +#define __txn_region_detach __txn_region_detach@DB_VERSION_UNIQUE_NAME@ #define __txn_findlastckp __txn_findlastckp@DB_VERSION_UNIQUE_NAME@ #define __txn_env_refresh __txn_env_refresh@DB_VERSION_UNIQUE_NAME@ #define __txn_region_mutex_count __txn_region_mutex_count@DB_VERSION_UNIQUE_NAME@ @@ -2234,7 +2479,7 @@ #define __txn_region_size __txn_region_size@DB_VERSION_UNIQUE_NAME@ #define __txn_region_max __txn_region_max@DB_VERSION_UNIQUE_NAME@ #define __txn_id_set __txn_id_set@DB_VERSION_UNIQUE_NAME@ -#define __txn_oldest_reader __txn_oldest_reader@DB_VERSION_UNIQUE_NAME@ +#define __txn_get_readers __txn_get_readers@DB_VERSION_UNIQUE_NAME@ #define __txn_add_buffer __txn_add_buffer@DB_VERSION_UNIQUE_NAME@ #define __txn_remove_buffer __txn_remove_buffer@DB_VERSION_UNIQUE_NAME@ #define __txn_stat_pp __txn_stat_pp@DB_VERSION_UNIQUE_NAME@ diff --git a/src/dbinc_auto/lock_ext.h b/src/dbinc_auto/lock_ext.h index d5981e18..3d2c37a3 100644 --- a/src/dbinc_auto/lock_ext.h +++ b/src/dbinc_auto/lock_ext.h @@ -28,10 +28,11 @@ int __lock_id_free_pp __P((DB_ENV *, u_int32_t)); int __lock_id_free __P((ENV *, DB_LOCKER *)); int __lock_id_set __P((ENV *, u_int32_t, u_int32_t)); int __lock_getlocker __P((DB_LOCKTAB *, u_int32_t, int, DB_LOCKER **)); -int __lock_getlocker_int __P((DB_LOCKTAB *, u_int32_t, int, DB_LOCKER **)); +int __lock_getlocker_int __P((DB_LOCKTAB *, u_int32_t, int, DB_THREAD_INFO *, DB_LOCKER **)); int __lock_addfamilylocker __P((ENV *, u_int32_t, u_int32_t, u_int32_t)); int __lock_freelocker __P((DB_LOCKTAB *, DB_LOCKER *)); int __lock_familyremove __P((DB_LOCKTAB *, DB_LOCKER *)); +int __lock_local_locker_invalidate __P((ENV *, db_mutex_t)); int __lock_fix_list __P((ENV *, DBT *, u_int32_t)); int __lock_get_list __P((ENV *, DB_LOCKER *, u_int32_t, db_lockmode_t, DBT *)); void __lock_list_print __P((ENV *, DB_MSGBUF *, DBT *)); @@ -57,6 +58,7 @@ int __lock_get_env_timeout __P((DB_ENV *, db_timeout_t *, u_int32_t)); int __lock_set_env_timeout __P((DB_ENV *, db_timeout_t, u_int32_t)); int __lock_open __P((ENV *)); int __lock_env_refresh __P((ENV *)); +int __lock_region_detach __P((ENV *, DB_LOCKTAB *)); u_int32_t __lock_region_mutex_count __P((ENV *)); u_int32_t __lock_region_mutex_max __P((ENV *)); size_t __lock_region_max __P((ENV *)); @@ -65,6 +67,7 @@ int __lock_stat_pp __P((DB_ENV *, DB_LOCK_STAT **, u_int32_t)); int __lock_stat_print_pp __P((DB_ENV *, u_int32_t)); int __lock_stat_print __P((ENV *, u_int32_t)); void __lock_printlock __P((DB_LOCKTAB *, DB_MSGBUF *mbp, struct __db_lock *, int)); +int __lock_dump_locker __P((ENV *, DB_MSGBUF *, DB_LOCKTAB *, DB_LOCKER *)); int __lock_set_timeout __P((ENV *, DB_LOCKER *, db_timeout_t, u_int32_t)); int __lock_set_timeout_internal __P((ENV *, DB_LOCKER *, db_timeout_t, u_int32_t)); int __lock_inherit_timeout __P((ENV *, DB_LOCKER *, DB_LOCKER *)); diff --git a/src/dbinc_auto/log_ext.h b/src/dbinc_auto/log_ext.h index dde6742d..769643fa 100644 --- a/src/dbinc_auto/log_ext.h +++ b/src/dbinc_auto/log_ext.h @@ -7,6 +7,7 @@ extern "C" { #endif int __log_open __P((ENV *)); +int __log_region_detach __P((ENV *, DB_LOG *)); int __log_find __P((DB_LOG *, int, u_int32_t *, logfile_validity *)); int __log_valid __P((DB_LOG *, u_int32_t, int, DB_FH **, u_int32_t, logfile_validity *, u_int32_t *)); int __log_env_refresh __P((ENV *)); @@ -72,6 +73,7 @@ int __log_flush_int __P((DB_LOG *, const DB_LSN *, int)); int __log_file_pp __P((DB_ENV *, const DB_LSN *, char *, size_t)); int __log_name __P((DB_LOG *, u_int32_t, char **, DB_FH **, u_int32_t)); int __log_rep_put __P((ENV *, DB_LSN *, const DBT *, u_int32_t)); +int __log_rep_write __P((ENV *)); int __log_put_record_pp __P((DB_ENV *, DB *, DB_TXN *, DB_LSN *, u_int32_t, u_int32_t, u_int32_t, u_int32_t, DB_LOG_RECSPEC *, ...)); int __log_put_record __P((ENV *, DB *, DB_TXN *, DB_LSN *, u_int32_t, u_int32_t, u_int32_t, u_int32_t, DB_LOG_RECSPEC *, ...)); int __log_stat_pp __P((DB_ENV *, DB_LOG_STAT **, u_int32_t)); @@ -115,6 +117,7 @@ int __db_relink_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __db_merge_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __db_pgno_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __dbreg_register_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); +int __dbreg_register_42_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __bam_split_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __bam_split_42_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __bam_rsplit_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); @@ -129,12 +132,19 @@ int __bam_rcuradj_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __bam_relink_43_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __bam_merge_44_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __fop_create_42_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); +int __fop_create_60_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __fop_create_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); +int __fop_remove_60_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __fop_remove_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __fop_write_42_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); +int __fop_write_60_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __fop_write_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); +int __fop_write_file_60_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); +int __fop_write_file_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __fop_rename_42_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); +int __fop_rename_60_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __fop_rename_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); +int __fop_file_remove_60_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __fop_file_remove_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __ham_insdel_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __ham_newpage_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); @@ -150,6 +160,7 @@ int __ham_contract_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __ham_curadj_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __ham_chgpg_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __heap_addrem_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); +int __heap_addrem_60_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __heap_pg_alloc_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __heap_trunc_meta_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __heap_trunc_page_verify __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); diff --git a/src/dbinc_auto/mp_ext.h b/src/dbinc_auto/mp_ext.h index d142b584..3f5a397b 100644 --- a/src/dbinc_auto/mp_ext.h +++ b/src/dbinc_auto/mp_ext.h @@ -6,6 +6,7 @@ extern "C" { #endif +int __memp_bh_unreachable __P((ENV *, BH *, DB_LSN *, int)); int __memp_alloc __P((DB_MPOOL *, REGINFO *, MPOOLFILE *, size_t, roff_t *, void *)); void __memp_free __P((REGINFO *, void *)); int __memp_backup_open __P((ENV *, DB_MPOOLFILE *, const char *, const char *, u_int32_t, DB_FH **, void**)); @@ -18,6 +19,7 @@ int __memp_pg __P((DB_MPOOLFILE *, db_pgno_t, void *, int)); int __memp_bhfree __P((DB_MPOOL *, REGINFO *, MPOOLFILE *, DB_MPOOL_HASH *, BH *, u_int32_t)); int __memp_fget_pp __P((DB_MPOOLFILE *, db_pgno_t *, DB_TXN *, u_int32_t, void *)); int __memp_fget __P((DB_MPOOLFILE *, db_pgno_t *, DB_THREAD_INFO *, DB_TXN *, u_int32_t, void *)); +int __memp_find_obsolete_version __P((ENV *, BH *, DB_MPOOL_HASH *, BH **)); int __memp_fcreate_pp __P((DB_ENV *, DB_MPOOLFILE **, u_int32_t)); int __memp_fcreate __P((ENV *, DB_MPOOLFILE **)); int __memp_set_clear_len __P((DB_MPOOLFILE *, u_int32_t)); @@ -28,6 +30,7 @@ int __memp_set_flags __P((DB_MPOOLFILE *, u_int32_t, int)); int __memp_get_ftype __P((DB_MPOOLFILE *, int *)); int __memp_set_ftype __P((DB_MPOOLFILE *, int)); int __memp_set_lsn_offset __P((DB_MPOOLFILE *, int32_t)); +void __memp_set_maxpgno __P((MPOOLFILE *, u_int32_t, u_int32_t)); int __memp_get_pgcookie __P((DB_MPOOLFILE *, DBT *)); int __memp_set_pgcookie __P((DB_MPOOLFILE *, DBT *)); int __memp_get_priority __P((DB_MPOOLFILE *, DB_CACHE_PRIORITY *)); @@ -75,10 +78,12 @@ int __memp_skip_curadj __P((DBC *, db_pgno_t)); int __memp_bh_freeze __P((DB_MPOOL *, REGINFO *, DB_MPOOL_HASH *, BH *, int *)); int __memp_bh_thaw __P((DB_MPOOL *, REGINFO *, DB_MPOOL_HASH *, BH *, BH *)); int __memp_open __P((ENV *, int)); +int __memp_region_detach __P((ENV *, DB_MPOOL *)); int __memp_init __P((ENV *, DB_MPOOL *, u_int, u_int32_t, u_int)); u_int32_t __memp_max_regions __P((ENV *)); u_int32_t __memp_region_mutex_count __P((ENV *)); int __memp_env_refresh __P((ENV *)); +int __memp_region_bhfree __P((REGINFO *)); int __memp_register_pp __P((DB_ENV *, int, int (*)(DB_ENV *, db_pgno_t, void *, DBT *), int (*)(DB_ENV *, db_pgno_t, void *, DBT *))); int __memp_register __P((ENV *, int, int (*)(DB_ENV *, db_pgno_t, void *, DBT *), int (*)(DB_ENV *, db_pgno_t, void *, DBT *))); int __memp_get_bucket __P((ENV *, MPOOLFILE *, db_pgno_t, REGINFO **, DB_MPOOL_HASH **, u_int32_t *)); diff --git a/src/dbinc_auto/mutex_ext.h b/src/dbinc_auto/mutex_ext.h index 1a2a1b2b..673c18d0 100644 --- a/src/dbinc_auto/mutex_ext.h +++ b/src/dbinc_auto/mutex_ext.h @@ -10,13 +10,13 @@ int __mutex_alloc __P((ENV *, int, u_int32_t, db_mutex_t *)); int __mutex_alloc_int __P((ENV *, int, int, u_int32_t, db_mutex_t *)); int __mutex_free __P((ENV *, db_mutex_t *)); int __mutex_free_int __P((ENV *, int, db_mutex_t *)); +int __mutex_died __P((ENV *, db_mutex_t)); int __mutex_refresh __P((ENV *, db_mutex_t)); -int __mut_failchk __P((ENV *)); -int __db_fcntl_mutex_init __P((ENV *, db_mutex_t, u_int32_t)); -int __db_fcntl_mutex_lock __P((ENV *, db_mutex_t, db_timeout_t)); -int __db_fcntl_mutex_trylock __P((ENV *, db_mutex_t)); -int __db_fcntl_mutex_unlock __P((ENV *, db_mutex_t)); -int __db_fcntl_mutex_destroy __P((ENV *, db_mutex_t)); +int __mutex_record_lock __P((ENV *, db_mutex_t, MUTEX_ACTION, MUTEX_STATE **)); +int __mutex_record_unlock __P((ENV *, db_mutex_t)); +int __mutex_record_print __P((ENV *, DB_THREAD_INFO *)); +int __mutex_failchk __P((ENV *)); +int __mutex_failchk_thread __P((ENV *, DB_THREAD_INFO *)); int __mutex_alloc_pp __P((DB_ENV *, u_int32_t, db_mutex_t *)); int __mutex_free_pp __P((DB_ENV *, db_mutex_t)); int __mutex_lock_pp __P((DB_ENV *, db_mutex_t)); @@ -31,6 +31,9 @@ int __mutex_get_max __P((DB_ENV *, u_int32_t *)); int __mutex_set_max __P((DB_ENV *, u_int32_t)); int __mutex_get_tas_spins __P((DB_ENV *, u_int32_t *)); int __mutex_set_tas_spins __P((DB_ENV *, u_int32_t)); +#ifdef HAVE_ERROR_HISTORY +int __mutex_diags __P((ENV *, db_mutex_t, int)); +#endif #if !defined(HAVE_ATOMIC_SUPPORT) && defined(HAVE_MUTEX_SUPPORT) atomic_value_t __atomic_inc __P((ENV *, db_atomic_t *)); #endif @@ -53,6 +56,7 @@ int __db_hybrid_mutex_suspend __P((ENV *, db_mutex_t, db_timespec *, int)); int __db_pthread_mutex_unlock __P((ENV *, db_mutex_t)); int __db_pthread_mutex_destroy __P((ENV *, db_mutex_t)); int __mutex_open __P((ENV *, int)); +int __mutex_region_detach __P((ENV *, DB_MUTEXMGR *)); int __mutex_env_refresh __P((ENV *)); void __mutex_resource_return __P((ENV *, REGINFO *)); int __mutex_stat_pp __P((DB_ENV *, DB_MUTEX_STAT **, u_int32_t)); @@ -62,6 +66,7 @@ void __mutex_print_debug_single __P((ENV *, const char *, db_mutex_t, u_int32_t) void __mutex_print_debug_stats __P((ENV *, DB_MSGBUF *, db_mutex_t, u_int32_t)); void __mutex_set_wait_info __P((ENV *, db_mutex_t, uintmax_t *, uintmax_t *)); void __mutex_clear __P((ENV *, db_mutex_t)); +char *__mutex_describe __P((ENV *, db_mutex_t, char *)); int __db_tas_mutex_init __P((ENV *, db_mutex_t, u_int32_t)); int __db_tas_mutex_lock __P((ENV *, db_mutex_t, db_timeout_t)); int __db_tas_mutex_trylock __P((ENV *, db_mutex_t)); diff --git a/src/dbinc_auto/os_ext.h b/src/dbinc_auto/os_ext.h index a0a7b791..26cf2127 100644 --- a/src/dbinc_auto/os_ext.h +++ b/src/dbinc_auto/os_ext.h @@ -6,7 +6,7 @@ extern "C" { #endif -void __os_abort __P((ENV *)); +void __os_abort __P((const ENV *)); int __os_abspath __P((const char *)); #if defined(HAVE_REPLICATION_THREADS) int __os_getaddrinfo __P((ENV *, const char *, u_int, const char *, const ADDRINFO *, ADDRINFO **)); @@ -18,12 +18,12 @@ int __os_umalloc __P((ENV *, size_t, void *)); int __os_urealloc __P((ENV *, size_t, void *)); void __os_ufree __P((ENV *, void *)); int __os_strdup __P((ENV *, const char *, void *)); -int __os_calloc __P((ENV *, size_t, size_t, void *)); -int __os_malloc __P((ENV *, size_t, void *)); -int __os_realloc __P((ENV *, size_t, void *)); -void __os_free __P((ENV *, void *)); +int __os_calloc __P((const ENV *, size_t, size_t, void *)); +int __os_malloc __P((const ENV *, size_t, void *)); +int __os_realloc __P((const ENV *, size_t, void *)); +void __os_free __P((const ENV *, void *)); void *__ua_memcpy __P((void *, const void *, size_t)); -void __os_gettime __P((ENV *, db_timespec *, int)); +void __os_gettime __P((const ENV *, db_timespec *, int)); int __os_fs_notzero __P((void)); int __os_support_direct_io __P((void)); int __os_support_db_register __P((void)); @@ -54,6 +54,7 @@ int __os_open __P((ENV *, const char *, u_int32_t, u_int32_t, int, DB_FH **)); int __os_concat_path __P((char *, size_t, const char *, const char *)); void __os_id __P((DB_ENV *, pid_t *, db_threadid_t*)); int __os_rename __P((ENV *, const char *, const char *, u_int32_t)); +int __os_rmdir __P((ENV *, const char *)); int __os_isroot __P((void)); char *__db_rpath __P((const char *)); int __os_io __P((ENV *, int, DB_FH *, db_pgno_t, u_int32_t, u_int32_t, u_int32_t, u_int8_t *, size_t *)); @@ -61,17 +62,38 @@ int __os_read __P((ENV *, DB_FH *, void *, size_t, size_t *)); int __os_write __P((ENV *, DB_FH *, void *, size_t, size_t *)); int __os_physwrite __P((ENV *, DB_FH *, void *, size_t, size_t *)); int __os_seek __P((ENV *, DB_FH *, db_pgno_t, u_int32_t, off_t)); -void __os_stack __P((ENV *)); +void __os_stack __P((const ENV *)); +void __os_stack_top __P((const ENV *, unsigned, unsigned)); +void __os_stack_text __P((const ENV *, char *, size_t, unsigned, unsigned)); +int __os_stack_save __P((const ENV *, unsigned, void **)); +void __os_stack_msgadd __P((const ENV *, DB_MSGBUF *, unsigned, unsigned, void **)); int __os_exists __P((ENV *, const char *, int *)); int __os_ioinfo __P((ENV *, const char *, DB_FH *, u_int32_t *, u_int32_t *, u_int32_t *)); int __os_tmpdir __P((ENV *, u_int32_t)); -int __os_truncate __P((ENV *, DB_FH *, db_pgno_t, u_int32_t)); +int __os_truncate __P((ENV *, DB_FH *, db_pgno_t, u_int32_t, off_t)); void __os_unique_id __P((ENV *, u_int32_t *)); +void __os_srandom __P((u_int)); +u_int __os_random __P((void)); int __os_unlink __P((ENV *, const char *, int)); void __os_yield __P((ENV *, u_long, u_long)); #ifdef HAVE_QNX int __os_qnx_region_open __P((ENV *, const char *, int, int, DB_FH **)); #endif +#ifdef DB_WINCE +FILE * __ce_freopen __P((const char *, const char *, FILE *)); +#endif +#ifdef DB_WINCE +struct tm * __ce_gmtime __P((const time_t *)); +#endif +#ifdef DB_WINCE +struct tm * localtime __P((const time_t *)); +#endif +#ifdef DB_WINCE +time_t __ce_mktime __P((struct tm *)); +#endif +#ifdef DB_WINCE +int __ce_remove __P((const char *path)); +#endif int __os_is_winnt __P((void)); u_int32_t __os_cpu_count __P((void)); #ifdef HAVE_REPLICATION_THREADS diff --git a/src/dbinc_auto/rep_automsg.h b/src/dbinc_auto/rep_automsg.h index 584040cf..f52c8907 100644 --- a/src/dbinc_auto/rep_automsg.h +++ b/src/dbinc_auto/rep_automsg.h @@ -32,7 +32,7 @@ typedef struct ___rep_egen_args { u_int32_t egen; } __rep_egen_args; -#define __REP_FILEINFO_SIZE 40 +#define __REP_FILEINFO_SIZE 48 typedef struct ___rep_fileinfo_args { u_int32_t pgsize; db_pgno_t pgno; @@ -44,8 +44,24 @@ typedef struct ___rep_fileinfo_args { DBT uid; DBT info; DBT dir; + u_int32_t blob_fid_lo; + u_int32_t blob_fid_hi; } __rep_fileinfo_args; +#define __REP_FILEINFO_V7_SIZE 40 +typedef struct ___rep_fileinfo_v7_args { + u_int32_t pgsize; + db_pgno_t pgno; + db_pgno_t max_pgno; + u_int32_t filenum; + u_int32_t finfo_flags; + u_int32_t type; + u_int32_t db_flags; + DBT uid; + DBT info; + DBT dir; +} __rep_fileinfo_v7_args; + #define __REP_FILEINFO_V6_SIZE 36 typedef struct ___rep_fileinfo_v6_args { u_int32_t pgsize; @@ -116,5 +132,46 @@ typedef struct ___rep_lsn_hist_data_args { u_int32_t hist_nsec; } __rep_lsn_hist_data_args; -#define __REP_MAXMSG_SIZE 40 +#define __REP_BLOB_UPDATE_REQ_SIZE 32 +typedef struct ___rep_blob_update_req_args { + u_int64_t blob_fid; + u_int64_t blob_sid; + u_int64_t blob_id; + u_int64_t highest_id; +} __rep_blob_update_req_args; + +#define __REP_BLOB_UPDATE_SIZE 24 +typedef struct ___rep_blob_update_args { + u_int64_t blob_fid; + u_int64_t highest_id; + u_int32_t flags; + u_int32_t num_blobs; +} __rep_blob_update_args; + +#define __REP_BLOB_FILE_SIZE 24 +typedef struct ___rep_blob_file_args { + u_int64_t blob_sid; + u_int64_t blob_id; + u_int64_t blob_size; +} __rep_blob_file_args; + +#define __REP_BLOB_CHUNK_SIZE 40 +typedef struct ___rep_blob_chunk_args { + u_int32_t flags; + u_int64_t blob_fid; + u_int64_t blob_sid; + u_int64_t blob_id; + u_int64_t offset; + DBT data; +} __rep_blob_chunk_args; + +#define __REP_BLOB_CHUNK_REQ_SIZE 32 +typedef struct ___rep_blob_chunk_req_args { + u_int64_t blob_fid; + u_int64_t blob_sid; + u_int64_t blob_id; + u_int64_t offset; +} __rep_blob_chunk_req_args; + +#define __REP_MAXMSG_SIZE 48 #endif diff --git a/src/dbinc_auto/rep_ext.h b/src/dbinc_auto/rep_ext.h index 89bdc797..97740acf 100644 --- a/src/dbinc_auto/rep_ext.h +++ b/src/dbinc_auto/rep_ext.h @@ -14,6 +14,8 @@ int __rep_egen_marshal __P((ENV *, __rep_egen_args *, u_int8_t *, size_t, size_t int __rep_egen_unmarshal __P((ENV *, __rep_egen_args *, u_int8_t *, size_t, u_int8_t **)); int __rep_fileinfo_marshal __P((ENV *, u_int32_t, __rep_fileinfo_args *, u_int8_t *, size_t, size_t *)); int __rep_fileinfo_unmarshal __P((ENV *, u_int32_t, __rep_fileinfo_args **, u_int8_t *, size_t, u_int8_t **)); +int __rep_fileinfo_v7_marshal __P((ENV *, u_int32_t, __rep_fileinfo_v7_args *, u_int8_t *, size_t, size_t *)); +int __rep_fileinfo_v7_unmarshal __P((ENV *, u_int32_t, __rep_fileinfo_v7_args **, u_int8_t *, size_t, u_int8_t **)); int __rep_fileinfo_v6_marshal __P((ENV *, u_int32_t, __rep_fileinfo_v6_args *, u_int8_t *, size_t, size_t *)); int __rep_fileinfo_v6_unmarshal __P((ENV *, u_int32_t, __rep_fileinfo_v6_args **, u_int8_t *, size_t, u_int8_t **)); int __rep_grant_info_marshal __P((ENV *, __rep_grant_info_args *, u_int8_t *, size_t, size_t *)); @@ -32,13 +34,29 @@ void __rep_lsn_hist_key_marshal __P((ENV *, __rep_lsn_hist_key_args *, u_int8_t int __rep_lsn_hist_key_unmarshal __P((ENV *, __rep_lsn_hist_key_args *, u_int8_t *, size_t, u_int8_t **)); void __rep_lsn_hist_data_marshal __P((ENV *, __rep_lsn_hist_data_args *, u_int8_t *)); int __rep_lsn_hist_data_unmarshal __P((ENV *, __rep_lsn_hist_data_args *, u_int8_t *, size_t, u_int8_t **)); +void __rep_blob_update_req_marshal __P((ENV *, __rep_blob_update_req_args *, u_int8_t *)); +int __rep_blob_update_req_unmarshal __P((ENV *, __rep_blob_update_req_args *, u_int8_t *, size_t, u_int8_t **)); +void __rep_blob_update_marshal __P((ENV *, __rep_blob_update_args *, u_int8_t *)); +int __rep_blob_update_unmarshal __P((ENV *, __rep_blob_update_args *, u_int8_t *, size_t, u_int8_t **)); +void __rep_blob_file_marshal __P((ENV *, __rep_blob_file_args *, u_int8_t *)); +int __rep_blob_file_unmarshal __P((ENV *, __rep_blob_file_args *, u_int8_t *, size_t, u_int8_t **)); +void __rep_blob_chunk_marshal __P((ENV *, __rep_blob_chunk_args *, u_int8_t *)); +int __rep_blob_chunk_unmarshal __P((ENV *, __rep_blob_chunk_args *, u_int8_t *, size_t, u_int8_t **)); +void __rep_blob_chunk_req_marshal __P((ENV *, __rep_blob_chunk_req_args *, u_int8_t *)); +int __rep_blob_chunk_req_unmarshal __P((ENV *, __rep_blob_chunk_req_args *, u_int8_t *, size_t, u_int8_t **)); int __rep_update_req __P((ENV *, __rep_control_args *)); +int __rep_blob_update_req __P((ENV *, DB_THREAD_INFO *, DBT *)); int __rep_page_req __P((ENV *, DB_THREAD_INFO *, int, __rep_control_args *, DBT *)); int __rep_update_setup __P((ENV *, int, __rep_control_args *, DBT *, time_t, DB_LSN *)); +int __rep_blob_update __P((ENV *, int, DB_THREAD_INFO *, DBT *)); +int __rep_blob_allreq __P((ENV *, int, DBT *)); int __rep_bulk_page __P((ENV *, DB_THREAD_INFO *, int, __rep_control_args *, DBT *)); int __rep_page __P((ENV *, DB_THREAD_INFO *, int, __rep_control_args *, DBT *)); +int __rep_blob_chunk __P((ENV *, int, DB_THREAD_INFO *, DBT *)); int __rep_init_cleanup __P((ENV *, REP *, int)); +int __rep_blob_chunk_req __P((ENV *, int, DBT *)); int __rep_pggap_req __P((ENV *, REP *, __rep_fileinfo_args *, u_int32_t)); +int __rep_blob_rereq __P((ENV *, REP *)); int __rep_finfo_alloc __P((ENV *, __rep_fileinfo_args *, __rep_fileinfo_args **)); int __rep_remove_init_file __P((ENV *)); int __rep_reset_init __P((ENV *)); @@ -65,27 +83,35 @@ void __rep_env_destroy __P((DB_ENV *)); int __rep_get_config __P((DB_ENV *, u_int32_t, int *)); int __rep_set_config __P((DB_ENV *, u_int32_t, int)); int __rep_start_pp __P((DB_ENV *, DBT *, u_int32_t)); -int __rep_start_int __P((ENV *, DBT *, u_int32_t)); +int __rep_start_int __P((ENV *, DBT *, u_int32_t, u_int32_t)); int __rep_open_sysdb __P((ENV *, DB_THREAD_INFO *, DB_TXN *, const char *, u_int32_t, DB **)); int __rep_client_dbinit __P((ENV *, int, repdb_t)); +int __rep_blob_cmp __P((DB *, const DBT *, const DBT *, size_t *)); +int __rep_offset_cmp __P((DB *, const DBT *, const DBT *, size_t *)); int __rep_get_limit __P((DB_ENV *, u_int32_t *, u_int32_t *)); int __rep_set_limit __P((DB_ENV *, u_int32_t, u_int32_t)); int __rep_set_nsites_pp __P((DB_ENV *, u_int32_t)); int __rep_set_nsites_int __P((ENV *, u_int32_t)); int __rep_get_nsites __P((DB_ENV *, u_int32_t *)); -int __rep_set_priority __P((DB_ENV *, u_int32_t)); +int __rep_set_priority_pp __P((DB_ENV *, u_int32_t)); +int __rep_set_priority_int __P((ENV *, u_int32_t)); int __rep_get_priority __P((DB_ENV *, u_int32_t *)); -int __rep_set_timeout __P((DB_ENV *, int, db_timeout_t)); +int __rep_set_timeout_pp __P((DB_ENV *, int, db_timeout_t)); +int __rep_set_timeout_int __P((ENV *, int, db_timeout_t)); int __rep_get_timeout __P((DB_ENV *, int, db_timeout_t *)); int __rep_get_request __P((DB_ENV *, db_timeout_t *, db_timeout_t *)); int __rep_set_request __P((DB_ENV *, db_timeout_t, db_timeout_t)); +int __rep_set_view __P((DB_ENV *, int (*)(DB_ENV *, const char *, int *, u_int32_t))); +int __rep_call_partial __P((ENV *, const char *, int *, u_int32_t, DELAYED_BLOB_LIST **)); int __rep_set_transport_pp __P((DB_ENV *, int, int (*)(DB_ENV *, const DBT *, const DBT *, const DB_LSN *, int, u_int32_t))); int __rep_set_transport_int __P((ENV *, int, int (*)(DB_ENV *, const DBT *, const DBT *, const DB_LSN *, int, u_int32_t))); int __rep_get_clockskew __P((DB_ENV *, u_int32_t *, u_int32_t *)); int __rep_set_clockskew __P((DB_ENV *, u_int32_t, u_int32_t)); -int __rep_flush __P((DB_ENV *)); +int __rep_flush_pp __P((DB_ENV *)); +int __rep_flush_int __P((ENV *)); int __rep_sync __P((DB_ENV *, u_int32_t)); int __rep_txn_applied __P((ENV *, DB_THREAD_INFO *, DB_COMMIT_INFO *, db_timeout_t)); +int __rep_read_lsn_history __P((ENV *, DB_THREAD_INFO *, DB_TXN **, DBC **, u_int32_t, __rep_lsn_hist_data_args *, struct rep_waitgoal *, u_int32_t, int)); int __rep_process_message_pp __P((DB_ENV *, DBT *, DBT *, int, DB_LSN *)); int __rep_process_message_int __P((ENV *, DBT *, DBT *, int, DB_LSN *)); int __rep_apply __P((ENV *, DB_THREAD_INFO *, __rep_control_args *, DBT *, DB_LSN *, int *, DB_LSN *)); @@ -101,6 +127,7 @@ int __rep_preclose __P((ENV *)); int __rep_closefiles __P((ENV *)); int __rep_write_egen __P((ENV *, REP *, u_int32_t)); int __rep_write_gen __P((ENV *, REP *, u_int32_t)); +int __rep_check_view __P((ENV *)); int __rep_stat_pp __P((DB_ENV *, DB_REP_STAT **, u_int32_t)); int __rep_stat_print_pp __P((DB_ENV *, u_int32_t)); int __rep_stat_print __P((ENV *, u_int32_t)); @@ -139,6 +166,8 @@ int __rep_log_backup __P((ENV *, DB_LOGC *, DB_LSN *, u_int32_t)); int __rep_get_maxpermlsn __P((ENV *, DB_LSN *)); int __rep_is_internal_rep_file __P((char *)); int __rep_get_datagen __P((ENV *, u_int32_t *)); +int __rep_become_readonly_master __P((ENV *, u_int32_t *, DB_LSN *)); +int __rep_get_lsnhist_data __P((ENV *, DB_THREAD_INFO *, u_int32_t, __rep_lsn_hist_data_args *)); int __rep_verify __P((ENV *, __rep_control_args *, DBT *, int, time_t)); int __rep_verify_fail __P((ENV *, __rep_control_args *)); int __rep_verify_req __P((ENV *, __rep_control_args *, int)); diff --git a/src/dbinc_auto/repmgr_automsg.h b/src/dbinc_auto/repmgr_automsg.h index 1b2b928c..17e467e9 100644 --- a/src/dbinc_auto/repmgr_automsg.h +++ b/src/dbinc_auto/repmgr_automsg.h @@ -72,11 +72,17 @@ typedef struct ___repmgr_membership_key_args { u_int16_t port; } __repmgr_membership_key_args; -#define __REPMGR_MEMBERSHIP_DATA_SIZE 4 +#define __REPMGR_MEMBERSHIP_DATA_SIZE 8 typedef struct ___repmgr_membership_data_args { + u_int32_t status; u_int32_t flags; } __repmgr_membership_data_args; +#define __REPMGR_V4MEMBERSHIP_DATA_SIZE 4 +typedef struct ___repmgr_v4membership_data_args { + u_int32_t flags; +} __repmgr_v4membership_data_args; + #define __REPMGR_MEMBER_METADATA_SIZE 8 typedef struct ___repmgr_member_metadata_args { u_int32_t format; @@ -96,18 +102,41 @@ typedef struct ___repmgr_membr_vers_args { u_int32_t gen; } __repmgr_membr_vers_args; -#define __REPMGR_SITE_INFO_SIZE 10 +#define __REPMGR_SITE_INFO_SIZE 14 typedef struct ___repmgr_site_info_args { DBT host; u_int16_t port; + u_int32_t status; u_int32_t flags; } __repmgr_site_info_args; -#define __REPMGR_CONNECT_REJECT_SIZE 8 +#define __REPMGR_V4SITE_INFO_SIZE 10 +typedef struct ___repmgr_v4site_info_args { + DBT host; + u_int16_t port; + u_int32_t flags; +} __repmgr_v4site_info_args; + +#define __REPMGR_CONNECT_REJECT_SIZE 12 typedef struct ___repmgr_connect_reject_args { u_int32_t version; u_int32_t gen; + u_int32_t status; } __repmgr_connect_reject_args; -#define __REPMGR_MAXMSG_SIZE 12 +#define __REPMGR_V4CONNECT_REJECT_SIZE 8 +typedef struct ___repmgr_v4connect_reject_args { + u_int32_t version; + u_int32_t gen; +} __repmgr_v4connect_reject_args; + +#define __REPMGR_LSNHIST_MATCH_SIZE 24 +typedef struct ___repmgr_lsnhist_match_args { + DB_LSN lsn; + u_int32_t hist_sec; + u_int32_t hist_nsec; + DB_LSN next_gen_lsn; +} __repmgr_lsnhist_match_args; + +#define __REPMGR_MAXMSG_SIZE 24 #endif diff --git a/src/dbinc_auto/repmgr_ext.h b/src/dbinc_auto/repmgr_ext.h index b1237950..3ff59ffe 100644 --- a/src/dbinc_auto/repmgr_ext.h +++ b/src/dbinc_auto/repmgr_ext.h @@ -29,6 +29,8 @@ int __repmgr_membership_key_marshal __P((ENV *, __repmgr_membership_key_args *, int __repmgr_membership_key_unmarshal __P((ENV *, __repmgr_membership_key_args *, u_int8_t *, size_t, u_int8_t **)); void __repmgr_membership_data_marshal __P((ENV *, __repmgr_membership_data_args *, u_int8_t *)); int __repmgr_membership_data_unmarshal __P((ENV *, __repmgr_membership_data_args *, u_int8_t *, size_t, u_int8_t **)); +void __repmgr_v4membership_data_marshal __P((ENV *, __repmgr_v4membership_data_args *, u_int8_t *)); +int __repmgr_v4membership_data_unmarshal __P((ENV *, __repmgr_v4membership_data_args *, u_int8_t *, size_t, u_int8_t **)); void __repmgr_member_metadata_marshal __P((ENV *, __repmgr_member_metadata_args *, u_int8_t *)); int __repmgr_member_metadata_unmarshal __P((ENV *, __repmgr_member_metadata_args *, u_int8_t *, size_t, u_int8_t **)); int __repmgr_gm_fwd_marshal __P((ENV *, __repmgr_gm_fwd_args *, u_int8_t *, size_t, size_t *)); @@ -37,21 +39,34 @@ void __repmgr_membr_vers_marshal __P((ENV *, __repmgr_membr_vers_args *, u_int8_ int __repmgr_membr_vers_unmarshal __P((ENV *, __repmgr_membr_vers_args *, u_int8_t *, size_t, u_int8_t **)); int __repmgr_site_info_marshal __P((ENV *, __repmgr_site_info_args *, u_int8_t *, size_t, size_t *)); int __repmgr_site_info_unmarshal __P((ENV *, __repmgr_site_info_args *, u_int8_t *, size_t, u_int8_t **)); +int __repmgr_v4site_info_marshal __P((ENV *, __repmgr_v4site_info_args *, u_int8_t *, size_t, size_t *)); +int __repmgr_v4site_info_unmarshal __P((ENV *, __repmgr_v4site_info_args *, u_int8_t *, size_t, u_int8_t **)); void __repmgr_connect_reject_marshal __P((ENV *, __repmgr_connect_reject_args *, u_int8_t *)); int __repmgr_connect_reject_unmarshal __P((ENV *, __repmgr_connect_reject_args *, u_int8_t *, size_t, u_int8_t **)); +void __repmgr_v4connect_reject_marshal __P((ENV *, __repmgr_v4connect_reject_args *, u_int8_t *)); +int __repmgr_v4connect_reject_unmarshal __P((ENV *, __repmgr_v4connect_reject_args *, u_int8_t *, size_t, u_int8_t **)); +void __repmgr_lsnhist_match_marshal __P((ENV *, __repmgr_lsnhist_match_args *, u_int8_t *)); +int __repmgr_lsnhist_match_unmarshal __P((ENV *, __repmgr_lsnhist_match_args *, u_int8_t *, size_t, u_int8_t **)); int __repmgr_member_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); int __repmgr_init_print __P((ENV *, DB_DISTAB *)); int __repmgr_init_election __P((ENV *, u_int32_t)); int __repmgr_claim_victory __P((ENV *)); int __repmgr_turn_on_elections __P((ENV *)); -int __repmgr_start __P((DB_ENV *, int, u_int32_t)); +int __repmgr_start_pp __P((DB_ENV *, int, u_int32_t)); +int __repmgr_start_int __P((ENV *, int, u_int32_t)); int __repmgr_valid_config __P((ENV *, u_int32_t)); +int __repmgr_prefmas_auto_config __P((DB_ENV *, u_int32_t *)); int __repmgr_autostart __P((ENV *)); int __repmgr_start_selector __P((ENV *)); int __repmgr_close __P((ENV *)); int __repmgr_stop __P((ENV *)); int __repmgr_set_ack_policy __P((DB_ENV *, int)); int __repmgr_get_ack_policy __P((DB_ENV *, int *)); +int __repmgr_set_incoming_queue_max __P((DB_ENV *, u_int32_t, u_int32_t)); +int __repmgr_get_incoming_queue_max __P((DB_ENV *, u_int32_t *, u_int32_t *)); +void __repmgr_set_incoming_queue_redzone __P((void *, u_int32_t, u_int32_t)); +int __repmgr_get_incoming_queue_redzone __P((DB_ENV *, u_int32_t *, u_int32_t *)); +int __repmgr_get_incoming_queue_fullevent __P((DB_ENV *, int *)); int __repmgr_env_create __P((ENV *, DB_REP *)); void __repmgr_env_destroy __P((ENV *, DB_REP *)); int __repmgr_stop_threads __P((ENV *)); @@ -72,12 +87,13 @@ int __repmgr_site_by_eid __P((DB_ENV *, int, DB_SITE **)); int __repmgr_get_site_address __P((DB_SITE *, const char **, u_int *)); int __repmgr_get_eid __P((DB_SITE *, int *)); int __repmgr_get_config __P((DB_SITE *, u_int32_t, u_int32_t *)); -int __repmgr_site_config __P((DB_SITE *, u_int32_t, u_int32_t)); +int __repmgr_site_config_pp __P((DB_SITE *, u_int32_t, u_int32_t)); +int __repmgr_site_config_int __P((DB_SITE *, u_int32_t, u_int32_t)); int __repmgr_site_close __P((DB_SITE *)); void *__repmgr_msg_thread __P((void *)); int __repmgr_send_err_resp __P((ENV *, CHANNEL *, int)); int __repmgr_handle_event __P((ENV *, u_int32_t, void *)); -int __repmgr_update_membership __P((ENV *, DB_THREAD_INFO *, int, u_int32_t)); +int __repmgr_update_membership __P((ENV *, DB_THREAD_INFO *, int, u_int32_t, u_int32_t)); int __repmgr_set_gm_version __P((ENV *, DB_THREAD_INFO *, DB_TXN *, u_int32_t)); int __repmgr_setup_gmdb_op __P((ENV *, DB_THREAD_INFO *, DB_TXN **, u_int32_t)); int __repmgr_cleanup_gmdb_op __P((ENV *, int)); @@ -132,7 +148,6 @@ int __repmgr_select_loop __P((ENV *)); int __repmgr_queue_destroy __P((ENV *)); int __repmgr_queue_get __P((ENV *, REPMGR_MESSAGE **, REPMGR_RUNNABLE *)); int __repmgr_queue_put __P((ENV *, REPMGR_MESSAGE *)); -int __repmgr_queue_size __P((ENV *)); int __repmgr_member_recover __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); void *__repmgr_select_thread __P((void *)); int __repmgr_bow_out __P((ENV *)); @@ -140,6 +155,7 @@ int __repmgr_accept __P((ENV *)); int __repmgr_compute_timeout __P((ENV *, db_timespec *)); REPMGR_SITE *__repmgr_connected_master __P((ENV *)); int __repmgr_check_timeouts __P((ENV *)); +int __repmgr_refresh_selector __P((ENV *)); int __repmgr_first_try_connections __P((ENV *)); int __repmgr_send_v1_handshake __P((ENV *, REPMGR_CONNECTION *, void *, size_t)); int __repmgr_read_from_site __P((ENV *, REPMGR_CONNECTION *)); @@ -151,7 +167,8 @@ int __repmgr_write_some __P((ENV *, REPMGR_CONNECTION *)); int __repmgr_stat_pp __P((DB_ENV *, DB_REPMGR_STAT **, u_int32_t)); int __repmgr_stat_print_pp __P((DB_ENV *, u_int32_t)); int __repmgr_stat_print __P((ENV *, u_int32_t)); -int __repmgr_site_list __P((DB_ENV *, u_int *, DB_REPMGR_SITE **)); +int __repmgr_site_list_pp __P((DB_ENV *, u_int *, DB_REPMGR_SITE **)); +int __repmgr_site_list_int __P((ENV *, u_int *, DB_REPMGR_SITE **)); #ifndef HAVE_REPLICATION_THREADS int __repmgr_close __P((ENV *)); #endif @@ -162,6 +179,18 @@ int __repmgr_get_ack_policy __P((DB_ENV *, int *)); int __repmgr_set_ack_policy __P((DB_ENV *, int)); #endif #ifndef HAVE_REPLICATION_THREADS +int __repmgr_get_incoming_queue_max __P((DB_ENV *, u_int32_t *, u_int32_t *)); +#endif +#ifndef HAVE_REPLICATION_THREADS +int __repmgr_set_incoming_queue_max __P((DB_ENV *, u_int32_t, u_int32_t)); +#endif +#ifndef HAVE_REPLICATION_THREADS +int __repmgr_get_incoming_queue_redzone __P((DB_ENV *, u_int32_t *, u_int32_t *)); +#endif +#ifndef HAVE_REPLICATION_THREADS +int __repmgr_get_incoming_queue_fullevent __P((DB_ENV *, int *)); +#endif +#ifndef HAVE_REPLICATION_THREADS int __repmgr_site __P((DB_ENV *, const char *, u_int, DB_SITE **, u_int32_t)); #endif #ifndef HAVE_REPLICATION_THREADS @@ -171,10 +200,10 @@ int __repmgr_site_by_eid __P((DB_ENV *, int, DB_SITE **)); int __repmgr_local_site __P((DB_ENV *, DB_SITE **)); #endif #ifndef HAVE_REPLICATION_THREADS -int __repmgr_site_list __P((DB_ENV *, u_int *, DB_REPMGR_SITE **)); +int __repmgr_site_list_pp __P((DB_ENV *, u_int *, DB_REPMGR_SITE **)); #endif #ifndef HAVE_REPLICATION_THREADS -int __repmgr_start __P((DB_ENV *, int, u_int32_t)); +int __repmgr_start_pp __P((DB_ENV *, int, u_int32_t)); #endif #ifndef HAVE_REPLICATION_THREADS int __repmgr_stat_pp __P((DB_ENV *, DB_REPMGR_STAT **, u_int32_t)); @@ -213,8 +242,8 @@ int __repmgr_thread_failure __P((ENV *, int)); char *__repmgr_format_eid_loc __P((DB_REP *, REPMGR_CONNECTION *, char *)); char *__repmgr_format_site_loc __P((REPMGR_SITE *, char *)); char *__repmgr_format_addr_loc __P((repmgr_netaddr_t *, char *)); -int __repmgr_repstart __P((ENV *, u_int32_t)); -int __repmgr_become_master __P((ENV *)); +int __repmgr_repstart __P((ENV *, u_int32_t, u_int32_t)); +int __repmgr_become_master __P((ENV *, u_int32_t)); int __repmgr_each_connection __P((ENV *, CONNECTION_ACTION, void *, int)); int __repmgr_open __P((ENV *, void *)); int __repmgr_join __P((ENV *, void *)); @@ -225,9 +254,16 @@ int __repmgr_init_new_sites __P((ENV *, int, int)); int __repmgr_failchk __P((ENV *)); int __repmgr_master_is_known __P((ENV *)); int __repmgr_stable_lsn __P((ENV *, DB_LSN *)); +int __repmgr_make_request_conn __P((ENV *, repmgr_netaddr_t *, REPMGR_CONNECTION **)); int __repmgr_send_sync_msg __P((ENV *, REPMGR_CONNECTION *, u_int32_t, u_int8_t *, u_int32_t)); -int __repmgr_marshal_member_list __P((ENV *, u_int8_t **, size_t *)); -int __repmgr_refresh_membership __P((ENV *, u_int8_t *, size_t)); +int __repmgr_read_own_msg __P((ENV *, REPMGR_CONNECTION *, u_int32_t *, u_int8_t **, size_t *)); +int __repmgr_prefmas_connected __P((ENV *)); +int __repmgr_restart_site_as_client __P((ENV *, int)); +int __repmgr_make_site_readonly_master __P((ENV *, int, u_int32_t *, DB_LSN *)); +int __repmgr_lsnhist_match __P((ENV *, DB_THREAD_INFO *, int, int *)); +int __repmgr_prefmas_get_wait __P((ENV *, u_int32_t *, u_long *)); +int __repmgr_marshal_member_list __P((ENV *, u_int32_t, u_int8_t **, size_t *)); +int __repmgr_refresh_membership __P((ENV *, u_int8_t *, size_t, u_int32_t)); int __repmgr_reload_gmdb __P((ENV *)); int __repmgr_gmdb_version_cmp __P((ENV *, u_int32_t, u_int32_t)); int __repmgr_init_save __P((ENV *, DBT *)); @@ -238,10 +274,11 @@ void __repmgr_print_conn_err __P((ENV *, repmgr_netaddr_t *, int)); int __repmgr_become_client __P((ENV *)); REPMGR_SITE *__repmgr_lookup_site __P((ENV *, const char *, u_int)); int __repmgr_find_site __P((ENV *, const char *, u_int, int *)); -int __repmgr_set_membership __P((ENV *, const char *, u_int, u_int32_t)); +int __repmgr_set_membership __P((ENV *, const char *, u_int, u_int32_t, u_int32_t)); int __repmgr_bcast_parm_refresh __P((ENV *)); int __repmgr_chg_prio __P((ENV *, u_int32_t, u_int32_t)); int __repmgr_bcast_own_msg __P((ENV *, u_int32_t, u_int8_t *, size_t)); +int __repmgr_bcast_member_list __P((ENV *)); #if defined(__cplusplus) } diff --git a/src/dbinc_auto/sequence_ext.h b/src/dbinc_auto/sequence_ext.h index a2c114cf..8f8b8473 100644 --- a/src/dbinc_auto/sequence_ext.h +++ b/src/dbinc_auto/sequence_ext.h @@ -10,6 +10,10 @@ int __seq_stat __P((DB_SEQUENCE *, DB_SEQUENCE_STAT **, u_int32_t)); int __seq_stat_print __P((DB_SEQUENCE *, u_int32_t)); const FN * __db_get_seq_flags_fn __P((void)); const FN * __db_get_seq_flags_fn __P((void)); +int __seq_open __P((DB_SEQUENCE *, DB_TXN *, DBT *, u_int32_t)); +int __seq_initial_value __P((DB_SEQUENCE *, db_seq_t)); +int __seq_get __P((DB_SEQUENCE *, DB_TXN *, u_int32_t, db_seq_t *, u_int32_t)); +int __seq_close __P((DB_SEQUENCE *, u_int32_t)); #if defined(__cplusplus) } diff --git a/src/dbinc_auto/tcl_ext.h b/src/dbinc_auto/tcl_ext.h index 8b076c8b..4ea037c0 100644 --- a/src/dbinc_auto/tcl_ext.h +++ b/src/dbinc_auto/tcl_ext.h @@ -19,9 +19,12 @@ int db_Cmd __P((ClientData, Tcl_Interp *, int, Tcl_Obj * CONST*)); int tcl_CompactStat __P((Tcl_Interp *, DBTCL_INFO *)); int tcl_rep_send __P((DB_ENV *, const DBT *, const DBT *, const DB_LSN *, int, u_int32_t)); int dbc_Cmd __P((ClientData, Tcl_Interp *, int, Tcl_Obj * CONST*)); +int dbstream_Cmd __P((ClientData, Tcl_Interp *, int, Tcl_Obj * CONST*)); int env_Cmd __P((ClientData, Tcl_Interp *, int, Tcl_Obj * CONST*)); int tcl_EnvRemove __P((Tcl_Interp *, int, Tcl_Obj * CONST*)); int tcl_EnvClose __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *, DBTCL_INFO *)); +int tcl_EnvBackup __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *)); +int tcl_EnvDbBackup __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *)); int tcl_EnvIdReset __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *)); int tcl_EnvLsnReset __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *)); int tcl_EnvVerbose __P((Tcl_Interp *, DB_ENV *, Tcl_Obj *, Tcl_Obj *)); @@ -30,7 +33,8 @@ int tcl_EnvSetFlags __P((Tcl_Interp *, DB_ENV *, Tcl_Obj *, Tcl_Obj *)); int tcl_EnvTest __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *)); int tcl_EnvGetEncryptFlags __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *)); void tcl_EnvSetErrfile __P((Tcl_Interp *, DB_ENV *, DBTCL_INFO *, char *)); -void tcl_EnvSetMsgfile __P((Tcl_Interp *, DB_ENV *, DBTCL_INFO *, char *)); +int tcl_EnvSetMsgfile __P((Tcl_Interp *, DB_ENV *, DBTCL_INFO *, char *)); +int tcl_EnvCloseMsgfile __P((Tcl_Interp *, DB_ENV *, DBTCL_INFO *)); int tcl_EnvSetErrpfx __P((Tcl_Interp *, DB_ENV *, DBTCL_INFO *, char *)); int tcl_EnvStatPrint __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *)); DBTCL_INFO *_NewInfo __P((Tcl_Interp *, void *, char *, enum INFOTYPE)); @@ -73,9 +77,11 @@ int tcl_LogGet __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *)); int tcl_LogPut __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *)); int tcl_LogStat __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *)); int tcl_LogStatPrint __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *)); +int tcl_LogVerify __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *)); int logc_Cmd __P((ClientData, Tcl_Interp *, int, Tcl_Obj * CONST*)); int tcl_LogConfig __P((Tcl_Interp *, DB_ENV *, Tcl_Obj *, Tcl_Obj *)); int tcl_LogGetConfig __P((Tcl_Interp *, DB_ENV *, Tcl_Obj *)); +int tcl_LogSetMax __P((Tcl_Interp *, DB_ENV *,Tcl_Obj *,u_int32_t *,u_int32_t *)); void _MpInfoDelete __P((Tcl_Interp *, DBTCL_INFO *)); int tcl_MpSync __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *)); int tcl_MpTrickle __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *)); @@ -83,6 +89,7 @@ int tcl_Mp __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *, DBTCL_INFO *)); int tcl_MpStat __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *)); int tcl_MpStatPrint __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *)); int tcl_Mutex __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *)); +int tcl_MutexFailchkTimeout __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *)); int tcl_MutFree __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *)); int tcl_MutGet __P((Tcl_Interp *, DB_ENV *, int)); int tcl_MutLock __P((Tcl_Interp *, int, Tcl_Obj * CONST*, DB_ENV *)); diff --git a/src/dbinc_auto/txn_ext.h b/src/dbinc_auto/txn_ext.h index 7c21455f..2fbcd147 100644 --- a/src/dbinc_auto/txn_ext.h +++ b/src/dbinc_auto/txn_ext.h @@ -60,6 +60,7 @@ int __txn_recover __P((ENV *, DB_PREPLIST *, long, long *, u_int32_t)); int __txn_get_prepared __P((ENV *, XID *, DB_PREPLIST *, long, long *, u_int32_t)); int __txn_openfiles __P((ENV *, DB_THREAD_INFO *, DB_LSN *, int)); int __txn_open __P((ENV *)); +int __txn_region_detach __P((ENV *, DB_TXNMGR *)); int __txn_findlastckp __P((ENV *, DB_LSN *, DB_LSN *)); int __txn_env_refresh __P((ENV *)); u_int32_t __txn_region_mutex_count __P((ENV *)); @@ -67,7 +68,7 @@ u_int32_t __txn_region_mutex_max __P((ENV *)); size_t __txn_region_size __P((ENV *)); size_t __txn_region_max __P((ENV *)); int __txn_id_set __P((ENV *, u_int32_t, u_int32_t)); -int __txn_oldest_reader __P((ENV *, DB_LSN *)); +int __txn_get_readers __P((ENV *, DB_LSN **, int *)); int __txn_add_buffer __P((ENV *, TXN_DETAIL *)); int __txn_remove_buffer __P((ENV *, TXN_DETAIL *, db_mutex_t)); int __txn_stat_pp __P((DB_ENV *, DB_TXN_STAT **, u_int32_t)); diff --git a/src/dbreg/dbreg.c b/src/dbreg/dbreg.c index 5067edac..99a80959 100644 --- a/src/dbreg/dbreg.c +++ b/src/dbreg/dbreg.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -9,6 +9,7 @@ #include "db_config.h" #include "db_int.h" +#include "dbinc/blob.h" #include "dbinc/db_page.h" #include "dbinc/log.h" #include "dbinc/txn.h" @@ -171,6 +172,7 @@ __dbreg_setup(dbp, fname, dname, create_txnid) F_SET(fnp, DBREG_EXCL); fnp->txn_ref = 1; fnp->mutex = dbp->mutex; + fnp->blob_file_id = dbp->blob_file_id; dbp->log_filename = fnp; @@ -722,7 +724,7 @@ __dbreg_failchk(env) MUTEX_LOCK(env, lp->mtx_filelist); for (fnp = SH_TAILQ_FIRST(&lp->fq, __fname); fnp != NULL; fnp = nnp) { nnp = SH_TAILQ_NEXT(fnp, q, __fname); - if (dbenv->is_alive(dbenv, + if (dbenv->is_alive(dbenv, fnp->pid, unused, DB_MUTEX_PROCESS_ONLY)) continue; MUTEX_LOCK(env, fnp->mutex); @@ -773,6 +775,7 @@ __dbreg_log_close(env, fnp, txn, op) DB_LOG *dblp; DB_LSN r_unused; int ret; + u_int32_t blob_file_lo, blob_file_hi; dblp = env->lg_handle; ret = 0; @@ -788,10 +791,12 @@ __dbreg_log_close(env, fnp, txn, op) memset(&fid_dbt, 0, sizeof(fid_dbt)); fid_dbt.data = fnp->ufid; fid_dbt.size = DB_FILE_ID_LEN; + SET_LO_HI_VAR(fnp->blob_file_id, blob_file_lo, blob_file_hi); if ((ret = __dbreg_register_log(env, txn, &r_unused, F_ISSET(fnp, DB_FNAME_DURABLE) ? 0 : DB_LOG_NOT_DURABLE, op, dbtp, &fid_dbt, fnp->id, - fnp->s_type, fnp->meta_pgno, TXN_INVALID)) != 0) { + fnp->s_type, fnp->meta_pgno, TXN_INVALID, blob_file_lo, + blob_file_hi)) != 0) { /* * We are trying to close, but the log write failed. * Unfortunately, close needs to plow forward, because @@ -958,6 +963,7 @@ __dbreg_log_id(dbp, txn, id, needlock) LOG *lp; u_int32_t op; int i, ret; + u_int32_t blob_file_lo, blob_file_hi; env = dbp->env; dblp = env->lg_handle; @@ -996,14 +1002,16 @@ __dbreg_log_id(dbp, txn, id, needlock) fid_dbt.size = DB_FILE_ID_LEN; op = !F_ISSET(dbp, DB_AM_OPEN_CALLED) ? DBREG_PREOPEN : - (F_ISSET(dbp, DB_AM_INMEM) ? + (F_ISSET(dbp, DB_AM_INMEM) ? (F2_ISSET(dbp, DB2_AM_EXCL) ? DBREG_XREOPEN : DBREG_REOPEN): (F2_ISSET(dbp, DB2_AM_EXCL) ? DBREG_XOPEN : DBREG_OPEN)); + SET_LO_HI_VAR(fnp->blob_file_id, blob_file_lo, blob_file_hi); ret = __dbreg_register_log(env, txn, &unused, F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0, op | F_ISSET(fnp, DB_FNAME_DBREG_MASK), r_name.size == 0 ? NULL : &r_name, &fid_dbt, id, - fnp->s_type, fnp->meta_pgno, fnp->create_txnid); + fnp->s_type, fnp->meta_pgno, fnp->create_txnid, + blob_file_lo, blob_file_hi); if (needlock) MUTEX_UNLOCK(env, lp->mtx_filelist); diff --git a/src/dbreg/dbreg.src b/src/dbreg/dbreg.src index c7740d63..3187bc4f 100644 --- a/src/dbreg/dbreg.src +++ b/src/dbreg/dbreg.src @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -26,7 +26,7 @@ INCLUDE * ftype: database type * id: transaction id of the subtransaction that created the fs object */ -BEGIN register 42 2 +BEGIN_COMPAT register 42 2 DBOP opcode u_int32_t lu DBT name DBT s DBT uid DBT s @@ -35,3 +35,26 @@ ARG ftype DBTYPE lx ARG meta_pgno db_pgno_t lu ARG id u_int32_t lx END + +/* + * Used for registering name/id translations at open or close. + * opcode: register or unregister + * name: file name + * fileid: unique file id + * ftype: file type + * ftype: database type + * id: transaction id of the subtransaction that created the fs object + * blob_fid_lo/hi: The blob file directory id + */ +BEGIN register 61 2 +DBOP opcode u_int32_t lu +DBT name DBT s +DBT uid DBT s +ARG fileid int32_t ld +ARG ftype DBTYPE lx +ARG meta_pgno db_pgno_t lu +ARG id u_int32_t lx +ARG blob_fid_lo u_int32_t lu +ARG blob_fid_hi u_int32_t lu +END + diff --git a/src/dbreg/dbreg_auto.c b/src/dbreg/dbreg_auto.c index a26e5527..3d9f01c7 100644 --- a/src/dbreg/dbreg_auto.c +++ b/src/dbreg/dbreg_auto.c @@ -8,6 +8,16 @@ #include "dbinc/db_am.h" #include "dbinc/txn.h" +DB_LOG_RECSPEC __dbreg_register_42_desc[] = { + {LOGREC_DBOP, SSZ(__dbreg_register_42_args, opcode), "opcode", ""}, + {LOGREC_DBT, SSZ(__dbreg_register_42_args, name), "name", ""}, + {LOGREC_DBT, SSZ(__dbreg_register_42_args, uid), "uid", ""}, + {LOGREC_ARG, SSZ(__dbreg_register_42_args, fileid), "fileid", "%ld"}, + {LOGREC_ARG, SSZ(__dbreg_register_42_args, ftype), "ftype", "%lx"}, + {LOGREC_ARG, SSZ(__dbreg_register_42_args, meta_pgno), "meta_pgno", "%lu"}, + {LOGREC_ARG, SSZ(__dbreg_register_42_args, id), "id", "%lx"}, + {LOGREC_Done, 0, "", ""} +}; DB_LOG_RECSPEC __dbreg_register_desc[] = { {LOGREC_DBOP, SSZ(__dbreg_register_args, opcode), "opcode", ""}, {LOGREC_DBT, SSZ(__dbreg_register_args, name), "name", ""}, @@ -16,6 +26,8 @@ DB_LOG_RECSPEC __dbreg_register_desc[] = { {LOGREC_ARG, SSZ(__dbreg_register_args, ftype), "ftype", "%lx"}, {LOGREC_ARG, SSZ(__dbreg_register_args, meta_pgno), "meta_pgno", "%lu"}, {LOGREC_ARG, SSZ(__dbreg_register_args, id), "id", "%lx"}, + {LOGREC_ARG, SSZ(__dbreg_register_args, blob_fid_lo), "blob_fid_lo", "%lu"}, + {LOGREC_ARG, SSZ(__dbreg_register_args, blob_fid_hi), "blob_fid_hi", "%lu"}, {LOGREC_Done, 0, "", ""} }; /* diff --git a/src/dbreg/dbreg_autop.c b/src/dbreg/dbreg_autop.c index ea43addd..931bc2d9 100644 --- a/src/dbreg/dbreg_autop.c +++ b/src/dbreg/dbreg_autop.c @@ -10,6 +10,23 @@ #include "dbinc/txn.h" /* + * PUBLIC: int __dbreg_register_42_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__dbreg_register_42_print(env, dbtp, lsnp, notused2, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *info; +{ + COMPQUIET(notused2, DB_TXN_PRINT); + + return (__log_print_record(env, dbtp, lsnp, "__dbreg_register_42", __dbreg_register_42_desc, info)); +} + +/* * PUBLIC: int __dbreg_register_print __P((ENV *, DBT *, DB_LSN *, * PUBLIC: db_recops, void *)); */ diff --git a/src/dbreg/dbreg_rec.c b/src/dbreg/dbreg_rec.c index 1b387bb7..066efa03 100644 --- a/src/dbreg/dbreg_rec.c +++ b/src/dbreg/dbreg_rec.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1995, 1996 @@ -37,12 +37,16 @@ #include "db_config.h" #include "db_int.h" +#include "dbinc/blob.h" #include "dbinc/db_page.h" #include "dbinc/db_am.h" #include "dbinc/txn.h" static int __dbreg_open_file __P((ENV *, DB_TXN *, __dbreg_register_args *, void *)); +static int __dbreg_register_recover_int + __P((ENV *, DBT *, db_recops, void *, __dbreg_register_args *)); + /* * PUBLIC: int __dbreg_register_recover * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); @@ -56,21 +60,97 @@ __dbreg_register_recover(env, dbtp, lsnp, op, info) void *info; { __dbreg_register_args *argp; + int ret; + + argp = NULL; + + if ((ret = __dbreg_register_read(env, dbtp->data, &argp)) != 0) + goto out; + + ret = __dbreg_register_recover_int(env, dbtp, op, info, argp); + + if (ret == 0) + *lsnp = argp->prev_lsn; +out: if (argp != NULL) + __os_free(env, argp); + return (ret); +} + +/* + * PUBLIC: int __dbreg_register_42_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__dbreg_register_42_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __dbreg_register_42_args *argp; + __dbreg_register_args arg; + int ret; + + argp = NULL; + if ((ret = __dbreg_register_42_read(env, dbtp->data, &argp)) != 0) + goto err; + + /* + * Databases before 6.0 cannot support blobs, so the blob_fid is 0. + * After 6.0 they can support blobs, so it is possible it has a non-0 + * blob_fid, but since logging that value in dbreg_register + * is only used in replication, and replication does not support blobs + * until 6.1, this is safe. + */ + memcpy(&arg, argp, sizeof(__dbreg_register_42_args)); + arg.blob_fid_lo = 0; + arg.blob_fid_hi = 0; + + ret = __dbreg_register_recover_int(env, dbtp, op, info, &arg); + + if (ret == 0) + *lsnp = argp->prev_lsn; +err: if (argp != NULL) + __os_free(env, argp); + return (ret); +} + +/* + * Internal register recovery function for both the 42 log version and the + * 61 log version. + */ +static int +__dbreg_register_recover_int(env, dbtp, op, info, argp) + ENV *env; + DBT *dbtp; + db_recops op; + void *info; + __dbreg_register_args *argp; +{ DB_ENTRY *dbe; DB_LOG *dblp; DB *dbp; u_int32_t opcode, status; int do_close, do_open, do_rem, ret, t_ret; +#ifdef HAVE_REPLICATION + DB_REP *db_rep; + DELAYED_BLOB_LIST *dbl; + int view_partial; + + dbl = NULL; +#endif dblp = env->lg_handle; dbp = NULL; + ret = 0; #ifdef DEBUG_RECOVER REC_PRINT(__dbreg_register_print); +#else + COMPQUIET(dbtp, NULL); #endif do_open = do_close = 0; - if ((ret = __dbreg_register_read(env, dbtp->data, &argp)) != 0) - goto out; opcode = FLD_ISSET(argp->opcode, DBREG_OP_MASK); switch (opcode) { @@ -123,12 +203,54 @@ __dbreg_register_recover(env, dbtp, lsnp, op, info) } if (do_open) { +#ifdef HAVE_REPLICATION + /* + * Partial replication may apply at this time. Invoke + * the callback if several conditions are met: + * - We are a view. + * - This is the OPENFILES pass of recovery. + * - The file is not a BDB owned database. + * - The dbreg operation is a create (id != TXN_INVALID). + * + * If the file is to be skipped, then we have to TXN_IGNORE + * the txnlist for that create operation. + */ + if (IS_VIEW_SITE(env) && op == DB_TXN_OPENFILES && + (!IS_DB_FILE(argp->name.data) || + IS_BLOB_META(argp->name.data)) && + argp->id != TXN_INVALID) { + db_rep = env->rep_handle; + /* + * Once a view, always a view. Must have set + * a callback already. + */ + if (db_rep->partial == NULL) { + __db_errx(env, DB_STR("1592", + "Must set a view callback.")); + ret = EINVAL; + goto out; + } + if ((ret = __rep_call_partial(env, + argp->name.data, &view_partial, 0, &dbl)) != 0) + goto out; + DB_ASSERT(env, dbl == NULL); + + /* + * If this should not be replicated, then set + * the child txnlist to TXN_IGNORE. + */ + if (view_partial == 0 && + (ret = __db_txnlist_update(env, info, + argp->id, TXN_IGNORE, NULL, &status, 1)) != 0) + goto out; + } +#endif /* * We must open the db even if the meta page is not * yet written as we may be creating subdatabase. */ - if (op == DB_TXN_OPENFILES && opcode != DBREG_CHKPNT - && opcode != DBREG_XCHKPNT) + if (op == DB_TXN_OPENFILES && opcode != DBREG_CHKPNT && + opcode != DBREG_XCHKPNT) F_SET(dblp, DBLOG_FORCE_OPEN); /* @@ -205,7 +327,7 @@ __dbreg_register_recover(env, dbtp, lsnp, op, info) if (dbe->dbp == NULL && !dbe->deleted) { /* No valid entry here. Nothing to do. */ MUTEX_UNLOCK(env, dblp->mtx_dbreg); - goto done; + goto out; } /* We have either an open entry or a deleted entry. */ @@ -273,11 +395,7 @@ __dbreg_register_recover(env, dbtp, lsnp, op, info) } } } -done: if (ret == 0) - *lsnp = argp->prev_lsn; -out: if (argp != NULL) - __os_free(env, argp); - return (ret); +out: return (ret); } /* @@ -296,11 +414,13 @@ __dbreg_open_file(env, txn, argp, info) DB *dbp; DB_ENTRY *dbe; DB_LOG *dblp; + db_seq_t blob_file_id; u_int32_t id, opcode, status; int ret; dblp = env->lg_handle; opcode = FLD_ISSET(argp->opcode, DBREG_OP_MASK); + ret = 0; /* * When we're opening, we have to check that the name we are opening @@ -336,7 +456,7 @@ __dbreg_open_file(env, txn, argp, info) * bit and try to open it again. */ if ((dbp = dbe->dbp) != NULL) { - if (opcode == DBREG_REOPEN || + if (opcode == DBREG_REOPEN || opcode == DBREG_XREOPEN || !F_ISSET(dbp, DB_AM_OPEN_CALLED) || dbp->meta_pgno != argp->meta_pgno || @@ -393,7 +513,11 @@ reopen: txn->mgrp = env->tx_handle; } - return (__dbreg_do_open(env, - txn, dblp, argp->uid.data, argp->name.data, argp->ftype, - argp->fileid, argp->meta_pgno, info, argp->id, opcode)); + GET_LO_HI(env, + argp->blob_fid_lo, argp->blob_fid_hi, blob_file_id, ret); + if (ret != 0) + return (ret); + return (__dbreg_do_open(env, txn, dblp, argp->uid.data, + argp->name.data, argp->ftype, argp->fileid, + argp->meta_pgno, info, argp->id, opcode, blob_file_id)); } diff --git a/src/dbreg/dbreg_stat.c b/src/dbreg/dbreg_stat.c index 6dfb3869..ad4bbdc2 100644 --- a/src/dbreg/dbreg_stat.c +++ b/src/dbreg/dbreg_stat.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/dbreg/dbreg_util.c b/src/dbreg/dbreg_util.c index 80de4d91..0d483f93 100644 --- a/src/dbreg/dbreg_util.c +++ b/src/dbreg/dbreg_util.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -9,6 +9,7 @@ #include "db_config.h" #include "db_int.h" +#include "dbinc/blob.h" #include "dbinc/db_page.h" #include "dbinc/db_am.h" #include "dbinc/fop.h" @@ -103,6 +104,7 @@ __dbreg_log_files(env, opcode) LOG *lp; u_int32_t lopcode; int ret; + u_int32_t blob_file_hi, blob_file_lo; dblp = env->lg_handle; lp = dblp->reginfo.primary; @@ -137,11 +139,12 @@ __dbreg_log_files(env, opcode) lopcode = opcode; if ( opcode == DBREG_CHKPNT && F_ISSET(fnp, DBREG_EXCL)) lopcode = DBREG_XCHKPNT; + SET_LO_HI_VAR(fnp->blob_file_id, blob_file_lo, blob_file_hi); if ((ret = __dbreg_register_log(env, NULL, &r_unused, F_ISSET(fnp, DB_FNAME_DURABLE) ? 0 : DB_LOG_NOT_DURABLE, lopcode | F_ISSET(fnp, DB_FNAME_DBREG_MASK), dbtp, &fid_dbt, fnp->id, fnp->s_type, fnp->meta_pgno, - TXN_INVALID)) != 0) + TXN_INVALID, blob_file_lo, blob_file_hi)) != 0) break; } @@ -429,7 +432,7 @@ __dbreg_id_to_db(env, txn, dbpp, ndx, tryopen) if ((ret = __dbreg_do_open(env, txn, dblp, fname->ufid, name, fname->s_type, ndx, fname->meta_pgno, NULL, TXN_INVALID, F_ISSET(fname, DB_FNAME_INMEM) ? - DBREG_REOPEN : DBREG_OPEN)) != 0) + DBREG_REOPEN : DBREG_OPEN, fname->blob_file_id)) != 0) return (ret); *dbpp = dblp->dbentry[ndx].dbp; @@ -540,6 +543,53 @@ __dbreg_fid_to_fname(dblp, fid, have_lock, fnamep) } /* + * __dbreg_blob_file_to_fname -- + * Traverse the shared-memory list of database file names, looking for + * the entry that matches the passed blob file id. Returns 0 on success; + * -1 on error. + * + * PUBLIC: int __dbreg_blob_file_to_fname + * PUBLIC: __P((DB_LOG *, db_seq_t, int, FNAME **)); + */ +int +__dbreg_blob_file_to_fname(dblp, blob_file_id, have_lock, fnamep) + DB_LOG *dblp; + db_seq_t blob_file_id; + int have_lock; + FNAME **fnamep; +{ + ENV *env; + FNAME *fnp; + LOG *lp; + int ret; + + env = dblp->env; + lp = dblp->reginfo.primary; + + ret = -1; + + /* + * If blob_file is 0 then blobs are not enabled and the value is not + * unique. + */ + if (blob_file_id == 0) + return (ret); + + if (!have_lock) + MUTEX_LOCK(env, lp->mtx_filelist); + SH_TAILQ_FOREACH(fnp, &lp->fq, q, __fname) + if (fnp->blob_file_id == blob_file_id) { + *fnamep = fnp; + ret = 0; + break; + } + if (!have_lock) + MUTEX_UNLOCK(env, lp->mtx_filelist); + + return (ret); +} + +/* * __dbreg_get_name * * Interface to get name of registered files. This is mainly diagnostic @@ -577,14 +627,14 @@ __dbreg_get_name(env, fid, fnamep, dnamep) * is not protected by the thread mutex. * PUBLIC: int __dbreg_do_open __P((ENV *, * PUBLIC: DB_TXN *, DB_LOG *, u_int8_t *, char *, DBTYPE, - * PUBLIC: int32_t, db_pgno_t, void *, u_int32_t, u_int32_t)); + * PUBLIC: int32_t, db_pgno_t, void *, u_int32_t, u_int32_t, db_seq_t)); */ int -__dbreg_do_open(env, - txn, lp, uid, name, ftype, ndx, meta_pgno, info, id, opcode) +__dbreg_do_open(env, txn, + dblp, uid, name, ftype, ndx, meta_pgno, info, id, opcode, blob_file_id) ENV *env; DB_TXN *txn; - DB_LOG *lp; + DB_LOG *dblp; u_int8_t *uid; char *name; DBTYPE ftype; @@ -592,6 +642,7 @@ __dbreg_do_open(env, db_pgno_t meta_pgno; void *info; u_int32_t id, opcode; + db_seq_t blob_file_id; { DB *dbp; u_int32_t cstat, ret_stat; @@ -604,7 +655,7 @@ __dbreg_do_open(env, try_inmem = 0; retry_inmem: - if ((ret = __db_create_internal(&dbp, lp->env, 0)) != 0) + if ((ret = __db_create_internal(&dbp, dblp->env, 0)) != 0) return (ret); /* @@ -700,7 +751,7 @@ err: if (cstat == TXN_UNEXPECTED) * handling those cases specially, above. */ if (try_inmem == 0 && - opcode != DBREG_PREOPEN && opcode != DBREG_REOPEN && + opcode != DBREG_PREOPEN && opcode != DBREG_REOPEN && opcode != DBREG_XREOPEN) { if ((ret = __db_close(dbp, NULL, DB_NOSYNC)) != 0) return (ret); @@ -725,6 +776,7 @@ err: if (cstat == TXN_UNEXPECTED) * we are closing a non-existent file and need to mark * it as deleted. */ + dbp->blob_file_id = blob_file_id; if (dbp->log_filename == NULL && (ret = __dbreg_setup(dbp, name, NULL, id)) != 0) return (ret); @@ -736,7 +788,8 @@ not_right: return (ret == 0 ? t_ret : ret); /* Add this file as deleted. */ - if ((t_ret = __dbreg_add_dbentry(env, lp, NULL, ndx)) != 0 && ret == 0) + if ((t_ret = __dbreg_add_dbentry(env, dblp, NULL, ndx)) != 0 && + ret == 0) ret = t_ret; return (ret); } diff --git a/src/env/env_alloc.c b/src/env/env_alloc.c index 700bfb27..9c8fd046 100644 --- a/src/env/env_alloc.c +++ b/src/env/env_alloc.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/env/env_backup.c b/src/env/env_backup.c index 9c79dbb4..2940f44b 100644 --- a/src/env/env_backup.c +++ b/src/env/env_backup.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2011, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/env/env_config.c b/src/env/env_config.c index 57496909..56cebb63 100644 --- a/src/env/env_config.c +++ b/src/env/env_config.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -84,8 +84,10 @@ static const CFG_DESC config_descs[] = { { "rep_set_clockskew", CFG_2UINT, __rep_set_clockskew }, { "rep_set_limit", CFG_2UINT, __rep_set_limit }, { "rep_set_nsites", CFG_UINT, __rep_set_nsites_pp }, - { "rep_set_priority", CFG_UINT, __rep_set_priority }, + { "rep_set_priority", CFG_UINT, __rep_set_priority_pp }, { "rep_set_request", CFG_2UINT, __rep_set_request }, + { "set_blob_dir", CFG_STRING, __env_set_blob_dir }, + { "set_blob_threshold", CFG_2UINT, __env_set_blob_threshold }, { "set_cache_max", CFG_2UINT, __memp_set_cache_max }, { "set_create_dir", CFG_STRING, __env_set_create_dir }, { "set_data_dir", CFG_STRING, __env_set_data_dir }, @@ -133,11 +135,16 @@ static const FN config_rep_config[] = { { DB_REP_CONF_AUTOROLLBACK, "db_rep_conf_autorollback" }, { DB_REP_CONF_BULK, "db_rep_conf_bulk" }, { DB_REP_CONF_DELAYCLIENT, "db_rep_conf_delayclient" }, + { DB_REP_CONF_ELECT_LOGLENGTH, "db_rep_conf_elect_loglength" }, { DB_REP_CONF_INMEM, "db_rep_conf_inmem" }, { DB_REP_CONF_LEASE, "db_rep_conf_lease" }, { DB_REP_CONF_NOWAIT, "db_rep_conf_nowait" }, { DB_REPMGR_CONF_2SITE_STRICT, "db_repmgr_conf_2site_strict" }, { DB_REPMGR_CONF_ELECTIONS, "db_repmgr_conf_elections" }, + { DB_REPMGR_CONF_PREFMAS_CLIENT, + "db_repmgr_conf_prefmas_client" }, + { DB_REPMGR_CONF_PREFMAS_MASTER, + "db_repmgr_conf_prefmas_master" }, { 0, NULL } }; @@ -198,7 +205,9 @@ static const FN config_set_flags_forlog[] = { { DB_LOG_DIRECT, "db_direct_log" }, { DB_LOG_DSYNC, "db_dsync_log" }, { DB_LOG_AUTO_REMOVE, "db_log_autoremove" }, + { DB_LOG_BLOB, "db_log_blob" }, { DB_LOG_IN_MEMORY, "db_log_inmemory" }, + { DB_LOG_NOSYNC, "db_log_nosync" }, { 0, NULL } }; @@ -206,7 +215,9 @@ static const FN config_log_set_config[] = { { DB_LOG_DIRECT, "db_log_direct" }, { DB_LOG_DSYNC, "db_log_dsync" }, { DB_LOG_AUTO_REMOVE, "db_log_auto_remove" }, + { DB_LOG_BLOB, "db_log_blob" }, { DB_LOG_IN_MEMORY, "db_log_in_memory" }, + { DB_LOG_NOSYNC, "db_log_nosync" }, { DB_LOG_ZERO, "db_log_zero" }, { 0, NULL } }; @@ -237,6 +248,7 @@ static const FN config_set_verbose[] = { { DB_VERB_DEADLOCK, "db_verb_deadlock" }, { DB_VERB_FILEOPS, "db_verb_fileops" }, { DB_VERB_FILEOPS_ALL, "db_verb_fileops_all" }, + { DB_VERB_MVCC, "db_verb_mvcc" }, { DB_VERB_RECOVERY, "db_verb_recovery" }, { DB_VERB_REGISTER, "db_verb_register" }, { DB_VERB_REPLICATION, "db_verb_replication" }, @@ -462,7 +474,7 @@ format: __db_errx(env, DB_STR_A("1584", if ((lv1 = __db_name_to_val(config_rep_timeout, argv[1])) == -1) goto format; CFG_GET_UINT32(argv[2], &uv2); - return (__rep_set_timeout(dbenv, lv1, (db_timeout_t)uv2)); + return (__rep_set_timeout_pp(dbenv, lv1, (db_timeout_t)uv2)); } /* repmgr_set_ack_policy db_repmgr_acks_XXX */ @@ -475,6 +487,15 @@ format: __db_errx(env, DB_STR_A("1584", return (__repmgr_set_ack_policy(dbenv, lv1)); } + if (strcasecmp(argv[0], "repmgr_set_incoming_queue_max") == 0) { + if (nf != 3) + goto format; + CFG_GET_UINT32(argv[1], &uv1); + CFG_GET_UINT32(argv[2], &uv2); + return (__repmgr_set_incoming_queue_max( + dbenv, (u_int32_t)uv1, (u_int32_t)uv2)); + } + /* * Configure name/value pairs of config information for a site (local or * remote). @@ -503,7 +524,7 @@ format: __db_errx(env, DB_STR_A("1584", uv2 = 0; else CFG_GET_UINT32(argv[i + 1], &uv2); - if ((ret = __repmgr_site_config(site, + if ((ret = __repmgr_site_config_int(site, (u_int32_t)lv1, (u_int32_t)uv2)) != 0) break; } @@ -630,6 +651,15 @@ format: __db_errx(env, DB_STR_A("1584", dbenv, DB_REGION_INIT, lv1 == 0 ? 0 : 1)); } + /* set_mutex_failchk_timeout <unsigned timeout> */ + if (strcasecmp(argv[0], "set_mutex_failchk_timeout") == 0) { + if (nf != 2) + goto format; + CFG_GET_UINT32(argv[1], &uv1); + return (__env_set_timeout( + dbenv, (u_int32_t)uv1, DB_SET_MUTEX_FAILCHK_TIMEOUT)); + } + /* set_reg_timeout <unsigned timeout> */ if (strcasecmp(argv[0], "set_reg_timeout") == 0) { if (nf != 2) diff --git a/src/env/env_failchk.c b/src/env/env_failchk.c index 05752f07..ad9bed0b 100644 --- a/src/env/env_failchk.c +++ b/src/env/env_failchk.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -22,9 +22,26 @@ static int __env_in_api __P((ENV *)); static void __env_clear_state __P((ENV *)); /* + * When failchk broadcast is enabled continue after the first error, to try to + * find all of them; without broadcasting stop at the first failure. + */ +#ifdef HAVE_FAILCHK_BROADCAST +#define FAILCHK_PROCESS_ERROR(t_ret, ret) \ + if ((t_ret) != 0 && (ret) == 0) (ret) = (t_ret) +#else +#define FAILCHK_PROCESS_ERROR(t_ret, ret) \ + if (((ret) = (t_ret)) != 0) goto err +#endif + +/* * __env_failchk_pp -- * ENV->failchk pre/post processing. * + * Single process failchk continues after recoverable failures but stops as + * soon as recovery is required. Broadcast failchks continue even after + * DB_RUNRECOVERY failures are detected, to maximize the possibility to + * wake up processes blocked on dead resources, e.g. mutexes. + * * PUBLIC: int __env_failchk_pp __P((DB_ENV *, u_int32_t)); */ int @@ -46,7 +63,7 @@ __env_failchk_pp(dbenv, flags) */ if (!ALIVE_ON(env)) { __db_errx(env, DB_STR("1503", - "DB_ENV->failchk requires DB_ENV->is_alive be configured")); + "DB_ENV->failchk requires DB_ENV->is_alive be configured")); return (EINVAL); } @@ -59,10 +76,14 @@ __env_failchk_pp(dbenv, flags) ENV_LEAVE(env, ip); return (ret); } + /* * __env_failchk_int -- * Process the subsystem failchk routines * + * The FAILCHK_PROCESS_ERROR macro (defined at the top of this file) + * differs between the broadcast and single process versions of failchk. + * * PUBLIC: int __env_failchk_int __P((DB_ENV *)); */ int @@ -70,42 +91,52 @@ __env_failchk_int(dbenv) DB_ENV *dbenv; { ENV *env; - int ret; + int ret, t_ret; env = dbenv->env; + ret = 0; F_SET(dbenv, DB_ENV_FAILCHK); /* - * We check for dead threads in the API first as this would be likely - * to hang other things we try later, like locks and transactions. + * We check for dead threads in the API first as this would likely + * hang other things we try later, like locks and transactions. */ - if ((ret = __env_in_api(env)) != 0) + if ((ret = __env_in_api(env)) != 0) { + __db_err(env, ret, "__env_in_api"); goto err; + } - if (LOCKING_ON(env) && (ret = __lock_failchk(env)) != 0) - goto err; + if (LOCKING_ON(env) && (t_ret = __lock_failchk(env)) != 0) + FAILCHK_PROCESS_ERROR(t_ret, ret); - if (TXN_ON(env) && - ((ret = __txn_failchk(env)) != 0 || - (ret = __dbreg_failchk(env)) != 0)) - goto err; + if (TXN_ON(env) && ret == 0 && ((t_ret = __txn_failchk(env)) != 0 || + (t_ret = __dbreg_failchk(env)) != 0)) + FAILCHK_PROCESS_ERROR(t_ret, ret); - if ((ret = __memp_failchk(env)) != 0) - goto err; + if ((t_ret = __memp_failchk(env)) != 0) + FAILCHK_PROCESS_ERROR(t_ret, ret); #ifdef HAVE_REPLICATION_THREADS - if (REP_ON(env) && (ret = __repmgr_failchk(env)) != 0) - goto err; + if (REP_ON(env) && (t_ret = __repmgr_failchk(env)) != 0) + FAILCHK_PROCESS_ERROR(t_ret, ret); #endif - /* Mark any dead blocked threads as dead. */ - __env_clear_state(env); +err: #ifdef HAVE_MUTEX_SUPPORT - ret = __mut_failchk(env); + if ((t_ret = __mutex_failchk(env)) != 0 && ret == 0) + ret = t_ret; #endif -err: F_CLR(dbenv, DB_ENV_FAILCHK); + /* Any dead blocked thread slots are no longer needed; allow reuse. */ + if (ret == 0) + __env_clear_state(env); + if (ret == DB_RUNRECOVERY) { + /* Announce a panic; avoid __env_panic()'s diag core dump. */ + __env_panic_set(env, 1); + __env_panic_event(env, ret); + } + F_CLR(dbenv, DB_ENV_FAILCHK); return (ret); } @@ -312,7 +343,8 @@ __env_in_api(env) REGINFO *infop; THREAD_INFO *thread; u_int32_t i; - int unpin, ret; + pid_t pid; + int unpin, ret, t_ret; if ((htab = env->thr_hashtab) == NULL) return (EINVAL); @@ -322,10 +354,13 @@ __env_in_api(env) renv = infop->primary; thread = R_ADDR(infop, renv->thread_off); unpin = 0; + ret = 0; for (i = 0; i < env->thr_nbucket; i++) SH_TAILQ_FOREACH(ip, &htab[i], dbth_links, __db_thread_info) { + pid = ip->dbth_pid; if (ip->dbth_state == THREAD_SLOT_NOT_IN_USE || + ip->dbth_state == THREAD_BLOCKED_DEAD || (ip->dbth_state == THREAD_OUT && thread->thr_count < thread->thr_max)) continue; @@ -341,26 +376,63 @@ __env_in_api(env) ip->dbth_state = THREAD_SLOT_NOT_IN_USE; continue; } - return (__db_failed(env, DB_STR("1507", + /* + * The above tests are not atomic, so it is possible that + * the process pointed by ip has changed during the tests. + * In particular, if the process pointed by ip when is_alive + * was executed terminated normally, a new process may reuse + * the same ip structure and change its dbth_state before the + * next two tests were performed. Therefore, we need to test + * here that all four tests above are done on the same process. + * If the process pointed by ip changed, all tests are invalid + * and can be ignored. + * Similarly, it's also possible for two processes racing to + * change the dbth_state of the same ip structure. For example, + * both process A and B reach the above test for the same + * terminated process C where C's dbth_state is THREAD_OUT. + * If A goes into the 'if' block and changes C's dbth_state to + * THREAD_SLOT_NOT_IN_USE before B checks the condition, B + * would incorrectly fail the test and run into this line. + * Therefore, we need to check C's dbth_state again and fail + * the db only if C's dbth_state is indeed THREAD_ACTIVE. + */ + if (ip->dbth_state != THREAD_ACTIVE || ip->dbth_pid != pid) + continue; + __os_gettime(env, &ip->dbth_failtime, 0); + t_ret = __db_failed(env, DB_STR("1507", "Thread died in Berkeley DB library"), - ip->dbth_pid, ip->dbth_tid)); + ip->dbth_pid, ip->dbth_tid); + if (ret == 0) + ret = t_ret; + /* + * Classic failchk stop after one dead thread in the + * api, but broadcasting looks for all. + */ +#ifndef HAVE_FAILCHK_BROADCAST + return (ret); +#endif } if (unpin == 0) - return (0); + return (ret); for (i = 0; i < env->thr_nbucket; i++) SH_TAILQ_FOREACH(ip, &htab[i], dbth_links, __db_thread_info) if (ip->dbth_state == THREAD_BLOCKED_DEAD && - (ret = __memp_unpin_buffers(env, ip)) != 0) + (t_ret = __memp_unpin_buffers(env, ip)) != 0) { + if (ret == 0) + ret = t_ret; +#ifndef HAVE_FAILCHK_BROADCAST return (ret); +#endif + } - return (0); + return (ret); } /* * __env_clear_state -- - * Look for threads which died while blockedi and clear them.. + * Look for threads which died while blocked and clear them.. */ static void __env_clear_state(env) @@ -441,6 +513,9 @@ __env_set_state(env, ipp, state) #endif } + /* A failchk thread must not block on a lock -- that would be faulty. */ + if (state == THREAD_BLOCKED && ip != NULL) + DB_ASSERT(env, ip->dbth_state != THREAD_FAILCHK); /* * If ipp is not null, return the thread control block if found. * Check to ensure the thread of control has been registered. @@ -457,7 +532,9 @@ __env_set_state(env, ipp, state) *ipp = NULL; ret = 0; - if (ip == NULL) { + if (ip != NULL) + ip->dbth_state = state; + else { infop = env->reginfo; renv = infop->primary; thread = R_ADDR(infop, renv->thread_off); @@ -503,11 +580,13 @@ __env_set_state(env, ipp, state) init: ip->dbth_pid = id.pid; ip->dbth_tid = id.tid; ip->dbth_state = state; + for (indx = 0; indx != MUTEX_STATE_MAX; indx++) + ip->dbth_latches[indx].mutex = MUTEX_INVALID; SH_TAILQ_INIT(&ip->dbth_xatxn); } MUTEX_UNLOCK(env, renv->mtx_regenv); - } else - ip->dbth_state = state; + } + *ipp = ip; DB_ASSERT(env, ret == 0); @@ -535,7 +614,7 @@ __env_thread_id_string(dbenv, pid, tid, buf) #ifdef UINT64_FMT char fmt[20]; - snprintf(fmt, sizeof(fmt), "%s/%s", UINT64_FMT, UINT64_FMT); + snprintf(fmt, sizeof(fmt), "%s/%s", INT64_FMT, UINT64_FMT); snprintf(buf, DB_THREADID_STRLEN, fmt, (u_int64_t)pid, (u_int64_t)(uintptr_t)tid); #else diff --git a/src/env/env_file.c b/src/env/env_file.c index b102404d..d6e29b21 100644 --- a/src/env/env_file.c +++ b/src/env/env_file.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2002, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2002, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -12,7 +12,7 @@ /* * __db_file_extend -- - * Initialize a regular file by writing the last page of the file. + * Initialize or extend a regular file by writing to its last page. * * PUBLIC: int __db_file_extend __P((ENV *, DB_FH *, size_t)); */ @@ -27,7 +27,19 @@ __db_file_extend(env, fhp, size) u_int32_t relative; int ret; char buf; +#ifdef HAVE_MMAP_EXTEND + unsigned pagesize; + /* + * Round up size to the VM pagesize. If it isn't aligned, then the bytes + * ending the mapping might have no corresponding backing location on + * disk, and could be silently lost when the process exits. [#23290] + */ + if (F_ISSET(fhp, DB_FH_REGION)) { + pagesize = (unsigned)getpagesize(); + size = DB_ALIGN(size, pagesize); + } +#endif buf = '\0'; /* * Extend the file by writing the last page. If the region is >4Gb, diff --git a/src/env/env_globals.c b/src/env/env_globals.c index 955e6738..2d665661 100644 --- a/src/env/env_globals.c +++ b/src/env/env_globals.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -31,14 +31,21 @@ DB_GLOBALS __db_global_values = { "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=", /* db_line */ { 0 }, /* error_buf */ - 0, /* uid_init */ - 0, /* rand_next */ + 0, /* random_seeded */ +#if defined(HAVE_RANDOM_R) + { 0 }, /* random_r random_data */ + { 0 }, /* random_r state */ +#elif !defined(HAVE_RAND) && !defined(HAVE_RANDOM) + 0, /* rand/srand value */ +#endif 0, /* fid_serial */ 0, /* db_errno */ - 0, /* num_active_pids */ - 0, /* size_active_pids */ - NULL, /* active_pids */ NULL, /* saved_errstr */ + "%m/%d %H:%M:%S", /* strftime format for dates */ +#if defined(HAVE_ERROR_HISTORY) + 0, /* thread local msgs_key */ + PTHREAD_ONCE_INIT, /* pthread_once initializer */ +#endif NULL, /* j_assert */ NULL, /* j_close */ NULL, /* j_dirfree */ diff --git a/src/env/env_method.c b/src/env/env_method.c index 63deacea..c246febc 100644 --- a/src/env/env_method.c +++ b/src/env/env_method.c @@ -1,9 +1,9 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved. * - * $Id: env_method.c,v dabaaeb7d839 2010/08/03 17:28:53 mike $ + * $Id$ */ #include "db_config.h" @@ -40,6 +40,7 @@ static int __env_get_app_dispatch __P((DB_ENV *, int (**)(DB_ENV *, DBT *, DB_LSN *, db_recops))); static int __env_set_app_dispatch __P((DB_ENV *, int (*)(DB_ENV *, DBT *, DB_LSN *, db_recops))); +static int __env_get_blob_dir __P((DB_ENV *, const char **)); static int __env_set_event_notify __P((DB_ENV *, void (*)(DB_ENV *, u_int32_t, void *))); static int __env_get_feedback __P((DB_ENV *, void (**)(DB_ENV *, int, int))); @@ -81,6 +82,11 @@ db_env_create(dbenvpp, flags) if (flags != 0) return (EINVAL); +#ifdef HAVE_ERROR_HISTORY + /* Call thread local storage initializer at least once per process. */ + __db_thread_init(); +#endif + /* Allocate the DB_ENV and ENV structures -- we always have both. */ if ((ret = __os_calloc(NULL, 1, sizeof(DB_ENV), &dbenv)) != 0) return (ret); @@ -159,7 +165,7 @@ __db_env_init(dbenv) */ /* DB_ENV PUBLIC HANDLE LIST BEGIN */ dbenv->add_data_dir = __env_add_data_dir; - dbenv->backup = __db_backup; + dbenv->backup = __db_backup_pp; dbenv->dbbackup = __db_dbbackup_pp; dbenv->cdsgroup_begin = __cdsgroup_begin_pp; dbenv->close = __env_close_pp; @@ -175,6 +181,8 @@ __db_env_init(dbenv) dbenv->get_cachesize = __memp_get_cachesize; dbenv->get_backup_callbacks = __env_get_backup_callbacks; dbenv->get_backup_config = __env_get_backup_config; + dbenv->get_blob_dir = __env_get_blob_dir; + dbenv->get_blob_threshold = __env_get_blob_threshold_pp; dbenv->get_create_dir = __env_get_create_dir; dbenv->get_data_dirs = __env_get_data_dirs; dbenv->get_data_len = __env_get_data_len; @@ -269,7 +277,7 @@ __db_env_init(dbenv) dbenv->open = __env_open_pp; dbenv->remove = __env_remove; dbenv->rep_elect = __rep_elect_pp; - dbenv->rep_flush = __rep_flush; + dbenv->rep_flush = __rep_flush_pp; dbenv->rep_get_clockskew = __rep_get_clockskew; dbenv->rep_get_config = __rep_get_config; dbenv->rep_get_limit = __rep_get_limit; @@ -282,29 +290,34 @@ __db_env_init(dbenv) dbenv->rep_set_config = __rep_set_config; dbenv->rep_set_limit = __rep_set_limit; dbenv->rep_set_nsites = __rep_set_nsites_pp; - dbenv->rep_set_priority = __rep_set_priority; + dbenv->rep_set_priority = __rep_set_priority_pp; dbenv->rep_set_request = __rep_set_request; - dbenv->rep_set_timeout = __rep_set_timeout; + dbenv->rep_set_timeout = __rep_set_timeout_pp; dbenv->rep_set_transport = __rep_set_transport_pp; + dbenv->rep_set_view = __rep_set_view; dbenv->rep_start = __rep_start_pp; dbenv->rep_stat = __rep_stat_pp; dbenv->rep_stat_print = __rep_stat_print_pp; dbenv->rep_sync = __rep_sync; dbenv->repmgr_channel = __repmgr_channel; dbenv->repmgr_get_ack_policy = __repmgr_get_ack_policy; + dbenv->repmgr_get_incoming_queue_max = __repmgr_get_incoming_queue_max; dbenv->repmgr_local_site = __repmgr_local_site; dbenv->repmgr_msg_dispatch = __repmgr_set_msg_dispatch; dbenv->repmgr_set_ack_policy = __repmgr_set_ack_policy; + dbenv->repmgr_set_incoming_queue_max = __repmgr_set_incoming_queue_max; dbenv->repmgr_site = __repmgr_site; dbenv->repmgr_site_by_eid = __repmgr_site_by_eid; - dbenv->repmgr_site_list = __repmgr_site_list; - dbenv->repmgr_start = __repmgr_start; + dbenv->repmgr_site_list = __repmgr_site_list_pp; + dbenv->repmgr_start = __repmgr_start_pp; dbenv->repmgr_stat = __repmgr_stat_pp; dbenv->repmgr_stat_print = __repmgr_stat_print_pp; dbenv->set_alloc = __env_set_alloc; dbenv->set_app_dispatch = __env_set_app_dispatch; dbenv->set_backup_callbacks = __env_set_backup_callbacks; dbenv->set_backup_config = __env_set_backup_config; + dbenv->set_blob_dir = __env_set_blob_dir; + dbenv->set_blob_threshold = __env_set_blob_threshold; dbenv->set_cache_max = __memp_set_cache_max; dbenv->set_cachesize = __memp_set_cachesize; dbenv->set_create_dir = __env_set_create_dir; @@ -370,10 +383,11 @@ __db_env_init(dbenv) dbenv->thread_id = __os_id; dbenv->thread_id_string = __env_thread_id_string; + dbenv->mutex_failchk_timeout = US_PER_SEC; + env = dbenv->env; __os_id(NULL, &env->pid_cache, NULL); - env->db_ref = 0; env->log_verify_wrap = __log_verify_wrap; env->data_len = ENV_DEF_DATA_LEN; TAILQ_INIT(&env->fdlist); @@ -561,6 +575,97 @@ __env_get_memory_init(dbenv, type, countp) } /* + * __env_get_blob_threshold_pp -- + * Get the blob threshold for the environment. Any data item larger + * than the blob threshold is automatically saved as a blob file. + * + * PUBLIC: int __env_get_blob_threshold_pp + * PUBLIC: __P ((DB_ENV *, u_int32_t *)); + */ +int +__env_get_blob_threshold_pp(dbenv, bytes) + DB_ENV *dbenv; + u_int32_t *bytes; +{ + ENV *env; + DB_THREAD_INFO *ip; + int ret; + + env = dbenv->env; + + ENV_ENTER(env, ip); + ret = __env_get_blob_threshold_int(env, bytes); + ENV_LEAVE(env, ip); + + return (ret); +} + +/* + * __env_get_blob_threshold_int -- + * Get the blob threshold for the environment. Any data item larger + * than the blob threshold is automatically saved as a blob file. + * + * PUBLIC: int __env_get_blob_threshold_int + * PUBLIC: __P ((ENV *, u_int32_t *)); + */ +int +__env_get_blob_threshold_int(env, bytes) + ENV *env; + u_int32_t *bytes; +{ + REGENV *renv; + REGINFO *infop; + + if (F_ISSET(env, ENV_OPEN_CALLED)) { + infop = env->reginfo; + renv = infop->primary; + MUTEX_LOCK(env, renv->mtx_regenv); + *bytes = renv->blob_threshold; + MUTEX_UNLOCK(env, renv->mtx_regenv); + } else + *bytes = env->dbenv->blob_threshold; + + return (0); +} + +/* + * __env_set_blob_threshold -- + * Set the default blob threshold for the environment. Any data item larger + * than the blob threshold is automatically saved as a blob file. + * + * PUBLIC: int __env_set_blob_threshold __P((DB_ENV *, u_int32_t, u_int32_t)); + */ +int +__env_set_blob_threshold(dbenv, bytes, flags) + DB_ENV *dbenv; + u_int32_t bytes; + u_int32_t flags; +{ + ENV *env; + REGENV *renv; + REGINFO *infop; + DB_THREAD_INFO *ip; + + env = dbenv->env; + + if (__db_fchk(dbenv->env, "DB_ENV->set_blob_threshold", flags, 0) != 0) + return (EINVAL); + + if (F_ISSET(env, ENV_OPEN_CALLED)) { + infop = env->reginfo; + renv = infop->primary; + ENV_ENTER(env, ip); + MUTEX_LOCK(env, renv->mtx_regenv); + renv->blob_threshold = bytes; + MUTEX_UNLOCK(env, renv->mtx_regenv); + ENV_LEAVE(env, ip); + } else + dbenv->blob_threshold = bytes; + + return (0); +} + +/* * __env_set_memory_init -- * DB_ENV->set_memory_init. * @@ -697,6 +802,43 @@ __env_set_app_dispatch(dbenv, app_dispatch) } /* + * __env_set_blob_dir -- + * API to allow the user to override the default blob file + * root directory. Must be set if blobs are enabled and an + * unnamed environment is created. + * + * PUBLIC: int __env_set_blob_dir __P((DB_ENV *, const char *)); + */ +int +__env_set_blob_dir(dbenv, dir) + DB_ENV *dbenv; + const char *dir; +{ + ENV *env; + + env = dbenv->env; + + ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_blob_dir"); + + if (dbenv->db_blob_dir != NULL) + __os_free(env, dbenv->db_blob_dir); + return (__os_strdup(env, dir, &dbenv->db_blob_dir)); +} + +/* + * __env_get_blob_dir -- + * Get the blob file root directory. + */ +static int +__env_get_blob_dir(dbenv, dirp) + DB_ENV *dbenv; + const char **dirp; +{ + *dirp = dbenv->db_blob_dir; + return (0); +} + +/* * __env_get_encrypt_flags -- * {DB_ENV,DB}->get_encrypt_flags. * @@ -1061,6 +1203,10 @@ __env_set_backup(env, on) return (EINVAL); } + /* + * This code does not need env_rep_enter for the checkpoint because + * it can only happen if there is an active bulk txn existing. + */ if (needs_checkpoint && (ret = __txn_checkpoint(env, 0, 0, 0))) return (ret); return (0); @@ -1244,6 +1390,11 @@ __env_set_data_len(dbenv, data_len) DB_ENV *dbenv; u_int32_t data_len; { + if (data_len == 0) { + __db_errx(dbenv->env, DB_STR("1593", +"Maximum number of bytes to display for each key/data item can not be 0.")); + return (EINVAL); + } dbenv->env->data_len = data_len; return (0); @@ -1720,6 +1871,7 @@ __env_get_verbose(dbenv, which, onoffp) case DB_VERB_DEADLOCK: case DB_VERB_FILEOPS: case DB_VERB_FILEOPS_ALL: + case DB_VERB_MVCC: case DB_VERB_RECOVERY: case DB_VERB_REGISTER: case DB_VERB_REPLICATION: @@ -1758,6 +1910,7 @@ __env_set_verbose(dbenv, which, on) case DB_VERB_DEADLOCK: case DB_VERB_FILEOPS: case DB_VERB_FILEOPS_ALL: + case DB_VERB_MVCC: case DB_VERB_RECOVERY: case DB_VERB_REGISTER: case DB_VERB_REPLICATION: @@ -1888,9 +2041,15 @@ __env_get_timeout(dbenv, timeoutp, flags) int ret; ret = 0; - if (flags == DB_SET_REG_TIMEOUT) { + if (flags == DB_SET_REG_TIMEOUT) *timeoutp = dbenv->envreg_timeout; - } else + else if (flags == DB_SET_MUTEX_FAILCHK_TIMEOUT) +#ifdef HAVE_FAILCHK_BROADCAST + *timeoutp = dbenv->mutex_failchk_timeout; +#else + ret = USR_ERR(dbenv->env, DB_OPNOTSUP); +#endif + else ret = __lock_get_env_timeout(dbenv, timeoutp, flags); return (ret); } @@ -1912,6 +2071,12 @@ __env_set_timeout(dbenv, timeout, flags) ret = 0; if (flags == DB_SET_REG_TIMEOUT) dbenv->envreg_timeout = timeout; + else if (flags == DB_SET_MUTEX_FAILCHK_TIMEOUT) +#ifdef HAVE_FAILCHK_BROADCAST + dbenv->mutex_failchk_timeout = timeout; +#else + ret = USR_ERR(dbenv->env, DB_OPNOTSUP); +#endif else ret = __lock_set_env_timeout(dbenv, timeout, flags); return (ret); diff --git a/src/env/env_name.c b/src/env/env_name.c index a3a0b371..d0dd5635 100644 --- a/src/env/env_name.c +++ b/src/env/env_name.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -9,6 +9,7 @@ #include "db_config.h" #include "db_int.h" +#include "dbinc/blob.h" static int __db_fullpath __P((ENV *, const char *, const char *, int, int, char **)); @@ -122,7 +123,7 @@ __db_appname(env, appname, file, dirp, namep) { DB_ENV *dbenv; char **ddp; - const char *dir; + const char *blob_dir, *dir; int ret; dbenv = env->dbenv; @@ -141,6 +142,8 @@ __db_appname(env, appname, file, dirp, namep) /* * DB_APP_NONE: * DB_HOME/file + * DB_APP_BLOB: + * DB_HOME/DB_BLOB_DIR/file * DB_APP_DATA: * DB_HOME/DB_DATA_DIR/file * DB_APP_LOG: @@ -151,6 +154,12 @@ __db_appname(env, appname, file, dirp, namep) switch (appname) { case DB_APP_NONE: break; + case DB_APP_BLOB: + if (dbenv != NULL && dbenv->db_blob_dir != NULL) + dir = dbenv->db_blob_dir; + else + dir = BLOB_DEFAULT_DIR; + break; case DB_APP_RECOVER: case DB_APP_DATA: /* @@ -164,6 +173,13 @@ __db_appname(env, appname, file, dirp, namep) /* Second, look in the environment home directory. */ DB_CHECKFILE(file, NULL, 1, 0, namep, dirp); + /* Third, check the blob directory. */ + if (dbenv != NULL && dbenv->db_blob_dir != NULL) + blob_dir = dbenv->db_blob_dir; + else + blob_dir = BLOB_DEFAULT_DIR; + DB_CHECKFILE(file, blob_dir, 1, 0, namep, dirp); + /* * Otherwise, we're going to create. Use the specified * directory unless we're in recovery and it doesn't exist. diff --git a/src/env/env_open.c b/src/env/env_open.c index 7eddca3a..85189369 100644 --- a/src/env/env_open.c +++ b/src/env/env_open.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -107,10 +107,16 @@ __env_open_pp(dbenv, db_home, flags, mode) __db_errx(env, DB_STR("1589", "DB_PRIVATE is not " "supported by 64-bit applications in " "mixed-size-addressing mode")); - return (EINVAL); - } + return (EINVAL); + } #endif + if (LF_ISSET(DB_PRIVATE) && PREFMAS_IS_SET(env)) { + __db_errx(env, DB_STR("1594", "DB_PRIVATE is not " + "supported in Replication Manager preferred master mode")); + return (EINVAL); + } + return (__env_open(dbenv, db_home, flags, mode)); } @@ -129,12 +135,20 @@ __env_open(dbenv, db_home, flags, mode) { DB_THREAD_INFO *ip; ENV *env; - u_int32_t orig_flags; - int register_recovery, ret, t_ret; + u_int32_t orig_flags, retry_flags; + int recovery_failed, register_recovery, ret, t_ret; + char *old_passwd; + size_t old_passwd_len; + u_int32_t old_encrypt_flags; ip = NULL; env = dbenv->env; + recovery_failed = 1; register_recovery = 0; + retry_flags = 0; + old_passwd = NULL; + old_passwd_len = 0; + old_encrypt_flags = 0; /* Initial configuration. */ if ((ret = __env_config(dbenv, db_home, &flags, mode)) != 0) @@ -171,13 +185,27 @@ __env_open(dbenv, db_home, flags, mode) dbenv->is_alive = __envreg_isalive; } - if ((ret = - __envreg_register(env, ®ister_recovery, flags)) != 0) + /* + * Backup the current key, because it would be consumed by + * __envreg_register below + */ + if (dbenv->passwd != NULL) { + if ((ret = __os_strdup(env, dbenv->passwd, &old_passwd)) != 0) + goto err; + old_passwd_len = dbenv->passwd_len; + (void)__env_get_encrypt_flags(dbenv, &old_encrypt_flags); + } + + F_SET(dbenv, DB_ENV_NOPANIC); + ret = __envreg_register(env, ®ister_recovery, flags); + dbenv->flags = orig_flags; + if (ret != 0) goto err; if (register_recovery) { if (!LF_ISSET(DB_RECOVER)) { __db_errx(env, DB_STR("1567", "The DB_RECOVER flag was not specified, and recovery is needed")); + recovery_failed = 0; ret = DB_RUNRECOVERY; goto err; } @@ -197,16 +225,27 @@ __env_open(dbenv, db_home, flags, mode) * want to remove files left over for any reason, from any session. */ retry: if (LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL)) + if ( #ifdef HAVE_REPLICATION - if ((ret = __rep_reset_init(env)) != 0 || - (ret = __env_remove_env(env)) != 0 || -#else - if ((ret = __env_remove_env(env)) != 0 || + (ret = __rep_reset_init(env)) != 0 || #endif - (ret = __env_refresh(dbenv, orig_flags, 0)) != 0) + (ret = __env_remove_env(env)) != 0 || + (ret = __env_refresh(dbenv, + orig_flags | retry_flags, 0)) != 0) goto err; - if ((ret = __env_attach_regions(dbenv, flags, orig_flags, 1)) != 0) + /* Restore the database key. */ + if (LF_ISSET(DB_REGISTER) && old_passwd != NULL) { + ret = __env_set_encrypt(dbenv, old_passwd, old_encrypt_flags); + memset(old_passwd, 0xff, old_passwd_len - 1); + __os_free(env, old_passwd); + if (ret != 0) + goto err; + } + + DB_ASSERT(env, ret == 0); + if ((ret = __env_attach_regions(dbenv, + flags, orig_flags | retry_flags, 1)) != 0) goto err; /* @@ -216,8 +255,18 @@ retry: if (LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL)) */ if (LF_ISSET(DB_FAILCHK) && !register_recovery) { ENV_ENTER(env, ip); - if ((ret = __env_failchk_int(dbenv)) != 0) + /* + * Set the thread state so that any waiting for a potentially + * dead thread will call is_alive() in order to avoid hanging. + */ + FAILCHK_THREAD(env, ip); + ret = __env_failchk_int(dbenv); + if (ret != 0) { + __db_err(env, ret, + DB_STR("1595", + "failchk crash after clean registry")); goto err; + } ENV_LEAVE(env, ip); } @@ -230,12 +279,12 @@ err: if (ret != 0) * processes can now proceed. * * If recovery failed, unregister now and let another process - * clean up. + * clean up and run recovery. */ if (ret == 0 && (t_ret = __envreg_xunlock(env)) != 0) ret = t_ret; if (ret != 0) - (void)__envreg_unregister(env, 1); + (void)__envreg_unregister(env, recovery_failed); } /* @@ -247,7 +296,11 @@ err: if (ret != 0) */ if (ret == DB_RUNRECOVERY && !register_recovery && !LF_ISSET(DB_RECOVER) && LF_ISSET(DB_REGISTER)) { + if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) + __db_msg(env, DB_STR("1596", + "env_open DB_REGISTER w/o RECOVER panic: trying w/recovery")); LF_SET(DB_RECOVER); + retry_flags = DB_ENV_NOPANIC; goto retry; } @@ -304,6 +357,9 @@ __env_open_arg(dbenv, flags) "replication requires transaction support")); return (EINVAL); } + if ((ret = + __log_set_config_int(dbenv, DB_LOG_BLOB, 1, 1)) != 0) + return (ret); } if (LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL)) { if ((ret = __db_fcchk(env, @@ -349,30 +405,6 @@ __env_open_arg(dbenv, flags) } #endif -#ifdef HAVE_MUTEX_FCNTL - /* - * !!! - * We need a file descriptor for fcntl(2) locking. We use the file - * handle from the REGENV file for this purpose. - * - * Since we may be using shared memory regions, e.g., shmget(2), and - * not a mapped-in regular file, the backing file may be only a few - * bytes in length. So, this depends on the ability to call fcntl to - * lock file offsets much larger than the actual physical file. I - * think that's safe -- besides, very few systems actually need this - * kind of support, SunOS is the only one still in wide use of which - * I'm aware. - * - * The error case is if an application lacks spinlocks and wants to be - * threaded. That doesn't work because fcntl will lock the underlying - * process, including all its threads. - */ - if (F_ISSET(env, ENV_THREAD)) { - __db_errx(env, DB_STR("1578", - "architecture lacks fast mutexes: applications cannot be threaded")); - return (EINVAL); - } -#endif return (ret); } @@ -506,7 +538,7 @@ __env_close_pp(dbenv, flags) { DB_THREAD_INFO *ip; ENV *env; - int rep_check, ret, t_ret; + int ret, t_ret; u_int32_t close_flags, flags_orig; env = dbenv->env; @@ -517,65 +549,75 @@ __env_close_pp(dbenv, flags) * Validate arguments, but as a DB_ENV handle destructor, we can't * fail. */ - if (flags != 0 && flags != DB_FORCESYNC && - (t_ret = __db_ferr(env, "DB_ENV->close", 0)) != 0 && ret == 0) - ret = t_ret; +#undef OKFLAGS +#define OKFLAGS (DB_FORCESYNC | DB_FORCESYNCENV) + + ret = __db_fchk(env, "DB_ENV->close", flags, OKFLAGS); #define DBENV_FORCESYNC 0x00000001 #define DBENV_CLOSE_REPCHECK 0x00000010 - if (flags == DB_FORCESYNC) + if (LF_ISSET(DB_FORCESYNC)) close_flags |= DBENV_FORCESYNC; + if (LF_ISSET(DB_FORCESYNCENV)) + F_SET(env, ENV_FORCESYNCENV); + + /* + * Call __env_close() to clean up resources even though the open + * didn't fully succeed. + * */ + if (!F_ISSET(env, ENV_OPEN_CALLED)) + goto do_close; /* * If the environment has panic'd, all we do is try and discard * the important resources. */ if (PANIC_ISSET(env)) { + /* + * Temporarily set no panic so we do not trigger the + * LAST_PANIC_CHECK_BEFORE_IO check in __os_physwrite thus + * allowing the unregister to happen correctly. + */ + flags_orig = dbenv->flags; + F_SET(dbenv, DB_ENV_NOPANIC); + ENV_ENTER(env, ip); /* clean up from registry file */ - if (dbenv->registry != NULL) { - /* - * Temporarily set no panic so we do not trigger the - * LAST_PANIC_CHECK_BEFORE_IO check in __os_physwr - * thus allowing the unregister to happen correctly. - */ - flags_orig = F_ISSET(dbenv, DB_ENV_NOPANIC); - F_SET(dbenv, DB_ENV_NOPANIC); + if (dbenv->registry != NULL) (void)__envreg_unregister(env, 0); - dbenv->registry = NULL; - if (!flags_orig) - F_CLR(dbenv, DB_ENV_NOPANIC); - } /* Close all underlying threads and sockets. */ - if (IS_ENV_REPLICATED(env)) - (void)__repmgr_close(env); + (void)__repmgr_close(env); /* Close all underlying file handles. */ (void)__file_handle_cleanup(env); + ENV_LEAVE(env, ip); + + dbenv->flags = flags_orig; + (void)__env_region_cleanup(env); - PANIC_CHECK(env); + return (__env_panic_msg(env)); } ENV_ENTER(env, ip); - rep_check = IS_ENV_REPLICATED(env) ? 1 : 0; - if (rep_check) { #ifdef HAVE_REPLICATION_THREADS - /* - * Shut down Replication Manager threads first of all. This - * must be done before __env_rep_enter to avoid a deadlock that - * could occur if repmgr's background threads try to do a rep - * operation that needs __rep_lockout. - */ - if ((t_ret = __repmgr_close(env)) != 0 && ret == 0) - ret = t_ret; + /* + * Shut down Replication Manager threads first of all. This + * must be done before __env_rep_enter to avoid a deadlock that + * could occur if repmgr's background threads try to do a rep + * operation that needs __rep_lockout. + */ + if ((t_ret = __repmgr_close(env)) != 0 && ret == 0) + ret = t_ret; #endif + if (IS_ENV_REPLICATED(env)) { if ((t_ret = __env_rep_enter(env, 0)) != 0 && ret == 0) ret = t_ret; + if (ret == 0) + close_flags |= DBENV_CLOSE_REPCHECK; } - if (rep_check) - close_flags |= DBENV_CLOSE_REPCHECK; +do_close: if ((t_ret = __env_close(dbenv, close_flags)) != 0 && ret == 0) ret = t_ret; @@ -640,8 +682,11 @@ __env_close(dbenv, flags) t_ret = dbp->alt_close(dbp, close_flags); else t_ret = __db_close(dbp, NULL, close_flags); - if (t_ret != 0 && ret == 0) - ret = t_ret; + if (t_ret != 0) { + if (ret == 0) + ret = t_ret; + break; + } } /* @@ -661,10 +706,8 @@ __env_close(dbenv, flags) #endif /* If we're registered, clean up. */ - if (dbenv->registry != NULL) { + if (dbenv->registry != NULL) (void)__envreg_unregister(env, 0); - dbenv->registry = NULL; - } /* Check we've closed all underlying file handles. */ if ((t_ret = __file_handle_cleanup(env)) != 0 && ret == 0) @@ -680,6 +723,9 @@ __env_close(dbenv, flags) if (dbenv->db_md_dir != NULL) __os_free(env, dbenv->db_md_dir); dbenv->db_md_dir = NULL; + if (dbenv->db_blob_dir != NULL) + __os_free(env, dbenv->db_blob_dir); + dbenv->db_blob_dir = NULL; if (dbenv->db_data_dir != NULL) { for (p = dbenv->db_data_dir; *p != NULL; ++p) __os_free(env, *p); @@ -761,9 +807,7 @@ __env_refresh(dbenv, orig_flags, rep_check) ret = t_ret; } - /* Discard the DB_ENV, ENV handle mutexes. */ - if ((t_ret = __mutex_free(env, &dbenv->mtx_db_env)) != 0 && ret == 0) - ret = t_ret; + /* Discard the ENV handle mutex. */ if ((t_ret = __mutex_free(env, &env->mtx_env)) != 0 && ret == 0) ret = t_ret; @@ -936,17 +980,38 @@ __file_handle_cleanup(env) ENV *env; { DB_FH *fhp; + DB_MPOOL *dbmp; + u_int i; - if (TAILQ_FIRST(&env->fdlist) == NULL) + if (TAILQ_EMPTY(&env->fdlist)) return (0); - __db_errx(env, DB_STR("1581", - "File handles still open at environment close")); + __db_errx(env, + DB_STR("1581", "File handles still open at environment close")); while ((fhp = TAILQ_FIRST(&env->fdlist)) != NULL) { - __db_errx(env, DB_STR_A("1582", "Open file handle: %s", "%s"), - fhp->name); - (void)__os_closehandle(env, fhp); + __db_errx(env, + DB_STR_A("1582", "Open file handle: %s", "%s"), fhp->name); + if (__os_closehandle(env, fhp) != 0) + break; } + if (env->lockfhp != NULL) + env->lockfhp = NULL; + /* Invalidate saved pointers to the regions' files: all are closed. */ + if (env->reginfo != NULL) + env->reginfo->fhp = NULL; + if (env->lg_handle != NULL) + env->lg_handle->reginfo.fhp = NULL; + if (env->lk_handle != NULL) + env->lk_handle->reginfo.fhp = NULL; +#ifdef HAVE_MUTEX_SUPPORT + if (env->mutex_handle != NULL) + env->mutex_handle->reginfo.fhp = NULL; +#endif + if (env->tx_handle != NULL) + env->tx_handle->reginfo.fhp = NULL; + if ((dbmp = env->mp_handle) != NULL && dbmp->reginfo != NULL) + for (i = 0; i < env->dbenv->mp_ncache; ++i) + dbmp->reginfo[i].fhp = NULL; return (EINVAL); } @@ -1109,11 +1174,9 @@ __env_attach_regions(dbenv, flags, orig_flags, retry_ok) goto err; /* - * Initialize the handle mutexes. + * Initialize the handle mutex. */ if ((ret = __mutex_alloc(env, - MTX_ENV_HANDLE, DB_MUTEX_PROCESS_ONLY, &dbenv->mtx_db_env)) != 0 || - (ret = __mutex_alloc(env, MTX_ENV_HANDLE, DB_MUTEX_PROCESS_ONLY, &env->mtx_env)) != 0) goto err; @@ -1125,8 +1188,15 @@ __env_attach_regions(dbenv, flags, orig_flags, retry_ok) goto err; rep_check = IS_ENV_REPLICATED(env) ? 1 : 0; - if (rep_check && (ret = __env_rep_enter(env, 0)) != 0) + if (rep_check && (ret = __env_rep_enter(env, 0)) != 0) { + /* + * If we get an error we didn't increment handle_cnt, + * so we don't want to decrement it later. Turn off + * rep_check here. + */ + rep_check = 0; goto err; + } if (LF_ISSET(DB_INIT_MPOOL)) { if ((ret = __memp_open(env, create_ok)) != 0) diff --git a/src/env/env_recover.c b/src/env/env_recover.c index 9636554a..fb7ddee7 100644 --- a/src/env/env_recover.c +++ b/src/env/env_recover.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -18,17 +18,15 @@ #include "dbinc/qam.h" #include "dbinc/txn.h" -#ifndef lint -static const char copyright[] = - "Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.\n"; -#endif - static int __db_log_corrupt __P((ENV *, DB_LSN *)); static int __env_init_rec_42 __P((ENV *)); static int __env_init_rec_43 __P((ENV *)); static int __env_init_rec_46 __P((ENV *)); static int __env_init_rec_47 __P((ENV *)); static int __env_init_rec_48 __P((ENV *)); +static int __env_init_rec_53 __P((ENV *)); +static int __env_init_rec_60 __P((ENV *)); +static int __env_init_rec_60p1 __P((ENV *)); static int __log_earliest __P((ENV *, DB_LOGC *, int32_t *, DB_LSN *)); static double __lsn_diff __P((DB_LSN *, DB_LSN *, DB_LSN *, u_int32_t, int)); @@ -632,6 +630,12 @@ err: if (logc != NULL && (t_ret = __logc_close(logc)) != 0 && ret == 0) dbenv->tx_timestamp = 0; + /* + * Failure means that the env has panicked. Disable locking so that the + * env can close without its mutexes calls causing additional panics. + */ + if (ret != 0) + F_SET(env->dbenv, DB_ENV_NOLOCKING); F_CLR(env->lg_handle, DBLOG_RECOVER); F_CLR(region, TXN_IN_RECOVERY); @@ -690,7 +694,8 @@ __lsn_diff(low, high, current, max, is_forward) * is trying to sync up with a master whose max LSN is less than this * client's max lsn; we want to roll back everything after that. * - * Find the latest checkpoint whose ckp_lsn is less than the max lsn. + * Find the latest checkpoint less than or equal to max lsn and + * return the ckp_lsn from that checkpoint. */ static int __log_backup(env, logc, max_lsn, start_lsn) @@ -713,10 +718,11 @@ __log_backup(env, logc, max_lsn, start_lsn) return (ret); /* * Follow checkpoints through the log until - * we find one with a ckp_lsn less than - * or equal max_lsn. + * we find one less than or equal max_lsn. + * Then return the ckp_lsn from that checkpoint as it + * is our earliest outstanding txn needed. */ - if (LOG_COMPARE(&ckp_args->ckp_lsn, max_lsn) <= 0) { + if (LOG_COMPARE(&lsn, max_lsn) <= 0) { *start_lsn = ckp_args->ckp_lsn; break; } @@ -727,7 +733,7 @@ __log_backup(env, logc, max_lsn, start_lsn) * done. Break with DB_NOTFOUND. */ if (IS_ZERO_LSN(lsn)) { - ret = DB_NOTFOUND; + ret = USR_ERR(env, DB_NOTFOUND); break; } __os_free(env, ckp_args); @@ -880,6 +886,9 @@ __db_log_corrupt(env, lsnp) /* * __env_init_rec -- * + * Install recover functions in the environment. Whenever this is updated, + * corresponding changes are needed by db_printlog's env_init_print(). + * * PUBLIC: int __env_init_rec __P((ENV *, u_int32_t)); */ int @@ -924,6 +933,29 @@ __env_init_rec(env, version) * oldest revision that applies must be used. Therefore we override * the recovery functions in reverse log version order. */ + if (version == DB_LOGVERSION) + goto done; + + /* DB_LOGVERSION_61 add the blob file id to the dbreg logs. */ + if (version > DB_LOGVERSION_60p1) + goto done; + if ((ret = __env_init_rec_60p1(env)) != 0) + goto err; + + /* + * DB_LOGVERSION_60p1 changed the two u_int32_t offset fields in the + * log for fop_write_file into a single int64. + */ + if (version > DB_LOGVERSION_60) + goto done; + if ((ret = __env_init_rec_60(env)) != 0) + goto err; + + /* DB_LOGVERSION_53 changed the heap addrem log record. */ + if (version > DB_LOGVERSION_53) + goto done; + if ((ret = __env_init_rec_53(env)) != 0) + goto err; /* * DB_LOGVERSION_53 is a strict superset of DB_LOGVERSION_50. * So, only check > DB_LOGVERSION_48p2. If/When log records are @@ -931,6 +963,8 @@ __env_init_rec(env, version) */ if (version > DB_LOGVERSION_48p2) goto done; + if (version >= DB_LOGVERSION_50) + goto done; if ((ret = __env_init_rec_48(env)) != 0) goto err; /* @@ -1091,3 +1125,77 @@ __env_init_rec_48(env) err: return (ret); } + +static int +__env_init_rec_53(env) + ENV *env; +{ + int ret; + +#ifdef HAVE_HEAP + if ((ret = __db_add_recovery_int(env, &env->recover_dtab, + __heap_addrem_50_recover, DB___heap_addrem_50)) != 0) + goto err; +#else + COMPQUIET(env, NULL); + COMPQUIET(ret, 0); + goto err; +#endif +err: + return (ret); +} + +static int +__env_init_rec_60(env) + ENV *env; +{ + int ret; + + if ((ret = __db_add_recovery_int(env, &env->recover_dtab, + __fop_create_60_recover, DB___fop_create_60)) != 0) + goto err; + + if ((ret = __db_add_recovery_int(env, &env->recover_dtab, + __fop_remove_60_recover, DB___fop_remove_60)) != 0) + goto err; + + if ((ret = __db_add_recovery_int(env, &env->recover_dtab, + __fop_rename_60_recover, DB___fop_rename_60)) != 0) + goto err; + + if ((ret = __db_add_recovery_int(env, &env->recover_dtab, + __fop_rename_noundo_60_recover, DB___fop_rename_noundo_60)) != 0) + goto err; + + if ((ret = __db_add_recovery_int(env, &env->recover_dtab, + __fop_file_remove_60_recover, DB___fop_file_remove_60)) != 0) + goto err; + + if ((ret = __db_add_recovery_int(env, &env->recover_dtab, + __fop_write_60_recover, DB___fop_write_60)) != 0) + goto err; + + if ((ret = __db_add_recovery_int(env, &env->recover_dtab, + __fop_write_file_60_recover, DB___fop_write_file_60)) != 0) + goto err; +err: + return (ret); +} + +static int +__env_init_rec_60p1(env) + ENV *env; +{ + int ret; + + if ((ret = __db_add_recovery_int(env, &env->recover_dtab, + __dbreg_register_42_recover, DB___dbreg_register_42)) != 0) + goto err; +#ifdef HAVE_HEAP + if ((ret = __db_add_recovery_int(env, &env->recover_dtab, + __heap_addrem_60_recover, DB___heap_addrem_60)) != 0) + goto err; +#endif +err: + return (ret); +} diff --git a/src/env/env_region.c b/src/env/env_region.c index 113bea21..cf7085b7 100644 --- a/src/env/env_region.c +++ b/src/env/env_region.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -90,8 +90,11 @@ loop: renv = NULL; * it's actually a creation or not, and we'll have to fall-back to a * join if it's not a create. */ - if (F_ISSET(env, ENV_PRIVATE) || DB_GLOBAL(j_region_map) != NULL) + if (F_ISSET(env, ENV_PRIVATE) || DB_GLOBAL(j_region_map) != NULL) { + DB_DEBUG_MSG(env, "env_attach: creating %s", + F_ISSET(env, ENV_PRIVATE) ? "private" : "user map func"); goto creation; + } /* * Try to create the file, if we have the authority. We have to ensure @@ -179,14 +182,15 @@ loop: renv = NULL; * something in the region file other than meta-data and that * shouldn't happen. */ - if (size < sizeof(ref)) + if (size < sizeof(ref)) { + DB_DEBUG_MSG(env, "region size %d is too small", (int)size); goto retry; - else { + } else { if (size == sizeof(ref)) F_SET(env, ENV_SYSTEM_MEM); else if (F_ISSET(env, ENV_SYSTEM_MEM)) { - ret = EINVAL; + ret = USR_ERR(env, EINVAL); __db_err(env, ret, DB_STR_A("1535", "%s: existing environment not created in system memory", "%s"), infop->name); @@ -197,6 +201,7 @@ loop: renv = NULL; nrw < (size_t)sizeof(rbuf) || (ret = __os_seek(env, env->lockfhp, 0, 0, rbuf.region_off)) != 0) { + ret = USR_ERR(env, ret); __db_err(env, ret, DB_STR_A("1536", "%s: unable to read region info", "%s"), infop->name); @@ -207,7 +212,8 @@ loop: renv = NULL; if ((ret = __os_read(env, env->lockfhp, &ref, sizeof(ref), &nrw)) != 0 || nrw < (size_t)sizeof(ref)) { if (ret == 0) - ret = EIO; + ret = USR_ERR(env, EIO); + (void)USR_ERR(env, ret); __db_err(env, ret, DB_STR_A("1537", "%s: unable to read system-memory information", "%s"), infop->name); @@ -218,18 +224,16 @@ loop: renv = NULL; segid = ref.segid; } -#ifndef HAVE_MUTEX_FCNTL /* - * If we're not doing fcntl locking, we can close the file handle. We - * no longer need it and the less contact between the buffer cache and - * the VM, the better. + * We no longer need the file handle; the less contact between the + * buffer cache and the VM, the better. */ (void)__os_closehandle(env, env->lockfhp); env->lockfhp = NULL; -#endif /* Call the region join routine to acquire the region. */ memset(&tregion, 0, sizeof(tregion)); + tregion.type = REGION_TYPE_ENV; tregion.size = (roff_t)size; tregion.max = (roff_t)max; tregion.segid = segid; @@ -257,15 +261,15 @@ user_map_functions: "Program version %d.%d doesn't match environment version %d.%d", "%d %d %d %d"), DB_VERSION_MAJOR, DB_VERSION_MINOR, renv->majver, renv->minver); - ret = DB_VERSION_MISMATCH; + ret = USR_ERR(env, DB_VERSION_MISMATCH); } else - ret = EINVAL; + ret = USR_ERR(env, EINVAL); goto err; } if (renv->signature != signature) { __db_errx(env, DB_STR("1539", "Build signature doesn't match environment")); - ret = DB_VERSION_MISMATCH; + ret = USR_ERR(env, DB_VERSION_MISMATCH); goto err; } @@ -287,8 +291,16 @@ user_map_functions: ret = __env_panic_msg(env); goto err; } - if (renv->magic != DB_REGION_MAGIC) + if (renv->magic != DB_REGION_MAGIC) { + DB_DEBUG_MSG(env, + "attach sees bad region magic 0x%lx", (u_long)renv->magic); goto retry; + } + + if (dbenv->blob_threshold != 0 && + renv->blob_threshold != dbenv->blob_threshold) + __db_msg(env, DB_STR("1591", +"Warning: Ignoring blob_threshold size when joining environment")); /* * Get a reference to the underlying REGION information for this @@ -329,7 +341,7 @@ user_map_functions: if (*init_flagsp != 0) { __db_errx(env, DB_STR("1540", "configured environment flags incompatible with existing environment")); - ret = EINVAL; + ret = USR_ERR(env, EINVAL); goto err; } *init_flagsp = renv->init_flags; @@ -437,6 +449,8 @@ creation: renv->minver = (u_int32_t)minver; renv->patchver = (u_int32_t)patchver; renv->signature = signature; + renv->failure_panic = 0; + renv->failure_symptom[0] = '\0'; (void)time(&renv->timestamp); __os_unique_id(env, &renv->envid); @@ -447,6 +461,8 @@ creation: */ renv->init_flags = (init_flagsp == NULL) ? 0 : *init_flagsp; + renv->blob_threshold = dbenv->blob_threshold; + /* * Set up the region array. We use an array rather than a linked list * as we have to traverse this list after failure in some cases, and @@ -513,17 +529,14 @@ find_err: __db_errx(env, DB_STR_A("1544", } } -#ifndef HAVE_MUTEX_FCNTL /* - * If we're not doing fcntl locking, we can close the file handle. We - * no longer need it and the less contact between the buffer cache and - * the VM, the better. + * We no longer need the file handle and the less contact between the + * buffer cache and the VM, the better. */ if (env->lockfhp != NULL) { (void)__os_closehandle(env, env->lockfhp); env->lockfhp = NULL; } -#endif /* Everything looks good, we're done. */ env->reginfo = infop; @@ -550,7 +563,7 @@ retry: /* Close any open file handle. */ (void)__env_sys_detach(env, infop, F_ISSET(infop, REGION_CREATE)); - if (rp != NULL && F_ISSET(env, DB_PRIVATE)) + if (rp != NULL && F_ISSET(env, ENV_PRIVATE)) __env_alloc_free(infop, rp); } @@ -674,8 +687,23 @@ __env_panic_set(env, on) ENV *env; int on; { - if (env != NULL && env->reginfo != NULL) - ((REGENV *)env->reginfo->primary)->panic = on ? 1 : 0; + REGENV *renv; + + if (env != NULL && env->reginfo != NULL) { + /* + * Remember it in the process' env as well, so that the + * panic-ness is still known on exit from the final close. + */ + renv = env->reginfo->primary; + if (on) { + F_SET(env, ENV_REMEMBER_PANIC); + if (F_ISSET(env->dbenv, DB_ENV_FAILCHK)) + renv->failure_panic = 1; + } + else + F_CLR(env, ENV_REMEMBER_PANIC); + renv->panic = on ? 1 : 0; + } } /* @@ -775,6 +803,31 @@ __env_ref_get(dbenv, countp) } /* + * __env_region_cleanup -- + * Detach from any regions, e.g., when closing after a panic. + * + * PUBLIC: int __env_region_cleanup __P((ENV *)); + */ +int +__env_region_cleanup(env) + ENV *env; +{ + if (env->reginfo != NULL) { +#ifdef HAVE_MUTEX_SUPPORT + (void)__lock_region_detach(env, env->lk_handle); + (void)__mutex_region_detach(env, env->mutex_handle); +#endif + (void)__log_region_detach(env, env->lg_handle); + (void)__memp_region_detach(env, env->mp_handle); + (void)__txn_region_detach(env, env->tx_handle); + (void)__env_detach(env, 0); + /* Remember the panic state after detaching. */ + F_SET(env, ENV_REMEMBER_PANIC); + } + return (0); +} + +/* * __env_detach -- * Detach from the environment. * @@ -796,9 +849,7 @@ __env_detach(env, destroy) /* Close the locking file handle. */ if (env->lockfhp != NULL) { - if ((t_ret = - __os_closehandle(env, env->lockfhp)) != 0 && ret == 0) - ret = t_ret; + ret = __os_closehandle(env, env->lockfhp); env->lockfhp = NULL; } @@ -1249,13 +1300,13 @@ __env_sys_attach(env, infop, rp) __db_errx(env, DB_STR_A("1548", "region size %lu is too large; maximum is %lu", "%lu %lu"), (u_long)rp->size, (u_long)DB_REGIONSIZE_MAX); - return (EINVAL); + return (USR_ERR(env, EINVAL)); } if (rp->max > DB_REGIONSIZE_MAX) { __db_errx(env, DB_STR_A("1549", "region max %lu is too large; maximum is %lu", "%lu %lu"), (u_long)rp->max, (u_long)DB_REGIONSIZE_MAX); - return (EINVAL); + return (USR_ERR(env, EINVAL)); } #endif @@ -1281,7 +1332,7 @@ __env_sys_attach(env, infop, rp) "architecture does not support locks inside process-local (malloc) memory")); __db_errx(env, DB_STR("1551", "application may not specify both DB_PRIVATE and DB_THREAD")); - return (EINVAL); + return (USR_ERR(env, EINVAL)); } #endif if ((ret = __os_malloc( @@ -1310,7 +1361,7 @@ __env_sys_attach(env, infop, rp) "region memory was not correctly aligned")); (void)__env_sys_detach(env, infop, F_ISSET(infop, REGION_CREATE)); - return (EINVAL); + return (USR_ERR(env, EINVAL)); } return (0); @@ -1402,7 +1453,7 @@ __env_des_get(env, env_infop, infop, rpp) * the region, fail. The caller generates any error message. */ if (!F_ISSET(infop, REGION_CREATE_OK)) - return (ENOENT); + return (USR_ERR(env, ENOENT)); /* * If we didn't find a region and don't have room to create the region @@ -1411,7 +1462,7 @@ __env_des_get(env, env_infop, infop, rpp) if (empty_slot == NULL) { __db_errx(env, DB_STR("1553", "no room remaining for additional REGIONs")); - return (ENOENT); + return (USR_ERR(env, ENOENT)); } /* diff --git a/src/env/env_register.c b/src/env/env_register.c index 7475444d..731ddd1f 100644 --- a/src/env/env_register.c +++ b/src/env/env_register.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2004, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2004, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -30,6 +30,7 @@ static int __envreg_add __P((ENV *, int *, u_int32_t)); static int __envreg_pid_compare __P((const void *, const void *)); static int __envreg_create_active_pid __P((ENV *, char *)); +static int __envreg_add_active_pid __P((ENV*, char *)); /* * Support for portable, multi-process database environment locking, based on @@ -137,7 +138,7 @@ __envreg_register(env, need_recoveryp, flags) if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) __db_msg(env, DB_STR_A("1524", - "%lu: register environment", "%lu"), (u_long)pid); + "%lu: register environment", "%lu"), (u_long)pid); /* Build the path name and open the registry file. */ if ((ret = __db_appname(env, @@ -176,7 +177,6 @@ __envreg_register(env, need_recoveryp, flags) /* Register this process. */ if ((ret = __envreg_add(env, need_recoveryp, flags)) != 0) goto err; - /* * Release our exclusive lock if we don't need to run recovery. If * we need to run recovery, ENV->open will call back into register @@ -186,8 +186,7 @@ __envreg_register(env, need_recoveryp, flags) goto err; if (0) { -err: *need_recoveryp = 0; - +err: /* * !!! * Closing the file handle must release all of our locks. @@ -196,7 +195,6 @@ err: *need_recoveryp = 0; (void)__os_closehandle(env, dbenv->registry); dbenv->registry = NULL; } - if (pp != NULL) __os_free(env, pp); @@ -222,11 +220,11 @@ __envreg_add(env, need_recoveryp, flags) size_t nr, nw; u_int lcnt; u_int32_t bytes, mbytes, orig_flags; - int need_recovery, ret, t_ret; + int need_failchk, ret, t_ret; char *p, buf[PID_LEN + 10], pid_buf[PID_LEN + 10]; dbenv = env->dbenv; - need_recovery = 0; + need_failchk = t_ret = 0; COMPQUIET(dead, 0); COMPQUIET(p, NULL); ip = NULL; @@ -269,7 +267,7 @@ kill_all: /* * registering. */ if (nr != PID_LEN) { - need_recovery = 1; + need_failchk = 1; break; } @@ -299,7 +297,7 @@ kill_all: /* } #if DB_ENVREG_KILL_ALL - if (need_recovery) { + if (need_failchk) { pid = (pid_t)strtoul(buf, NULL, 10); (void)kill(pid, SIGKILL); @@ -318,7 +316,7 @@ kill_all: /* __db_msg(env, DB_STR_A("1530", "%02u: %s: FAILED", "%02u %s"), lcnt, p); - need_recovery = 1; + need_failchk = 1; dead = pos; #if DB_ENVREG_KILL_ALL goto kill_all; @@ -331,16 +329,27 @@ kill_all: /* "%02u: %s: LOCKED", "%02u %s"), lcnt, p); } + /* Check for a panic; if so there's no need to call failchk. */ + if (__env_attach(env, NULL, 0, 0) != 0) + goto sig_proc; + infop = env->reginfo; + renv = infop->primary; + *need_recoveryp = renv->panic != 0; + (void)__env_detach(env, 0); + if (*need_recoveryp) + return (0); + /* - * If we have to perform recovery... + * If we have to perform failchk... * * Mark all slots empty. Registry ignores empty slots we can't lock, * so it doesn't matter if any of the processes are in the middle of * exiting Berkeley DB -- they'll discard their lock when they exit. */ - if (need_recovery) { + if (need_failchk) { if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) - __db_msg(env, "%lu: recovery required", (u_long)pid); + __db_msg(env, + "%lu: failchk recovery required", (u_long)pid); if (LF_ISSET(DB_FAILCHK) || LF_ISSET(DB_FAILCHK_ISALIVE)) { if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) @@ -352,13 +361,14 @@ kill_all: /* env, pid_buf)) != 0) goto sig_proc; - /* The environment will already exist, so we do not + /* + * The environment will already exist, so we do not * want DB_CREATE set, nor do we want any recovery at * this point. No need to put values back as flags is * passed in by value. Save original dbenv flags in * case we need to recover/remove existing environment. * Set DB_ENV_FAILCHK before attach to help ensure we - * dont block on a mutex held by the dead process. + * don't block on a mutex held by the dead process. */ LF_CLR(DB_CREATE | DB_RECOVER | DB_RECOVER_FATAL); orig_flags = dbenv->flags; @@ -367,44 +377,53 @@ kill_all: /* if ((ret = __env_attach_regions( dbenv, flags, orig_flags, 0)) != 0) goto sig_proc; - if ((t_ret = - __env_set_state(env, &ip, THREAD_FAILCHK)) != 0 && - ret == 0) + if ((t_ret = __env_set_state(env, + &ip, THREAD_FAILCHK)) != 0 && ret == 0) ret = t_ret; - if ((t_ret = - __env_failchk_int(dbenv)) != 0 && ret == 0) + if (ret == 0 && (t_ret = __env_failchk_int(dbenv)) != 0) ret = t_ret; + if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) + __db_msg(env, + "%lu: failchk returned %d, ret is %d", + (u_long)pid, t_ret, ret); /* Free active pid array if used. */ if (LF_ISSET(DB_FAILCHK_ISALIVE)) { - DB_GLOBAL(num_active_pids) = 0; - DB_GLOBAL(size_active_pids) = 0; - __os_free( env, DB_GLOBAL(active_pids)); + env->num_active_pids = 0; + env->size_active_pids = 0; + __os_free(env, env->active_pids); + env->active_pids = NULL; } /* Detach from environment and deregister thread. */ - if ((t_ret = - __env_refresh(dbenv, orig_flags, 0)) != 0 && - ret == 0) + if ((t_ret = __env_refresh(dbenv, + orig_flags, 0)) != 0 && ret == 0) ret = t_ret; + F_CLR(env, ENV_OPEN_CALLED); + if (ret == 0) { if ((ret = __os_seek(env, dbenv->registry, - 0, 0,(u_int32_t)dead)) != 0 || + 0, 0, (u_int32_t)dead)) != 0 || (ret = __os_write(env, dbenv->registry, PID_EMPTY, PID_LEN, &nw)) != 0) return (ret); - need_recovery = 0; + need_failchk = 0; goto add; } } /* If we can't attach, then we cannot set DB_REGISTER panic. */ -sig_proc: if (__env_attach(env, NULL, 0, 0) == 0) { +sig_proc: + if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) + __db_msg(env, "%lu: sig_proc attaching errs %s/ret %s", + (u_long)pid, db_strerror(t_ret), db_strerror(ret)); + if (__env_attach(env, NULL, 0, 0) == 0) { infop = env->reginfo; renv = infop->primary; - /* Indicate DB_REGSITER panic. Also, set environment - * panic as this is the panic trigger mechanism in - * the code that everything looks for. + /* + * Indicate DB_REGISTER panic. Also, set (or re-set) + * environment panic as this is the panic trigger + * mechanism in the code that everything looks for. */ renv->reg_panic = 1; renv->panic = 1; @@ -484,7 +503,7 @@ add: if ((ret = __os_seek(env, dbenv->registry, 0, 0, 0)) != 0) } } - if (need_recovery) + if (need_failchk) *need_recoveryp = 1; return (ret); @@ -543,8 +562,9 @@ __envreg_unregister(env, recovery_failed) * also releasing our slot lock, we could race. That can't happen, I * don't think. */ -err: if ((t_ret = - __os_closehandle(env, dbenv->registry)) != 0 && ret == 0) +err: + if (dbenv->registry != NULL && + (t_ret = __os_closehandle(env, dbenv->registry)) != 0 && ret == 0) ret = t_ret; dbenv->registry = NULL; @@ -610,6 +630,10 @@ __envreg_isalive(dbenv, pid, tid, flags ) db_threadid_t tid; u_int32_t flags; { + ENV *env; + + env = dbenv->env; + /* in this case we really do not care about tid, simply for lint */ DB_THREADID_INIT(tid); @@ -617,15 +641,14 @@ __envreg_isalive(dbenv, pid, tid, flags ) if (!((flags == 0) || (flags == DB_MUTEX_PROCESS_ONLY))) return (EINVAL); - if (DB_GLOBAL(active_pids) == NULL || - DB_GLOBAL(num_active_pids) == 0 || dbenv == NULL) + if (env->active_pids == NULL || env->num_active_pids == 0) return (0); /* * bsearch returns a pointer to an entry in active_pids if a match * is found on pid, else no match found it returns NULL. This * routine will return a 1 if a match is found, else a 0. */ - if (bsearch(&pid, DB_GLOBAL(active_pids), DB_GLOBAL(num_active_pids), + if (bsearch(&pid, env->active_pids, env->num_active_pids, sizeof(pid_t), __envreg_pid_compare)) return 1; @@ -635,7 +658,8 @@ __envreg_isalive(dbenv, pid, tid, flags ) /* * __envreg_create_active_pid -- * Create array of pids, if need more room in array then double size. - * Only add active pids from DB_REGISTER file into array. + * Only add active pids from DB_REGISTER file into array. The given + * active my_pid is also added into array. */ static int __envreg_create_active_pid(env, my_pid) @@ -646,8 +670,7 @@ __envreg_create_active_pid(env, my_pid) char buf[PID_LEN + 10]; int ret; off_t pos; - pid_t pid, *tmparray; - size_t tmpsize, nr; + size_t nr; u_int lcnt; dbenv = env->dbenv; @@ -655,6 +678,15 @@ __envreg_create_active_pid(env, my_pid) ret = 0; /* + * The process getting here has not been added to the DB_REGISTER + * file yet, so include it as the first item in array + */ + if (env->num_active_pids == 0) { + if ((ret = __envreg_add_active_pid(env, my_pid)) != 0) + return (ret); + } + + /* * Walk through DB_REGISTER file, we grab pid entries that are locked * as those represent processes that are still alive. Ignore empty * slots, or those that are unlocked. @@ -678,53 +710,50 @@ __envreg_create_active_pid(env, my_pid) if ((ret = REGISTRY_UNLOCK(env, pos)) != 0) return (ret); } else { - /* first, check to make sure we have room in arrary */ - if (DB_GLOBAL(num_active_pids) + 1 > - DB_GLOBAL(size_active_pids)) { - tmpsize = - DB_GLOBAL(size_active_pids) * sizeof(pid_t); - - /* start with 512, then double if must grow */ - tmpsize = tmpsize>0 ? tmpsize*2 : 512; - if ((ret = __os_malloc - (env, tmpsize, &tmparray )) != 0) - return (ret); - - /* if array exists, then copy and free */ - if (DB_GLOBAL(active_pids)) { - memcpy( tmparray, - DB_GLOBAL(active_pids), - DB_GLOBAL(num_active_pids) * - sizeof(pid_t)); - __os_free( env, DB_GLOBAL(active_pids)); - } - - DB_GLOBAL(active_pids) = tmparray; - DB_GLOBAL(size_active_pids) = tmpsize; - - /* - * The process getting here has not been added - * to the DB_REGISTER file yet, so include it - * as the first item in array - */ - if (DB_GLOBAL(num_active_pids) == 0) { - pid = (pid_t)strtoul(my_pid, NULL, 10); - DB_GLOBAL(active_pids) - [DB_GLOBAL(num_active_pids)++] = pid; - } - } - - /* insert into array */ - pid = (pid_t)strtoul(buf, NULL, 10); - DB_GLOBAL(active_pids) - [DB_GLOBAL(num_active_pids)++] = pid; - + if ((ret = __envreg_add_active_pid(env, buf)) != 0) + return (ret); } } /* lets sort the array to allow for binary search in isalive func */ - qsort(DB_GLOBAL(active_pids), DB_GLOBAL(num_active_pids), + qsort(env->active_pids, env->num_active_pids, sizeof(pid_t), __envreg_pid_compare); return (ret); } + +/* + * __envreg_add_active_pid -- + * Add an active pid into array, if need more room in array then double size. + */ +static int +__envreg_add_active_pid(env, pid) + ENV *env; + char *pid; +{ + int ret; + size_t tmpsize; + + ret = 0; + + /* first, check to make sure we have room in arrary */ + if (env->num_active_pids + 1 > + env->size_active_pids) { + tmpsize = + env->size_active_pids * sizeof(pid_t); + + /* start with 512, then double if must grow */ + tmpsize = tmpsize > 0 ? tmpsize * 2 : 512; + if ((ret = __os_realloc + (env, tmpsize, &(env->active_pids) )) != 0) + return (ret); + + env->size_active_pids = tmpsize / sizeof(pid_t); + } + + /* insert into array */ + env->active_pids + [env->num_active_pids++] = (pid_t)strtoul(pid, NULL, 10); + + return (0); +} diff --git a/src/env/env_sig.c b/src/env/env_sig.c index 6d127f85..57e64228 100644 --- a/src/env/env_sig.c +++ b/src/env/env_sig.c @@ -28,9 +28,9 @@ * shared memory. */ #ifdef HAVE_MIXED_SIZE_ADDRESSING -#define __STRUCTURE_COUNT 41 +#define __STRUCTURE_COUNT 48 #else -#define __STRUCTURE_COUNT (41 + 104) +#define __STRUCTURE_COUNT (48 + 108) #endif /* @@ -66,7 +66,11 @@ __env_struct_sig() __ADD(__db_h_stat); __ADD(__db_heap_stat); __ADD(__db_qam_stat); +#ifdef HAVE_MUTEX_SUPPORT + __ADD(__mutex_state); +#endif __ADD(__db_thread_info); + __ADD(__env_thread_info); __ADD(__db_lockregion); __ADD(__sh_dbt); __ADD(__db_lockobj); @@ -82,6 +86,9 @@ __env_struct_sig() __ADD(__db_mutexregion); #endif #ifdef HAVE_MUTEX_SUPPORT + __ADD(__mutex_history); +#endif +#ifdef HAVE_MUTEX_SUPPORT __ADD(__db_mutex_t); #endif __ADD(__db_reg_env); @@ -92,6 +99,10 @@ __env_struct_sig() #ifndef HAVE_MIXED_SIZE_ADDRESSING __ADD(__db_dbt); +#ifdef HAVE_MUTEX_SUPPORT + __ADD(__db_event_mutex_died_info); +#endif + __ADD(__db_event_failchk_info); __ADD(__db_lockreq); __ADD(__db_log_cursor); __ADD(__log_rec_spec); @@ -113,6 +124,7 @@ __env_struct_sig() __ADD(__cq_fq); __ADD(__cq_aq); __ADD(__cq_jq); + __ADD(__db_stream); __ADD(__db_heap_rid); __ADD(__dbc); __ADD(__key_range); @@ -125,7 +137,6 @@ __env_struct_sig() __ADD(__fn); __ADD(__db_msgbuf); __ADD(__pin_list); - __ADD(__env_thread_info); __ADD(__flag_map); __ADD(__db_backup_handle); __ADD(__env); diff --git a/src/env/env_stat.c b/src/env/env_stat.c index 9bc3fe7e..094d0545 100644 --- a/src/env/env_stat.c +++ b/src/env/env_stat.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -21,11 +21,9 @@ static int __env_print_dbenv_all __P((ENV *, u_int32_t)); static int __env_print_env_all __P((ENV *, u_int32_t)); static int __env_print_fh __P((ENV *)); static int __env_print_stats __P((ENV *, u_int32_t)); -static int __env_print_thread __P((ENV *)); static int __env_stat_print __P((ENV *, u_int32_t)); static char *__env_thread_state_print __P((DB_THREAD_STATE)); -static const char * - __reg_type __P((reg_type_t)); +static const char * __reg_type __P((reg_type_t)); /* * __env_stat_print_pp -- @@ -146,7 +144,6 @@ __env_stat_print(env, flags) /* * __env_print_stats -- * Display the default environment statistics. - * */ static int __env_print_stats(env, flags) @@ -186,6 +183,10 @@ __env_print_stats(env, flags) (u_long)0, (u_long)0, (u_long)infop->rp->size); __db_dlbytes(env, "Maximum region size", (u_long)0, (u_long)0, (u_long)infop->rp->max); + STAT_LONG("Process failure detected", renv->failure_panic); + if (renv->failure_symptom[0] != '\0') + __db_msg(env, + "%s:\tFirst failure symptom", renv->failure_symptom); return (0); } @@ -267,8 +268,6 @@ __env_print_dbenv_all(env, flags) __db_msg(env, "%s", DB_GLOBAL(db_line)); STAT_POINTER("ENV", dbenv->env); - __mutex_print_debug_single( - env, "DB_ENV handle mutex", dbenv->mtx_db_env, flags); STAT_ISSET("Errcall", dbenv->db_errcall); STAT_ISSET("Errfile", dbenv->db_errfile); STAT_STRING("Errpfx", dbenv->db_errpfx); @@ -286,6 +285,7 @@ __env_print_dbenv_all(env, flags) STAT_ISSET("ThreadId", dbenv->thread_id); STAT_ISSET("ThreadIdString", dbenv->thread_id_string); + STAT_STRING("Blob dir", dbenv->db_blob_dir); STAT_STRING("Log dir", dbenv->db_log_dir); STAT_STRING("Metadata dir", dbenv->db_md_dir); STAT_STRING("Tmp dir", dbenv->db_tmp_dir); @@ -304,6 +304,8 @@ __env_print_dbenv_all(env, flags) STAT_ISSET("Password", dbenv->passwd); + STAT_ULONG("Blob threshold", dbenv->blob_threshold); + STAT_ISSET("App private", dbenv->app_private); STAT_ISSET("Api1 internal", dbenv->api1_internal); STAT_ISSET("Api2 internal", dbenv->api2_internal); @@ -314,6 +316,7 @@ __env_print_dbenv_all(env, flags) STAT_ULONG("Mutex cnt", dbenv->mutex_cnt); STAT_ULONG("Mutex inc", dbenv->mutex_inc); STAT_ULONG("Mutex tas spins", dbenv->mutex_tas_spins); + STAT_LONG("Mutex failchk timeout", dbenv->mutex_failchk_timeout); STAT_ISSET("Lock conflicts", dbenv->lk_conflicts); STAT_LONG("Lock modes", dbenv->lk_modes); @@ -356,6 +359,7 @@ __env_print_dbenv_all(env, flags) __db_prflags(env, NULL, dbenv->flags, db_env_fn, NULL, "\tPublic environment flags"); + COMPQUIET(flags, 0); return (0); } @@ -507,6 +511,8 @@ __env_thread_state_print(state) return ("blocked and dead"); case THREAD_OUT: return ("out"); + case THREAD_VERIFY: + return ("verify"); default: return ("unknown"); } @@ -516,14 +522,17 @@ __env_thread_state_print(state) /* * __env_print_thread -- * Display the thread block state. + * + * PUBLIC: int __env_print_thread __P((ENV *)); */ -static int +int __env_print_thread(env) ENV *env; { BH *bhp; DB_ENV *dbenv; DB_HASHTAB *htab; + DB_LOCKER *locker; DB_MPOOL *dbmp; DB_THREAD_INFO *ip; PIN_LIST *list, *lp; @@ -532,6 +541,7 @@ __env_print_thread(env) THREAD_INFO *thread; u_int32_t i; char buf[DB_THREADID_STRLEN]; + char time_buf[CTIME_BUFLEN]; dbenv = env->dbenv; @@ -561,6 +571,10 @@ __env_print_thread(env) dbenv->thread_id_string( dbenv, ip->dbth_pid, ip->dbth_tid, buf), __env_thread_state_print(ip->dbth_state)); + if (timespecisset(&ip->dbth_failtime)) + __db_msg(env, "Crashed at %s", + __db_ctimespec(&ip->dbth_failtime, + time_buf)); list = R_ADDR(env->reginfo, ip->dbth_pinlist); for (lp = list; lp < &list[ip->dbth_pinmax]; lp++) { if (lp->b_ref == INVALID_ROFF) @@ -570,6 +584,18 @@ __env_print_thread(env) __db_msg(env, "\t\tpins: %lu", (u_long)bhp->pgno); } + if (ip->dbth_local_locker != INVALID_ROFF) { + locker = (DB_LOCKER *) + R_ADDR(&env->lk_handle->reginfo, + ip->dbth_local_locker); + __db_msg(env, "\t\tcached locker %lx mtx %lu", + (u_long)locker->id, + (u_long)locker->mtx_locker); + + } +#ifdef HAVE_MUTEX_SUPPORT + (void)__mutex_record_print(env, ip); +#endif } return (0); } @@ -846,6 +872,7 @@ __reg_type(t) return ("Transaction"); case INVALID_REGION_TYPE: return ("Invalid"); + /*lint -e{787} */ } return ("Unknown"); } diff --git a/src/fileops/fileops.src b/src/fileops/fileops.src index cdb6af27..3cb874b7 100644 --- a/src/fileops/fileops.src +++ b/src/fileops/fileops.src @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -30,7 +30,14 @@ ARG appname u_int32_t lu ARG mode u_int32_t o END -BEGIN create 48 143 +BEGIN_COMPAT create 60 143 +DBT name DBT s +DBT dirname DBT s +ARG appname u_int32_t lu +ARG mode u_int32_t o +END + +BEGIN create 60p1 143 DBT name DBT s DBT dirname DBT s ARG appname u_int32_t lu @@ -43,7 +50,13 @@ END * name: name in the file system * appname: indicates if the name needs to go through __db_appname */ -BEGIN remove 42 144 +BEGIN_COMPAT remove 60 144 +DBT name DBT s +DBT fid DBT s +ARG appname u_int32_t lu +END + +BEGIN remove 60p1 144 DBT name DBT s DBT fid DBT s ARG appname u_int32_t lu @@ -71,7 +84,18 @@ DBT page DBT s ARG flag u_int32_t lu END -BEGIN write 48 145 +BEGIN_COMPAT write 60 145 +DBT name DBT s +DBT dirname DBT s +ARG appname u_int32_t lu +ARG pgsize u_int32_t lu +ARG pageno db_pgno_t lu +ARG offset u_int32_t lu +DBT page DBT s +ARG flag u_int32_t lu +END + +BEGIN write 60p1 145 DBT name DBT s DBT dirname DBT s ARG appname u_int32_t lu @@ -83,6 +107,42 @@ ARG flag u_int32_t lu END /* + * write_file: log the writing of data into a file. + * + * name: file containing the data. + * appname: indicates if the name needs to go through __db_appname + * offset_lo: offset in the file, low part of a 64 bit integer. + * offset_hi: offset in the file, high part of a 64 bit integer. + * old_data: Data being overwritten, if there is any + * new_data: Data being written to the file. + * flag: DB_FOP_APPEND (0x00000001), DB_FOP_CREATE (0x00000002) and + * DB_FOP_REDO (0x00000008). Used to tell how the operation can be + * undone, truncating in the case of append and deleting the file in + * the case of create, and whether enough information was logged so + * that the operation can be redone. + */ +BEGIN_COMPAT write_file 60 86 +DBT name DBT s +DBT dirname DBT s +ARG appname u_int32_t lu +ARG offset_lo u_int32_t lu +ARG offset_hi u_int32_t lu +DBT old_data DBT s +DBT new_data DBT s +ARG flag u_int32_t lu +END + +BEGIN write_file 60p1 86 +DBT name DBT s +DBT dirname DBT s +ARG appname u_int32_t lu +LONGARG offset u_int64_t llu +DBT old_data DBT s +DBT new_data DBT s +ARG flag u_int32_t lu +END + +/* * rename: move a file from one name to another. * The appname value indicates if this is a path name that should be used * directly (i.e., no interpretation) or if it is a pathname that should @@ -105,8 +165,17 @@ DBT fileid DBT s ARG appname u_int32_t lu END -BEGIN rename 48 146 -DUPLICATE rename_noundo 46 150 +BEGIN_COMPAT rename 60 146 +DUPLICATE rename_noundo 60 150 +DBT oldname DBT s +DBT newname DBT s +DBT dirname DBT s +DBT fileid DBT s +ARG appname u_int32_t lu +END + +BEGIN rename 60p1 146 +DUPLICATE rename_noundo 60p1 150 DBT oldname DBT s DBT newname DBT s DBT dirname DBT s @@ -128,7 +197,15 @@ END * child: The transaction that removed or renamed the file. */ */ -BEGIN file_remove 42 141 +BEGIN_COMPAT file_remove 60 141 +DBT real_fid DBT s +DBT tmp_fid DBT s +DBT name DBT s +ARG appname u_int32_t lu +ARG child u_int32_t lx +END + +BEGIN file_remove 60p1 141 DBT real_fid DBT s DBT tmp_fid DBT s DBT name DBT s diff --git a/src/fileops/fileops_auto.c b/src/fileops/fileops_auto.c index 0db619a5..eff1377b 100644 --- a/src/fileops/fileops_auto.c +++ b/src/fileops/fileops_auto.c @@ -14,6 +14,13 @@ DB_LOG_RECSPEC __fop_create_42_desc[] = { {LOGREC_ARG, SSZ(__fop_create_42_args, mode), "mode", "%o"}, {LOGREC_Done, 0, "", ""} }; +DB_LOG_RECSPEC __fop_create_60_desc[] = { + {LOGREC_DBT, SSZ(__fop_create_60_args, name), "name", ""}, + {LOGREC_DBT, SSZ(__fop_create_60_args, dirname), "dirname", ""}, + {LOGREC_ARG, SSZ(__fop_create_60_args, appname), "appname", "%lu"}, + {LOGREC_ARG, SSZ(__fop_create_60_args, mode), "mode", "%o"}, + {LOGREC_Done, 0, "", ""} +}; DB_LOG_RECSPEC __fop_create_desc[] = { {LOGREC_DBT, SSZ(__fop_create_args, name), "name", ""}, {LOGREC_DBT, SSZ(__fop_create_args, dirname), "dirname", ""}, @@ -21,6 +28,12 @@ DB_LOG_RECSPEC __fop_create_desc[] = { {LOGREC_ARG, SSZ(__fop_create_args, mode), "mode", "%o"}, {LOGREC_Done, 0, "", ""} }; +DB_LOG_RECSPEC __fop_remove_60_desc[] = { + {LOGREC_DBT, SSZ(__fop_remove_60_args, name), "name", ""}, + {LOGREC_DBT, SSZ(__fop_remove_60_args, fid), "fid", ""}, + {LOGREC_ARG, SSZ(__fop_remove_60_args, appname), "appname", "%lu"}, + {LOGREC_Done, 0, "", ""} +}; DB_LOG_RECSPEC __fop_remove_desc[] = { {LOGREC_DBT, SSZ(__fop_remove_args, name), "name", ""}, {LOGREC_DBT, SSZ(__fop_remove_args, fid), "fid", ""}, @@ -37,6 +50,17 @@ DB_LOG_RECSPEC __fop_write_42_desc[] = { {LOGREC_ARG, SSZ(__fop_write_42_args, flag), "flag", "%lu"}, {LOGREC_Done, 0, "", ""} }; +DB_LOG_RECSPEC __fop_write_60_desc[] = { + {LOGREC_DBT, SSZ(__fop_write_60_args, name), "name", ""}, + {LOGREC_DBT, SSZ(__fop_write_60_args, dirname), "dirname", ""}, + {LOGREC_ARG, SSZ(__fop_write_60_args, appname), "appname", "%lu"}, + {LOGREC_ARG, SSZ(__fop_write_60_args, pgsize), "pgsize", "%lu"}, + {LOGREC_ARG, SSZ(__fop_write_60_args, pageno), "pageno", "%lu"}, + {LOGREC_ARG, SSZ(__fop_write_60_args, offset), "offset", "%lu"}, + {LOGREC_DBT, SSZ(__fop_write_60_args, page), "page", ""}, + {LOGREC_ARG, SSZ(__fop_write_60_args, flag), "flag", "%lu"}, + {LOGREC_Done, 0, "", ""} +}; DB_LOG_RECSPEC __fop_write_desc[] = { {LOGREC_DBT, SSZ(__fop_write_args, name), "name", ""}, {LOGREC_DBT, SSZ(__fop_write_args, dirname), "dirname", ""}, @@ -48,6 +72,27 @@ DB_LOG_RECSPEC __fop_write_desc[] = { {LOGREC_ARG, SSZ(__fop_write_args, flag), "flag", "%lu"}, {LOGREC_Done, 0, "", ""} }; +DB_LOG_RECSPEC __fop_write_file_60_desc[] = { + {LOGREC_DBT, SSZ(__fop_write_file_60_args, name), "name", ""}, + {LOGREC_DBT, SSZ(__fop_write_file_60_args, dirname), "dirname", ""}, + {LOGREC_ARG, SSZ(__fop_write_file_60_args, appname), "appname", "%lu"}, + {LOGREC_ARG, SSZ(__fop_write_file_60_args, offset_lo), "offset_lo", "%lu"}, + {LOGREC_ARG, SSZ(__fop_write_file_60_args, offset_hi), "offset_hi", "%lu"}, + {LOGREC_DBT, SSZ(__fop_write_file_60_args, old_data), "old_data", ""}, + {LOGREC_DBT, SSZ(__fop_write_file_60_args, new_data), "new_data", ""}, + {LOGREC_ARG, SSZ(__fop_write_file_60_args, flag), "flag", "%lu"}, + {LOGREC_Done, 0, "", ""} +}; +DB_LOG_RECSPEC __fop_write_file_desc[] = { + {LOGREC_DBT, SSZ(__fop_write_file_args, name), "name", ""}, + {LOGREC_DBT, SSZ(__fop_write_file_args, dirname), "dirname", ""}, + {LOGREC_ARG, SSZ(__fop_write_file_args, appname), "appname", "%lu"}, + {LOGREC_LONGARG, SSZ(__fop_write_file_args, offset), "offset", ""}, + {LOGREC_DBT, SSZ(__fop_write_file_args, old_data), "old_data", ""}, + {LOGREC_DBT, SSZ(__fop_write_file_args, new_data), "new_data", ""}, + {LOGREC_ARG, SSZ(__fop_write_file_args, flag), "flag", "%lu"}, + {LOGREC_Done, 0, "", ""} +}; DB_LOG_RECSPEC __fop_rename_42_desc[] = { {LOGREC_DBT, SSZ(__fop_rename_42_args, oldname), "oldname", ""}, {LOGREC_DBT, SSZ(__fop_rename_42_args, newname), "newname", ""}, @@ -62,6 +107,22 @@ DB_LOG_RECSPEC __fop_rename_noundo_46_desc[] = { {LOGREC_ARG, SSZ(__fop_rename_42_args, appname), "appname", "%lu"}, {LOGREC_Done, 0, "", ""} }; +DB_LOG_RECSPEC __fop_rename_60_desc[] = { + {LOGREC_DBT, SSZ(__fop_rename_60_args, oldname), "oldname", ""}, + {LOGREC_DBT, SSZ(__fop_rename_60_args, newname), "newname", ""}, + {LOGREC_DBT, SSZ(__fop_rename_60_args, dirname), "dirname", ""}, + {LOGREC_DBT, SSZ(__fop_rename_60_args, fileid), "fileid", ""}, + {LOGREC_ARG, SSZ(__fop_rename_60_args, appname), "appname", "%lu"}, + {LOGREC_Done, 0, "", ""} +}; +DB_LOG_RECSPEC __fop_rename_noundo_60_desc[] = { + {LOGREC_DBT, SSZ(__fop_rename_60_args, oldname), "oldname", ""}, + {LOGREC_DBT, SSZ(__fop_rename_60_args, newname), "newname", ""}, + {LOGREC_DBT, SSZ(__fop_rename_60_args, dirname), "dirname", ""}, + {LOGREC_DBT, SSZ(__fop_rename_60_args, fileid), "fileid", ""}, + {LOGREC_ARG, SSZ(__fop_rename_60_args, appname), "appname", "%lu"}, + {LOGREC_Done, 0, "", ""} +}; DB_LOG_RECSPEC __fop_rename_desc[] = { {LOGREC_DBT, SSZ(__fop_rename_args, oldname), "oldname", ""}, {LOGREC_DBT, SSZ(__fop_rename_args, newname), "newname", ""}, @@ -78,6 +139,14 @@ DB_LOG_RECSPEC __fop_rename_noundo_desc[] = { {LOGREC_ARG, SSZ(__fop_rename_args, appname), "appname", "%lu"}, {LOGREC_Done, 0, "", ""} }; +DB_LOG_RECSPEC __fop_file_remove_60_desc[] = { + {LOGREC_DBT, SSZ(__fop_file_remove_60_args, real_fid), "real_fid", ""}, + {LOGREC_DBT, SSZ(__fop_file_remove_60_args, tmp_fid), "tmp_fid", ""}, + {LOGREC_DBT, SSZ(__fop_file_remove_60_args, name), "name", ""}, + {LOGREC_ARG, SSZ(__fop_file_remove_60_args, appname), "appname", "%lu"}, + {LOGREC_ARG, SSZ(__fop_file_remove_60_args, child), "child", "%lx"}, + {LOGREC_Done, 0, "", ""} +}; DB_LOG_RECSPEC __fop_file_remove_desc[] = { {LOGREC_DBT, SSZ(__fop_file_remove_args, real_fid), "real_fid", ""}, {LOGREC_DBT, SSZ(__fop_file_remove_args, tmp_fid), "tmp_fid", ""}, @@ -106,6 +175,9 @@ __fop_init_recover(env, dtabp) __fop_write_recover, DB___fop_write)) != 0) return (ret); if ((ret = __db_add_recovery_int(env, dtabp, + __fop_write_file_recover, DB___fop_write_file)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, __fop_rename_recover, DB___fop_rename)) != 0) return (ret); if ((ret = __db_add_recovery_int(env, dtabp, diff --git a/src/fileops/fileops_autop.c b/src/fileops/fileops_autop.c index 6e271a17..784aa1d0 100644 --- a/src/fileops/fileops_autop.c +++ b/src/fileops/fileops_autop.c @@ -27,6 +27,23 @@ __fop_create_42_print(env, dbtp, lsnp, notused2, info) } /* + * PUBLIC: int __fop_create_60_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__fop_create_60_print(env, dbtp, lsnp, notused2, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *info; +{ + COMPQUIET(notused2, DB_TXN_PRINT); + + return (__log_print_record(env, dbtp, lsnp, "__fop_create_60", __fop_create_60_desc, info)); +} + +/* * PUBLIC: int __fop_create_print __P((ENV *, DBT *, DB_LSN *, * PUBLIC: db_recops, void *)); */ @@ -44,6 +61,23 @@ __fop_create_print(env, dbtp, lsnp, notused2, info) } /* + * PUBLIC: int __fop_remove_60_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__fop_remove_60_print(env, dbtp, lsnp, notused2, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *info; +{ + COMPQUIET(notused2, DB_TXN_PRINT); + + return (__log_print_record(env, dbtp, lsnp, "__fop_remove_60", __fop_remove_60_desc, info)); +} + +/* * PUBLIC: int __fop_remove_print __P((ENV *, DBT *, DB_LSN *, * PUBLIC: db_recops, void *)); */ @@ -78,6 +112,23 @@ __fop_write_42_print(env, dbtp, lsnp, notused2, info) } /* + * PUBLIC: int __fop_write_60_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__fop_write_60_print(env, dbtp, lsnp, notused2, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *info; +{ + COMPQUIET(notused2, DB_TXN_PRINT); + + return (__log_print_record(env, dbtp, lsnp, "__fop_write_60", __fop_write_60_desc, info)); +} + +/* * PUBLIC: int __fop_write_print __P((ENV *, DBT *, DB_LSN *, * PUBLIC: db_recops, void *)); */ @@ -95,6 +146,40 @@ __fop_write_print(env, dbtp, lsnp, notused2, info) } /* + * PUBLIC: int __fop_write_file_60_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__fop_write_file_60_print(env, dbtp, lsnp, notused2, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *info; +{ + COMPQUIET(notused2, DB_TXN_PRINT); + + return (__log_print_record(env, dbtp, lsnp, "__fop_write_file_60", __fop_write_file_60_desc, info)); +} + +/* + * PUBLIC: int __fop_write_file_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__fop_write_file_print(env, dbtp, lsnp, notused2, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *info; +{ + COMPQUIET(notused2, DB_TXN_PRINT); + + return (__log_print_record(env, dbtp, lsnp, "__fop_write_file", __fop_write_file_desc, info)); +} + +/* * PUBLIC: int __fop_rename_42_print __P((ENV *, DBT *, DB_LSN *, * PUBLIC: db_recops, void *)); */ @@ -112,6 +197,23 @@ __fop_rename_42_print(env, dbtp, lsnp, notused2, info) } /* + * PUBLIC: int __fop_rename_60_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__fop_rename_60_print(env, dbtp, lsnp, notused2, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *info; +{ + COMPQUIET(notused2, DB_TXN_PRINT); + + return (__log_print_record(env, dbtp, lsnp, "__fop_rename_60", __fop_rename_60_desc, info)); +} + +/* * PUBLIC: int __fop_rename_print __P((ENV *, DBT *, DB_LSN *, * PUBLIC: db_recops, void *)); */ @@ -129,6 +231,23 @@ __fop_rename_print(env, dbtp, lsnp, notused2, info) } /* + * PUBLIC: int __fop_file_remove_60_print __P((ENV *, DBT *, + * PUBLIC: DB_LSN *, db_recops, void *)); + */ +int +__fop_file_remove_60_print(env, dbtp, lsnp, notused2, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *info; +{ + COMPQUIET(notused2, DB_TXN_PRINT); + + return (__log_print_record(env, dbtp, lsnp, "__fop_file_remove_60", __fop_file_remove_60_desc, info)); +} + +/* * PUBLIC: int __fop_file_remove_print __P((ENV *, DBT *, DB_LSN *, * PUBLIC: db_recops, void *)); */ @@ -165,6 +284,9 @@ __fop_init_print(env, dtabp) __fop_write_print, DB___fop_write)) != 0) return (ret); if ((ret = __db_add_recovery_int(env, dtabp, + __fop_write_file_print, DB___fop_write_file)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, __fop_rename_print, DB___fop_rename)) != 0) return (ret); if ((ret = __db_add_recovery_int(env, dtabp, diff --git a/src/fileops/fop_basic.c b/src/fileops/fop_basic.c index d6c707f2..c1280d76 100644 --- a/src/fileops/fop_basic.c +++ b/src/fileops/fop_basic.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -253,6 +253,220 @@ err: if (local_open && } /* + * Used to reduce the maximum amount of data that will be logged at a time. + * Large writes are logged as a series of smaller writes to prevent a + * single log from being larger than the log buffer or a log file. + */ +#define LOG_OVERWRITE_MULTIPLIER 0.75 +#define LOG_REDO_MULTIPLIER 0.75 +#define LOG_OVERWRITE_REDO_MULTIPLIER 0.33 + +/* + * __fop_write_file + * + * Write "size" bytes from "buf" to file "name" beginning at offset "off." + * dirname is the directory in which the file is stored, fhp the file + * handle to write too, and flags contains whether this is creating or + * appending data, which changes how the data is logged. + * The other __fop_write is designed for writing pages to databases, this + * function writes generic data to files, usually blob files. + * + * PUBLIC: int __fop_write_file __P((ENV *, DB_TXN *, + * PUBLIC: const char *, const char *, APPNAME, DB_FH *, + * PUBLIC: off_t, void *, size_t, u_int32_t)); + */ +int +__fop_write_file(env, txn, + name, dirname, appname, fhp, off, buf, size, flags) + ENV *env; + DB_TXN *txn; + const char *name, *dirname; + APPNAME appname; + DB_FH *fhp; + off_t off; + void *buf; + size_t size; + u_int32_t flags; +{ + DBT new_data, old_data, namedbt, dirdbt; + DB_LOG *dblp; + DB_LSN lsn; + off_t cur_off; + int local_open, ret, t_ret; + size_t cur_size, nbytes, tmp_size; + u_int32_t lflags, lgbuf_size, lgsize, lgfile_size; + char *real_name; + void *cur_ptr; + + ret = local_open = 0; + real_name = NULL; + lflags = 0; + memset(&new_data, 0, sizeof(new_data)); + memset(&old_data, 0, sizeof(old_data)); + ZERO_LSN(lsn); + + if (fhp == NULL) { + /* File isn't open; we need to reopen it. */ + if ((ret = __db_appname(env, + appname, name, &dirname, &real_name)) != 0) + return (ret); + + if ((ret = __os_open(env, real_name, 0, 0, 0, &fhp)) != 0) + goto err; + local_open = 1; + } + + if (DBENV_LOGGING(env) +#if !defined(DEBUG_WOP) + && txn != NULL +#endif + ) { + DB_INIT_DBT(namedbt, name, strlen(name) + 1); + if (dirname != NULL) + DB_INIT_DBT(dirdbt, dirname, strlen(dirname) + 1); + else + memset(&dirdbt, 0, sizeof(dirdbt)); + /* + * If the write is larger than the log buffer or file size, + * then log it as a set of smaller writes. + */ + cur_off = off; + cur_ptr = buf; + cur_size = size; + dblp = env->lg_handle; + LOG_SYSTEM_LOCK(env); + lgfile_size = ((LOG *)dblp->reginfo.primary)->log_nsize; + LOG_SYSTEM_UNLOCK(env); + if ((ret = __log_get_lg_bsize(env->dbenv, &lgbuf_size)) != 0) + goto err; + + if (lgfile_size > lgbuf_size) + lgsize = lgbuf_size; + else + lgsize = lgfile_size; + + /* + * Parial logging only logs enough data to undo an operation. + */ + if (LF_ISSET(DB_FOP_PARTIAL_LOG)) { + /* No data needs to be logged for append and create. */ + if (LF_ISSET(DB_FOP_APPEND | DB_FOP_CREATE)) { + lflags |= + flags & (DB_FOP_APPEND | DB_FOP_CREATE); + cur_size = 0; + goto log; + } else { + /* + * Writting in the middle of the blob requires + * logging the data being overwritten. + */ + lgsize = (u_int32_t) + (lgsize * LOG_OVERWRITE_MULTIPLIER); + } + } else { + /* Log that the operation can be redone from logs. */ + lflags |= DB_FOP_REDO; + /* Just log the new data for append and create */ + if (LF_ISSET(DB_FOP_APPEND | DB_FOP_CREATE)) { + lgsize = (u_int32_t) + (lgsize * LOG_REDO_MULTIPLIER); + lflags |= flags & + (DB_FOP_APPEND | DB_FOP_CREATE); + } else { + /* + * Writting in the middle of the blob requires + * logging both the old and new data. + */ + lgsize = (u_int32_t) + (lgsize * LOG_OVERWRITE_REDO_MULTIPLIER); + } + } + + while (cur_size > 0) { + new_data.data = cur_ptr; + if (cur_size > lgsize) { + new_data.size = lgsize; + cur_size -= lgsize; + } else { + new_data.size = (u_int32_t)cur_size; + cur_size = 0; + } + cur_ptr = (unsigned char *)cur_ptr + new_data.size; + /* + * If not creating or appending the file, then + * the data being overwritten needs to be read + * in so it can be written back in on abort. + */ + if (!(lflags & (DB_FOP_CREATE | DB_FOP_APPEND))) { + DB_ASSERT(env, old_data.data == NULL || + new_data.size <= old_data.size); + old_data.size = new_data.size; + if (old_data.data == NULL) { + if ((ret = __os_malloc(env, + old_data.size, + &old_data.data)) != 0) + goto err; + } + if ((ret = __os_seek( + env, fhp, 0, 0, cur_off)) != 0) + goto err; + if ((ret = __os_read(env, fhp, old_data.data, + old_data.size, &nbytes)) != 0) + goto err; + } +log: tmp_size = new_data.size; + /* + * No need to log the new data if this operation + * cannot be redone from logs. + */ + if (!(lflags & DB_FOP_REDO)) + memset(&new_data, 0, sizeof(new_data)); + if ((ret = __fop_write_file_log( + env, txn, &lsn, flags, &namedbt, &dirdbt, + (u_int32_t)appname, (u_int64_t)cur_off, + &old_data, &new_data, lflags)) != 0) + goto err; + cur_off += tmp_size; + } + /* + * If not creating, we have to flush the logs so that they + * will be available to undo internal writes and appends in case + * of a crash. + */ + if (!(LF_ISSET(DB_FOP_CREATE)) && + txn != NULL && !F_ISSET(txn, TXN_NOSYNC)) + if ((ret = __log_flush(env, &lsn)) != 0) + goto err; + } + + /* Seek to offset. */ + if ((ret = __os_seek(env, fhp, 0, 0, off)) != 0) + goto err; + + /* Now do the write. */ + if ((ret = __os_write(env, fhp, buf, size, &nbytes)) != 0) + goto err; + + if (nbytes != size) { + __db_errx(env, DB_STR_A("0238", + "Error wrote %lld bytes to file %s instead of %lld .", + "%lld %s %lld"), + (long long)nbytes, name, (long long)size); + goto err; + } + +err: if (local_open && + (t_ret = __os_closehandle(env, fhp)) != 0 && ret == 0) + ret = t_ret; + + if (real_name != NULL) + __os_free(env, real_name); + if (old_data.data != NULL) + __os_free(env, old_data.data); + return (ret); +} + +/* * __fop_rename -- * Change a file's name. * diff --git a/src/fileops/fop_rec.c b/src/fileops/fop_rec.c index 52d6175d..71a81ad6 100644 --- a/src/fileops/fop_rec.c +++ b/src/fileops/fop_rec.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -9,16 +9,63 @@ #include "db_config.h" #include "db_int.h" +#include "dbinc/blob.h" #include "dbinc/db_page.h" #include "dbinc/fop.h" #include "dbinc/db_am.h" #include "dbinc/mp.h" #include "dbinc/txn.h" +typedef enum { + DB_APP53_NONE=0, /* No type (region). */ + DB_APP53_DATA, /* Data file. */ + DB_APP53_LOG, /* Log file. */ + DB_APP53_META, /* Persistent metadata file. */ + DB_APP53_RECOVER, /* We are in recovery. */ + DB_APP53_TMP /* Temporary file. */ +} APPNAME53; + +static APPNAME __fop_convert_appname __P((ENV *, APPNAME53)); +static int __fop_create_recover_int __P((ENV *, char *, db_recops, int)); static int __fop_rename_recover_int __P((ENV *, DBT *, DB_LSN *, db_recops, void *, int)); +static int __fop_rename_60_recover_int + __P((ENV *, DBT *, DB_LSN *, db_recops, void *, int)); static int __fop_rename_42_recover_int __P((ENV *, DBT *, DB_LSN *, db_recops, void *, int)); +static int __fop_write_file_recover_int + __P((ENV *, db_recops, + APPNAME, u_int32_t, DBT *, DBT *, DBT *, DBT *, off_t, DB_TXN *)); + +/* + * The APPNAME enumermation was changed in 6.0 to include DB_APP_BLOB. APPNAME + * is used by the log records __fop_create, __fop_write, and __fop_rename. + * __fop_write_file also includes an APPNAME field, but that record was created + * in 6.0. + */ +static APPNAME +__fop_convert_appname(env, appname) + ENV *env; + APPNAME53 appname; +{ + switch(appname) + { + case DB_APP53_NONE: + return (DB_APP_NONE); + case DB_APP53_DATA: + return (DB_APP_DATA); + case DB_APP53_LOG: + return (DB_APP_LOG); + case DB_APP53_META: + return (DB_APP_META); + case DB_APP53_RECOVER: + return (DB_APP_RECOVER); + case DB_APP53_TMP: + return (DB_APP_TMP); + } + DB_ASSERT(env, 0); + return (DB_APP_NONE); +} /* * The transactional guarantees Berkeley DB provides for file @@ -50,6 +97,85 @@ static int __fop_rename_42_recover_int * it does not apply. */ +static int +__fop_create_recover_int(env, real_name, op, mode) + ENV *env; + char *real_name; + db_recops op; + int mode; +{ + DB_FH *fhp; + DBMETA *meta; + u_int8_t mbuf[DBMETASIZE]; + int ret; + char *path; +#ifdef HAVE_REPLICATION + DELAYED_BLOB_LIST *dbl; + int view_partial; + + dbl = NULL; +#endif + meta = (DBMETA *)mbuf; + ret = 0; + + if (DB_UNDO(op)) { + /* + * If the file was opened in mpool, we must mark it as + * dead via nameop which will also unlink the file. + */ + if (__os_open(env, real_name, 0, 0, 0, &fhp) == 0) { + if (__fop_read_meta(env, + real_name, mbuf, DBMETASIZE, fhp, 1, NULL) == 0 && + __db_chk_meta(env, NULL, meta, DB_CHK_META) == 0) { + if ((ret = __memp_nameop(env, + meta->uid, NULL, real_name, NULL, 0)) != 0) + goto out; + } else { + (void)__os_closehandle(env, fhp); + goto do_unlink; + } + (void)__os_closehandle(env, fhp); + } else +do_unlink: (void)__os_unlink(env, real_name, 0); + } else if (DB_REDO(op)) { + path = real_name; +#ifdef DB_WIN32 + /* + * Absolute paths on windows can result in it creating a + * "C" or "D" directory in the working directory. + */ + if (__os_abspath(real_name)) + path += 2; +#endif + +#ifdef HAVE_REPLICATION + /* + * Prevent replication of blob files if their owning database + * is not replicated. + */ + if (IS_VIEW_SITE(env) && IS_BLOB_FILE(path)) { + if ((ret = __rep_call_partial(env, + path, &view_partial, 0, &dbl)) != 0) + goto out; + DB_ASSERT(env, dbl == NULL); + if (view_partial == 0) + goto out; + } +#endif + /* Blob directories might not exist yet. */ + if (__os_exists(env, real_name, NULL) != 0 && + (ret = __db_mkpath(env, path)) != 0) + goto out; + + if ((ret = __os_open(env, real_name, + 0, DB_OSO_CREATE, mode, &fhp)) == 0) + (void)__os_closehandle(env, fhp); + else + goto out; + } +out: return (ret); +} + /* * __fop_create_recover -- * Recovery function for create. @@ -66,9 +192,6 @@ __fop_create_recover(env, dbtp, lsnp, op, info) void *info; { __fop_create_args *argp; - DB_FH *fhp; - DBMETA *meta; - u_int8_t mbuf[DBMETASIZE]; int ret; char *real_name; const char *dirname; @@ -78,7 +201,6 @@ __fop_create_recover(env, dbtp, lsnp, op, info) real_name = NULL; REC_PRINT(__fop_create_print); REC_NOOP_INTRO(__fop_create_read); - meta = (DBMETA *)mbuf; if (argp->dirname.size == 0) dirname = NULL; @@ -90,32 +212,60 @@ __fop_create_recover(env, dbtp, lsnp, op, info) (const char *)argp->name.data, &dirname, &real_name)) != 0) goto out; - if (DB_UNDO(op)) { - /* - * If the file was opened in mpool, we must mark it as - * dead via nameop which will also unlink the file. - */ - if (__os_open(env, real_name, 0, 0, 0, &fhp) == 0) { - if (__fop_read_meta(env, - real_name, mbuf, DBMETASIZE, fhp, 1, NULL) == 0 && - __db_chk_meta(env, NULL, meta, 1) == 0) { - if ((ret = __memp_nameop(env, - meta->uid, NULL, real_name, NULL, 0)) != 0) - goto out; - } else { - (void)__os_closehandle(env, fhp); - goto do_unlink; - } - (void)__os_closehandle(env, fhp); - } else -do_unlink: (void)__os_unlink(env, real_name, 0); - } else if (DB_REDO(op)) { - if ((ret = __os_open(env, real_name, 0, - DB_OSO_CREATE, (int)argp->mode, &fhp)) == 0) - (void)__os_closehandle(env, fhp); - else - goto out; - } + if ((ret = __fop_create_recover_int( + env, real_name, op, (int)argp->mode)) != 0) + goto out; + + *lsnp = argp->prev_lsn; + +out: if (real_name != NULL) + __os_free(env, real_name); + + REC_NOOP_CLOSE; +} + +/* + * __fop_create_60_recover -- + * Recovery function for create. + * + * PUBLIC: int __fop_create_60_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__fop_create_60_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __fop_create_60_args *argp; + APPNAME appname; + int ret; + char *real_name; + const char *dirname; + + COMPQUIET(info, NULL); + + real_name = NULL; + REC_PRINT(__fop_create_60_print); + REC_NOOP_INTRO(__fop_create_60_read); + + if (argp->dirname.size == 0) + dirname = NULL; + else + dirname = (const char *)argp->dirname.data; + + appname = __fop_convert_appname(env, (APPNAME53)argp->appname); + + if ((ret = __db_appname(env, + appname == DB_APP_DATA ? DB_APP_RECOVER : appname, + (const char *)argp->name.data, &dirname, &real_name)) != 0) + goto out; + + if ((ret = __fop_create_recover_int( + env, real_name, op, (int)argp->mode)) != 0) + goto out; *lsnp = argp->prev_lsn; @@ -144,6 +294,7 @@ __fop_create_42_recover(env, dbtp, lsnp, op, info) DB_FH *fhp; DBMETA *meta; u_int8_t mbuf[DBMETASIZE]; + APPNAME appname; int ret; char *real_name; @@ -153,8 +304,9 @@ __fop_create_42_recover(env, dbtp, lsnp, op, info) REC_PRINT(__fop_create_print); REC_NOOP_INTRO(__fop_create_read); meta = (DBMETA *)mbuf; + appname = __fop_convert_appname(env, (APPNAME53)argp->appname); - if ((ret = __db_appname(env, (APPNAME)argp->appname, + if ((ret = __db_appname(env, appname, (const char *)argp->name.data, NULL, &real_name)) != 0) goto out; @@ -166,7 +318,7 @@ __fop_create_42_recover(env, dbtp, lsnp, op, info) if (__os_open(env, real_name, 0, 0, 0, &fhp) == 0) { if (__fop_read_meta(env, real_name, mbuf, DBMETASIZE, fhp, 1, NULL) == 0 && - __db_chk_meta(env, NULL, meta, 1) == 0) { + __db_chk_meta(env, NULL, meta, DB_CHK_META) == 0) { if ((ret = __memp_nameop(env, meta->uid, NULL, real_name, NULL, 0)) != 0) goto out; @@ -232,6 +384,49 @@ out: if (real_name != NULL) } /* + * __fop_remove_60_recover -- + * Recovery function for remove. + * + * PUBLIC: int __fop_remove_60_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__fop_remove_60_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __fop_remove_60_args *argp; + APPNAME appname; + int ret; + char *real_name; + + COMPQUIET(info, NULL); + + real_name = NULL; + REC_PRINT(__fop_remove_60_print); + REC_NOOP_INTRO(__fop_remove_60_read); + + appname = __fop_convert_appname(env, (APPNAME53)argp->appname); + + if ((ret = __db_appname(env, appname, + (const char *)argp->name.data, NULL, &real_name)) != 0) + goto out; + + /* Its ok if the file is not there. */ + if (DB_REDO(op)) + (void)__memp_nameop(env, + (u_int8_t *)argp->fid.data, NULL, real_name, NULL, 0); + + *lsnp = argp->prev_lsn; +out: if (real_name != NULL) + __os_free(env, real_name); + REC_NOOP_CLOSE; +} + +/* * __fop_write_recover -- * Recovery function for writechunk. * @@ -251,6 +446,15 @@ __fop_write_recover(env, dbtp, lsnp, op, info) COMPQUIET(info, NULL); +#ifndef HAVE_64BIT_TYPES + COMPQUIET(dbtp, NULL); + COMPQUIET(lsnp, NULL); + COMPQUIET(op, 0); + __db_errx(env, DB_STR("0243", + "Blobs require 64 integer compiler support.")); + return (DB_OPNOTSUP); +#endif + REC_PRINT(__fop_write_print); REC_NOOP_INTRO(__fop_write_read); @@ -272,6 +476,48 @@ __fop_write_recover(env, dbtp, lsnp, op, info) } /* + * __fop_write_60_recover -- + * Recovery function for writechunk. + * + * PUBLIC: int __fop_write_60_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__fop_write_60_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __fop_write_60_args *argp; + APPNAME appname; + int ret; + + COMPQUIET(info, NULL); + + REC_PRINT(__fop_write_60_print); + REC_NOOP_INTRO(__fop_write_60_read); + + ret = 0; + if (DB_UNDO(op)) + DB_ASSERT(env, argp->flag != 0); + else if (DB_REDO(op)) { + appname = __fop_convert_appname(env, (APPNAME53)argp->appname); + ret = __fop_write(env, + argp->txnp, argp->name.data, + argp->dirname.size == 0 ? NULL : argp->dirname.data, + appname == DB_APP_DATA ? DB_APP_RECOVER : appname, + NULL, argp->pgsize, argp->pageno, argp->offset, + argp->page.data, argp->page.size, argp->flag, 0); + } + + if (ret == 0) + *lsnp = argp->prev_lsn; + REC_NOOP_CLOSE; +} + +/* * __fop_write_42_recover -- * Recovery function for writechunk. * @@ -287,6 +533,7 @@ __fop_write_42_recover(env, dbtp, lsnp, op, info) void *info; { __fop_write_args *argp; + APPNAME appname; int ret; COMPQUIET(info, NULL); @@ -297,18 +544,194 @@ __fop_write_42_recover(env, dbtp, lsnp, op, info) ret = 0; if (DB_UNDO(op)) DB_ASSERT(env, argp->flag != 0); - else if (DB_REDO(op)) + else if (DB_REDO(op)) { + appname = __fop_convert_appname(env, (APPNAME53)argp->appname); ret = __fop_write(env, - argp->txnp, argp->name.data, NULL, (APPNAME)argp->appname, + argp->txnp, argp->name.data, NULL, appname, NULL, argp->pgsize, argp->pageno, argp->offset, argp->page.data, argp->page.size, argp->flag, 0); + } + + if (ret == 0) + *lsnp = argp->prev_lsn; + REC_NOOP_CLOSE; +} + +static int +__fop_write_file_recover_int( + env, op, appname, flag, dirname, name, new_data, old_data, offset, txn) + ENV *env; + db_recops op; + APPNAME appname; + u_int32_t flag; + DBT *dirname; + DBT *name; + DBT *new_data; + DBT *old_data; + off_t offset; + DB_TXN *txn; +{ + DB_FH *fhp; + int ret; + size_t nbytes; + char *path; + + fhp = NULL; + path = NULL; + ret = 0; + + if (DB_UNDO(op)) { + if (flag & DB_FOP_CREATE) { + /* + * File was created in this transaction. Do nothing, + * destroying the file will undo the write. + */ + } else { + if ((ret = __db_appname(env, + appname == DB_APP_DATA ? DB_APP_RECOVER : + appname, name->data, NULL, &path)) != 0) + goto end; + + if (__os_open(env, path, 0, 0, DB_MODE_600, &fhp) != 0) + goto end; + + if (flag & DB_FOP_APPEND) { + /* + * Appended to the end of the file, undo by + * truncating the file. + */ + (void)__os_truncate(env, fhp, 0, 0, offset); + } else { + /* + * Data overwritten in the middle of the file, + * undo by writing back in the old data. + */ + + /* Seek to offset. */ + if ((__os_seek(env, fhp, 0, 0, offset)) != 0) + goto end; + + /* Now do the write. */ + ret = __os_write(env, fhp, + old_data->data, old_data->size, &nbytes); + } + } + } else if (DB_REDO(op)) { + /* + * Not all operations log enough data to be redone. Since + * files are flushed before the transaction commit this is + * not an issue, unless we are on an HA client or initializing + * from a backup. + */ + if (flag & DB_FOP_REDO) { + ret = __fop_write_file(env, txn, name->data, + dirname->size == 0 ? NULL : dirname->data, + appname == DB_APP_DATA ? DB_APP_RECOVER : appname, + NULL, offset, new_data->data, new_data->size, 0); +#ifdef HAVE_REPLICATION + /* + * Blob files of databases that are not replicated are + * also not replicated. So assume any ENOENT errors + * are because the file was not replicated. + */ + if (ret == ENOENT && IS_VIEW_SITE(env)) + ret = 0; +#endif + } else { + /* DB_ASSERT(env, !IS_REP_CLIENT(env)); */ + } + } + +end: if (path != NULL) + __os_free(env, path); + if (fhp != NULL) + (void)__os_closehandle(env, fhp); + return (ret); +} +/* + * __fop_write_file_recover -- + * Recovery function for writing to a blob file. Files are flushed before + * the transaction is committed, so often the file operations do not need + * to be redone or undone. However, since no lsn is stored in the file, + * we always try to redo or undo the operation, since it will not change + * the final state of the file if the operation is not needed. This also + * means that this function has to be very tolerant of errors, such as + * trying to open a file that was deleted, or truncate a file that is + * already short. + * + * PUBLIC: int __fop_write_file_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__fop_write_file_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __fop_write_file_args *argp; + int ret; + COMPQUIET(info, NULL); + +#ifndef HAVE_64BIT_TYPES + COMPQUIET(dbtp, NULL); + COMPQUIET(lsnp, NULL); + COMPQUIET(op, 0); + __db_errx(env, DB_STR("0244", + "Blobs require 64 integer compiler support.")); + return (DB_OPNOTSUP); +#endif + + REC_PRINT(__fop_write_file_print); + REC_NOOP_INTRO(__fop_write_file_read); + + ret = __fop_write_file_recover_int(env, op, + (APPNAME)argp->appname, argp->flag, &argp->dirname, &argp->name, + &argp->new_data, &argp->old_data, (off_t)argp->offset, argp->txnp); if (ret == 0) *lsnp = argp->prev_lsn; REC_NOOP_CLOSE; } /* + * __fop_write_file_60_recover -- + * + * PUBLIC: int __fop_write_file_60_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__fop_write_file_60_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __fop_write_file_60_args *argp; + off_t offset; + int ret; + COMPQUIET(info, NULL); + + REC_PRINT(__fop_write_file_60_print); + REC_NOOP_INTRO(__fop_write_file_60_read); + + /* The offset is stored as two u_in32_t values. */ + GET_LO_HI(env, argp->offset_lo, argp->offset_hi, offset, ret); + if (ret != 0) + goto end; + + ret = __fop_write_file_recover_int(env, op, + (APPNAME)argp->appname, argp->flag, &argp->dirname, &argp->name, + &argp->new_data, &argp->old_data, offset, argp->txnp); + +end: if (ret == 0) + *lsnp = argp->prev_lsn; + REC_NOOP_CLOSE; +} + +/* * __fop_rename_recover -- * Recovery functions for rename. There are two variants that * both use the same utility function. Had we known about this on day @@ -408,7 +831,148 @@ __fop_rename_recover_int(env, dbtp, lsnp, op, info, undo) if (__fop_read_meta(env, src, mbuf, DBMETASIZE, fhp, 1, NULL) != 0) goto done; - if (__db_chk_meta(env, NULL, meta, 1) != 0) + if (__db_chk_meta(env, NULL, meta, DB_CHK_META) != 0) + goto done; + if (memcmp(argp->fileid.data, meta->uid, DB_FILE_ID_LEN) != 0) + goto done; + (void)__os_closehandle(env, fhp); + fhp = NULL; + if (DB_REDO(op)) { + /* + * Check to see if the target file exists. If it + * does and it does not have the proper id then + * it is a later version. We just remove the source + * file since the state of the world is beyond this + * point. + */ + if (__os_open(env, real_new, 0, 0, 0, &fhp) == 0 && + __fop_read_meta(env, src, mbuf, + DBMETASIZE, fhp, 1, NULL) == 0 && + __db_chk_meta(env, NULL, meta, DB_CHK_META) == 0 && + memcmp(argp->fileid.data, + meta->uid, DB_FILE_ID_LEN) != 0) { + (void)__memp_nameop(env, + fileid, NULL, real_old, NULL, 0); + goto done; + } + } + } + + if (undo && DB_UNDO(op)) + (void)__memp_nameop(env, fileid, + (const char *)argp->oldname.data, real_new, real_old, 0); + if (DB_REDO(op)) + (void)__memp_nameop(env, fileid, + (const char *)argp->newname.data, real_old, real_new, 0); + +done: *lsnp = argp->prev_lsn; +out: if (real_new != NULL) + __os_free(env, real_new); + if (real_old != NULL) + __os_free(env, real_old); + if (fhp != NULL) + (void)__os_closehandle(env, fhp); + + REC_NOOP_CLOSE; +} + +/* + * __fop_rename_60_recover -- + * + * PUBLIC: int __fop_rename_60_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + * + * PUBLIC: int __fop_rename_noundo_60_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ + +int +__fop_rename_60_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + return (__fop_rename_60_recover_int(env, dbtp, lsnp, op, info, 1)); +} + +int +__fop_rename_noundo_60_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + return (__fop_rename_60_recover_int(env, dbtp, lsnp, op, info, 0)); +} + +static int +__fop_rename_60_recover_int(env, dbtp, lsnp, op, info, undo) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; + int undo; +{ + __fop_rename_60_args *argp; + APPNAME appname; + DB_FH *fhp; + DBMETA *meta; + u_int8_t *fileid, mbuf[DBMETASIZE]; + int ret; + char *real_new, *real_old, *src; + const char *dirname; + + COMPQUIET(info, NULL); + + fhp = NULL; + meta = (DBMETA *)&mbuf[0]; + ret = 0; + real_new = real_old = NULL; + + REC_PRINT(__fop_rename_60_print); + REC_NOOP_INTRO(__fop_rename_60_read); + fileid = argp->fileid.data; + + if (argp->dirname.size == 0) + dirname = NULL; + else + dirname = (const char *)argp->dirname.data; + + + appname = __fop_convert_appname(env, (APPNAME53)argp->appname); + if (appname == DB_APP_DATA) + appname = DB_APP_RECOVER; + + if ((ret = __db_appname(env, appname, (const char *)argp->newname.data, + &dirname, &real_new)) != 0) + goto out; + if ((ret = __db_appname(env, appname, (const char *)argp->oldname.data, + &dirname, &real_old)) != 0) + goto out; + + /* + * Verify that we are manipulating the correct file. We should always + * be OK on an ABORT or an APPLY, but during recovery, we have to + * check. + */ + if (op != DB_TXN_ABORT && op != DB_TXN_APPLY) { + src = DB_UNDO(op) ? real_new : real_old; + /* + * Interpret any error as meaning that the file either doesn't + * exist, doesn't have a meta-data page, or is in some other + * way, shape or form, incorrect, so that we should not restore + * it. + */ + if (__os_open(env, src, 0, 0, 0, &fhp) != 0) + goto done; + if (__fop_read_meta(env, + src, mbuf, DBMETASIZE, fhp, 1, NULL) != 0) + goto done; + if (__db_chk_meta(env, NULL, meta, DB_CHK_META) != 0) goto done; if (memcmp(argp->fileid.data, meta->uid, DB_FILE_ID_LEN) != 0) goto done; @@ -425,7 +989,7 @@ __fop_rename_recover_int(env, dbtp, lsnp, op, info, undo) if (__os_open(env, real_new, 0, 0, 0, &fhp) == 0 && __fop_read_meta(env, src, mbuf, DBMETASIZE, fhp, 1, NULL) == 0 && - __db_chk_meta(env, NULL, meta, 1) == 0 && + __db_chk_meta(env, NULL, meta, DB_CHK_META) == 0 && memcmp(argp->fileid.data, meta->uid, DB_FILE_ID_LEN) != 0) { (void)__memp_nameop(env, @@ -501,6 +1065,7 @@ __fop_rename_42_recover_int(env, dbtp, lsnp, op, info, undo) DB_FH *fhp; DBMETA *meta; u_int8_t *fileid, mbuf[DBMETASIZE]; + APPNAME appname; int ret; char *real_new, *real_old, *src; @@ -515,10 +1080,11 @@ __fop_rename_42_recover_int(env, dbtp, lsnp, op, info, undo) REC_NOOP_INTRO(__fop_rename_read); fileid = argp->fileid.data; - if ((ret = __db_appname(env, (APPNAME)argp->appname, + appname = __fop_convert_appname(env, (APPNAME53)argp->appname); + if ((ret = __db_appname(env, appname, (const char *)argp->newname.data, NULL, &real_new)) != 0) goto out; - if ((ret = __db_appname(env, (APPNAME)argp->appname, + if ((ret = __db_appname(env, appname, (const char *)argp->oldname.data, NULL, &real_old)) != 0) goto out; @@ -540,7 +1106,7 @@ __fop_rename_42_recover_int(env, dbtp, lsnp, op, info, undo) if (__fop_read_meta(env, src, mbuf, DBMETASIZE, fhp, 1, NULL) != 0) goto done; - if (__db_chk_meta(env, NULL, meta, 1) != 0) + if (__db_chk_meta(env, NULL, meta, DB_CHK_META) != 0) goto done; if (memcmp(argp->fileid.data, meta->uid, DB_FILE_ID_LEN) != 0) goto done; @@ -557,7 +1123,7 @@ __fop_rename_42_recover_int(env, dbtp, lsnp, op, info, undo) if (__os_open(env, real_new, 0, 0, 0, &fhp) == 0 && __fop_read_meta(env, src, mbuf, DBMETASIZE, fhp, 1, NULL) == 0 && - __db_chk_meta(env, NULL, meta, 1) == 0 && + __db_chk_meta(env, NULL, meta, DB_CHK_META) == 0 && memcmp(argp->fileid.data, meta->uid, DB_FILE_ID_LEN) != 0) { (void)__memp_nameop(env, @@ -652,7 +1218,115 @@ __fop_file_remove_recover(env, dbtp, lsnp, op, info) * We can ignore errors here since we'll simply fail the * checks below and assume this is the wrong file. */ - (void)__db_chk_meta(env, NULL, meta, 1); + (void)__db_chk_meta(env, NULL, meta, DB_CHK_META); + is_real = + memcmp(argp->real_fid.data, meta->uid, DB_FILE_ID_LEN) == 0; + is_tmp = + memcmp(argp->tmp_fid.data, meta->uid, DB_FILE_ID_LEN) == 0; + + if (!is_real && !is_tmp) + /* File exists, but isn't what we were removing. */ + cstat = TXN_IGNORE; + else + /* File exists and is the one that we were removing. */ + cstat = TXN_COMMIT; + } + if (fhp != NULL) { + (void)__os_closehandle(env, fhp); + fhp = NULL; + } + + if (DB_UNDO(op)) { + /* On the backward pass, we leave a note for the child txn. */ + if ((ret = __db_txnlist_update(env, + info, argp->child, cstat, NULL, &ret_stat, 1)) != 0) + goto out; + } else if (DB_REDO(op)) { + /* + * On the forward pass, check if someone recreated the + * file while we weren't looking. + */ + if (cstat == TXN_COMMIT) + (void)__memp_nameop(env, + is_real ? argp->real_fid.data : argp->tmp_fid.data, + NULL, real_name, NULL, 0); + } + +done: *lsnp = argp->prev_lsn; + ret = 0; + +out: if (real_name != NULL) + __os_free(env, real_name); + if (fhp != NULL) + (void)__os_closehandle(env, fhp); + REC_NOOP_CLOSE; +} + +/* + * __fop_file_remove_60_recover -- + * + * PUBLIC: int __fop_file_remove_60_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__fop_file_remove_60_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __fop_file_remove_60_args *argp; + DBMETA *meta; + DB_FH *fhp; + size_t len; + u_int8_t mbuf[DBMETASIZE]; + u_int32_t cstat, ret_stat; + APPNAME appname; + int is_real, is_tmp, ret; + char *real_name; + + fhp = NULL; + meta = (DBMETA *)&mbuf[0]; + is_real = is_tmp = 0; + real_name = NULL; + REC_PRINT(__fop_file_remove_60_print); + REC_NOOP_INTRO(__fop_file_remove_60_read); + + /* + * This record is only interesting on the backward, forward, and + * apply phases. + */ + if (op != DB_TXN_BACKWARD_ROLL && + op != DB_TXN_FORWARD_ROLL && op != DB_TXN_APPLY) + goto done; + + appname = __fop_convert_appname(env, (APPNAME53)argp->appname); + if ((ret = __db_appname(env, appname, + argp->name.data, NULL, &real_name)) != 0) + goto out; + + /* Verify that we are manipulating the correct file. */ + len = 0; + if (__os_open(env, real_name, 0, 0, 0, &fhp) != 0 || + (ret = __fop_read_meta(env, real_name, + mbuf, DBMETASIZE, fhp, 1, &len)) != 0) { + /* + * If len is non-zero, then the file exists and has something + * in it, but that something isn't a full meta-data page, so + * this is very bad. Bail out! + */ + if (len != 0) + goto out; + + /* File does not exist. */ + cstat = TXN_EXPECTED; + } else { + /* + * We can ignore errors here since we'll simply fail the + * checks below and assume this is the wrong file. + */ + (void)__db_chk_meta(env, NULL, meta, DB_CHK_META); is_real = memcmp(argp->real_fid.data, meta->uid, DB_FILE_ID_LEN) == 0; is_tmp = @@ -695,3 +1369,4 @@ out: if (real_name != NULL) (void)__os_closehandle(env, fhp); REC_NOOP_CLOSE; } + diff --git a/src/fileops/fop_util.c b/src/fileops/fop_util.c index 1925ffd1..d51aba0f 100644 --- a/src/fileops/fop_util.c +++ b/src/fileops/fop_util.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -24,9 +24,10 @@ static int __fop_inmem_read_meta __P((DB *, DB_TXN *, const char *, u_int32_t, u_int32_t)); static int __fop_inmem_swap __P((DB *, DB *, DB_TXN *, const char *, const char *, const char *, DB_LOCKER *)); -static int __fop_ondisk_dummy __P((DB *, DB_TXN *, const char *, u_int8_t *)); +static int __fop_ondisk_dummy __P(( + DB *, DB_TXN *, const char *, u_int8_t *, APPNAME)); static int __fop_ondisk_swap __P((DB *, DB *, DB_TXN *, - const char *, const char *, const char *, DB_LOCKER *)); + const char *, const char *, const char *, DB_LOCKER *, APPNAME)); /* * Acquire the environment meta-data lock. The parameters are the @@ -115,7 +116,7 @@ __fop_lock_handle(env, dbp, locker, mode, elockp, flags) /* * If we are in recovery, the only locking we should be * doing is on the global environment. The one exception - * is if we are opening an exclusive database on a client + * is if we are opening an exclusive database on a client * syncing with the master. */ if (IS_RECOVERING(env) && !F2_ISSET(dbp, DB2_AM_INTEXCL)) @@ -234,8 +235,8 @@ __fop_file_setup(dbp, ip, txn, name, mode, flags, retidp) real_name = real_tmpname = tmpname = NULL; dflags = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0; aflags = LF_ISSET(DB_INTERNAL_PERSISTENT_DB) ? DB_APP_META : - (LF_ISSET(DB_INTERNAL_TEMPORARY_DB) ? DB_APP_NONE : DB_APP_DATA); - LF_CLR(DB_INTERNAL_PERSISTENT_DB | DB_INTERNAL_TEMPORARY_DB); + (LF_ISSET(DB_INTERNAL_BLOB_DB) ? DB_APP_BLOB : + (LF_ISSET(DB_INTERNAL_TEMPORARY_DB) ? DB_APP_NONE : DB_APP_DATA)); ret = 0; retries = 0; @@ -394,14 +395,14 @@ reopen: if (!F_ISSET(dbp, DB_AM_INMEM) && (ret = goto done; } - /* + /* * Case 4: This is a valid file. Now check the - * checksum and decrypt the file so the file + * checksum and decrypt the file so the file * id can be obtained for the handle lock. Note that * the checksum can fail if the database is being * written (possible because the handle lock has * not been obtained yet). So on checksum fail retry - * until the checksum succeeds or the number of + * until the checksum succeeds or the number of * retries is exhausted, then throw an error. */ if (ret == 0 && (ret = __db_chk_meta(env, dbp, @@ -410,7 +411,7 @@ reopen: if (!F_ISSET(dbp, DB_AM_INMEM) && (ret = ret = t_ret; goto err; } - /* + /* * Retry unless the number of retries is * exhausted. */ @@ -423,8 +424,7 @@ reopen: if (!F_ISSET(dbp, DB_AM_INMEM) && (ret = ret = EINVAL; goto err; } - if ((ret = __os_closehandle(env, fhp)) != 0) - goto err; + CLOSE_HANDLE(dbp, fhp); goto retry; } /* Get the file id for the handle lock. */ @@ -464,11 +464,8 @@ reopen: if (!F_ISSET(dbp, DB_AM_INMEM) && (ret = * any application level FCNTL semantics. */ DB_ASSERT(env, !LF_ISSET(DB_FCNTL_LOCKING)); - if (!F_ISSET(dbp, DB_AM_INMEM)) { - if ((ret = __os_closehandle(env, fhp)) != 0) - goto err; - fhp = NULL; - } + if (!F_ISSET(dbp, DB_AM_INMEM)) + CLOSE_HANDLE(dbp, fhp); if ((ret = __fop_lock_handle(env, dbp, locker, lockmode, &elock, 0)) != 0) { if (F_ISSET(dbp, DB_AM_INMEM)) @@ -495,7 +492,7 @@ reopen: if (!F_ISSET(dbp, DB_AM_INMEM) && (ret = } - /* + /* * If we got here, then we have the handle lock, it is now * safe to check the rest of the meta data, since the file * will not be deleted out from under the handle. @@ -505,7 +502,7 @@ reopen: if (!F_ISSET(dbp, DB_AM_INMEM) && (ret = dbp, txn, name, flags, DB_SKIP_CHK)) != 0) goto err; } else { - if ((ret = __db_meta_setup(env, dbp, real_name, + if ((ret = __db_meta_setup(env, dbp, real_name, (DBMETA *)mbuf, flags, DB_SKIP_CHK)) != 0) goto err; } @@ -524,9 +521,8 @@ reopen: if (!F_ISSET(dbp, DB_AM_INMEM) && (ret = if (create_ok) { if (F_ISSET(dbp, DB_AM_INMEM)) { RESET_MPF(dbp, DB_MPOOL_DISCARD); - } else if ((ret = - __os_closehandle(env, fhp)) != 0) - goto err; + } else + CLOSE_HANDLE(dbp, fhp); LF_SET(DB_CREATE); goto create; } else { @@ -856,6 +852,7 @@ retry: if ((ret = __db_master_open(dbp, /* Copy the pagesize and set the sub-database flag. */ dbp->pgsize = mdbp->pgsize; F_SET(dbp, DB_AM_SUBDB); + dbp->blob_file_id = mdbp->blob_file_id; if (name != NULL && (ret = __db_master_update(mdbp, dbp, ip, txn, name, dbp->type, MU_OPEN, NULL, flags)) != 0) { @@ -881,6 +878,8 @@ retry: if ((ret = __db_master_open(dbp, DB_TEST_RECOVERY(dbp, DB_TEST_POSTLOG, ret, mname); + dbp->dirname = mdbp->dirname; + /* * We copy our fileid from our master so that we all open * the same file in mpool. We'll use the meta-pgno to lock @@ -1174,13 +1173,14 @@ err: * remove). * * PUBLIC: int __fop_dummy __P((DB *, - * PUBLIC: DB_TXN *, const char *, const char *)); + * PUBLIC: DB_TXN *, const char *, const char *, APPNAME)); */ int -__fop_dummy(dbp, txn, old, new) +__fop_dummy(dbp, txn, old, new, appname) DB *dbp; DB_TXN *txn; const char *old, *new; + APPNAME appname; { DB *tmpdbp; DB_TXN *stxn; @@ -1214,17 +1214,19 @@ __fop_dummy(dbp, txn, old, new) if (F_ISSET(dbp, DB_AM_NOT_DURABLE) && (ret = __db_set_flags(tmpdbp, DB_TXN_NOT_DURABLE)) != 0) goto err; + tmpdbp->dirname = dbp->dirname; memset(mbuf, 0, sizeof(mbuf)); ret = F_ISSET(dbp, DB_AM_INMEM) ? __fop_inmem_dummy(tmpdbp, stxn, back, mbuf) : - __fop_ondisk_dummy(tmpdbp, stxn, back, mbuf); + __fop_ondisk_dummy(tmpdbp, stxn, back, mbuf, appname); if (ret != 0) goto err; ret = F_ISSET(dbp, DB_AM_INMEM) ? __fop_inmem_swap(dbp, tmpdbp, stxn, old, new, back, txn->locker) : - __fop_ondisk_swap(dbp, tmpdbp, stxn, old, new, back, txn->locker); + __fop_ondisk_swap( + dbp, tmpdbp, stxn, old, new, back, txn->locker, appname); stxn = NULL; if (ret != 0) goto err; @@ -1246,12 +1248,13 @@ err: if (stxn != NULL) * and the subsequent calls in __db_rename do the work for the * transactional case). * - * PUBLIC: int __fop_dbrename __P((DB *, const char *, const char *)); + * PUBLIC: int __fop_dbrename __P((DB *, const char *, const char *, APPNAME)); */ int -__fop_dbrename(dbp, old, new) +__fop_dbrename(dbp, old, new, appname) DB *dbp; const char *old, *new; + APPNAME appname; { DB_LOCK elock; ENV *env; @@ -1269,11 +1272,11 @@ __fop_dbrename(dbp, old, new) } else { /* Get full names. */ if ((ret = __db_appname(env, - DB_APP_DATA, old, &dbp->dirname, &real_old)) != 0) + appname, old, &dbp->dirname, &real_old)) != 0) goto err; if ((ret = __db_appname(env, - DB_APP_DATA, new, &dbp->dirname, &real_new)) != 0) + appname, new, &dbp->dirname, &real_new)) != 0) goto err; } @@ -1414,9 +1417,11 @@ __fop_inmem_read_meta(dbp, txn, name, flags, chkflags) if ((ret = __db_chk_meta(dbp->env, dbp, metap, chkflags)) == 0) memcpy(dbp->fileid, ((DBMETA *)metap)->uid, DB_FILE_ID_LEN); - } else + } else ret = __db_meta_setup( dbp->env, dbp, name, metap, flags, chkflags); + if (ret == DB_CHKSUM_FAIL) + ret = DB_META_CHKSUM_FAIL; if ((t_ret = __memp_fput(dbp->mpf, ip, metap, dbp->priority)) && ret == 0) @@ -1426,11 +1431,12 @@ __fop_inmem_read_meta(dbp, txn, name, flags, chkflags) } static int -__fop_ondisk_dummy(dbp, txn, name, mbuf) +__fop_ondisk_dummy(dbp, txn, name, mbuf, appname) DB *dbp; DB_TXN *txn; const char *name; u_int8_t *mbuf; + APPNAME appname; { ENV *env; int ret; @@ -1442,11 +1448,11 @@ __fop_ondisk_dummy(dbp, txn, name, mbuf) dflags = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0; if ((ret = __db_appname(env, - DB_APP_DATA, name, &dbp->dirname, &realname)) != 0) + appname, name, &dbp->dirname, &realname)) != 0) goto err; if ((ret = __fop_create(env, - txn, NULL, name, &dbp->dirname, DB_APP_DATA, 0, dflags)) != 0) + txn, NULL, name, &dbp->dirname, appname, 0, dflags)) != 0) goto err; if ((ret = @@ -1455,7 +1461,7 @@ __fop_ondisk_dummy(dbp, txn, name, mbuf) ((DBMETA *)mbuf)->magic = DB_RENAMEMAGIC; if ((ret = __fop_write(env, txn, name, dbp->dirname, - DB_APP_DATA, NULL, 0, 0, 0, mbuf, DBMETASIZE, 1, dflags)) != 0) + appname, NULL, 0, 0, 0, mbuf, DBMETASIZE, 1, dflags)) != 0) goto err; memcpy(dbp->fileid, ((DBMETA *)mbuf)->uid, DB_FILE_ID_LEN); @@ -1511,11 +1517,12 @@ err: return (ret); } static int -__fop_ondisk_swap(dbp, tmpdbp, txn, old, new, back, locker) +__fop_ondisk_swap(dbp, tmpdbp, txn, old, new, back, locker, appname) DB *dbp, *tmpdbp; DB_TXN *txn; const char *old, *new, *back; DB_LOCKER *locker; + APPNAME appname; { DBT fiddbt, namedbt, tmpdbt; DB_FH *fhp; @@ -1538,7 +1545,7 @@ __fop_ondisk_swap(dbp, tmpdbp, txn, old, new, back, locker) dflags = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0; if ((ret = __db_appname(env, - DB_APP_DATA, new, &dbp->dirname, &realnew)) != 0) + appname, new, &dbp->dirname, &realnew)) != 0) goto err; /* Now, lock the name space while we initialize this file. */ @@ -1634,10 +1641,10 @@ retry: GET_ENVLOCK(env, locker, &elock); * swap for the handle lock. */ if ((ret = __fop_rename(env, txn, - old, new, &dbp->dirname, dbp->fileid, DB_APP_DATA, 1, dflags)) != 0) + old, new, &dbp->dirname, dbp->fileid, appname, 1, dflags)) != 0) goto err; if ((ret = __fop_rename(env, txn, back, old, - &dbp->dirname, tmpdbp->fileid, DB_APP_DATA, 0, dflags)) != 0) + &dbp->dirname, tmpdbp->fileid, appname, 0, dflags)) != 0) goto err; if ((ret = __fop_lock_handle(env, tmpdbp, locker, DB_LOCK_WRITE, &elock, NOWAIT_FLAG(txn))) != 0) @@ -1673,12 +1680,12 @@ retry: GET_ENVLOCK(env, locker, &elock); DB_INIT_DBT(namedbt, old, strlen(old) + 1); if ((t_ret = __fop_file_remove_log(env, parent, &lsn, dflags, &fiddbt, &tmpdbt, &namedbt, - (u_int32_t)DB_APP_DATA, child_txnid)) != 0 && ret == 0) + (u_int32_t)appname, child_txnid)) != 0 && ret == 0) ret = t_ret; /* This is a delayed delete of the dummy file. */ if ((ret = __db_appname(env, - DB_APP_DATA, old, &dbp->dirname, &realold)) != 0) + appname, old, &dbp->dirname, &realold)) != 0) goto err; if ((ret = __txn_remevent(env, parent, realold, NULL, 0)) != 0) diff --git a/src/hash/hash.c b/src/hash/hash.c index ae5736e7..5bff1dee 100644 --- a/src/hash/hash.c +++ b/src/hash/hash.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1990, 1993, 1994 @@ -298,6 +298,7 @@ __hamc_count(dbc, recnop) } switch (HPAGE_PTYPE(H_PAIRDATA(dbp, hcp->page, hcp->indx))) { + case H_BLOB: case H_KEYDATA: case H_OFFPAGE: recno = 1; @@ -379,7 +380,7 @@ __hamc_del(dbc, flags) hcp = (HASH_CURSOR *)dbc->internal; if (F_ISSET(hcp, H_DELETED)) - return (DB_NOTFOUND); + return (DBC_ERR(dbc, DB_NOTFOUND)); if ((ret = __ham_get_meta(dbc)) != 0) goto out; @@ -535,7 +536,7 @@ next: ret = __ham_item_next(dbc, lock_type, pgnop); case DB_CURRENT: /* cgetchk has already determined that the cursor is set. */ if (F_ISSET(hcp, H_DELETED)) { - ret = DB_KEYEMPTY; + ret = DBC_ERR(dbc, DB_KEYEMPTY); goto err; } @@ -554,7 +555,8 @@ next: ret = __ham_item_next(dbc, lock_type, pgnop); if (ret != 0 && ret != DB_NOTFOUND) goto err; else if (F_ISSET(hcp, H_OK)) { - if (*pgnop == PGNO_INVALID) + if (*pgnop == PGNO_INVALID && HPAGE_PTYPE( + H_PAIRDATA(dbp, hcp->page, hcp->indx)) != H_BLOB) ret = __ham_dup_return(dbc, data, flags); break; } else if (!F_ISSET(hcp, H_NOMORE)) { @@ -576,7 +578,7 @@ next: ret = __ham_item_next(dbc, lock_type, pgnop); dbc->thread_info, hcp->page, dbc->priority); hcp->page = NULL; if (hcp->bucket == 0) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); hcp->pgno = PGNO_INVALID; goto err; } @@ -598,7 +600,7 @@ next: ret = __ham_item_next(dbc, lock_type, pgnop); F_CLR(hcp, H_ISDUP); hcp->pgno = BUCKET_TO_PAGE(hcp, hcp->bucket); if (hcp->bucket > hcp->hdr->max_bucket) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); hcp->pgno = PGNO_INVALID; goto err; } @@ -612,7 +614,7 @@ next: ret = __ham_item_next(dbc, lock_type, pgnop); case DB_SET: case DB_SET_RANGE: /* Key not found. */ - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; case DB_CURRENT: /* @@ -621,7 +623,7 @@ next: ret = __ham_item_next(dbc, lock_type, pgnop); * locking. We return the same error code as we would * if the cursor were deleted. */ - ret = DB_KEYEMPTY; + ret = DBC_ERR(dbc, DB_KEYEMPTY); goto err; default: DB_ASSERT(env, 0); @@ -649,11 +651,14 @@ __ham_bulk(dbc, data, flags) DB *dbp; DB_MPOOLFILE *mpf; HASH_CURSOR *cp; + HBLOB hblob; PAGE *pg; db_indx_t dup_len, dup_off, dup_tlen, indx, *inp; db_lockmode_t lock_mode; db_pgno_t pgno; + off_t blob_size; int32_t *endp, *offp, *saveoff; + db_seq_t blob_id; u_int32_t key_off, key_size, pagesize, size, space; u_int8_t *dbuf, *dp, *hk, *np, *tmp; int is_dup, is_key; @@ -708,6 +713,10 @@ next_pg: space -= key_size; key_off = (u_int32_t)(np - dbuf); np += key_size; + } else if (HPAGE_PTYPE(hk) == H_BLOB) { + __db_errx(dbp->env, DB_STR("1185", + "Blob item key.")); + (void)__env_panic(dbp->env, DB_RUNRECOVERY); } else { if (need_pg) { dp = np; @@ -982,6 +991,38 @@ get_space: np += size; space -= size; break; + case H_BLOB: + space -= (is_key ? 4 : 2) * sizeof(*offp); + if (space > data->ulen) + goto back_up; + + memcpy(&hblob, hk, HBLOB_SIZE); + blob_id = (db_seq_t)hblob.id; + GET_BLOB_SIZE(dbc->env, hblob, blob_size, ret); + if (ret != 0) + return (ret); + if (blob_size > UINT32_MAX) { + size = UINT32_MAX; + goto back_up; + } + size = (u_int32_t)blob_size; + if (size > space) + goto back_up; + + if ((ret = __blob_bulk(dbc, size, blob_id, np)) != 0) + return (ret); + + if (is_key) { + *offp-- = (int32_t)key_off; + *offp-- = (int32_t)key_size; + } + + *offp-- = (int32_t)(np - dbuf); + *offp-- = (int32_t)size; + + np += size; + space -= size; + break; default: /* Do nothing. */ break; @@ -1014,7 +1055,7 @@ get_space: * DBC->get(DB_NEXT) will return DB_NOTFOUND. */ cp->bucket--; - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); } else { /* * Start on the next bucket. @@ -1071,7 +1112,7 @@ __hamc_put(dbc, key, data, flags, pgnop) if (F_ISSET(hcp, H_DELETED) && flags != DB_KEYFIRST && flags != DB_KEYLAST && flags != DB_OVERWRITE_DUP) - return (DB_NOTFOUND); + return (DBC_ERR(dbc, DB_NOTFOUND)); if ((ret = __ham_get_meta(dbc)) != 0) goto err1; @@ -1083,9 +1124,15 @@ __hamc_put(dbc, key, data, flags, pgnop) case DB_NOOVERWRITE: case DB_OVERWRITE_DUP: nbytes = (ISBIG(hcp, key->size) ? HOFFPAGE_PSIZE : - HKEYDATA_PSIZE(key->size)) + - (ISBIG(hcp, data->size) ? HOFFPAGE_PSIZE : - HKEYDATA_PSIZE(data->size)); + HKEYDATA_PSIZE(key->size)); + if (dbp->blob_threshold && (data->size >= + dbp->blob_threshold || F_ISSET(data, DB_DBT_BLOB))) + nbytes += HBLOB_PSIZE; + else if (ISBIG(hcp, data->size)) + nbytes += HOFFPAGE_PSIZE; + else + nbytes += HKEYDATA_PSIZE(data->size); + if ((ret = __ham_lookup(dbc, key, nbytes, DB_LOCK_WRITE, pgnop)) == DB_NOTFOUND) { if (hcp->seek_found_page != PGNO_INVALID && @@ -1124,7 +1171,7 @@ __hamc_put(dbc, key, data, flags, pgnop) } else if (ret == 0 && flags == DB_NOOVERWRITE && !F_ISSET(hcp, H_DELETED)) { if (*pgnop == PGNO_INVALID) - ret = DB_KEYEXIST; + ret = DBC_ERR(dbc, DB_KEYEXIST); else ret = __bam_opd_exists(dbc, *pgnop); if (ret != 0) @@ -1468,6 +1515,7 @@ __ham_dup_return(dbc, val, flags) type = HPAGE_TYPE(dbp, hcp->page, ndx); pp = hcp->page; myval = val; + cmp = 0; /* * There are 4 cases: @@ -1545,9 +1593,13 @@ __ham_dup_return(dbc, val, flags) memcpy(&pgno, HOFFPAGE_PGNO(hk), sizeof(db_pgno_t)); if ((ret = __db_moff(dbc, val, pgno, tlen, - dbp->dup_compare, &cmp)) != 0) + dbp->dup_compare, &cmp, NULL)) != 0) return (ret); cmp = -cmp; + } else if (((HKEYDATA *)hk)->type == H_BLOB) { + __db_errx(dbp->env, DB_STR("1186", + "Error - found a blob file in a duplicate data set.")); + (void)__env_panic(dbp->env, DB_RUNRECOVERY); } else { /* * We do not zero tmp_val since the comparison @@ -1557,8 +1609,8 @@ __ham_dup_return(dbc, val, flags) tmp_val.size = LEN_HDATA(dbp, hcp->page, dbp->pgsize, hcp->indx); cmp = dbp->dup_compare == NULL ? - __bam_defcmp(dbp, &tmp_val, val) : - dbp->dup_compare(dbp, &tmp_val, val); + __bam_defcmp(dbp, &tmp_val, val, NULL) : + dbp->dup_compare(dbp, &tmp_val, val, NULL); } if (cmp > 0 && flags == DB_GET_BOTH_RANGE && @@ -1567,7 +1619,7 @@ __ham_dup_return(dbc, val, flags) } if (cmp != 0) - return (DB_NOTFOUND); + return (DBC_ERR(dbc, DB_NOTFOUND)); } /* @@ -1654,17 +1706,21 @@ __ham_overwrite(dbc, nval, flags) u_int32_t flags; { DB *dbp; - DBT *myval, tmp_val, tmp_val2; + DBT *myval, tmp_val, tmp_val2, old_rec, new_rec; ENV *env; HASH_CURSOR *hcp; + HBLOB hblob; void *newrec; u_int8_t *hk, *p; u_int32_t len, nondup_size; + db_seq_t blob_id, new_blob_id; db_indx_t newsize; + off_t blob_size; int ret; dbp = dbc->dbp; env = dbp->env; + ret = 0; hcp = (HASH_CURSOR *)dbc->internal; if (F_ISSET(hcp, H_ISDUP)) { /* @@ -1717,7 +1773,7 @@ __ham_overwrite(dbc, nval, flags) NULL, nval, flags, NULL)); } - if ((ret = __os_malloc(dbp->env, + if ((ret = __os_malloc(env, DUP_SIZE(newsize), &newrec)) != 0) return (ret); memset(&tmp_val2, 0, sizeof(tmp_val2)); @@ -1765,7 +1821,7 @@ __ham_overwrite(dbc, nval, flags) (u_int8_t *)newrec + sizeof(db_indx_t); tmp_val2.size = newsize; if (dbp->dup_compare( - dbp, &tmp_val, &tmp_val2) != 0) { + dbp, &tmp_val, &tmp_val2, NULL) != 0) { __os_free(env, newrec); return (__db_duperr(dbp, flags)); } @@ -1816,7 +1872,7 @@ __ham_overwrite(dbc, nval, flags) sizeof(db_indx_t); tmp_val2.size = hcp->dup_len; if (dbp->dup_compare( - dbp, nval, &tmp_val2) != 0) { + dbp, nval, &tmp_val2, NULL) != 0) { __db_errx(env, DB_STR("1131", "Existing data sorts differently from put data")); return (EINVAL); @@ -1848,16 +1904,84 @@ __ham_overwrite(dbc, nval, flags) hcp->dup_len = (db_indx_t)nval->size; } myval = &tmp_val; + goto end; + } + hk = H_PAIRDATA(dbp, hcp->page, hcp->indx); + if (HPAGE_PTYPE(hk) == H_BLOB) { + memcpy(&hblob, hk, HBLOB_SIZE); + memset(&old_rec, 0, sizeof(DBT)); + memset(&new_rec, 0, sizeof(DBT)); + if (DBC_LOGGING(dbc)) { + new_rec.data = HKEYDATA_DATA(&hblob); + if ((ret = __os_malloc( + env, HBLOB_SIZE, &old_rec.data)) != 0) + return (ret); + memcpy(old_rec.data, + HKEYDATA_DATA(&hblob), HBLOB_DSIZE); + new_rec.size = old_rec.size = HBLOB_DSIZE; + } + /* + * Inserting a blob record instead of blob data, only + * used internally by the DB_STREAM api. + */ + if (F_ISSET(nval, DB_DBT_BLOB_REC)) { + DB_ASSERT(env, nval->size == HBLOB_SIZE); + DB_ASSERT(env, HPAGE_PTYPE(nval->data) == H_BLOB); + memcpy(&hblob, nval->data, nval->size); + } else { + /* + * A blob file overwrite is simpler than other + * replace operations. It's simply a matter + * deleting the old blob file, and creating a + * new one. We may need to be careful of + * cursors when we have support for blob + * cursors. + * That means that we can skip the replpair + * call. + */ + blob_id = (db_seq_t)hblob.id; + GET_BLOB_SIZE(env, hblob, blob_size, ret); + if (ret != 0) + return (ret); + if ((ret = __blob_repl(dbc, + nval, blob_id, &new_blob_id, &blob_size)) == 0) { + SET_BLOB_ID(&hblob, new_blob_id, HBLOB); + SET_BLOB_SIZE(&hblob, blob_size, HBLOB); + } + } + if (ret == 0) { + if (DBC_LOGGING(dbc)) { + if ((ret = __ham_replace_log(dbp, + dbc->txn, &LSN(hcp->page), 0, + PGNO(hcp->page), + (u_int32_t)H_DATAINDEX(hcp->indx), + &LSN(hcp->page), 0, + OP_SET(H_BLOB, hcp->page), &old_rec, + OP_SET(H_BLOB, hcp->page), + &new_rec)) != 0) { + memcpy(HKEYDATA_DATA(&hblob), + old_rec.data, HBLOB_DSIZE); + __os_free(env, old_rec.data); + return (ret); + } + + } else + LSN_NOT_LOGGED(LSN(hcp->page)); + } + /* Copy the updated blob data back to the page. */ + memcpy(hk, &hblob, HBLOB_SIZE); + if (old_rec.data != NULL) + __os_free(env, old_rec.data); + return (ret); } else if (!F_ISSET(nval, DB_DBT_PARTIAL)) { /* Put/overwrite */ memcpy(&tmp_val, nval, sizeof(*nval)); F_SET(&tmp_val, DB_DBT_PARTIAL); tmp_val.doff = 0; - hk = H_PAIRDATA(dbp, hcp->page, hcp->indx); - if (HPAGE_PTYPE(hk) == H_OFFPAGE) + if (HPAGE_PTYPE(hk) == H_OFFPAGE) { memcpy(&tmp_val.dlen, HOFFPAGE_TLEN(hk), sizeof(u_int32_t)); - else + } else tmp_val.dlen = LEN_HDATA(dbp, hcp->page, hcp->hdr->dbmeta.pagesize, hcp->indx); myval = &tmp_val; @@ -1865,7 +1989,7 @@ __ham_overwrite(dbc, nval, flags) /* Regular partial put */ myval = nval; - return (__ham_replpair(dbc, myval, +end: return (__ham_replpair(dbc, myval, F_ISSET(hcp, H_ISDUP) ? H_DUPLICATE : H_KEYDATA)); } @@ -1955,7 +2079,7 @@ __ham_lookup(dbc, key, sought, mode, pgnop) return (ret); } F_SET(hcp, H_NOMORE); - return (DB_NOTFOUND); + return (DBC_ERR(dbc, DB_NOTFOUND)); } /* diff --git a/src/hash/hash.src b/src/hash/hash.src index e544c6f3..f56a9c5b 100644 --- a/src/hash/hash.src +++ b/src/hash/hash.src @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/hash/hash_compact.c b/src/hash/hash_compact.c index 83b5ffb1..79fb6004 100644 --- a/src/hash/hash_compact.c +++ b/src/hash/hash_compact.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * $Id$ */ @@ -118,7 +118,8 @@ __ham_compact_int(dbc, start, stop, factor, c_data, donep, flags) break; origpgno = pgno; if ((ret = __db_truncate_root(dbc, hcp->page, - H_DATAINDEX(hcp->indx), &pgno, 0)) != 0) + H_DATAINDEX(hcp->indx), + &pgno, 0, &pgs_done)) != 0) break; if (pgno != origpgno) { memcpy(HOFFDUP_PGNO(H_PAIRDATA(dbp, @@ -247,7 +248,7 @@ __ham_compact_bucket(dbc, c_data, pgs_donep) if (check_trunc && PREV_PGNO(pg) != PGNO_INVALID && PGNO(pg) > c_data->compact_truncate && (ret = __db_exchange_page(dbc, &pg, - hcp->page, PGNO_INVALID, DB_EXCH_FREE)) != 0) + hcp->page, PGNO_INVALID, DB_EXCH_FREE, pgs_donep)) != 0) break; if (pgno != PGNO(pg)) (*pgs_donep)++; @@ -400,8 +401,8 @@ __ham_truncate_overflow(dbc, indx, c_data, pgs_done) if ((ret = __memp_dirty(dbp->mpf, &hcp->page, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0) return (ret); - if ((ret = - __db_truncate_root(dbc, hcp->page, indx, &pgno, 0)) != 0) + if ((ret = __db_truncate_root(dbc, + hcp->page, indx, &pgno, 0, pgs_done)) != 0) return (ret); if (pgno != origpgno) { memcpy(HOFFPAGE_PGNO(P_ENTRY(dbp, hcp->page, indx)), @@ -410,7 +411,8 @@ __ham_truncate_overflow(dbc, indx, c_data, pgs_done) c_data->compact_pages--; } } - if ((ret = __db_truncate_overflow(dbc, pgno, NULL, c_data)) != 0) + if ((ret = + __db_truncate_overflow(dbc, pgno, NULL, c_data, pgs_done)) != 0) return (ret); return (0); } @@ -434,10 +436,11 @@ __ham_compact_hash(dbp, ip, txn, c_data) HMETA *meta; PAGE *oldpage; db_pgno_t free_pgno, last_pgno, pgno, start_pgno; - int flags, local_txn, ret, t_ret; + int flags, local_txn, pgs_done, ret, t_ret; u_int32_t bucket, i, size; local_txn = IS_DB_AUTO_COMMIT(dbp, txn); + pgs_done = 0; oldpage = NULL; dbc = NULL; LOCK_INIT(lock); @@ -506,8 +509,8 @@ __ham_compact_hash(dbp, ip, txn, c_data) flags = 0; else flags = DB_EXCH_FREE; - if ((ret = __db_exchange_page(dbc, - &oldpage, NULL, free_pgno, flags)) != 0) + if ((ret = __db_exchange_page(dbc, &oldpage, + NULL, free_pgno, flags, &pgs_done)) != 0) goto err; } else if (pgno >= last_pgno) { if ((ret = __db_free(dbc, oldpage, 0)) != 0) @@ -526,7 +529,8 @@ __ham_compact_hash(dbp, ip, txn, c_data) } if (ret == 0 && F_ISSET(dbp, DB_AM_SUBDB) && PGNO(hcp->hdr) > c_data->compact_truncate) - ret = __db_move_metadata(dbc, (DBMETA**)&hcp->hdr, c_data); + ret = __db_move_metadata(dbc, (DBMETA**)&hcp->hdr, + c_data, &pgs_done); err: if (oldpage != NULL && (t_ret = __memp_fput(dbp->mpf, dbc->thread_info, oldpage, dbc->priority)) != 0 && ret == 0) diff --git a/src/hash/hash_conv.c b/src/hash/hash_conv.c index fa084f2a..7a53a037 100644 --- a/src/hash/hash_conv.c +++ b/src/hash/hash_conv.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -104,7 +104,12 @@ __ham_mswap(env, pg) SWAP32(p); /* h_charkey */ for (i = 0; i < NCACHED; ++i) SWAP32(p); /* spares */ - p += 59 * sizeof(u_int32_t); /* unused */ + SWAP32(p); /* threshold */ + SWAP32(p); /* file id lo */ + SWAP32(p); /* file id hi */ + SWAP32(p); /* sdb id lo */ + SWAP32(p); /* sdb id hi */ + p += 54 * sizeof(u_int32_t); /* unused */ SWAP32(p); /* crypto_magic */ return (0); } diff --git a/src/hash/hash_dup.c b/src/hash/hash_dup.c index 879c33d7..523d7227 100644 --- a/src/hash/hash_dup.c +++ b/src/hash/hash_dup.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1990, 1993, 1994 @@ -368,6 +368,7 @@ finish: if (ret == 0) { off += len + 2 * sizeof(db_indx_t); } break; + case H_BLOB: default: ret = __db_pgfmt(env, hcp->pgno); break; @@ -772,7 +773,7 @@ __ham_dsearch(dbc, dbt, offp, cmpp, flags) DBT cur; HASH_CURSOR *hcp; db_indx_t i, len; - int (*func) __P((DB *, const DBT *, const DBT *)); + int (*func) __P((DB *, const DBT *, const DBT *, size_t *)); u_int8_t *data; dbp = dbc->dbp; @@ -794,7 +795,7 @@ __ham_dsearch(dbc, dbt, offp, cmpp, flags) * we're done. In the latter case, if permitting partial * matches, it's not a failure. */ - *cmpp = func(dbp, dbt, &cur); + *cmpp = func(dbp, dbt, &cur, NULL); if (*cmpp == 0) break; if (*cmpp < 0 && dbp->dup_compare != NULL) { diff --git a/src/hash/hash_func.c b/src/hash/hash_func.c index baf6061c..1e83b00a 100644 --- a/src/hash/hash_func.c +++ b/src/hash/hash_func.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1990, 1993 diff --git a/src/hash/hash_meta.c b/src/hash/hash_meta.c index d9a35cb4..aefdffb8 100644 --- a/src/hash/hash_meta.c +++ b/src/hash/hash_meta.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/hash/hash_method.c b/src/hash/hash_method.c index 1da81e70..a05bcea6 100644 --- a/src/hash/hash_method.c +++ b/src/hash/hash_method.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -20,7 +20,7 @@ static int __ham_set_h_hash static int __ham_set_h_nelem __P((DB *, u_int32_t)); static int __ham_get_h_compare - __P((DB *, int (**)(DB *, const DBT *, const DBT *))); + __P((DB *, int (**)(DB *, const DBT *, const DBT *, size_t *))); /* * __ham_db_create -- @@ -153,7 +153,7 @@ __ham_set_h_hash(dbp, func) static int __ham_get_h_compare(dbp, funcp) DB *dbp; - int (**funcp) __P((DB *, const DBT *, const DBT *)); + int (**funcp) __P((DB *, const DBT *, const DBT *, size_t *)); { HASH *t; @@ -170,13 +170,13 @@ __ham_get_h_compare(dbp, funcp) * __ham_set_h_compare -- * Set the comparison function. * - * PUBLIC: int __ham_set_h_compare - * PUBLIC: __P((DB *, int (*)(DB *, const DBT *, const DBT *))); + * PUBLIC: int __ham_set_h_compare __P((DB *, + * PUBLIC: int (*)(DB *, const DBT *, const DBT *, size_t *))); */ int __ham_set_h_compare(dbp, func) DB *dbp; - int (*func) __P((DB *, const DBT *, const DBT *)); + int (*func) __P((DB *, const DBT *, const DBT *, size_t *)); { HASH *t; diff --git a/src/hash/hash_open.c b/src/hash/hash_open.c index 3d0bb220..0104a57f 100644 --- a/src/hash/hash_open.c +++ b/src/hash/hash_open.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1990, 1993, 1994 @@ -44,6 +44,7 @@ #include "db_config.h" #include "db_int.h" +#include "dbinc/blob.h" #include "dbinc/crypto.h" #include "dbinc/db_page.h" #include "dbinc/hash.h" @@ -149,6 +150,7 @@ __ham_metachk(dbp, name, hashm) int ret; env = dbp->env; + ret = 0; /* * At this point, all we know is that the magic number is for a Hash. @@ -168,6 +170,7 @@ __ham_metachk(dbp, name, hashm) case 7: case 8: case 9: + case 10: break; default: __db_errx(env, DB_STR_A("1126", @@ -230,6 +233,29 @@ __ham_metachk(dbp, name, hashm) /* Set the page size. */ dbp->pgsize = hashm->dbmeta.pagesize; + dbp->blob_threshold = hashm->blob_threshold; + GET_BLOB_FILE_ID(env, hashm, dbp->blob_file_id, ret); + if (ret != 0) + return (ret); + GET_BLOB_SDB_ID(env, hashm, dbp->blob_sdb_id, ret); + if (ret != 0) + return (ret); + /* Blob databases must be upgraded. */ + if (vers == 9 && (dbp->blob_file_id != 0 || dbp->blob_sdb_id != 0)) { + __db_errx(env, DB_STR_A("1208", +"%s: databases that support blobs must be upgraded.", "%s"), + name); + return (EINVAL); + } +#ifndef HAVE_64BIT_TYPES + if (dbp->blob_file_id != 0 || dbp->blob_sdb_id != 0) { + __db_errx(env, DB_STR_A("1202", + "%s: blobs require 64 integer compiler support.", "%s"), + name); + return (EINVAL); + } +#endif + /* Copy the file's ID. */ memcpy(dbp->fileid, hashm->dbmeta.uid, DB_FILE_ID_LEN); @@ -297,6 +323,9 @@ __ham_init_meta(dbp, meta, pgno, lsnp) meta->nelem = hashp->h_nelem; meta->h_charkey = hashp->h_hash(dbp, CHARKEY, sizeof(CHARKEY)); memcpy(meta->dbmeta.uid, dbp->fileid, DB_FILE_ID_LEN); + meta->blob_threshold = dbp->blob_threshold; + SET_BLOB_META_FILE_ID(meta, dbp->blob_file_id, HMETA); + SET_BLOB_META_SDB_ID(meta, dbp->blob_sdb_id, HMETA); if (F_ISSET(dbp, DB_AM_DUP)) F_SET(&meta->dbmeta, DB_HASH_DUP); @@ -414,6 +443,12 @@ __ham_new_file(dbp, ip, txn, fhp, name) F_ISSET(dbp, (DB_AM_CHKSUM | DB_AM_ENCRYPT | DB_AM_SWAP)); pdbt.data = &pginfo; pdbt.size = sizeof(pginfo); + if (dbp->blob_threshold) { + if ((ret = __blob_generate_dir_ids( + dbp, txn, &dbp->blob_file_id)) != 0) + return (ret); + + } if ((ret = __os_calloc(dbp->env, 1, dbp->pgsize, &buf)) != 0) return (ret); meta = (HMETA *)buf; @@ -491,6 +526,13 @@ __ham_new_subdb(mdbp, dbp, ip, txn) LOCK_INIT(metalock); LOCK_INIT(mmlock); + if (dbp->blob_threshold) { + if ((ret = __blob_generate_dir_ids( + dbp, txn, &dbp->blob_sdb_id)) != 0) + return (ret); + + } + if ((ret = __db_cursor(mdbp, ip, txn, &dbc, CDB_LOCKING(env) ? DB_WRITECURSOR : 0)) != 0) return (ret); diff --git a/src/hash/hash_page.c b/src/hash/hash_page.c index 7576fe61..8e0f897d 100644 --- a/src/hash/hash_page.c +++ b/src/hash/hash_page.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1990, 1993, 1994 @@ -129,7 +129,7 @@ recheck: /* Fetch next page. */ if (NEXT_PGNO(hcp->page) == PGNO_INVALID) { F_SET(hcp, H_NOMORE); - return (DB_NOTFOUND); + return (DBC_ERR(dbc, DB_NOTFOUND)); } next_pgno = NEXT_PGNO(hcp->page); hcp->indx = 0; @@ -344,7 +344,7 @@ __ham_item_prev(dbc, mode, pgnop) if (hcp->pgno == PGNO_INVALID) { /* Beginning of bucket. */ F_SET(hcp, H_NOMORE); - return (DB_NOTFOUND); + return (DBC_ERR(dbc, DB_NOTFOUND)); } else if ((ret = __ham_next_cpage(dbc, hcp->pgno)) != 0) return (ret); @@ -371,7 +371,7 @@ __ham_item_prev(dbc, mode, pgnop) if (hcp->indx == 0) { /* Bucket was empty. */ F_SET(hcp, H_NOMORE); - return (DB_NOTFOUND); + return (DBC_ERR(dbc, DB_NOTFOUND)); } } @@ -497,7 +497,8 @@ __ham_insertpair(dbc, p, indxp, key_dbt, data_dbt, key_type, data_type) inp = P_INP(dbp, p); ksize = (key_type == H_OFFPAGE) ? key_dbt->size : HKEYDATA_SIZE(key_dbt->size); - dsize = (data_type == H_OFFPAGE || data_type == H_OFFDUP) ? + dsize = (data_type == H_OFFPAGE || + data_type == H_OFFDUP || data_type == H_BLOB) ? data_dbt->size : HKEYDATA_SIZE(data_dbt->size); increase = ksize + dsize; @@ -579,7 +580,8 @@ __ham_insertpair(dbc, p, indxp, key_dbt, data_dbt, key_type, data_type) else PUT_HKEYDATA(P_ENTRY(dbp, p, indx), key_dbt->data, key_dbt->size, key_type); - if (data_type == H_OFFPAGE || data_type == H_OFFDUP) + if (data_type == H_BLOB || + data_type == H_OFFPAGE || data_type == H_OFFDUP) memcpy(P_ENTRY(dbp, p, indx+1), data_dbt->data, data_dbt->size); else @@ -618,6 +620,8 @@ __ham_getindex(dbc, p, key, key_type, match, indx) { /* Since all entries are key/data pairs. */ DB_ASSERT(dbc->env, NUM_ENT(p)%2 == 0 ); + /* Blob files can only be stored as data items. */ + DB_ASSERT(dbc->env, key_type != H_BLOB ); /* Support pre 4.6 unsorted hash pages. */ if (p->type == P_HASH_UNSORTED) @@ -672,7 +676,7 @@ __ham_getindex_unsorted(dbc, p, key, match, indx) memcpy(&pgno, HOFFPAGE_PGNO(hk), sizeof(db_pgno_t)); if ((ret = __db_moff(dbc, key, pgno, tlen, - t->h_compare, &res)) != 0) + t->h_compare, &res, NULL)) != 0) return (ret); } break; @@ -681,7 +685,7 @@ __ham_getindex_unsorted(dbc, p, key, match, indx) DB_INIT_DBT(pg_dbt, HKEYDATA_DATA(hk), key->size); if (t->h_compare( - dbp, key, &pg_dbt) != 0) + dbp, key, &pg_dbt, NULL) != 0) break; } else if (key->size == LEN_HKEY(dbp, p, dbp->pgsize, i)) @@ -784,7 +788,7 @@ __ham_getindex_sorted(dbc, p, key, key_type, match, indxp) (void)__ua_memcpy(&off_pgno, HOFFPAGE_PGNO(offp), sizeof(db_pgno_t)); if ((ret = __db_moff(dbc, key, off_pgno, - itemlen, t->h_compare, &res)) != 0) + itemlen, t->h_compare, &res, NULL)) != 0) return (ret); } } else { @@ -799,7 +803,7 @@ __ham_getindex_sorted(dbc, p, key, key_type, match, indxp) (void)__ua_memcpy(&off_len, HOFFPAGE_TLEN(offp), sizeof(u_int32_t)); if ((ret = __db_moff(dbc, &tmp_dbt, off_pgno, - off_len, t->h_compare, &res)) != 0) + off_len, t->h_compare, &res, NULL)) != 0) return (ret); /* * Since we switched the key/match parameters @@ -810,7 +814,7 @@ __ham_getindex_sorted(dbc, p, key, key_type, match, indxp) } else if (t->h_compare != NULL) { /* Case 4, with a user comparison func */ DB_INIT_DBT(tmp_dbt, data, itemlen); - res = t->h_compare(dbp, key, &tmp_dbt); + res = t->h_compare(dbp, key, &tmp_dbt, NULL); } else { /* Case 4, without a user comparison func */ if ((res = memcmp(key->data, data, @@ -899,8 +903,8 @@ __ham_verify_sorted_page (dbc, p) sizeof(u_int32_t)); memcpy(&tpgno, HOFFPAGE_PGNO(H_PAIRKEY(dbp, p, i-2)), sizeof(db_pgno_t)); - if ((ret = __db_moff(dbc, - &curr_dbt, tpgno, tlen, t->h_compare, &res)) != 0) + if ((ret = __db_moff(dbc, &curr_dbt, + tpgno, tlen, t->h_compare, &res, NULL)) != 0) return (ret); } else if (HPAGE_TYPE(dbp, p, i) == H_OFFPAGE) { memset(&prev_dbt, 0, sizeof(prev_dbt)); @@ -910,8 +914,8 @@ __ham_verify_sorted_page (dbc, p) sizeof(u_int32_t)); memcpy(&tpgno, HOFFPAGE_PGNO(H_PAIRKEY(dbp, p, i)), sizeof(db_pgno_t)); - if ((ret = __db_moff(dbc, - &prev_dbt, tpgno, tlen, t->h_compare, &res)) != 0) + if ((ret = __db_moff(dbc, &prev_dbt, tpgno, tlen, + t->h_compare, &res, NULL)) != 0) return (ret); } else res = memcmp(prev, curr, min(curr_len, prev_len)); @@ -1047,9 +1051,11 @@ __ham_del_pair(dbc, flags, ppg) DBT data_dbt, key_dbt; DB_LSN new_lsn, *n_lsn, tmp_lsn; DB_MPOOLFILE *mpf; + HBLOB hblob; HASH_CURSOR *hcp; PAGE *n_pagep, *nn_pagep, *p, *p_pagep; db_ham_mode op; + db_seq_t blob_id; db_indx_t ndx; db_pgno_t chg_pgno, pgno, tmp_pgno; u_int32_t data_type, key_type, order; @@ -1067,6 +1073,8 @@ __ham_del_pair(dbc, flags, ppg) DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &hcp->page)) != 0) return (ret); p = hcp->page; + key_type = HPAGE_PTYPE(H_PAIRKEY(dbp, p, ndx)); + data_type = HPAGE_PTYPE(H_PAIRDATA(dbp, p, ndx)); /* * We optimize for the normal case which is when neither the key nor @@ -1075,8 +1083,7 @@ __ham_del_pair(dbc, flags, ppg) * to remove the big item and then update the page to remove the * entry referring to the big item. */ - if (!LF_ISSET(HAM_DEL_IGNORE_OFFPAGE) && - HPAGE_PTYPE(H_PAIRKEY(dbp, p, ndx)) == H_OFFPAGE) { + if (!LF_ISSET(HAM_DEL_IGNORE_OFFPAGE) && key_type == H_OFFPAGE) { memcpy(&pgno, HOFFPAGE_PGNO(P_ENTRY(dbp, p, H_KEYINDEX(ndx))), sizeof(db_pgno_t)); ret = __db_doff(dbc, pgno); @@ -1084,7 +1091,13 @@ __ham_del_pair(dbc, flags, ppg) ret = 0; if (!LF_ISSET(HAM_DEL_IGNORE_OFFPAGE) && ret == 0) - switch (HPAGE_PTYPE(H_PAIRDATA(dbp, p, ndx))) { + switch (data_type) { + case H_BLOB: + memcpy(&hblob, + P_ENTRY(dbp, p, H_DATAINDEX(ndx)), HBLOB_SIZE); + blob_id = (db_seq_t)hblob.id; + ret = __blob_del(dbc, blob_id); + break; case H_OFFPAGE: memcpy(&pgno, HOFFPAGE_PGNO(P_ENTRY(dbp, p, H_DATAINDEX(ndx))), @@ -1111,7 +1124,7 @@ __ham_del_pair(dbc, flags, ppg) /* Now log the delete off this page. */ if (DBC_LOGGING(dbc)) { hk = H_PAIRKEY(dbp, hcp->page, ndx); - if ((key_type = HPAGE_PTYPE(hk)) == H_OFFPAGE) { + if (key_type == H_OFFPAGE) { key_dbt.data = hk; key_dbt.size = HOFFPAGE_SIZE; } else { @@ -1120,9 +1133,12 @@ __ham_del_pair(dbc, flags, ppg) LEN_HKEY(dbp, hcp->page, dbp->pgsize, ndx); } hk = H_PAIRDATA(dbp, hcp->page, ndx); - if ((data_type = HPAGE_PTYPE(hk)) == H_OFFPAGE) { + if (data_type == H_OFFPAGE) { data_dbt.data = hk; data_dbt.size = HOFFPAGE_SIZE; + } else if (data_type == H_BLOB) { + data_dbt.data = hk; + data_dbt.size = HBLOB_SIZE; } else if (data_type == H_OFFDUP) { data_dbt.data = hk; data_dbt.size = HOFFDUP_SIZE; @@ -1404,6 +1420,8 @@ __ham_replpair(dbc, dbt, newtype) * unless it is an append, when we extend the offpage item, and * update the HOFFPAGE item on the current page to have the new size * via a delete/add. + * + * Updating a record won't cause it to become a blob file or vice versa. */ dbp = dbc->dbp; env = dbp->env; @@ -2464,15 +2482,18 @@ __ham_add_el(dbc, key, val, type) const DBT *pkey, *pdata; DB *dbp; DBT key_dbt, data_dbt; - DB_LSN new_lsn; + DB_LSN blob_lsn, new_lsn; DB_MPOOLFILE *mpf; HASH_CURSOR *hcp; HOFFPAGE doff, koff; + HBLOB dblob; PAGE *new_pagep; db_pgno_t next_pgno, pgno; + off_t file_size; + db_seq_t blob_id; u_int32_t data_size, data_type, key_size, key_type; u_int32_t pages, pagespace, pairsize; - int do_expand, is_keybig, is_databig, match, ret; + int do_expand, is_keybig, match, ret; dbp = dbc->dbp; mpf = dbp->mpf; @@ -2485,14 +2506,33 @@ __ham_add_el(dbc, key, val, type) dbc->thread_info, dbc->txn, DB_MPOOL_CREATE, &hcp->page)) != 0) return (ret); + /* + * Key is either: + * - On page + * - On overflow page(s) + */ key_size = HKEYDATA_PSIZE(key->size); - data_size = HKEYDATA_PSIZE(val->size); is_keybig = ISBIG(hcp, key->size); - is_databig = ISBIG(hcp, val->size); if (is_keybig) key_size = HOFFPAGE_PSIZE; - if (is_databig) + /* + * Data is either: + * - On page (H_KEYDATA or H_DUPLICATE) + * - On overflow page(s) + * - In a blob file + */ + data_type = + (dbp->blob_threshold && (val->size >= dbp->blob_threshold || + F_ISSET(val, DB_DBT_BLOB))) ? + H_BLOB : (ISBIG(hcp, val->size)) ? H_OFFPAGE : H_KEYDATA; + if (data_type == H_KEYDATA || data_type == H_DUPLICATE) + data_size = HKEYDATA_PSIZE(val->size); + else if (data_type == H_OFFPAGE) data_size = HOFFPAGE_PSIZE; + else { /* H_BLOB */ + DB_ASSERT(dbp->env, data_type == H_BLOB); + data_size = HBLOB_PSIZE; + } pairsize = key_size + data_size; @@ -2536,17 +2576,17 @@ __ham_add_el(dbc, key, val, type) * run out of file space before updating the key or data. */ if (dbc->txn == NULL && - dbp->mpf->mfp->maxpgno != 0 && (is_keybig || is_databig)) { + dbp->mpf->mfp->maxpgno != 0 && + (is_keybig || data_type == H_OFFPAGE)) { pagespace = P_MAXSPACE(dbp, dbp->pgsize); pages = 0; - if (is_databig) + if (data_type == H_OFFPAGE) pages = ((data_size - 1) / pagespace) + 1; - if (is_keybig) { + if (is_keybig) pages += ((key->size - 1) / pagespace) + 1; - if (pages > - (dbp->mpf->mfp->maxpgno - dbp->mpf->mfp->last_pgno)) - return (__db_space_err(dbp)); - } + if (pages > + (dbp->mpf->mfp->maxpgno - dbp->mpf->mfp->last_pgno)) + return (__db_space_err(dbp)); } if ((ret = __memp_dirty(mpf, @@ -2575,7 +2615,7 @@ __ham_add_el(dbc, key, val, type) key_type = H_KEYDATA; } - if (is_databig) { + if (data_type == H_OFFPAGE) { doff.type = H_OFFPAGE; UMRW_SET(doff.unused[0]); UMRW_SET(doff.unused[1]); @@ -2587,6 +2627,22 @@ __ham_add_el(dbc, key, val, type) data_dbt.size = sizeof(doff); pdata = &data_dbt; data_type = H_OFFPAGE; + } else if (data_type == H_BLOB) { + memset(&dblob, 0, HBLOB_SIZE); + dblob.type = H_BLOB; + blob_id = 0; + file_size = 0; + if ((ret = __blob_put( + dbc, (DBT *)val, &blob_id, &file_size, &blob_lsn)) != 0) + return (ret); + SET_BLOB_ID(&dblob, blob_id, HBLOB); + SET_BLOB_SIZE(&dblob, file_size, HBLOB); + SET_BLOB_FILE_ID(&dblob, dbp->blob_file_id, HBLOB); + SET_BLOB_SDB_ID(&dblob, dbp->blob_sdb_id, HBLOB); + data_dbt.data = &dblob; + data_dbt.size = sizeof(dblob); + pdata = &data_dbt; + data_type = H_BLOB; } else { pdata = val; data_type = type; @@ -2673,7 +2729,7 @@ __ham_add_el(dbc, key, val, type) /* * Special insert pair call -- copies a key/data pair from one page to * another. Works for all types of hash entries (H_OFFPAGE, H_KEYDATA, - * H_DUPLICATE, H_OFFDUP). Since we log splits at a high level, we + * H_DUPLICATE, H_OFFDUP, H_BLOB). Since we log splits at a high level, we * do not need to log them here. * * dest_indx is an optional parameter, it serves several purposes: @@ -2715,7 +2771,7 @@ __ham_copypair(dbc, src_page, src_ndx, dest_page, dest_indx, log) tkey.data = HKEYDATA_DATA(P_ENTRY(dbp, src_page, kindx)); tkey.size = LEN_HKEYDATA(dbp, src_page, dbp->pgsize, kindx); } - if (dtype == H_OFFPAGE || dtype == H_OFFDUP) { + if (dtype == H_OFFPAGE || dtype == H_OFFDUP || dtype == H_BLOB) { tdata.data = P_ENTRY(dbp, src_page, dindx); tdata.size = LEN_HITEM(dbp, src_page, dbp->pgsize, dindx); } else { diff --git a/src/hash/hash_rec.c b/src/hash/hash_rec.c index 58965569..8a39d880 100644 --- a/src/hash/hash_rec.c +++ b/src/hash/hash_rec.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1995, 1996 @@ -232,6 +232,7 @@ __ham_insdel_42_recover(env, dbtp, lsnp, op, info) REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); ktype = DB_UNDO(op) || PAIR_ISKEYBIG(argp->opcode) ? H_OFFPAGE : H_KEYDATA; + /* TODO: May need a PAIR_ISDATABLOB here. */ if (PAIR_ISDATADUP(argp->opcode)) dtype = H_DUPLICATE; else if (DB_UNDO(op) || PAIR_ISDATABIG(argp->opcode)) @@ -957,9 +958,8 @@ __ham_metagroup_recover(env, dbtp, lsnp, op, info) if (IS_ZERO_LSN(LSN(pagep))) { REC_DIRTY(mpf, ip, dbc->priority, &pagep); - P_INIT(pagep, file_dbp->pgsize, - PGNO_INVALID, PGNO_INVALID, PGNO_INVALID, - 0, P_HASH); + P_INIT(pagep, file_dbp->pgsize, pgno, + PGNO_INVALID, PGNO_INVALID, 0, P_HASH); } if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0) diff --git a/src/hash/hash_reclaim.c b/src/hash/hash_reclaim.c index ce3f6d9e..55980444 100644 --- a/src/hash/hash_reclaim.c +++ b/src/hash/hash_reclaim.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/hash/hash_stat.c b/src/hash/hash_stat.c index 683ce5a6..7ccf472d 100644 --- a/src/hash/hash_stat.c +++ b/src/hash/hash_stat.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -188,15 +188,19 @@ __ham_stat_print(dbc, flags) sp->hash_bfree, sp->hash_buckets, sp->hash_pagesize), "ff"); __db_dl(env, - "Number of overflow pages", (u_long)sp->hash_bigpages); - __db_dl_pct(env, "Number of bytes free in overflow pages", + "Number of blobs", (u_long)sp->hash_nblobs); + __db_dl(env, + "Number of hash overflow (big item) pages", + (u_long)sp->hash_bigpages); + __db_dl_pct(env, + "Number of bytes free in hash overflow (big item) pages", (u_long)sp->hash_big_bfree, DB_PCT_PG( sp->hash_big_bfree, sp->hash_bigpages, sp->hash_pagesize), "ff"); __db_dl(env, "Number of bucket overflow pages", (u_long)sp->hash_overflows); __db_dl_pct(env, - "Number of bytes free in bucket overflow pages", + "Number of bytes free on bucket overflow pages", (u_long)sp->hash_ovfl_free, DB_PCT_PG( sp->hash_ovfl_free, sp->hash_overflows, sp->hash_pagesize), "ff"); @@ -258,6 +262,9 @@ __ham_stat_callback(dbc, pagep, cookie, putp) switch (*H_PAIRDATA(dbp, pagep, indx)) { case H_OFFDUP: break; + case H_BLOB: + sp->hash_nblobs++; + /* fall through */ case H_OFFPAGE: case H_KEYDATA: sp->hash_ndata++; @@ -480,6 +487,7 @@ __ham_traverse(dbc, mode, callback, cookie, look_past_max) opgno, callback, cookie)) != 0) goto err; break; + case H_BLOB: case H_KEYDATA: case H_DUPLICATE: break; diff --git a/src/hash/hash_stub.c b/src/hash/hash_stub.c index 57337ea9..89307670 100644 --- a/src/hash/hash_stub.c +++ b/src/hash/hash_stub.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -127,6 +127,40 @@ __ham_46_hashmeta(dbp, real_name, flags, fhp, h, dirtyp) } int +__ham_60_hashmeta(dbp, real_name, flags, fhp, h, dirtyp) + DB *dbp; + char *real_name; + u_int32_t flags; + DB_FH *fhp; + PAGE *h; + int *dirtyp; +{ + COMPQUIET(real_name, NULL); + COMPQUIET(flags, 0); + COMPQUIET(fhp, NULL); + COMPQUIET(h, NULL); + COMPQUIET(dirtyp, NULL); + return (__db_no_hash_am(dbp->env)); +} + +int +__ham_60_hash(dbp, real_name, flags, fhp, h, dirtyp) + DB *dbp; + char *real_name; + u_int32_t flags; + DB_FH *fhp; + PAGE *h; + int *dirtyp; +{ + COMPQUIET(real_name, NULL); + COMPQUIET(flags, 0); + COMPQUIET(fhp, NULL); + COMPQUIET(h, NULL); + COMPQUIET(dirtyp, NULL); + return (__db_no_hash_am(dbp->env)); +} + +int __hamc_cmp(dbc, other_dbc, result) DBC *dbc, *other_dbc; int *result; diff --git a/src/hash/hash_upgrade.c b/src/hash/hash_upgrade.c index f66a7a58..17014a5c 100644 --- a/src/hash/hash_upgrade.c +++ b/src/hash/hash_upgrade.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -9,6 +9,7 @@ #include "db_config.h" #include "db_int.h" +#include "dbinc/blob.h" #include "dbinc/db_page.h" #include "dbinc/hash.h" #include "dbinc/db_upgrade.h" @@ -321,3 +322,93 @@ __ham_46_hash(dbp, real_name, flags, fhp, h, dirtyp) return (ret); } + +/* + * __ham_60_hashmeta-- + * Upgrade the version number. + * + * PUBLIC: int __ham_60_hashmeta + * PUBLIC: __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)); + */ +int +__ham_60_hashmeta(dbp, real_name, flags, fhp, h, dirtyp) + DB *dbp; + char *real_name; + u_int32_t flags; + DB_FH *fhp; + PAGE *h; + int *dirtyp; +{ + HMETA33 *hmeta; + + COMPQUIET(flags, 0); + COMPQUIET(real_name, NULL); + COMPQUIET(fhp, NULL); + COMPQUIET(dbp, NULL); + hmeta = (HMETA33 *)h; + + hmeta->dbmeta.version = 10; + *dirtyp = 1; + + return (0); +} + +/* + * __ham_60_hash -- + * Upgrade the blob records on the database hash leaf pages. + * + * PUBLIC: int __ham_60_hash + * PUBLIC: __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)); + */ +int +__ham_60_hash(dbp, real_name, flags, fhp, h, dirtyp) + DB *dbp; + char *real_name; + u_int32_t flags; + DB_FH *fhp; + PAGE *h; + int *dirtyp; +{ + HBLOB60 hb60; + HBLOB60P1 hb60p1; + HKEYDATA *hk; + db_seq_t blob_id, blob_size, file_id, sdb_id; + db_indx_t indx; + int ret; + + COMPQUIET(flags, 0); + COMPQUIET(real_name, NULL); + COMPQUIET(fhp, NULL); + ret = 0; + + DB_ASSERT(dbp->env, HBLOB60_SIZE == HBLOB_SIZE); + for (indx = 0; indx < NUM_ENT(h); indx += 2) { + hk = (HKEYDATA *)H_PAIRDATA(dbp, h, indx); + if (HPAGE_PTYPE(hk) == H_BLOB) { + memcpy(&hb60, hk, HBLOB60_SIZE); + memset(&hb60p1, 0, HBLOB_SIZE); + hb60p1.type = hb60.type; + hb60p1.encoding = hb60.encoding; + GET_BLOB60_ID(dbp->env, hb60, blob_id, ret); + if (ret != 0) + return (ret); + GET_BLOB60_SIZE(dbp->env, hb60, blob_size, ret); + if (ret != 0) + return (ret); + GET_BLOB60_FILE_ID(dbp->env, &hb60, file_id, ret); + if (ret != 0) + return (ret); + GET_BLOB60_SDB_ID(dbp->env, &hb60, sdb_id, ret); + if (ret != 0) + return (ret); + SET_BLOB_ID(&hb60p1, blob_id, HBLOB60P1); + SET_BLOB_SIZE(&hb60p1, blob_size, HBLOB60P1); + SET_BLOB_FILE_ID(&hb60p1, file_id, HBLOB60P1); + SET_BLOB_SDB_ID(&hb60p1, sdb_id, HBLOB60P1); + memcpy(hk, &hb60p1, HBLOB_SIZE); + *dirtyp = 1; + } + } + + return (ret); +} diff --git a/src/hash/hash_verify.c b/src/hash/hash_verify.c index 662e7ac8..302d42d8 100644 --- a/src/hash/hash_verify.c +++ b/src/hash/hash_verify.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -9,6 +9,7 @@ #include "db_config.h" #include "db_int.h" +#include "dbinc/blob.h" #include "dbinc/db_page.h" #include "dbinc/db_verify.h" #include "dbinc/btree.h" @@ -47,6 +48,7 @@ __ham_vrfy_meta(dbp, vdp, m, pgno, flags) int i, ret, t_ret, isbad; u_int32_t pwr, mbucket; u_int32_t (*hfunc) __P((DB *, const void *, u_int32_t)); + db_seq_t blob_id; env = dbp->env; isbad = 0; @@ -164,6 +166,55 @@ __ham_vrfy_meta(dbp, vdp, m, pgno, flags) } } +/* + * Where 64-bit integer support is not available, + * return an error if the file has any blobs. + */ + t_ret = 0; +#ifdef HAVE_64BIT_TYPES + GET_BLOB_FILE_ID(env, m, blob_id, t_ret); + if (t_ret != 0) { + isbad = 1; + EPRINT((env, DB_STR_A("1178", + "Page %lu: blob file id overflow.", "%lu"), (u_long)pgno)); + if (ret == 0) + ret = t_ret; + } + t_ret = 0; + GET_BLOB_SDB_ID(env, m, blob_id, t_ret); + if (t_ret != 0) { + isbad = 1; + EPRINT((env, DB_STR_A("1179", + "Page %lu: blob subdatabase id overflow.", + "%lu"), (u_long)pgno)); + if (ret == 0) + ret = t_ret; + } +#else /* HAVE_64BIT_TYPES */ + /* + * db_seq_t is an int on systems that do not have 64 integer types, so + * this will compile and run. + */ + GET_BLOB_FILE_ID(env, m, blob_id, t_ret); + if (t_ret != 0 || blob_id != 0) { + isbad = 1; + EPRINT((env, DB_STR_A("1203", + "Page %lu: blobs require 64 integer compiler support.", + "%lu"), (u_long)pgno)); + if (ret == 0) + ret = t_ret; + } + GET_BLOB_SDB_ID(env, m, blob_id, t_ret); + if (t_ret != 0 || blob_id != 0) { + isbad = 1; + EPRINT((env, DB_STR_A("1204", + "Page %lu: blobs require 64 integer compiler support.", + "%lu"), (u_long)pgno)); + if (ret == 0) + ret == t_ret; + } +#endif + err: if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0) ret = t_ret; if (LF_ISSET(DB_SALVAGE) && @@ -272,12 +323,15 @@ __ham_vrfy_item(dbp, vdp, pgno, h, i, flags) PAGE *h; u_int32_t i, flags; { + HBLOB hblob; HOFFDUP hod; HOFFPAGE hop; VRFY_CHILDINFO child; VRFY_PAGEINFO *pip; db_indx_t offset, len, dlen, elen; int ret, t_ret; + off_t blob_size; + db_seq_t blob_id, file_id, sdb_id; u_int8_t *databuf; if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) @@ -287,6 +341,38 @@ __ham_vrfy_item(dbp, vdp, pgno, h, i, flags) case H_KEYDATA: /* Nothing to do here--everything but the type field is data */ break; + case H_BLOB: + /* + * Blob item. Check that the blob file exists and is the same + * file size as is stored in the database record. + */ + memcpy(&hblob, P_ENTRY(dbp, h, i), HBLOB_SIZE); + blob_id = (db_seq_t)hblob.id; + GET_BLOB_SIZE(dbp->env, hblob, blob_size, ret); + if (ret != 0 || blob_size < 0) { + EPRINT((dbp->env, DB_STR_A("1181", + "Page %lu: blob file size value has overflowed", + "%lu"), (u_long)pip->pgno)); + ret = DB_VERIFY_BAD; + goto err; + } + file_id = (db_seq_t)hblob.file_id; + sdb_id = (db_seq_t)hblob.sdb_id; + if (file_id == 0 && sdb_id == 0) { + EPRINT((dbp->env, DB_STR_A("1184", + "Page %lu: invalid blob dir ids %llu %llu at item %lu", + "%lu %llu %llu %lu"), + (u_long)pip->pgno, (unsigned long long)file_id, + (unsigned long long)sdb_id, (u_long)i)); + ret = DB_VERIFY_BAD; + goto err; + } + if ((ret = __blob_vrfy(dbp->env, blob_id, + blob_size, file_id, sdb_id, pip->pgno, flags)) != 0) { + ret = DB_VERIFY_BAD; + goto err; + } + break; case H_DUPLICATE: /* Are we a datum or a key? Better be the former. */ if (i % 2 == 0) { @@ -822,15 +908,23 @@ __ham_salvage(dbp, vdp, pgno, h, handle, callback, flags) u_int32_t flags; { DBT dbt, key_dbt, unkdbt; + ENV *env; + HBLOB hblob; + char *prefix; db_pgno_t dpgno; int ret, err_ret, t_ret; - u_int32_t himark, i, ovfl_bufsz; - u_int8_t *hk, *p; + off_t blob_size, blob_offset, remaining; + u_int32_t blob_buf_size, himark, i, ovfl_bufsz; + u_int8_t *blob_buf, *hk, *p; + db_seq_t blob_id, file_id, sdb_id; void *buf, *key_buf; db_indx_t dlen, len, tlen; memset(&dbt, 0, sizeof(DBT)); dbt.flags = DB_DBT_REALLOC; + blob_buf = NULL; + blob_buf_size = 0; + env = dbp->env; DB_INIT_DBT(unkdbt, "UNKNOWN", sizeof("UNKNOWN") - 1); @@ -840,9 +934,9 @@ __ham_salvage(dbp, vdp, pgno, h, handle, callback, flags) * Allocate a buffer for overflow items. Start at one page; * __db_safe_goff will realloc as needed. */ - if ((ret = __os_malloc(dbp->env, dbp->pgsize, &buf)) != 0) + if ((ret = __os_malloc(env, dbp->pgsize, &buf)) != 0) return (ret); - ovfl_bufsz = dbp->pgsize; + ovfl_bufsz = dbp->pgsize; himark = dbp->pgsize; for (i = 0;; i++) { @@ -886,6 +980,70 @@ keydata: memcpy(buf, HKEYDATA_DATA(hk), len); 0, " ", handle, callback, 0, 0, vdp)) != 0) err_ret = ret; break; + case H_BLOB: + memcpy(&hblob, hk, HBLOB_SIZE); + blob_id = (db_seq_t)hblob.id; + GET_BLOB_SIZE(env, hblob, blob_size, ret); + if (ret != 0 || blob_size < 0) { + err_ret = DB_VERIFY_BAD; + continue; + } + file_id = (db_seq_t)hblob.file_id; + sdb_id = (db_seq_t)hblob.sdb_id; + /* Read the blob, in pieces if too large.*/ + blob_offset = 0; + if (blob_size > MEGABYTE) { + if (blob_buf_size < MEGABYTE) { + if ((ret = __os_realloc( + env, MEGABYTE, + &blob_buf)) != 0) { + err_ret = ret; + continue; + } + blob_buf_size = MEGABYTE; + } + } else if (blob_buf_size < blob_size) { + blob_buf_size = (u_int32_t)blob_size; + if ((ret = __os_realloc(env, + blob_buf_size, &blob_buf)) != 0) { + err_ret = ret; + continue; + } + } + dbt.data = blob_buf; + dbt.ulen = blob_buf_size; + remaining = blob_size; + prefix = " "; + do { + if ((ret = __blob_salvage(env, blob_id, + blob_offset, + (remaining < blob_buf_size ? + (size_t)remaining : blob_buf_size), + file_id, sdb_id, &dbt)) != 0) { + err_ret = DB_VERIFY_BAD; + break; + } + if (remaining > blob_buf_size) + F_SET( + vdp, SALVAGE_STREAM_BLOB); + else + F_CLR( + vdp, SALVAGE_STREAM_BLOB); + if ((ret = __db_vrfy_prdbt( + &dbt, 0, prefix, handle, + callback, 0, 0, vdp)) != 0) { + err_ret = ret; + break; + } + prefix = NULL; + blob_offset += dbt.size; + if (remaining < blob_buf_size) + remaining = 0; + else + remaining -= blob_buf_size; + } while (remaining > 0); + F_CLR(vdp, SALVAGE_STREAM_BLOB); + break; case H_OFFPAGE: if (len < HOFFPAGE_SIZE) { err_ret = DB_VERIFY_BAD; @@ -960,7 +1118,7 @@ keydata: memcpy(buf, HKEYDATA_DATA(hk), len); */ memset(&key_dbt, 0, sizeof(key_dbt)); if ((ret = __os_malloc( - dbp->env, dbt.size, &key_buf)) != 0) + env, dbt.size, &key_buf)) != 0) return (ret); memcpy(key_buf, buf, dbt.size); key_dbt.data = key_buf; @@ -1002,7 +1160,7 @@ keydata: memcpy(buf, HKEYDATA_DATA(hk), len); handle, callback, 0, 0, vdp)) != 0) err_ret = ret; } - __os_free(dbp->env, key_buf); + __os_free(env, key_buf); break; default: if (!LF_ISSET(DB_AGGRESSIVE)) @@ -1013,7 +1171,9 @@ keydata: memcpy(buf, HKEYDATA_DATA(hk), len); } } - __os_free(dbp->env, buf); + if (blob_buf != NULL) + __os_free(env, blob_buf); + __os_free(env, buf); if ((t_ret = __db_salvage_markdone(vdp, pgno)) != 0) return (t_ret); return ((ret == 0 && err_ret != 0) ? err_ret : ret); @@ -1129,7 +1289,7 @@ __ham_dups_unsorted(dbp, buf, len) { DBT a, b; db_indx_t offset, dlen; - int (*func) __P((DB *, const DBT *, const DBT *)); + int (*func) __P((DB *, const DBT *, const DBT *, size_t *)); memset(&a, 0, sizeof(DBT)); memset(&b, 0, sizeof(DBT)); @@ -1146,7 +1306,7 @@ __ham_dups_unsorted(dbp, buf, len) b.data = buf + offset + sizeof(db_indx_t); b.size = dlen; - if (a.data != NULL && func(dbp, &a, &b) > 0) + if (a.data != NULL && func(dbp, &a, &b, NULL) > 0) return (1); a.data = b.data; diff --git a/src/heap/heap.c b/src/heap/heap.c index ab404658..7aec416b 100644 --- a/src/heap/heap.c +++ b/src/heap/heap.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2010, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -24,6 +24,8 @@ static int __heapc_get __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *)); static int __heapc_put __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *)); static int __heapc_reloc __P((DBC *, DBT *, DBT *)); static int __heapc_reloc_partial __P((DBC *, DBT *, DBT *)); +static void __heapc_search __P((DBC *, HEAPPG *, db_indx_t, + int, db_indx_t *, int *)); static int __heapc_split __P((DBC *, DBT *, DBT *, int)); /* @@ -134,12 +136,15 @@ __heap_bulk(dbc, data, flags) DB_HEAP_RID prev_rid, rid; DBT sdata; HEAP_CURSOR *cp; + HEAPBLOBHDR bhdr; HEAPHDR *hdr; HEAPSPLITHDR *shdr; PAGE *pg; db_lockmode_t lock_type; int is_key, ret; int32_t *offp; + off_t blob_size; + db_seq_t blob_id; u_int32_t data_size, key_size, needed, space; u_int8_t *dbuf, *np; @@ -183,6 +188,7 @@ __heap_bulk(dbc, data, flags) next_pg: rid.indx = cp->indx; rid.pgno = cp->pgno; + prev_rid = rid; pg = cp->page; /* @@ -213,6 +219,14 @@ next_pg: if (F_ISSET(hdr, HEAP_RECSPLIT)) { shdr = (HEAPSPLITHDR *)hdr; data_size = DB_ALIGN(shdr->tsize, sizeof(u_int32_t)); + } else if (F_ISSET(hdr, HEAP_RECBLOB)) { + memcpy(&bhdr, hdr, HEAPBLOBREC_SIZE); + GET_BLOB_SIZE(dbp->env, bhdr, blob_size, ret); + if (ret != 0) + return (ret); + if (blob_size > UINT32_MAX) + return (DB_BUFFER_SMALL); + data_size = (u_int32_t)blob_size; } else data_size = DB_ALIGN(hdr->size, sizeof(u_int32_t)); needed += 2 * sizeof(*offp) + data_size; @@ -250,13 +264,21 @@ next_pg: if ((ret = __heapc_gsplit( dbc, &sdata, NULL, NULL)) != 0) return (ret); - } else { + } else if (F_ISSET(hdr, HEAP_RECBLOB)) { + memcpy(&bhdr, hdr, HEAPBLOBREC_SIZE); + blob_id = (db_seq_t)bhdr.id; + if ((ret = __blob_bulk( + dbc, data_size, blob_id, np)) != 0) + return (ret); + }else { memcpy(np, (u_int8_t *)hdr + sizeof(HEAPHDR), hdr->size); } *offp-- = (int32_t)(np - dbuf); if (F_ISSET(hdr, HEAP_RECSPLIT)) *offp-- = (int32_t)shdr->tsize; + else if (F_ISSET(hdr, HEAP_RECBLOB)) + *offp-- = (int32_t)data_size; else *offp-- = (int32_t)hdr->size; np += data_size; @@ -296,7 +318,6 @@ __heapc_close(dbc, root_pgno, rmroot) db_pgno_t root_pgno; int *rmroot; { - DB_MPOOLFILE *mpf; HEAP_CURSOR *cp; int ret; @@ -304,7 +325,6 @@ __heapc_close(dbc, root_pgno, rmroot) COMPQUIET(rmroot, 0); cp = (HEAP_CURSOR *)dbc->internal; - mpf = dbc->dbp->mpf; ret = 0; /* Release the page/lock held by the cursor. */ @@ -325,11 +345,14 @@ __heapc_del(dbc, flags) DB_MPOOLFILE *mpf; DBT hdr_dbt, log_dbt; HEAP *h; + HEAPBLOBHDR bhdr; HEAPHDR *hdr; HEAPPG *rpage; HEAP_CURSOR *cp; db_pgno_t region_pgno; - int oldspacebits, ret, spacebits, t_ret; + int ret, t_ret; + db_seq_t blob_id; + u_int32_t oldspacebits, spacebits; u_int16_t data_size, size; dbp = dbc->dbp; @@ -337,6 +360,7 @@ __heapc_del(dbc, flags) h = dbp->heap_internal; cp = (HEAP_CURSOR *)dbc->internal; rpage = NULL; + ret = 0; COMPQUIET(flags, 0); /* @@ -377,6 +401,14 @@ start: if (STD_LOCKING(dbc) && (ret = __db_lget(dbc, next_rid.indx = 0; } + /* Delete the blob file. */ + if (F_ISSET(hdr, HEAP_RECBLOB)) { + memcpy(&bhdr, hdr, HEAPBLOBREC_SIZE); + blob_id = (db_seq_t)bhdr.id; + if ((ret = __blob_del(dbc, blob_id)) != 0) + return (ret); + } + /* Log the deletion. */ if (DBC_LOGGING(dbc)) { hdr_dbt.data = hdr; @@ -384,8 +416,9 @@ start: if (STD_LOCKING(dbc) && (ret = __db_lget(dbc, log_dbt.data = (u_int8_t *)hdr + hdr_dbt.size; log_dbt.size = data_size; if ((ret = __heap_addrem_log(dbp, dbc->txn, &LSN(cp->page), - 0, DB_REM_HEAP, cp->pgno, (u_int32_t)cp->indx, - size, &hdr_dbt, &log_dbt, &LSN(cp->page))) != 0) + 0, OP_SET(DB_REM_HEAP, cp->page), + cp->pgno, (u_int32_t)cp->indx, size, + &hdr_dbt, &log_dbt, &LSN(cp->page))) != 0) goto err; } else LSN_NOT_LOGGED(LSN(cp->page)); @@ -414,7 +447,7 @@ start: if (STD_LOCKING(dbc) && (ret = __db_lget(dbc, dbc->thread_info, NULL, DB_MPOOL_DIRTY, &rpage)) != 0) goto err; HEAP_SETSPACE(dbp, rpage, - cp->pgno - region_pgno - 1, spacebits); + (cp->pgno - region_pgno) - 1, spacebits); } err: DB_ASSERT(dbp->env, ret != DB_PAGE_NOTFOUND); @@ -443,7 +476,8 @@ err: DB_ASSERT(dbp->env, ret != DB_PAGE_NOTFOUND); /* * __heap_ditem -- - * Remove an item from a page. + * Remove an item from a page. Note when deleting blob records that the file + * has to be deleted separate from calling this function. * * PUBLIC: int __heap_ditem * PUBLIC: __P((DBC *, PAGE *, u_int32_t, u_int32_t)); @@ -537,19 +571,21 @@ __heapc_get(dbc, key, data, flags, pgnop) DB_MPOOLFILE *mpf; DB_LOCK meta_lock; DBT tmp_val; - HEAP *h; + HEAPBLOBHDR bhdr; HEAPHDR *hdr; HEAPMETA *meta; HEAPPG *dpage; HEAP_CURSOR *cp; db_lockmode_t lock_type; db_pgno_t pgno; - int cmp, f_indx, found, getpage, indx, ret; + int cmp, np_inc, f_indx, found, getpage, indx, ret; + off_t blob_size; + db_seq_t blob_id; dbp = dbc->dbp; mpf = dbp->mpf; - h = dbp->heap_internal; cp = (HEAP_CURSOR *)dbc->internal; + pgno = PGNO_INVALID; LOCK_INIT(meta_lock); COMPQUIET(pgnop, NULL); @@ -564,7 +600,7 @@ __heapc_get(dbc, key, data, flags, pgnop) else lock_type = DB_LOCK_READ; - ret = 0; + np_inc = ret = 0; found = getpage = FALSE; meta = NULL; dpage = NULL; @@ -579,7 +615,7 @@ __heapc_get(dbc, key, data, flags, pgnop) ACQUIRE_CUR(dbc, lock_type, cp->pgno, 0, 0, ret); if (ret != 0) { if (ret == DB_PAGE_NOTFOUND) - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } @@ -591,7 +627,7 @@ __heapc_get(dbc, key, data, flags, pgnop) hdr = (HEAPHDR *)P_ENTRY(dbp, dpage, cp->indx); if (F_ISSET(hdr, HEAP_RECSPLIT) && !F_ISSET(hdr, HEAP_RECFIRST)) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } @@ -610,7 +646,7 @@ first: pgno = FIRST_HEAP_DPAGE; ACQUIRE_CUR(dbc, lock_type, pgno, 0, 0, ret); if (ret != 0 ) { if (ret == DB_PAGE_NOTFOUND) - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } dpage = (HEAPPG *)cp->page; @@ -620,25 +656,10 @@ first: pgno = FIRST_HEAP_DPAGE; * finding first non-split record or first piece of a * split record, then set up cursor. */ - if (TYPE(dpage) == P_HEAP && NUM_ENT(dpage) != 0) { - for (indx = 0; - indx <= HEAP_HIGHINDX(dpage); indx++) { - if (HEAP_OFFSETTBL( - dbp, dpage)[indx] == 0) - continue; - hdr = (HEAPHDR *)P_ENTRY( - dbp, dpage, indx); - if (!F_ISSET(hdr, HEAP_RECSPLIT) || - F_ISSET(hdr, HEAP_RECFIRST)) { - found = TRUE; - cp->pgno = pgno; - cp->indx = indx; - break; - } - } - if (!found) - pgno++; - } else + __heapc_search(dbc, dpage, 0, 1, &cp->indx, &found); + if (found) + cp->pgno = pgno; + else pgno++; } break; @@ -668,7 +689,7 @@ last: pgno = PGNO_BASE_MD; while (!found) { /* Don't look earlier than the first data page. */ if (pgno < FIRST_HEAP_DPAGE) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } @@ -683,33 +704,33 @@ last: pgno = PGNO_BASE_MD; * non-split record or the first piece of a split record * is found. */ - if (TYPE(dpage) == P_HEAP && NUM_ENT(dpage) != 0) { - for (indx = HEAP_HIGHINDX(dpage); - indx >= 0; indx--) { - if (HEAP_OFFSETTBL( - dbp, dpage)[indx] == 0) - continue; - hdr = (HEAPHDR *)P_ENTRY( - dbp, dpage, indx); - if (!F_ISSET(hdr, HEAP_RECSPLIT) || - F_ISSET(hdr, HEAP_RECFIRST)) { - found = TRUE; - cp->pgno = pgno; - cp->indx = indx; - break; - } - } - if (!found) - pgno--; - } else + __heapc_search(dbc, + dpage, HEAP_HIGHINDX(dpage), 1, &cp->indx, &found); + if (found) + cp->pgno = pgno; + else pgno--; } break; case DB_NEXT_NODUP: case DB_NEXT: - /* If cursor not initialize, behave as DB_FIRST */ - if (dbc->internal->pgno == PGNO_INVALID) - goto first; + case DB_PREV_NODUP: + case DB_PREV: + /* + * np_inc stores whether to increment or decrement when + * iterating through records on a page and pages in the file. + */ + if (flags == DB_NEXT_NODUP || flags == DB_NEXT) + np_inc = 1; + else + np_inc = -1; + /* If cursor not initialized, behave as DB_FIRST/DB_LAST */ + if (dbc->internal->pgno == PGNO_INVALID) { + if (np_inc == 1) + goto first; + else + goto last; + } /* * Acquire the current page with the lock we have already, @@ -720,108 +741,49 @@ last: pgno = PGNO_BASE_MD; goto err; dpage = (HEAPPG *)cp->page; - /* At end of current page, must get next page */ - if (cp->indx >= HEAP_HIGHINDX(dpage)) + if (np_inc == 1 && cp->indx >= HEAP_HIGHINDX(dpage)) + /* At end of current page, must get next page. */ getpage = TRUE; - - while (!found) { - if (getpage) { - pgno = cp->pgno + 1; - - /* Put current page/lock and get next one */ - ACQUIRE_CUR(dbc, lock_type, pgno, 0, 0, ret); - if (ret != 0) { - /* Beyond last page? */ - if (ret == DB_PAGE_NOTFOUND) - ret = DB_NOTFOUND; - goto err; - } - dpage = (HEAPPG *)cp->page; - - /* - * If page is a spam page or its a data - * page without entries, try again. - */ - if (TYPE(dpage) != P_HEAP || - (TYPE(dpage) == P_HEAP && - NUM_ENT(dpage) == 0)) - continue; - - /* When searching, indx gets bumped to 0 */ - cp->indx = -1; - getpage = FALSE; - } - + else if (np_inc == -1) { /* - * Bump index and loop through the offset table finding - * first nonzero entry. If the offset is for a split - * record, make sure it's the first piece of the split - * record. HEAP_HIGHINDX always points to highest filled - * entry on page. + * Loop through indexes and find first used slot. Check + * if already at the first slot. */ - cp->indx++; - for (indx=cp->indx; - indx <= HEAP_HIGHINDX(dpage); indx++) { - if (HEAP_OFFSETTBL(dbp, dpage)[indx] == 0) - continue; - hdr = (HEAPHDR *)P_ENTRY(dbp, dpage, indx); - if (!F_ISSET(hdr, HEAP_RECSPLIT) || - F_ISSET(hdr, HEAP_RECFIRST)) { - found = TRUE; - cp->indx = indx; - break; - } + for (f_indx=0; (f_indx <= HEAP_HIGHINDX(dpage)) && + (HEAP_OFFSETTBL(dbp, dpage)[f_indx] == 0); f_indx++) + { + /* No-op. */ } - /* Nothing of interest on page, so try next */ - if (!found) + /* At the beginning of current page, get new page */ + if (cp->indx == 0 || cp->indx <= f_indx) { + if (cp->pgno == FIRST_HEAP_DPAGE) { + ret = DBC_ERR(dbc, DB_NOTFOUND); + goto err; + } getpage = TRUE; - } - break; - case DB_PREV_NODUP: - case DB_PREV: - /* If cursor not initialize, behave as DB_LAST */ - if (dbc->internal->pgno == PGNO_INVALID) - goto last; - - /* - * Acquire the current page with the lock we have already, - * unless user has asked for a write lock. - */ - ACQUIRE_CUR(dbc, lock_type, cp->pgno, 0, 0, ret); - if (ret != 0) - goto err; - dpage = (HEAPPG *)cp->page; - - /* - * Loop through indexes and find first used slot. Check if - * already at the first slot. - */ - for (f_indx=0; (f_indx <= HEAP_HIGHINDX(dpage)) && - (HEAP_OFFSETTBL(dbp, dpage)[f_indx] == 0); f_indx++) ; - - /* At the beginning of current page, must get new page */ - if (cp->indx == 0 || cp->indx <= f_indx) { - if (cp->pgno == FIRST_HEAP_DPAGE) { - ret = DB_NOTFOUND; - goto err; } - getpage = TRUE; } while (!found) { if (getpage) { - pgno = cp->pgno - 1; - /* Do not go past first page */ + if (np_inc == -1) + pgno = cp->pgno - 1; + else if (np_inc == 1) + pgno = cp->pgno + 1; if (pgno < FIRST_HEAP_DPAGE) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } - /* Put current page/lock and get prev page. */ + /* Put current page/lock and get next one */ ACQUIRE_CUR(dbc, lock_type, pgno, 0, 0, ret); - if (ret != 0) + if (ret != 0) { + if (np_inc == 1 && + ret == DB_PAGE_NOTFOUND) + /* Beyond last page */ + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; - + } dpage = (HEAPPG *)cp->page; /* @@ -833,31 +795,36 @@ last: pgno = PGNO_BASE_MD; NUM_ENT(dpage) == 0)) continue; - /* When search, this gets bumped to high indx */ - cp->indx = HEAP_HIGHINDX(dpage) + 1; + if (np_inc == 1) + /* + * When searching, indx gets + * bumped to 0 + */ + cp->indx = UINT16_MAX; + else + /* + * When searching, indx gets bumped to + * high indx + */ + cp->indx = HEAP_HIGHINDX(dpage) + 1; getpage = FALSE; } /* - * Decrement index and loop through the offset table - * finding previous nonzero entry. + * Bump index and loop through the offset table finding + * first nonzero entry. If the offset is for a split + * record, make sure it's the first piece of the split + * record. HEAP_HIGHINDX always points to highest filled + * entry on page. */ - cp->indx--; - for (indx=cp->indx; - indx >= 0; indx--) { - if (HEAP_OFFSETTBL(dbp, dpage)[indx] == 0) - continue; - hdr = (HEAPHDR *)P_ENTRY(dbp, dpage, indx); - if (!F_ISSET(hdr, HEAP_RECSPLIT) || - F_ISSET(hdr, HEAP_RECFIRST)) { - found = TRUE; - cp->indx = indx; - break; - } - } - - /* Nothing of interest on page, so try previous */ + if (np_inc == -1) + cp->indx--; + else if (np_inc == 1) + cp->indx++; + __heapc_search(dbc, + dpage, cp->indx, np_inc, &cp->indx, &found); if (!found) + /* Nothing of interest on page, so try next */ getpage = TRUE; } break; @@ -871,7 +838,7 @@ last: pgno = PGNO_BASE_MD; /* First make sure we're trying to get a data page. */ if (pgno == PGNO_BASE_MD || pgno == HEAP_REGION_PGNO(dbp, pgno)) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } @@ -880,7 +847,7 @@ last: pgno = PGNO_BASE_MD; if (ret != 0) { if (ret == DB_PAGE_NOTFOUND) - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } dpage = (HEAPPG *)cp->page; @@ -889,14 +856,14 @@ last: pgno = PGNO_BASE_MD; if ((indx > HEAP_HIGHINDX(dpage)) || (HEAP_OFFSETTBL(dbp, dpage)[indx] == 0)) { DISCARD(dbc, cp->page, cp->lock, 0, ret); - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } hdr = (HEAPHDR *)P_ENTRY(dbp, dpage, indx); if (F_ISSET(hdr, HEAP_RECSPLIT) && !F_ISSET(hdr, HEAP_RECFIRST)) { DISCARD(dbc, cp->page, cp->lock, 0, ret); - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } @@ -911,16 +878,30 @@ last: pgno = PGNO_BASE_MD; if ((ret = __heapc_gsplit( dbc, &tmp_val, NULL, 0)) != 0) goto err; + } else if (F_ISSET(hdr, HEAP_RECBLOB)) { + memcpy(&bhdr, hdr, HEAPBLOBREC_SIZE); + blob_id = (db_seq_t)bhdr.id; + GET_BLOB_SIZE(dbc->env, bhdr, blob_size, ret); + if (ret != 0) + goto err; + if (blob_size > UINT32_MAX) { + ret = DB_BUFFER_SMALL; + goto err; + } + tmp_val.flags = DB_DBT_MALLOC; + if ((ret = __blob_get(dbc, &tmp_val, + blob_id, blob_size, NULL, 0)) != 0) + goto err; } else { tmp_val.data = (void *)((u_int8_t *)hdr + sizeof(HEAPHDR)); tmp_val.size = hdr->size; } - cmp = __bam_defcmp(dbp, &tmp_val, data); + cmp = __bam_defcmp(dbp, &tmp_val, data, NULL); if (F_ISSET(&tmp_val, DB_DBT_MALLOC)) __os_ufree(dbp->env, tmp_val.data); if (cmp != 0) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } } @@ -928,7 +909,7 @@ last: pgno = PGNO_BASE_MD; break; case DB_NEXT_DUP: case DB_PREV_DUP: - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; default: /* DB_GET_RECNO, DB_JOIN_ITEM, DB_SET_RECNO are invalid */ @@ -959,6 +940,53 @@ err: if (ret == 0 ) { return (ret); } +/* + * __heapc_search -- + * Search a given a heap page, starting at a given index, for a viable heap + * record. Return the index of the found record in indxp. + */ +static void +__heapc_search(dbc, dpage, begin, dir, indxp, found) + DBC *dbc; + HEAPPG *dpage; + db_indx_t begin; + int dir; + db_indx_t *indxp; + int *found; +{ + DB *dbp; + HEAPHDR *hdr; + db_indx_t indx; + + dbp = dbc->dbp; + DB_ASSERT(dbp->env, dir == -1 || dir == 1); + + *found = FALSE; + if (TYPE(dpage) != P_HEAP || NUM_ENT(dpage) == 0) + return; + + indx = begin; + for (;;) { + if (HEAP_OFFSETTBL(dbp, dpage)[indx] != 0) { + hdr = (HEAPHDR *)P_ENTRY(dbp, dpage, indx); + if (!F_ISSET(hdr, HEAP_RECSPLIT) || + F_ISSET(hdr, HEAP_RECFIRST)) { + *found = TRUE; + *indxp = indx; + break; + } + } + if ((dir == -1 && indx == 0) || + (dir == 1 && indx == HEAP_HIGHINDX(dpage))) + break; + + if (dir == -1) + indx--; + else + indx++; + } +} + #undef IS_FIRST #define IS_FIRST (last_rid.pgno == PGNO_INVALID) /* @@ -993,6 +1021,7 @@ __heapc_reloc_partial(dbc, key, data) /* We only work on partial puts. */ DB_ASSERT(dbp->env, F_ISSET(data, DB_DBT_PARTIAL)); + DB_ASSERT(dbp->env, !F_ISSET(old_hdr, HEAP_RECBLOB)); /* * Start by calculating the data_size, total size of the new record, and @@ -1014,7 +1043,7 @@ __heapc_reloc_partial(dbc, key, data) dlen = old_size - doff; else dlen = data->dlen; - data_size = old_size - dlen + data->size; + data_size = (old_size - dlen) + data->size; } /* @@ -1075,8 +1104,8 @@ __heapc_reloc_partial(dbc, key, data) */ data_size = doff + (add_bytes ? data->size : 0); else - data_size = old_hdr->size - - dlen + (add_bytes ? data->size : 0); + data_size = (old_hdr->size - + dlen) + (add_bytes ? data->size : 0); data_size += remaining; if (data_size > buflen) { @@ -1120,7 +1149,7 @@ __heapc_reloc_partial(dbc, key, data) if (doff + dlen < old_hdr->size) { olddata += dlen; memcpy(buf, - olddata, old_hdr->size - doff - dlen); + olddata, (old_hdr->size - doff) - dlen); dlen = 0; } else /* @@ -1145,8 +1174,8 @@ __heapc_reloc_partial(dbc, key, data) log_dbt.size = DB_ALIGN( old_hdr->size, sizeof(u_int32_t)); if ((ret = __heap_addrem_log(dbp, dbc->txn, - &LSN(cp->page), 0, DB_REM_HEAP, cp->pgno, - (u_int32_t)cp->indx, old_size, + &LSN(cp->page), 0, OP_SET(DB_REM_HEAP, cp->page), + cp->pgno, (u_int32_t)cp->indx, old_size, &hdr_dbt, &log_dbt, &LSN(cp->page))) != 0) goto err; } else @@ -1185,7 +1214,8 @@ __heapc_reloc_partial(dbc, key, data) log_dbt.size = DB_ALIGN( old_hdr->size, sizeof(u_int32_t)); if ((ret = __heap_addrem_log(dbp, dbc->txn, - &LSN(cp->page), 0, DB_REM_HEAP, cp->pgno, + &LSN(cp->page), 0, + OP_SET(DB_REM_HEAP, cp->page), cp->pgno, (u_int32_t)cp->indx, old_size, &hdr_dbt, &log_dbt, &LSN(cp->page))) != 0) goto err; @@ -1197,7 +1227,8 @@ __heapc_reloc_partial(dbc, key, data) if (DBC_LOGGING(dbc)) { if ((ret = __heap_addrem_log(dbp, dbc->txn, - &LSN(cp->page), 0, DB_ADD_HEAP, cp->pgno, + &LSN(cp->page), 0, + OP_SET(DB_ADD_HEAP, cp->page), cp->pgno, (u_int32_t)cp->indx, old_size, &hdr_dbt, &log_dbt, &LSN(cp->page))) != 0) goto err; @@ -1231,7 +1262,7 @@ __heapc_reloc_partial(dbc, key, data) size -= sizeof(db_indx_t); /* Round down to a multiple of 4. */ size = DB_ALIGN( - size - sizeof(u_int32_t) + 1, sizeof(u_int32_t)); + (size - sizeof(u_int32_t)) + 1, sizeof(u_int32_t)); DB_ASSERT(dbp->env, size >= sizeof(HEAPSPLITHDR)); /* @@ -1261,7 +1292,8 @@ __heapc_reloc_partial(dbc, key, data) if (DBC_LOGGING(dbc)) { if ((ret = __heap_addrem_log(dbp, dbc->txn, &LSN(cp->page), 0, - DB_ADD_HEAP, cp->pgno, (u_int32_t)cp->indx, + OP_SET(DB_ADD_HEAP, cp->page), cp->pgno, + (u_int32_t)cp->indx, size, &hdr_dbt, &t_data, &LSN(cp->page))) != 0) goto err; } else @@ -1343,7 +1375,8 @@ next_pg: last_rid.pgno = cp->pgno; log_dbt.size = DB_ALIGN( old_hdr->size, sizeof(u_int32_t)); if ((ret = __heap_addrem_log(dbp, dbc->txn, - &LSN(cp->page), 0, DB_REM_HEAP, cp->pgno, + &LSN(cp->page), 0, + OP_SET(DB_REM_HEAP, cp->page), cp->pgno, (u_int32_t)cp->indx, old_size, &hdr_dbt, &log_dbt, &LSN(cp->page))) != 0) goto err; @@ -1355,7 +1388,8 @@ next_pg: last_rid.pgno = cp->pgno; if (DBC_LOGGING(dbc)) { if ((ret = __heap_addrem_log(dbp, dbc->txn, - &LSN(cp->page), 0, DB_ADD_HEAP, cp->pgno, + &LSN(cp->page), 0, + OP_SET(DB_ADD_HEAP, cp->page), cp->pgno, (u_int32_t)cp->indx, old_size, &hdr_dbt, &log_dbt, &LSN(cp->page))) != 0) goto err; @@ -1397,6 +1431,8 @@ __heapc_reloc(dbc, key, data) memset(&hdr_dbt, 0, sizeof(DBT)); memset(&log_dbt, 0, sizeof(DBT)); COMPQUIET(key, NULL); + /* Blob database records never change size. */ + DB_ASSERT(dbp->env, !F_ISSET(old_hdr, HEAP_RECBLOB)); /* * We are updating an existing record, which will grow into a split @@ -1436,7 +1472,8 @@ __heapc_reloc(dbc, key, data) log_dbt.size = DB_ALIGN( old_hdr->size, sizeof(u_int32_t)); if ((ret = __heap_addrem_log(dbp, dbc->txn, - &LSN(cp->page), 0, DB_REM_HEAP, cp->pgno, + &LSN(cp->page), 0, + OP_SET(DB_REM_HEAP, cp->page), cp->pgno, (u_int32_t)cp->indx, old_size, &hdr_dbt, &log_dbt, &LSN(cp->page))) != 0) goto err; @@ -1471,7 +1508,7 @@ __heapc_reloc(dbc, key, data) size -= sizeof(db_indx_t); /* Round down to a multiple of 4. */ size = DB_ALIGN( - size - sizeof(u_int32_t) + 1, sizeof(u_int32_t)); + (size - sizeof(u_int32_t)) + 1, sizeof(u_int32_t)); DB_ASSERT(dbp->env, size >= sizeof(HEAPSPLITHDR)); new_hdr.std_hdr.size = (u_int16_t)(size - sizeof(HEAPSPLITHDR)); @@ -1495,7 +1532,8 @@ __heapc_reloc(dbc, key, data) if (DBC_LOGGING(dbc)) { if ((ret = __heap_addrem_log(dbp, dbc->txn, &LSN(cp->page), 0, - DB_ADD_HEAP, cp->pgno, (u_int32_t)cp->indx, + OP_SET(DB_ADD_HEAP, cp->page), + cp->pgno, (u_int32_t)cp->indx, size, &hdr_dbt, &t_data, &LSN(cp->page))) != 0) goto err; } else @@ -1565,7 +1603,8 @@ next_pg: if (next_rid.pgno != PGNO_INVALID) { log_dbt.size = DB_ALIGN( old_hdr->size, sizeof(u_int32_t)); if ((ret = __heap_addrem_log(dbp, dbc->txn, - &LSN(cp->page), 0, DB_REM_HEAP, cp->pgno, + &LSN(cp->page), 0, + OP_SET(DB_REM_HEAP, cp->page), cp->pgno, (u_int32_t)cp->indx, old_size, &hdr_dbt, &log_dbt, &LSN(cp->page))) != 0) goto err; @@ -1577,7 +1616,8 @@ next_pg: if (next_rid.pgno != PGNO_INVALID) { if (DBC_LOGGING(dbc)) { if ((ret = __heap_addrem_log(dbp, dbc->txn, - &LSN(cp->page), 0, DB_ADD_HEAP, cp->pgno, + &LSN(cp->page), 0, + OP_SET(DB_ADD_HEAP, cp->page), cp->pgno, (u_int32_t)cp->indx,old_size, &hdr_dbt, &log_dbt, &LSN(cp->page))) != 0) goto err; @@ -1608,20 +1648,26 @@ __heapc_put(dbc, key, data, flags, pgnop) DB *dbp; DBT hdr_dbt, log_dbt, new_data; DB_MPOOLFILE *mpf; + HEAPBLOBHDR bhdr; HEAPHDR hdr, *old_hdr; HEAP_CURSOR *cp; PAGE *rpage; db_pgno_t region_pgno; - int oldspace, ret, space, t_ret; - u_int32_t data_size, dlen, new_size, old_flags, old_size, tot_size; - u_int8_t *buf, *olddata, *src, *dest; + int buf_alloc, ret, t_ret; + off_t blob_size; + db_seq_t blob_id, new_blob_id; + u_int32_t data_size, dlen, new_size, old_flags, old_size; + u_int32_t oldspace, space, tot_size; + u_int8_t *buf, *olddata; dbp = dbc->dbp; mpf = dbp->mpf; cp = (HEAP_CURSOR *)dbc->internal; rpage = NULL; - buf = dest = src = NULL; + buf = NULL; + buf_alloc = 0; dlen = 0; + blob_id = new_blob_id = 0; if (flags != DB_CURRENT) { /* We're going to write following the get, so use RMW. */ @@ -1668,7 +1714,8 @@ __heapc_put(dbc, key, data, flags, pgnop) DB_ALIGN(old_hdr->size + HEAP_HDRSIZE(old_hdr), sizeof(u_int32_t)); if (old_size < sizeof(HEAPSPLITHDR)) old_size = sizeof(HEAPSPLITHDR); - if (F_ISSET(data, DB_DBT_PARTIAL)) { + /* Partial puts on blobs are dealt with in the blob code. */ + if (F_ISSET(data, DB_DBT_PARTIAL) && !F_ISSET(old_hdr, HEAP_RECBLOB)) { if (F_ISSET(old_hdr, HEAP_RECSPLIT)) tot_size = ((HEAPSPLITHDR *)old_hdr)->tsize; else @@ -1682,9 +1729,11 @@ __heapc_put(dbc, key, data, flags, pgnop) dlen = tot_size - data->doff; else dlen = data->dlen; - data_size = tot_size - dlen + data->size; + data_size = (tot_size - dlen) + data->size; } - } else + } else if F_ISSET(old_hdr, HEAP_RECBLOB) + data_size = HEAPBLOBREC_DSIZE; + else data_size = data->size; new_size = DB_ALIGN(data_size + sizeof(HEAPHDR), sizeof(u_int32_t)); if (new_size < sizeof(HEAPSPLITHDR)) @@ -1694,6 +1743,8 @@ __heapc_put(dbc, key, data, flags, pgnop) if (F_ISSET(old_hdr, HEAP_RECSPLIT) || (new_size > old_size && new_size - old_size > HEAP_FREESPACE(dbp, cp->page))) { + /* Blob database records never change size. */ + DB_ASSERT(dbp->env, !F_ISSET(old_hdr, HEAP_RECBLOB)); /* * We've got to split the record, not enough room on the * page. Splitting the record will remove old_size bytes and @@ -1707,13 +1758,14 @@ __heapc_put(dbc, key, data, flags, pgnop) memset(&new_data, 0, sizeof(DBT)); new_data.size = data_size; - if (F_ISSET(data, DB_DBT_PARTIAL)) { + if (F_ISSET(data, DB_DBT_PARTIAL) && !F_ISSET(old_hdr, HEAP_RECBLOB)) { /* * Before replacing the old data, we need to use it to build the * new data. */ if ((ret = __os_malloc(dbp->env, data_size, &buf)) != 0) goto err; + buf_alloc = 1; new_data.data = buf; /* @@ -1736,10 +1788,32 @@ __heapc_put(dbc, key, data, flags, pgnop) buf += data->size; /* Fill in remaining data from the old record, skipping dlen. */ - if (data->doff < old_hdr->size) { + if ((data->doff + data->dlen) < old_hdr->size) { olddata += data->doff + data->dlen; - memcpy(buf, - olddata, old_hdr->size - data->doff - data->dlen); + memcpy(buf, olddata, + (old_hdr->size - data->doff) - data->dlen); + } + } else if (F_ISSET(old_hdr, HEAP_RECBLOB)) { + data_size = HEAPBLOBREC_DSIZE; + new_data.size = HEAPBLOBREC_DSIZE; + if (F_ISSET(data, DB_DBT_BLOB_REC)) { + DB_ASSERT(dbp->env, + F_ISSET(((HEAPHDR *)data->data), HEAP_RECBLOB)); + new_data.data = HEAPBLOBREC_DATA(data->data); + } else { + memcpy(&bhdr, old_hdr, HEAPBLOBREC_SIZE); + blob_id = (db_seq_t)bhdr.id; + GET_BLOB_SIZE(dbp->env, bhdr, blob_size, ret); + if (ret != 0) + goto err; + if ((ret = __blob_repl(dbc, + data, blob_id, &new_blob_id, &blob_size)) != 0) + goto err; + bhdr.std_hdr.flags = HEAP_RECBLOB; + bhdr.std_hdr.size = HEAPBLOBREC_DSIZE; + SET_BLOB_SIZE(&bhdr, blob_size, HEAPBLOBHDR); + SET_BLOB_ID(&bhdr, new_blob_id, HEAPBLOBHDR); + new_data.data = HEAPBLOBREC_DATA(&bhdr); } } else { new_data.data = data->data; @@ -1751,19 +1825,23 @@ __heapc_put(dbc, key, data, flags, pgnop) */ memset(&hdr, 0, sizeof(HEAPHDR)); hdr.size = data_size; + if (F_ISSET(old_hdr, HEAP_RECBLOB)) + hdr.flags = HEAP_RECBLOB; if (DBC_LOGGING(dbc)) { hdr_dbt.data = old_hdr; hdr_dbt.size = HEAP_HDRSIZE(old_hdr); log_dbt.data = (u_int8_t *)old_hdr + hdr_dbt.size; log_dbt.size = DB_ALIGN(old_hdr->size, sizeof(u_int32_t)); if ((ret = __heap_addrem_log(dbp, dbc->txn, &LSN(cp->page), - 0, DB_REM_HEAP, cp->pgno, (u_int32_t)cp->indx, + 0, OP_SET(DB_REM_HEAP, cp->page), cp->pgno, + (u_int32_t)cp->indx, old_size, &hdr_dbt, &log_dbt, &LSN(cp->page))) != 0) goto err; hdr_dbt.data = &hdr; hdr_dbt.size = HEAP_HDRSIZE(&hdr); if ((ret = __heap_addrem_log(dbp, dbc->txn, &LSN(cp->page), - 0, DB_ADD_HEAP, cp->pgno, (u_int32_t)cp->indx, + 0, OP_SET(DB_ADD_HEAP, cp->page), cp->pgno, + (u_int32_t)cp->indx, new_size, &hdr_dbt, &new_data, &LSN(cp->page))) != 0) goto err; } else @@ -1788,14 +1866,14 @@ __heapc_put(dbc, key, data, flags, pgnop) dbc->thread_info, NULL, DB_MPOOL_DIRTY, &rpage)) != 0) goto err; - HEAP_SETSPACE(dbp, rpage, cp->pgno - region_pgno - 1, space); + HEAP_SETSPACE(dbp, rpage, (cp->pgno - region_pgno) - 1, space); } err: DB_ASSERT(dbp->env, ret != DB_PAGE_NOTFOUND); if (rpage != NULL && (t_ret = __memp_fput(mpf, dbc->thread_info, rpage, dbc->priority)) != 0 && ret == 0) ret = t_ret; - if (F_ISSET(data, DB_DBT_PARTIAL)) + if (buf_alloc) __os_free(dbp->env, new_data.data); if (ret != 0 && LOCK_ISSET(cp->lock)) @@ -1823,18 +1901,21 @@ __heap_getpage(dbc, size, avail) HEAP *h; HEAPPG *rpage; HEAP_CURSOR *cp; - db_pgno_t data_pgno, *lkd_pgs, meta_pgno, region_pgno, start_region; - int i, lk_mode, max, p, ret, space, start, t_ret; + db_pgno_t data_pgno, i, max, meta_pgno, p, region_pgno, start; + db_pgno_t start_region; + int ret, t_ret; + u_int32_t lk_mode, space; LOCK_INIT(meta_lock); + data_pgno = PGNO_INVALID; dbp = dbc->dbp; mpf = dbp->mpf; cp = (HEAP_CURSOR *)dbc->internal; h = dbp->heap_internal; start_region = region_pgno = h->curregion; max = HEAP_REGION_SIZE(dbp); - i = ret = t_ret = 0; - lkd_pgs = NULL; + i = 0; + ret = t_ret = 0; /* * The algorithm for finding a page: @@ -1897,10 +1978,10 @@ find: while ((ret = __memp_fget(mpf, ®ion_pgno, max = h->maxpgno - region_pgno; /* * Look in the bitmap for a page with sufficient free space. We use i - * in a slightly strange way. Because the 2-bits in the bitmap are only - * an estimate, there is a chance the data won't fit on the page we - * choose. In that case, we re-start the process and want to be able to - * resume this loop where we left off. + * in a slightly strange way. Because the 2-bits in the bitmap are + * only an estimate, there is a chance the data won't fit on the page + * we choose. In that case, we re-start the process and want to be + * able to resume this loop where we left off. */ for (; i < max; i++) { p = start + i; @@ -1908,7 +1989,7 @@ find: while ((ret = __memp_fget(mpf, ®ion_pgno, p -= max; if ((*avail = HEAP_SPACE(dbp, rpage, p)) > space) continue; - data_pgno = region_pgno + p + 1; + data_pgno = (region_pgno + p) + 1; ACQUIRE_CUR(dbc, DB_LOCK_WRITE, data_pgno, DB_LOCK_NOWAIT, 0, ret); /* @@ -2071,7 +2152,7 @@ pg_err: if (p != 0) { if (ret == DB_LOCK_NOTGRANTED) ret = 0; else if (ret != 0) { - /* + /* * Free up the metadata lock. If this was an error * other than a missing region page, bail. */ @@ -2165,7 +2246,7 @@ check: if (size + sizeof(db_indx_t) > HEAP_FREESPACE(dbp, cp->page)) { } } - h->curpgindx = data_pgno - region_pgno - 1; + h->curpgindx = (data_pgno - region_pgno) - 1; err: DB_ASSERT(dbp->env, ret != DB_PAGE_NOTFOUND); if (rpage != NULL && (t_ret = __memp_fput(mpf, dbc->thread_info, rpage, dbc->priority)) != 0 && ret == 0) @@ -2187,26 +2268,40 @@ __heap_append(dbc, key, data) DBT *data, *key; { DB *dbp; - DBT tmp_dbt; + DBT tmp_dbt, data_dbt; DB_HEAP_RID rid; + DB_LSN lsn; DB_MPOOLFILE *mpf; HEAPPG *rpage; + HEAPBLOBHDR bhdr; HEAPHDR hdr; HEAP_CURSOR *cp; db_indx_t indx; db_pgno_t region_pgno; - int ret, space, t_ret; + int is_blob, ret, t_ret; + off_t blob_size; + db_seq_t blob_id; u_int8_t avail; - u_int32_t data_size; + u_int32_t data_size, space; dbp = dbc->dbp; mpf = dbp->mpf; ret = t_ret = 0; rpage = NULL; cp = (HEAP_CURSOR *)dbc->internal; + blob_size = 0; + blob_id = 0; + + if (dbp->blob_threshold && + (data->size >= dbp->blob_threshold || F_ISSET(data, DB_DBT_BLOB))) + is_blob = 1; + else + is_blob = 0; /* Need data.size + header size, 4-byte aligned. */ - if (F_ISSET(data, DB_DBT_PARTIAL)) + if (is_blob) + data_size = HEAPBLOBREC_SIZE; + else if (F_ISSET(data, DB_DBT_PARTIAL)) data_size = DB_ALIGN(data->doff + data->size + sizeof(HEAPHDR), sizeof(u_int32_t)); else @@ -2222,24 +2317,42 @@ __heap_append(dbc, key, data) goto err; indx = HEAP_FREEINDX(cp->page); - memset(&hdr, 0, sizeof(HEAPHDR)); - hdr.size = data->size; - if (F_ISSET(data, DB_DBT_PARTIAL)) - hdr.size += data->doff; - tmp_dbt.data = &hdr; - tmp_dbt.size = sizeof(HEAPHDR); + if (is_blob) { + if ((ret = __blob_put( + dbc, data, &blob_id, &blob_size, &lsn)) != 0) + goto err; + memset(&bhdr, 0, HEAPBLOBREC_SIZE); + bhdr.std_hdr.flags = HEAP_RECBLOB; + bhdr.std_hdr.size = HEAPBLOBREC_DSIZE; + SET_BLOB_SIZE(&bhdr, blob_size, HEAPBLOBHDR); + SET_BLOB_ID(&bhdr, blob_id, HEAPBLOBHDR); + SET_BLOB_FILE_ID(&bhdr, dbp->blob_file_id, HEAPBLOBHDR); + tmp_dbt.data = &bhdr; + tmp_dbt.size = sizeof(HEAPHDR); + memset(&data_dbt, 0, sizeof(DBT)); + data_dbt.data = HEAPBLOBREC_DATA((&bhdr)); + data_dbt.size = HEAPBLOBREC_DSIZE; + } else { + memset(&hdr, 0, sizeof(HEAPHDR)); + hdr.size = data->size; + if (F_ISSET(data, DB_DBT_PARTIAL)) + hdr.size += data->doff; + tmp_dbt.data = &hdr; + tmp_dbt.size = sizeof(HEAPHDR); + memcpy(&data_dbt, data, sizeof(DBT)); + } /* Log the write. */ if (DBC_LOGGING(dbc)) { if ((ret = __heap_addrem_log(dbp, dbc->txn, &LSN(cp->page), - 0, DB_ADD_HEAP, cp->pgno, (u_int32_t)indx, - data_size, &tmp_dbt, data, &LSN(cp->page))) != 0) + 0, OP_SET(DB_ADD_HEAP, cp->page), cp->pgno, (u_int32_t)indx, + data_size, &tmp_dbt, &data_dbt, &LSN(cp->page))) != 0) goto err; } else LSN_NOT_LOGGED(LSN(cp->page)); if ((ret = __heap_pitem( - dbc, (PAGE *)cp->page, indx, data_size, &tmp_dbt, data)) != 0) + dbc, (PAGE *)cp->page, indx, data_size, &tmp_dbt, &data_dbt)) != 0) goto err; rid.pgno = cp->pgno; @@ -2256,7 +2369,7 @@ __heap_append(dbc, key, data) dbc->thread_info, NULL, DB_MPOOL_DIRTY, &rpage)) != 0) goto err; - HEAP_SETSPACE(dbp, rpage, cp->pgno - region_pgno - 1, space); + HEAP_SETSPACE(dbp, rpage, (cp->pgno - region_pgno) - 1, space); } err: DB_ASSERT(dbp->env, ret != DB_PAGE_NOTFOUND); @@ -2292,8 +2405,8 @@ __heapc_split(dbc, key, data, is_first) HEAP_CURSOR *cp; db_indx_t indx; db_pgno_t region_pgno; - int ret, spacebits, t_ret; - u_int32_t buflen, doff, left, size; + int ret, t_ret; + u_int32_t buflen, doff, left, size, spacebits; u_int8_t availbits, *buf; dbp = dbc->dbp; @@ -2308,7 +2421,6 @@ __heapc_split(dbc, key, data, is_first) ret = t_ret = 0; indx = 0; buf = NULL; - buflen = 0; /* * Write the record to multiple pages, in chunks starting from the end. @@ -2322,6 +2434,9 @@ __heapc_split(dbc, key, data, is_first) left += data->doff; } hdrs.tsize = left; + buflen = 1; + if ((ret = __os_malloc(dbp->env, buflen, &buf)) != 0) + return (ret); while (left > 0) { size = DB_ALIGN(left + sizeof(HEAPSPLITHDR), sizeof(u_int32_t)); if (size < sizeof(HEAPSPLITHDR)) @@ -2336,8 +2451,10 @@ __heapc_split(dbc, key, data, is_first) else hdrs.std_hdr.flags |= HEAP_RECFIRST; - if ((ret = __heap_getpage(dbc, size, &availbits)) != 0) + if ((ret = __heap_getpage(dbc, size, &availbits)) != 0) { + __os_free(dbp->env, buf); return (ret); + } /* * size is the total number of bytes being written to the page. @@ -2363,7 +2480,7 @@ __heapc_split(dbc, key, data, is_first) size -= sizeof(db_indx_t); /* Round down to a multiple of 4. */ size = DB_ALIGN( - size - sizeof(u_int32_t) + 1, sizeof(u_int32_t)); + (size - sizeof(u_int32_t)) + 1, sizeof(u_int32_t)); DB_ASSERT(dbp->env, size >= sizeof(HEAPSPLITHDR)); hdrs.std_hdr.size = (u_int16_t)(size - sizeof(HEAPSPLITHDR)); @@ -2401,10 +2518,10 @@ __heapc_split(dbc, key, data, is_first) * page minus the bytes we're taking from data. */ t_data.data = buf; - memset(buf, '\0', t_data.size - left + doff); - buf += t_data.size - left + doff; + memset(buf, 0, (t_data.size - left) + doff); + buf += (t_data.size - left) + doff; memcpy(buf, data->data, left - doff); - doff -= t_data.size - left + doff; + doff -= (t_data.size - left) + doff; buf = t_data.data; } hdr_dbt.data = &hdrs; @@ -2415,7 +2532,8 @@ __heapc_split(dbc, key, data, is_first) if (DBC_LOGGING(dbc)) { if ((ret = __heap_addrem_log(dbp, dbc->txn, &LSN(cp->page), 0, - DB_ADD_HEAP, cp->pgno, (u_int32_t)indx, + OP_SET(DB_ADD_HEAP, cp->page), + cp->pgno, (u_int32_t)indx, size, &hdr_dbt, &t_data, &LSN(cp->page))) != 0) goto err; } else @@ -2447,7 +2565,7 @@ __heapc_split(dbc, key, data, is_first) goto err; HEAP_SETSPACE(dbp, - rpage, cp->pgno - region_pgno - 1, spacebits); + rpage, (cp->pgno - region_pgno) - 1, spacebits); ret = __memp_fput(mpf, dbc->thread_info, rpage, dbc->priority); rpage = NULL; diff --git a/src/heap/heap.src b/src/heap/heap.src index 47bd4bb0..a08ad5eb 100644 --- a/src/heap/heap.src +++ b/src/heap/heap.src @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2010, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -29,7 +29,29 @@ INCLUDE * dbt: data that is to be added or deleted. * pagelsn: former lsn of the page. */ -BEGIN addrem 49 151 +BEGIN addrem 61 151 +OP opcode u_int32_t lu +DB fileid int32_t ld +ARG pgno db_pgno_t lu +ARG indx u_int32_t lu +ARG nbytes u_int32_t lu +HDR hdr DBT s +DATA dbt DBT s +POINTER pagelsn DB_LSN * lu +END + +BEGIN_COMPAT addrem 60 151 +OP opcode u_int32_t lu +DB fileid int32_t ld +ARG pgno db_pgno_t lu +ARG indx u_int32_t lu +ARG nbytes u_int32_t lu +HDR hdr DBT s +DBT dbt DBT s +POINTER pagelsn DB_LSN * lu +END + +BEGIN_COMPAT addrem 50 151 ARG opcode u_int32_t lu DB fileid int32_t ld ARG pgno db_pgno_t lu diff --git a/src/heap/heap_auto.c b/src/heap/heap_auto.c index 1cb705f4..9fdcce7a 100644 --- a/src/heap/heap_auto.c +++ b/src/heap/heap_auto.c @@ -9,16 +9,38 @@ #include "dbinc/txn.h" DB_LOG_RECSPEC __heap_addrem_desc[] = { - {LOGREC_ARG, SSZ(__heap_addrem_args, opcode), "opcode", "%lu"}, + {LOGREC_OP, SSZ(__heap_addrem_args, opcode), "opcode", "%lu"}, {LOGREC_DB, SSZ(__heap_addrem_args, fileid), "fileid", ""}, {LOGREC_ARG, SSZ(__heap_addrem_args, pgno), "pgno", "%lu"}, {LOGREC_ARG, SSZ(__heap_addrem_args, indx), "indx", "%lu"}, {LOGREC_ARG, SSZ(__heap_addrem_args, nbytes), "nbytes", "%lu"}, - {LOGREC_DBT, SSZ(__heap_addrem_args, hdr), "hdr", ""}, - {LOGREC_DBT, SSZ(__heap_addrem_args, dbt), "dbt", ""}, + {LOGREC_HDR, SSZ(__heap_addrem_args, hdr), "hdr", ""}, + {LOGREC_DATA, SSZ(__heap_addrem_args, dbt), "dbt", ""}, {LOGREC_POINTER, SSZ(__heap_addrem_args, pagelsn), "pagelsn", ""}, {LOGREC_Done, 0, "", ""} }; +DB_LOG_RECSPEC __heap_addrem_60_desc[] = { + {LOGREC_OP, SSZ(__heap_addrem_60_args, opcode), "opcode", "%lu"}, + {LOGREC_DB, SSZ(__heap_addrem_60_args, fileid), "fileid", ""}, + {LOGREC_ARG, SSZ(__heap_addrem_60_args, pgno), "pgno", "%lu"}, + {LOGREC_ARG, SSZ(__heap_addrem_60_args, indx), "indx", "%lu"}, + {LOGREC_ARG, SSZ(__heap_addrem_60_args, nbytes), "nbytes", "%lu"}, + {LOGREC_HDR, SSZ(__heap_addrem_60_args, hdr), "hdr", ""}, + {LOGREC_DBT, SSZ(__heap_addrem_60_args, dbt), "dbt", ""}, + {LOGREC_POINTER, SSZ(__heap_addrem_60_args, pagelsn), "pagelsn", ""}, + {LOGREC_Done, 0, "", ""} +}; +DB_LOG_RECSPEC __heap_addrem_50_desc[] = { + {LOGREC_ARG, SSZ(__heap_addrem_50_args, opcode), "opcode", "%lu"}, + {LOGREC_DB, SSZ(__heap_addrem_50_args, fileid), "fileid", ""}, + {LOGREC_ARG, SSZ(__heap_addrem_50_args, pgno), "pgno", "%lu"}, + {LOGREC_ARG, SSZ(__heap_addrem_50_args, indx), "indx", "%lu"}, + {LOGREC_ARG, SSZ(__heap_addrem_50_args, nbytes), "nbytes", "%lu"}, + {LOGREC_DBT, SSZ(__heap_addrem_50_args, hdr), "hdr", ""}, + {LOGREC_DBT, SSZ(__heap_addrem_50_args, dbt), "dbt", ""}, + {LOGREC_POINTER, SSZ(__heap_addrem_50_args, pagelsn), "pagelsn", ""}, + {LOGREC_Done, 0, "", ""} +}; DB_LOG_RECSPEC __heap_pg_alloc_desc[] = { {LOGREC_DB, SSZ(__heap_pg_alloc_args, fileid), "fileid", ""}, {LOGREC_POINTER, SSZ(__heap_pg_alloc_args, meta_lsn), "meta_lsn", ""}, diff --git a/src/heap/heap_autop.c b/src/heap/heap_autop.c index b767203b..ac08441b 100644 --- a/src/heap/heap_autop.c +++ b/src/heap/heap_autop.c @@ -28,6 +28,40 @@ __heap_addrem_print(env, dbtp, lsnp, notused2, info) } /* + * PUBLIC: int __heap_addrem_60_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__heap_addrem_60_print(env, dbtp, lsnp, notused2, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *info; +{ + COMPQUIET(notused2, DB_TXN_PRINT); + + return (__log_print_record(env, dbtp, lsnp, "__heap_addrem_60", __heap_addrem_60_desc, info)); +} + +/* + * PUBLIC: int __heap_addrem_50_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__heap_addrem_50_print(env, dbtp, lsnp, notused2, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *info; +{ + COMPQUIET(notused2, DB_TXN_PRINT); + + return (__log_print_record(env, dbtp, lsnp, "__heap_addrem_50", __heap_addrem_50_desc, info)); +} + +/* * PUBLIC: int __heap_pg_alloc_print __P((ENV *, DBT *, DB_LSN *, * PUBLIC: db_recops, void *)); */ diff --git a/src/heap/heap_backup.c b/src/heap/heap_backup.c index 4588b0ba..77b0eaaa 100644 --- a/src/heap/heap_backup.c +++ b/src/heap/heap_backup.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2011, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/heap/heap_conv.c b/src/heap/heap_conv.c index 9f432d13..dbf059a4 100644 --- a/src/heap/heap_conv.c +++ b/src/heap/heap_conv.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2010, 2015 Oracle and/or its affiliates. All rights reserved. */ #include "db_config.h" @@ -86,7 +86,10 @@ __heap_mswap(env, pg) SWAP32(p); /* gbytes */ SWAP32(p); /* bytes */ SWAP32(p); /* region_size */ - p += 92 * sizeof(u_int32_t); /* unused */ + SWAP32(p); /* threshold */ + SWAP32(p); /* file id lo */ + SWAP32(p); /* file id hi */ + p += 89 * sizeof(u_int32_t); /* unused */ SWAP32(p); /* crypto_magic */ return (0); diff --git a/src/heap/heap_method.c b/src/heap/heap_method.c index f938b5e7..2667f4fe 100644 --- a/src/heap/heap_method.c +++ b/src/heap/heap_method.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2010, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -49,15 +49,11 @@ __heap_db_close(dbp) DB *dbp; { HEAP *h; - int ret; - - ret = 0; - if ((h = dbp->heap_internal) == NULL) - return (0); - - __os_free(dbp->env, h); - dbp->heap_internal = NULL; + if ((h = dbp->heap_internal) != NULL) { + __os_free(dbp->env, h); + dbp->heap_internal = NULL; + } return (0); } diff --git a/src/heap/heap_open.c b/src/heap/heap_open.c index 6827450d..f5bb72ae 100644 --- a/src/heap/heap_open.c +++ b/src/heap/heap_open.c @@ -1,19 +1,19 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2010, 2015 Oracle and/or its affiliates. All rights reserved. */ #include "db_config.h" #include "db_int.h" +#include "dbinc/blob.h" #include "dbinc/crypto.h" #include "dbinc/db_page.h" #include "dbinc/db_swap.h" #include "dbinc/fop.h" #include "dbinc/heap.h" #include "dbinc/lock.h" -#include "dbinc/log.h" #include "dbinc/mp.h" static void __heap_init_meta __P((DB *, HEAPMETA *, db_pgno_t, DB_LSN*)); @@ -82,6 +82,7 @@ __heap_metachk(dbp, name, hm) env = dbp->env; h = (HEAP *)dbp->heap_internal; + ret = 0; /* * At this point, all we know is that the magic number is for a Heap. @@ -92,6 +93,7 @@ __heap_metachk(dbp, name, hm) M_32_SWAP(vers); switch (vers) { case 1: + case 2: break; default: __db_errx(env, @@ -116,6 +118,26 @@ __heap_metachk(dbp, name, hm) /* Set the page size. */ dbp->pgsize = hm->dbmeta.pagesize; + dbp->blob_threshold = hm->blob_threshold; + GET_BLOB_FILE_ID(env, hm, dbp->blob_file_id, ret); + if (ret != 0) + return (ret); + /* Blob databases must be upgraded. */ + if (vers == 1 && dbp->blob_file_id != 0) { + __db_errx(env, DB_STR_A("1209", +"%s: databases that support blobs must be upgraded.", "%s"), + name); + return (EINVAL); + } +#ifndef HAVE_64BIT_TYPES + if (dbp->blob_file_id != 0) { + __db_errx(env, DB_STR_A("1205", + "%s: blobs require 64 integer compiler support.", "%s"), + name); + return (EINVAL); + } +#endif + /* Copy the file's ID. */ memcpy(dbp->fileid, hm->dbmeta.uid, DB_FILE_ID_LEN); @@ -179,7 +201,8 @@ __heap_read_meta(dbp, ip, txn, meta_pgno, flags) h->region_size = meta->region_size; if (PGNO(meta) == PGNO_BASE_MD && !F_ISSET(dbp, DB_AM_RECOVER)) - __memp_set_last_pgno(mpf, meta->dbmeta.last_pgno); + (void)__memp_set_last_pgno( + mpf, meta->dbmeta.last_pgno); } else { DB_ASSERT(dbp->env, IS_RECOVERING(dbp->env) || F_ISSET(dbp, DB_AM_RECOVER)); @@ -285,6 +308,12 @@ __heap_new_file(dbp, ip, txn, fhp, name) pginfo.type = dbp->type; pdbt.data = &pginfo; pdbt.size = sizeof(pginfo); + if (dbp->blob_threshold) { + if ((ret = __blob_generate_dir_ids( + dbp, txn, &dbp->blob_file_id)) != 0) + return (ret); + + } if ((ret = __os_calloc(env, 1, dbp->pgsize, &buf)) != 0) return (ret); meta = (HEAPMETA *)buf; @@ -394,7 +423,9 @@ done: if (region != NULL && (t_ret = __memp_fput(mpf, dbc->thread_info, region, dbc->priority)) != 0 && ret == 0) ret = t_ret; - ret = __memp_fput(mpf, dbc->thread_info, meta, dbc->priority); + if ((t_ret = __memp_fput(mpf, + dbc->thread_info, meta, dbc->priority)) != 0 && ret == 0) + ret = t_ret; if ((t_ret = __TLPUT(dbc, meta_lock)) != 0 && ret == 0) ret = t_ret; @@ -436,4 +467,6 @@ __heap_init_meta(dbp, meta, pgno, lsnp) meta->region_size = h->region_size; meta->nregions = 1; meta->curregion = 1; + meta->blob_threshold = dbp->blob_threshold; + SET_BLOB_META_FILE_ID(meta, dbp->blob_file_id, HEAPMETA); } diff --git a/src/heap/heap_rec.c b/src/heap/heap_rec.c index 578a61c4..01803a70 100644 --- a/src/heap/heap_rec.c +++ b/src/heap/heap_rec.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2010, 2015 Oracle and/or its affiliates. All rights reserved. */ #include "db_config.h" @@ -9,7 +9,6 @@ #include "db_int.h" #include "dbinc/db_page.h" #include "dbinc/heap.h" -#include "dbinc/log.h" #include "dbinc/mp.h" /* @@ -34,7 +33,8 @@ __heap_addrem_recover(env, dbtp, lsnp, op, info) DB_THREAD_INFO *ip; PAGE *pagep, *regionp; db_pgno_t region_pgno; - int cmp_n, cmp_p, modified, oldspace, ret, space; + int cmp_n, cmp_p, modified, ret; + u_int32_t oldspace, opcode, space; ip = ((DB_TXNHEAD *)info)->thread_info; pagep = NULL; @@ -44,19 +44,20 @@ __heap_addrem_recover(env, dbtp, lsnp, op, info) REC_FGET(mpf, ip, argp->pgno, &pagep, done); modified = 0; + opcode = OP_MODE_GET(argp->opcode); cmp_n = log_compare(lsnp, &LSN(pagep)); cmp_p = log_compare(&LSN(pagep), &argp->pagelsn); - if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_ADD_HEAP) || - (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_REM_HEAP)) { + if ((cmp_p == 0 && DB_REDO(op) && opcode == DB_ADD_HEAP) || + (cmp_n == 0 && DB_UNDO(op) && opcode == DB_REM_HEAP)) { /* We are either redo-ing an add or undoing a delete. */ REC_DIRTY(mpf, ip, dbc->priority, &pagep); if ((ret = __heap_pitem(dbc, pagep, argp->indx, argp->nbytes, &argp->hdr, &argp->dbt)) != 0) goto out; modified = 1; - } else if ((cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_ADD_HEAP) || - (cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_REM_HEAP)) { + } else if ((cmp_n == 0 && DB_UNDO(op) && opcode == DB_ADD_HEAP) || + (cmp_p == 0 && DB_REDO(op) && opcode == DB_REM_HEAP)) { /* We are either undoing an add or redo-ing a delete. */ REC_DIRTY(mpf, ip, dbc->priority, &pagep); if ((ret = __heap_ditem( @@ -76,11 +77,11 @@ __heap_addrem_recover(env, dbtp, lsnp, op, info) HEAP_CALCSPACEBITS( file_dbp, HEAP_FREESPACE(file_dbp, pagep), space); oldspace = HEAP_SPACE(file_dbp, regionp, - argp->pgno - region_pgno - 1); + (argp->pgno - region_pgno) - 1); if (space != oldspace) { REC_DIRTY(mpf, ip, dbc->priority, ®ionp); HEAP_SETSPACE(file_dbp, - regionp, argp->pgno - region_pgno - 1, space); + regionp, (argp->pgno - region_pgno) - 1, space); } if ((ret = __memp_fput(mpf, ip, regionp, dbc->priority)) != 0) goto out; @@ -384,3 +385,200 @@ out: if (pagep != NULL) (void)__memp_fput(mpf, ip, pagep, dbc->priority); REC_CLOSE; } + +/* + * __heap_addrem_60_recover -- + * Recovery function for addrem. + * + * PUBLIC: int __heap_addrem_60_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__heap_addrem_60_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __heap_addrem_60_args *argp; + DB *file_dbp; + DBC *dbc; + DB_MPOOLFILE *mpf; + DB_THREAD_INFO *ip; + HEAPBLOBHDR bhdr; + HEAPHDR *hhdr; + PAGE *pagep, *regionp; + db_pgno_t region_pgno; + int cmp_n, cmp_p, modified, ret; + u_int32_t oldspace, opcode, space; + u_int8_t buf[HEAPBLOBREC_SIZE]; + + ip = ((DB_TXNHEAD *)info)->thread_info; + pagep = NULL; + REC_PRINT(__heap_addrem_60_print); + REC_INTRO(__heap_addrem_60_read, ip, 1); + region_pgno = HEAP_REGION_PGNO(file_dbp, argp->pgno); + + REC_FGET(mpf, ip, argp->pgno, &pagep, done); + modified = 0; + opcode = OP_MODE_GET(argp->opcode); + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->pagelsn); + + if ((cmp_p == 0 && DB_REDO(op) && opcode == DB_ADD_HEAP) || + (cmp_n == 0 && DB_UNDO(op) && opcode == DB_REM_HEAP)) { + hhdr = argp->hdr.data; + /* + * In 6.0 heap blob log records were not correctly byte + * swapped, so do the swapping here if the blob file id of the + * database does not match the blob file id stored in the + * record. Technically byte swapping the blob file id could + * produce the same value, but that would only happen in + * practice if the environment contained over 4 billion blob + * databases. 0 is an invalid blob file id. + */ + if (F_ISSET(hhdr, HEAP_RECBLOB)) { + memcpy(buf + sizeof(HEAPHDR), + argp->dbt.data, HEAPBLOBREC_DSIZE); + memcpy(&bhdr, buf, HEAPBLOBREC_SIZE); + if ((db_seq_t)bhdr.file_id != dbc->dbp->blob_file_id) { + M_64_SWAP(bhdr.id); + M_64_SWAP(bhdr.size); + M_64_SWAP(bhdr.file_id); + DB_ASSERT(env, + (db_seq_t)bhdr.file_id + == dbc->dbp->blob_file_id); + memcpy(buf, &bhdr, HEAPBLOBREC_SIZE); + memcpy(argp->dbt.data, + buf + sizeof(HEAPHDR), HEAPBLOBREC_DSIZE); + } + } + /* We are either redo-ing an add or undoing a delete. */ + REC_DIRTY(mpf, ip, dbc->priority, &pagep); + if ((ret = __heap_pitem(dbc, pagep, + argp->indx, argp->nbytes, &argp->hdr, &argp->dbt)) != 0) + goto out; + modified = 1; + } else if ((cmp_n == 0 && DB_UNDO(op) && opcode == DB_ADD_HEAP) || + (cmp_p == 0 && DB_REDO(op) && opcode == DB_REM_HEAP)) { + /* We are either undoing an add or redo-ing a delete. */ + REC_DIRTY(mpf, ip, dbc->priority, &pagep); + if ((ret = __heap_ditem( + dbc, pagep, argp->indx, argp->nbytes)) != 0) + goto out; + modified = 1; + } + + if (modified) { + REC_FGET(mpf, ip, region_pgno, ®ionp, done); + if (DB_REDO(op)) + LSN(pagep) = *lsnp; + else + LSN(pagep) = argp->pagelsn; + + /* Update the available space bitmap, if necessary. */ + HEAP_CALCSPACEBITS( + file_dbp, HEAP_FREESPACE(file_dbp, pagep), space); + oldspace = HEAP_SPACE(file_dbp, regionp, + (argp->pgno - region_pgno) - 1); + if (space != oldspace) { + REC_DIRTY(mpf, ip, dbc->priority, ®ionp); + HEAP_SETSPACE(file_dbp, + regionp, (argp->pgno - region_pgno) - 1, space); + } + if ((ret = __memp_fput(mpf, ip, regionp, dbc->priority)) != 0) + goto out; + } + +done: *lsnp = argp->prev_lsn; + ret = 0; + +out: if (pagep != NULL) + (void)__memp_fput(mpf, ip, pagep, dbc->priority); + REC_CLOSE; + +} + +/* + * __heap_addrem_50_recover -- + * Recovery function for addrem. + * + * PUBLIC: int __heap_addrem_50_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__heap_addrem_50_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __heap_addrem_50_args *argp; + DB *file_dbp; + DBC *dbc; + DB_MPOOLFILE *mpf; + DB_THREAD_INFO *ip; + PAGE *pagep, *regionp; + db_pgno_t region_pgno; + int cmp_n, cmp_p, modified, ret; + u_int32_t oldspace, space; + + ip = ((DB_TXNHEAD *)info)->thread_info; + pagep = NULL; + REC_PRINT(__heap_addrem_50_print); + REC_INTRO(__heap_addrem_50_read, ip, 1); + region_pgno = HEAP_REGION_PGNO(file_dbp, argp->pgno); + + REC_FGET(mpf, ip, argp->pgno, &pagep, done); + modified = 0; + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->pagelsn); + + if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_ADD_HEAP) || + (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_REM_HEAP)) { + /* We are either redo-ing an add or undoing a delete. */ + REC_DIRTY(mpf, ip, dbc->priority, &pagep); + if ((ret = __heap_pitem(dbc, pagep, + argp->indx, argp->nbytes, &argp->hdr, &argp->dbt)) != 0) + goto out; + modified = 1; + } else if ((cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_ADD_HEAP) || + (cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_REM_HEAP)) { + /* We are either undoing an add or redo-ing a delete. */ + REC_DIRTY(mpf, ip, dbc->priority, &pagep); + if ((ret = __heap_ditem( + dbc, pagep, argp->indx, argp->nbytes)) != 0) + goto out; + modified = 1; + } + + if (modified) { + REC_FGET(mpf, ip, region_pgno, ®ionp, done); + if (DB_REDO(op)) + LSN(pagep) = *lsnp; + else + LSN(pagep) = argp->pagelsn; + + /* Update the available space bitmap, if necessary. */ + HEAP_CALCSPACEBITS( + file_dbp, HEAP_FREESPACE(file_dbp, pagep), space); + oldspace = HEAP_SPACE(file_dbp, + regionp, (argp->pgno - region_pgno) - 1); + if (space != oldspace) { + REC_DIRTY(mpf, ip, dbc->priority, ®ionp); + HEAP_SETSPACE(file_dbp, + regionp, (argp->pgno - region_pgno) - 1, space); + } + if ((ret = __memp_fput(mpf, ip, regionp, dbc->priority)) != 0) + goto out; + } + +done: *lsnp = argp->prev_lsn; + ret = 0; + +out: if (pagep != NULL) + (void)__memp_fput(mpf, ip, pagep, dbc->priority); + REC_CLOSE; +} diff --git a/src/heap/heap_reclaim.c b/src/heap/heap_reclaim.c index 8cedb223..463e40c0 100644 --- a/src/heap/heap_reclaim.c +++ b/src/heap/heap_reclaim.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1998, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -50,7 +50,7 @@ __heap_truncate(dbc, countp) return (ret); if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &meta)) != 0) { - __TLPUT(dbc, lock); + (void)__TLPUT(dbc, lock); goto err; } diff --git a/src/heap/heap_stat.c b/src/heap/heap_stat.c index 9f4361a7..13bd36a2 100644 --- a/src/heap/heap_stat.c +++ b/src/heap/heap_stat.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2010, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -29,7 +29,7 @@ __heap_stat(dbc, spp, flags) { DB *dbp; DB_HEAP_STAT *sp; - DB_LOCK lock, metalock; + DB_LOCK metalock; DB_MPOOLFILE *mpf; ENV *env; HEAPMETA *meta; @@ -41,7 +41,6 @@ __heap_stat(dbc, spp, flags) meta = NULL; LOCK_INIT(metalock); - LOCK_INIT(lock); mpf = dbp->mpf; sp = NULL; ret = t_ret = write_meta = 0; @@ -147,6 +146,8 @@ __heap_stat_print(dbc, flags) "Underlying database page size", (u_long)sp->heap_pagesize); __db_dl(env, "Number of records in the database", (u_long)sp->heap_nrecs); + __db_dl(env, + "Number of blobs in the database", (u_long)sp->heap_nblobs); __db_dl(env, "Number of database pages", (u_long)sp->heap_pagecnt); __db_dl(env, "Number of database regions", (u_long)sp->heap_nregions); __db_dl(env, @@ -200,11 +201,13 @@ __heap_stat_callback(dbc, h, cookie, putp) * We can't just use NUM_ENT, otherwise we'd mis-count split * records. */ - for (i = 0; i < NUM_ENT(h); i++) { + for (i = 0; i <= HEAP_HIGHINDX(h); i++) { hdr = (HEAPHDR *)P_ENTRY(dbp, h, i); if (!F_ISSET(hdr, HEAP_RECSPLIT) || F_ISSET(hdr, HEAP_RECFIRST)) sp->heap_nrecs++; + if (F_ISSET(hdr, HEAP_RECBLOB)) + sp->heap_nblobs++; } break; case P_HEAPMETA: /* Fallthrough */ diff --git a/src/heap/heap_stub.c b/src/heap/heap_stub.c index b4feb2f3..3093abc2 100644 --- a/src/heap/heap_stub.c +++ b/src/heap/heap_stub.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2010, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id: */ @@ -35,6 +35,40 @@ __db_no_heap_am(env) } int +__heap_60_heapmeta(dbp, real_name, flags, fhp, h, dirtyp) + DB *dbp; + char *real_name; + u_int32_t flags; + DB_FH *fhp; + PAGE *h; + int *dirtyp; +{ + COMPQUIET(real_name, NULL); + COMPQUIET(flags, 0); + COMPQUIET(fhp, NULL); + COMPQUIET(h, NULL); + COMPQUIET(dirtyp, NULL); + return (__db_no_heap_am(dbp->env)); +} + +int +__heap_60_heap(dbp, real_name, flags, fhp, h, dirtyp) + DB *dbp; + char *real_name; + u_int32_t flags; + DB_FH *fhp; + PAGE *h; + int *dirtyp; +{ + COMPQUIET(real_name, NULL); + COMPQUIET(flags, 0); + COMPQUIET(fhp, NULL); + COMPQUIET(h, NULL); + COMPQUIET(dirtyp, NULL); + return (__db_no_heap_am(dbp->env)); +} + +int __heap_db_create(dbp) DB *dbp; { diff --git a/src/heap/heap_upgrade.c b/src/heap/heap_upgrade.c new file mode 100644 index 00000000..35fa78b9 --- /dev/null +++ b/src/heap/heap_upgrade.c @@ -0,0 +1,106 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/blob.h" +#include "dbinc/db_page.h" +#include "dbinc/heap.h" +#include "dbinc/db_upgrade.h" + +/* + * __heap_60_heapmeta-- + * Upgrade the version number. + * + * PUBLIC: int __heap_60_heapmeta + * PUBLIC: __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)); + */ +int +__heap_60_heapmeta(dbp, real_name, flags, fhp, h, dirtyp) + DB *dbp; + char *real_name; + u_int32_t flags; + DB_FH *fhp; + PAGE *h; + int *dirtyp; +{ + HEAPMETA *hmeta; + + COMPQUIET(flags, 0); + COMPQUIET(real_name, NULL); + COMPQUIET(fhp, NULL); + COMPQUIET(dbp, NULL); + hmeta = (HEAPMETA *)h; + + hmeta->dbmeta.version = 2; + *dirtyp = 1; + + return (0); +} + +/* + * __heap_60_heap -- + * Upgrade the blob records on the database heap pages. + * + * PUBLIC: int __heap_60_heap + * PUBLIC: __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)); + */ +int +__heap_60_heap(dbp, real_name, flags, fhp, h, dirtyp) + DB *dbp; + char *real_name; + u_int32_t flags; + DB_FH *fhp; + PAGE *h; + int *dirtyp; +{ + HEAPBLOBHDR60 hb60; + HEAPBLOBHDR60P1 hb60p1; + HEAPHDR *hdr; + db_seq_t blob_id, blob_size, file_id; + db_indx_t indx, *offtbl; + int ret; + + COMPQUIET(flags, 0); + COMPQUIET(real_name, NULL); + COMPQUIET(fhp, NULL); + offtbl = (db_indx_t *)HEAP_OFFSETTBL(dbp, h); + ret = 0; + + DB_ASSERT(dbp->env, HEAPBLOBREC60_SIZE == HEAPBLOBREC_SIZE); + for (indx = 0; indx <= HEAP_HIGHINDX(h); indx++) { + if (offtbl[indx] == 0) + continue; + hdr = (HEAPHDR *)P_ENTRY(dbp, h, indx); + if (F_ISSET(hdr, HEAP_RECBLOB)) { + memcpy(&hb60, hdr, HEAPBLOBREC60_SIZE); + memset(&hb60p1, 0, HEAPBLOBREC_SIZE); + hb60p1.std_hdr.flags = hb60.flags; + hb60p1.std_hdr.size = hb60.size; + hb60p1.encoding = hb60.encoding; + hb60p1.lsn = hb60.lsn; + GET_BLOB60_ID(dbp->env, hb60, blob_id, ret); + if (ret != 0) + return (ret); + GET_BLOB60_SIZE(dbp->env, hb60, blob_size, ret); + if (ret != 0) + return (ret); + GET_BLOB60_FILE_ID(dbp->env, &hb60, file_id, ret); + if (ret != 0) + return (ret); + SET_BLOB_ID(&hb60p1, blob_id, HEAPBLOBHDR60P1); + SET_BLOB_SIZE(&hb60p1, blob_size, HEAPBLOBHDR60P1); + SET_BLOB_FILE_ID(&hb60p1, file_id, HEAPBLOBHDR60P1); + memcpy(hdr, &hb60p1, HEAPBLOBREC_SIZE); + *dirtyp = 1; + } + } + + return (ret); +} diff --git a/src/heap/heap_verify.c b/src/heap/heap_verify.c index ea15c28b..7c90caf0 100644 --- a/src/heap/heap_verify.c +++ b/src/heap/heap_verify.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2010, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -9,13 +9,14 @@ #include "db_config.h" #include "db_int.h" +#include "dbinc/blob.h" #include "dbinc/db_page.h" #include "dbinc/db_verify.h" #include "dbinc/heap.h" #include "dbinc/lock.h" #include "dbinc/mp.h" -static int __heap_safe_gsplit __P((DB *, VRFY_DBINFO *, PAGE *, db_indx_t, +static int __heap_safe_gsplit __P((DB *, VRFY_DBINFO *, PAGE *, unsigned, DBT *)); static int __heap_verify_offset_cmp __P((const void *, const void *)); @@ -37,7 +38,8 @@ __heap_vrfy_meta(dbp, vdp, meta, pgno, flags) HEAP *h; VRFY_PAGEINFO *pip; db_pgno_t last_pgno, max_pgno, npgs; - int isbad, ret; + int isbad, ret, t_ret; + db_seq_t blob_id; if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) return (ret); @@ -97,8 +99,40 @@ __heap_vrfy_meta(dbp, vdp, meta, pgno, flags) "%lu"), (u_long)pgno)); isbad = 1; } + h->gbytes = meta->gbytes; + h->bytes = meta->bytes; } +/* + * Where 64-bit integer support is not available, + * return an error if the file has any blobs. + */ + t_ret = 0; +#ifdef HAVE_64BIT_TYPES + GET_BLOB_FILE_ID(dbp->env, meta, blob_id, t_ret); + if (t_ret != 0) { + isbad = 1; + EPRINT((dbp->env, DB_STR_A("1173", + "Page %lu: blob file id overflow.", "%lu"), (u_long)pgno)); + if (ret == 0) + ret = t_ret; + } +#else /* HAVE_64BIT_TYPES */ + /* + * db_seq_t is an int on systems that do not have 64 integers types, so + * this will compile and run. + */ + GET_BLOB_FILE_ID(env, meta, blob_id, t_ret); + if (t_ret != 0 || blob_id != 0) { + isbad = 1; + EPRINT((env, DB_STR_A("1206", + "Page %lu: blobs require 64 integer compiler support.", + "%lu"), (u_long)pgno)); + if (ret == 0) + ret = t_ret; + } +#endif + err: if (LF_ISSET(DB_SALVAGE)) ret = __db_salvage_markdone(vdp, pgno); @@ -120,12 +154,16 @@ __heap_vrfy(dbp, vdp, h, pgno, flags) db_pgno_t pgno; u_int32_t flags; { + HEAPBLOBHDR bhdr; HEAPHDR *hdr; - int cnt, i, j, ret; + int i, j, ret; + off_t blob_size; + db_seq_t blob_id, file_id; db_indx_t *offsets, *offtbl, end; + u_int32_t cnt; if ((ret = __db_vrfy_datapage(dbp, vdp, h, pgno, flags)) != 0) - goto err; + return (ret); if (TYPE(h) == P_IHEAP) /* Nothing to verify on a region page. */ @@ -140,7 +178,7 @@ __heap_vrfy(dbp, vdp, h, pgno, flags) /* * Build a sorted list of all the offsets in the table. Entries in the * offset table are not always sorted. While we're here, check that - * flags are sane. + * flags are sane, and that the blob entries are sane. */ cnt = 0; for (i = 0; i <= HEAP_HIGHINDX(h); i++) { @@ -164,6 +202,36 @@ __heap_vrfy(dbp, vdp, h, pgno, flags) ret = DB_VERIFY_BAD; goto err; } + if (F_ISSET(hdr, HEAP_RECBLOB)) { + /* + * Check that the blob file exists and is the same + * file size as is stored in the database record. + */ + memcpy(&bhdr, hdr, sizeof(HEAPBLOBHDR)); + blob_id = (db_seq_t)bhdr.id; + GET_BLOB_SIZE(dbp->env, bhdr, blob_size, ret); + if (ret != 0 || blob_size < 0) { + EPRINT((dbp->env, DB_STR_A("1175", + "Page %lu: blob file size value has overflowed", + "%lu"), (u_long)pgno)); + ret = DB_VERIFY_BAD; + goto err; + } + file_id = (db_seq_t)bhdr.file_id; + if (file_id == 0) { + EPRINT((dbp->env, DB_STR_A("1177", + "Page %lu: invalid blob dir id %llu at item %lu", + "%lu %llu, %lu"), (u_long)pgno, + (unsigned long long)file_id, (u_long)i)); + ret = DB_VERIFY_BAD; + goto err; + } + if ((ret = __blob_vrfy(dbp->env, blob_id, + blob_size, file_id, 0, pgno, flags)) != 0) { + ret = DB_VERIFY_BAD; + goto err; + } + } offsets[cnt] = offtbl[i]; cnt++; @@ -180,7 +248,7 @@ __heap_vrfy(dbp, vdp, h, pgno, flags) * record. We can't use the P_ENTRY macro because we've kept track of * the offsets, not the indexes. */ - for (i = 0; i < cnt - 1; i++) { + for (i = 0; i < (int)cnt - 1; i++) { hdr = (HEAPHDR *)((u_int8_t *)h + offsets[i]); end = offsets[i] + HEAP_HDRSIZE(hdr) + hdr->size; if (end > offsets[i+1]) { @@ -328,12 +396,22 @@ __heap_salvage(dbp, vdp, pgno, h, handle, callback, flags) u_int32_t flags; { DBT dbt; + ENV *env; HEAPHDR *hdr; + HEAPBLOBHDR bhdr; db_indx_t i, *offtbl; + char *prefix; int err_ret, ret, t_ret; + off_t blob_size, blob_offset, remaining; + u_int32_t blob_buf_size; + u_int8_t *blob_buf; + db_seq_t blob_id, file_id; COMPQUIET(flags, 0); memset(&dbt, 0, sizeof(DBT)); + blob_buf = NULL; + blob_buf_size = 0; + env = dbp->env; offtbl = (db_indx_t *)HEAP_OFFSETTBL(dbp, h); err_ret = ret = t_ret = 0; @@ -357,9 +435,74 @@ __heap_salvage(dbp, vdp, pgno, h, handle, callback, flags) if (dbt.size > dbp->pgsize * 4) dbt.size = dbp->pgsize * 4; if ((ret = - __os_malloc(dbp->env, dbt.size, &dbt.data)) != 0) + __os_malloc(env, dbt.size, &dbt.data)) != 0) goto err; - __heap_safe_gsplit(dbp, vdp, h, i, &dbt); + if ((ret = __heap_safe_gsplit + (dbp, vdp, h, i, &dbt)) != 0) { + err_ret = ret; + __os_free(env, dbt.data); + continue; + } + } else if (F_ISSET(hdr, HEAP_RECBLOB)) { + memcpy(&bhdr, hdr, HEAPBLOBREC_SIZE); + blob_id = (db_seq_t)bhdr.id; + GET_BLOB_SIZE(env, bhdr, blob_size, ret); + if (ret != 0 || blob_size < 0) + goto err; + file_id = (db_seq_t)bhdr.file_id; + /* Read the blob, in pieces if it is too large.*/ + blob_offset = 0; + if (blob_size > MEGABYTE) { + if (blob_buf_size < MEGABYTE) { + if ((ret = __os_realloc( + env, MEGABYTE, &blob_buf)) != 0) + goto err; + blob_buf_size = MEGABYTE; + } + } else if (blob_buf_size < blob_size) { + blob_buf_size = (u_int32_t)blob_size; + if ((ret = __os_realloc( + env, blob_buf_size, &blob_buf)) != 0) + goto err; + } + dbt.data = blob_buf; + dbt.ulen = blob_buf_size; + remaining = blob_size; + prefix = " "; + do { + if ((ret = __blob_salvage(env, blob_id, + blob_offset, + ((remaining < blob_buf_size) ? + (size_t)remaining : blob_buf_size), + file_id, 0, &dbt)) != 0) { + if (LF_ISSET(DB_AGGRESSIVE)) { + ret = DB_VERIFY_BAD; + break; + } + F_CLR(vdp, SALVAGE_STREAM_BLOB); + goto err; + } + if (remaining > blob_buf_size) + F_SET(vdp, SALVAGE_STREAM_BLOB); + else + F_CLR(vdp, SALVAGE_STREAM_BLOB); + if ((t_ret = __db_vrfy_prdbt( + &dbt, 0, prefix, handle, + callback, 0, 0, vdp)) != 0) { + if (ret == 0) + ret = t_ret; + F_CLR(vdp, SALVAGE_STREAM_BLOB); + goto err; + } + prefix = NULL; + blob_offset += dbt.size; + if (remaining < blob_buf_size) + remaining = 0; + else + remaining -= blob_buf_size; + } while (remaining > 0); + F_CLR(vdp, SALVAGE_STREAM_BLOB); + continue; } else { dbt.data = (u_int8_t *)hdr + HEAP_HDRSIZE(hdr); dbt.size = hdr->size; @@ -369,11 +512,13 @@ __heap_salvage(dbp, vdp, pgno, h, handle, callback, flags) 0, " ", handle, callback, 0, 0, vdp)) != 0) err_ret = ret; if (F_ISSET(hdr, HEAP_RECSPLIT)) - __os_free(dbp->env, dbt.data); + __os_free(env, dbt.data); } err: if ((t_ret = __db_salvage_markdone(vdp, pgno)) != 0) return (t_ret); + if (blob_buf != NULL) + __os_free(env, blob_buf); return ((ret == 0 && err_ret != 0) ? err_ret : ret); } @@ -386,7 +531,7 @@ __heap_safe_gsplit(dbp, vdp, h, i, dbt) DB *dbp; VRFY_DBINFO *vdp; PAGE *h; - db_indx_t i; + unsigned i; DBT *dbt; { DB_MPOOLFILE *mpf; @@ -433,7 +578,7 @@ __heap_safe_gsplit(dbp, vdp, h, i, dbt) err: if (gotpg && (t_ret = __memp_fput( mpf, vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0 && ret == 0) - t_ret = ret; + ret = t_ret; return (ret); } diff --git a/src/hmac/hmac.c b/src/hmac/hmac.c index 4febfc60..acaca6bc 100644 --- a/src/hmac/hmac.c +++ b/src/hmac/hmac.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved. * * Some parts of this code originally written by Adam Stubblefield, * -- astubble@rice.edu. diff --git a/src/lock/Design b/src/lock/Design index f82bc7e8..2a1d1145 100644 --- a/src/lock/Design +++ b/src/lock/Design @@ -298,4 +298,4 @@ A: We currently do not support any automatic configuration for FINE_GRAIN locking. When we do, will need to document that atomicity discussion listed above (it is bug-report #553). -Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved. +Copyright (c) 2010, 2015 Oracle and/or its affiliates. All rights reserved. diff --git a/src/lock/lock.c b/src/lock/lock.c index e4627734..bcebbe44 100644 --- a/src/lock/lock.c +++ b/src/lock/lock.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -31,8 +31,8 @@ static int __lock_trade __P((ENV *, DB_LOCK *, DB_LOCKER *)); static int __lock_vec_api __P((ENV *, u_int32_t, u_int32_t, DB_LOCKREQ *, int, DB_LOCKREQ **)); -static const char __db_lock_invalid[] = "%s: Lock is no longer valid"; -static const char __db_locker_invalid[] = "Locker is not valid"; +#define LOCK_INVALID_ERR DB_STR_A("2056", "%s: Lock is no longer valid", "%s") +#define LOCKER_INVALID_ERR DB_STR("2057", "Locker is not valid") #ifdef DEBUG extern void __db_loadme (void); @@ -111,7 +111,8 @@ __lock_vec(env, sh_locker, flags, list, nlist, elistp) DB_LOCKREQ *list, **elistp; { struct __db_lock *lp, *next_lock; - DB_LOCK lock; DB_LOCKOBJ *sh_obj; + DB_LOCK lock; + DB_LOCKOBJ *sh_obj; DB_LOCKREGION *region; DB_LOCKTAB *lt; DBT *objlist, *np; @@ -200,12 +201,18 @@ __lock_vec(env, sh_locker, flags, list, nlist, elistp) if (writes == 1 || lp->mode == DB_LOCK_READ || lp->mode == DB_LOCK_READ_UNCOMMITTED) { - SH_LIST_REMOVE(lp, - locker_links, __db_lock); + /* + * It is safe to look at lp before + * locking because any threads sharing + * this locker must not be in the API + * at the same time. + */ sh_obj = SH_OFF_TO_PTR(lp, lp->obj, DB_LOCKOBJ); ndx = sh_obj->indx; OBJECT_LOCK_NDX(lt, region, ndx); + SH_LIST_REMOVE(lp, + locker_links, __db_lock); /* * We are not letting lock_put_internal * unlink the lock, so we'll have to @@ -423,7 +430,7 @@ __lock_get_api(env, locker, flags, obj, lock_mode, lock) region = env->lk_handle->reginfo.primary; LOCK_LOCKERS(env, region); - ret = __lock_getlocker_int(env->lk_handle, locker, 0, &sh_locker); + ret = __lock_getlocker_int(env->lk_handle, locker, 0, NULL, &sh_locker); UNLOCK_LOCKERS(env, region); LOCK_SYSTEM_LOCK(env->lk_handle, region); if (ret == 0) @@ -979,12 +986,21 @@ in_abort: newl->status = DB_LSTAT_WAITING; goto err; } + /* + * Sleep until someone releases a lock which might let us in. + * Since we want to set the thread state back to ACTIVE, don't + * use the normal MUTEX_LOCK() macro, which would immediately + * return a panic error code. Instead, return the panic after + * restoring the thread state. + */ PERFMON2(env, lock, suspend, (DBT *) obj, lock_mode); - MUTEX_LOCK(env, newl->mtx_lock); + ret = __mutex_lock(env, newl->mtx_lock); PERFMON2(env, lock, resume, (DBT *) obj, lock_mode); if (ip != NULL) ip->dbth_state = THREAD_ACTIVE; + if (ret != 0) + return (ret); LOCK_SYSTEM_LOCK(lt, region); OBJECT_LOCK_NDX(lt, region, ndx); @@ -1165,7 +1181,7 @@ __lock_put_nolock(env, lock, runp, flags) lockp = R_ADDR(<->reginfo, lock->off); DB_ASSERT(env, lock->gen == lockp->gen); if (lock->gen != lockp->gen) { - __db_errx(env, __db_lock_invalid, "DB_LOCK->lock_put"); + __db_errx(env, LOCK_INVALID_ERR, "DB_LOCK->lock_put"); LOCK_INIT(*lock); return (EINVAL); } @@ -1224,7 +1240,7 @@ __lock_downgrade(env, lock, new_mode, flags) lockp = R_ADDR(<->reginfo, lock->off); if (lock->gen != lockp->gen) { - __db_errx(env, __db_lock_invalid, "lock_downgrade"); + __db_errx(env, LOCK_INVALID_ERR, "lock_downgrade"); ret = EINVAL; goto out; } @@ -1662,7 +1678,7 @@ __lock_inherit_locks(lt, sh_locker, flags) * locks, so inheritance is easy! */ if (sh_locker == NULL) { - __db_errx(env, __db_locker_invalid); + __db_errx(env, LOCKER_INVALID_ERR); return (EINVAL); } @@ -1683,11 +1699,15 @@ __lock_inherit_locks(lt, sh_locker, flags) for (lp = SH_LIST_FIRST(&sh_locker->heldby, __db_lock); lp != NULL; lp = SH_LIST_FIRST(&sh_locker->heldby, __db_lock)) { - SH_LIST_REMOVE(lp, locker_links, __db_lock); - - /* See if the parent already has a lock. */ + /* + * See if the parent already has a lock. It is safe to look at + * lp before locking it because any threads sharing this locker + * must not be in the API with the same time. + */ obj = SH_OFF_TO_PTR(lp, lp->obj, DB_LOCKOBJ); OBJECT_LOCK_NDX(lt, region, obj->indx); + SH_LIST_REMOVE(lp, locker_links, __db_lock); + SH_TAILQ_FOREACH(hlp, &obj->holders, links, __db_lock) if (hlp->holder == poff && lp->mode == hlp->mode) break; @@ -1917,7 +1937,7 @@ __lock_trade(env, lock, new_locker) /* If the lock is already released, simply return. */ if (lp->gen != lock->gen) - return (DB_NOTFOUND); + return (USR_ERR(env, DB_NOTFOUND)); if (new_locker == NULL) { __db_errx(env, DB_STR("2040", "Locker does not exist")); diff --git a/src/lock/lock_alloc.incl b/src/lock/lock_alloc.incl index edea07d2..e10cbcbf 100644 --- a/src/lock/lock_alloc.incl +++ b/src/lock/lock_alloc.incl @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2011, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/lock/lock_deadlock.c b/src/lock/lock_deadlock.c index 3c00d7f1..79086687 100644 --- a/src/lock/lock_deadlock.c +++ b/src/lock/lock_deadlock.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -683,38 +683,45 @@ again: memset(bitmap, 0, count * sizeof(u_int32_t) * nentries); /* * Now for each locker, record its last lock and set abort status. * We need to look at the heldby list carefully. We have the LOCKERS - * locked so they cannot go away. The lock at the head of the - * list can be removed by locking the object it points at. - * Since lock memory is not freed if we get a lock we can look - * at it safely but SH_LIST_FIRST is not atomic, so we check that - * the list has not gone empty during that macro. We check abort - * status after building the bit maps so that we will not detect - * a blocked transaction without noting that it is already aborting. + * locked so they cannot go away. The LOCK_SYSTEM_LOCK keeps things + * steady when the lock table is not partitioned. However, if there are + * multiple lock partitions then the head of the heldby list can be + * changed by another thread locking the object it points at. That + * thread will have OBJECT_LOCK()'d that lock's partition. We need to + * look at the lock entry in order to determine which partition to + * mutex_lock. Since lock structs are never really freed, once we get + * the pointer we can look at it safely. However SH_LIST_FIRST is not + * atomic, so we first fetch the pointer and then check that the list + * was not empty during the fetch. This lets us at least mutex_lock the + * partition of the lock. Afterwards, we retry if the lock is no longer + * the first for that locker -- it might have changed to something ELSE + * since then. We check abort status after building the bit maps so that + * we will not pick a blocked transaction without noting that it is + * already aborting. */ for (id = 0; id < count; id++) { if (!id_array[id].valid) continue; - if ((ret = __lock_getlocker_int(lt, - id_array[id].id, 0, &lockerp)) != 0 || lockerp == NULL) + if ((ret = __lock_getlocker_int(lt, id_array[id].id, + 0, NULL, &lockerp)) != 0 || lockerp == NULL) continue; /* - * If this is a master transaction, try to - * find one of its children's locks first, - * as they are probably more recent. + * If this is a master transaction, try to find one of its + * children's locks first, as they are probably more recent. */ child = SH_LIST_FIRST(&lockerp->child_locker, __db_locker); if (child != NULL) { do { -c_retry: lp = SH_LIST_FIRST(&child->heldby, __db_lock); - if (SH_LIST_EMPTY(&child->heldby) || lp == NULL) +c_retry: lp = SH_LIST_FIRSTP(&child->heldby, __db_lock); + if (__SH_LIST_WAS_EMPTY(&child->heldby, lp)) goto c_next; if (F_ISSET(child, DB_LOCKER_INABORT)) id_array[id].in_abort = 1; ndx = lp->indx; OBJECT_LOCK_NDX(lt, region, ndx); - if (lp != SH_LIST_FIRST( + if (lp != SH_LIST_FIRSTP( &child->heldby, __db_lock) || ndx != lp->indx) { OBJECT_UNLOCK(lt, region, ndx); @@ -733,11 +740,11 @@ c_next: child = SH_LIST_NEXT( } while (child != NULL); } -l_retry: lp = SH_LIST_FIRST(&lockerp->heldby, __db_lock); - if (!SH_LIST_EMPTY(&lockerp->heldby) && lp != NULL) { +l_retry: lp = SH_LIST_FIRSTP(&lockerp->heldby, __db_lock); + if (!__SH_LIST_WAS_EMPTY(&lockerp->heldby, lp)) { ndx = lp->indx; OBJECT_LOCK_NDX(lt, region, ndx); - if (lp != SH_LIST_FIRST(&lockerp->heldby, __db_lock) || + if (lp != SH_LIST_FIRSTP(&lockerp->heldby, __db_lock) || lp->indx != ndx) { OBJECT_UNLOCK(lt, region, ndx); goto l_retry; @@ -869,7 +876,7 @@ __dd_abort(env, info, statusp) * detecting, return that. */ if ((ret = __lock_getlocker_int(lt, - info->last_locker_id, 0, &lockerp)) != 0) + info->last_locker_id, 0, NULL, &lockerp)) != 0) goto err; if (lockerp == NULL || F_ISSET(lockerp, DB_LOCKER_INABORT)) { *statusp = DB_ALREADY_ABORTED; diff --git a/src/lock/lock_failchk.c b/src/lock/lock_failchk.c index 59fb010f..84f757bf 100644 --- a/src/lock/lock_failchk.c +++ b/src/lock/lock_failchk.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -15,7 +15,7 @@ /* * __lock_failchk -- * Check for locks held by dead threads of control and release - * read locks. If any write locks were held by dead non-trasnactional + * read locks. If any write locks were held by dead non-transactional * lockers then we must abort and run recovery. Otherwise we release * read locks for lockers owned by dead threads. Write locks for * dead transactional lockers will be freed when we abort the transaction. @@ -98,9 +98,8 @@ retry: LOCK_LOCKERS(env, lrp); /* * This locker is most likely referenced by a cursor * which is owned by a dead thread. Normally the - * cursor would be available for other threads - * but we assume the dead thread will never release - * it. + * cursor would be available for other threads but we + * assume the dead thread will never release it. */ if (lip->id < TXN_MINIMUM && (ret = __lock_freelocker(lt, lip)) != 0) diff --git a/src/lock/lock_id.c b/src/lock/lock_id.c index 24b545d1..e0dbaa01 100644 --- a/src/lock/lock_id.c +++ b/src/lock/lock_id.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -17,7 +17,7 @@ static int __lock_freelocker_int /* * __lock_id_pp -- - * ENV->lock_id pre/post processing. + * DB_ENV->lock_id pre/post processing. * * PUBLIC: int __lock_id_pp __P((DB_ENV *, u_int32_t *)); */ @@ -43,7 +43,11 @@ __lock_id_pp(dbenv, idp) /* * __lock_id -- - * ENV->lock_id. + * Allocate a new lock id as well as a locker struct to hold it. If we wrap + * around then we find the minimum currently in use and make sure we can + * stay below that. This is similar to __txn_begin_int's code to recover + * txn ids. + * * * PUBLIC: int __lock_id __P((ENV *, u_int32_t *, DB_LOCKER **)); */ @@ -59,22 +63,15 @@ __lock_id(env, idp, lkp) u_int32_t id, *ids; int nids, ret; - lk = NULL; lt = env->lk_handle; region = lt->reginfo.primary; id = DB_LOCK_INVALIDID; - ret = 0; - - id = DB_LOCK_INVALIDID; lk = NULL; + ret = 0; LOCK_LOCKERS(env, region); /* - * Allocate a new lock id. If we wrap around then we find the minimum - * currently in use and make sure we can stay below that. This code is - * similar to code in __txn_begin_int for recovering txn ids. - * * Our current valid range can span the maximum valid value, so check * for it and wrap manually. */ @@ -98,7 +95,7 @@ __lock_id(env, idp, lkp) id = ++region->lock_id; /* Allocate a locker for this id. */ - ret = __lock_getlocker_int(lt, id, 1, &lk); + ret = __lock_getlocker_int(lt, id, 1, NULL, &lk); err: UNLOCK_LOCKERS(env, region); @@ -165,7 +162,8 @@ __lock_id_free_pp(dbenv, id) LOCK_LOCKERS(env, region); if ((ret = - __lock_getlocker_int(env->lk_handle, id, 0, &sh_locker)) == 0) { + __lock_getlocker_int(env->lk_handle, + id, 0, NULL, &sh_locker)) == 0) { if (sh_locker != NULL) ret = __lock_freelocker_int(lt, region, sh_locker, 1); else { @@ -194,8 +192,10 @@ __lock_id_free(env, sh_locker) ENV *env; DB_LOCKER *sh_locker; { + DB_LOCKER locker; DB_LOCKREGION *region; DB_LOCKTAB *lt; + DB_MSGBUF mb; int ret; lt = env->lk_handle; @@ -203,9 +203,14 @@ __lock_id_free(env, sh_locker) ret = 0; if (sh_locker->nlocks != 0) { - __db_errx(env, DB_STR("2046", - "Locker still has locks")); - ret = EINVAL; + locker = *sh_locker; + ret = USR_ERR(env, EINVAL); + __db_errx(env, DB_STR_A("2046", + "Locker %d still has %d locks", "%d %d"), + locker.id, locker.nlocks ); + DB_MSGBUF_INIT(&mb); + (void)__lock_dump_locker(env, &mb, lt, sh_locker); + DB_MSGBUF_FLUSH(env, &mb); goto err; } @@ -243,17 +248,19 @@ __lock_id_set(env, cur_id, max_id) } /* - * __lock_getlocker -- - * Get a locker in the locker hash table. The create parameter - * indicates if the locker should be created if it doesn't exist in - * the table. + * __lock_getlocker,__lock_getlocker_int -- + * Get a locker in the locker hash table. The create parameter indicates + * whether the locker should be created if it doesn't exist in the table. If + * there's a matching locker cached in the thread info, use that without + * locking. * - * This must be called with the locker mutex lock if create == 1. + * The internal version does not check the thread info cache; it must be called + * with the locker mutex locked. * * PUBLIC: int __lock_getlocker __P((DB_LOCKTAB *, * PUBLIC: u_int32_t, int, DB_LOCKER **)); * PUBLIC: int __lock_getlocker_int __P((DB_LOCKTAB *, - * PUBLIC: u_int32_t, int, DB_LOCKER **)); + * PUBLIC: u_int32_t, int, DB_THREAD_INFO *, DB_LOCKER **)); */ int __lock_getlocker(lt, locker, create, retp) @@ -263,32 +270,47 @@ __lock_getlocker(lt, locker, create, retp) DB_LOCKER **retp; { DB_LOCKREGION *region; + DB_THREAD_INFO *ip; ENV *env; int ret; COMPQUIET(region, NULL); env = lt->env; region = lt->reginfo.primary; - + ENV_GET_THREAD_INFO(env, ip); + + /* Check to see if the locker is already in the thread info */ + if (ip != NULL && ip->dbth_local_locker != INVALID_ROFF) { + *retp = (DB_LOCKER *) + R_ADDR(<->reginfo, ip->dbth_local_locker); + if ((*retp)->id == locker) { + DB_ASSERT(env, !F_ISSET(*retp, DB_LOCKER_FREE)); +#ifdef HAVE_STATISTICS + region->stat.st_nlockers_hit++; +#endif + return (0); + } + } LOCK_LOCKERS(env, region); - ret = __lock_getlocker_int(lt, locker, create, retp); + ret = __lock_getlocker_int(lt, locker, create, ip, retp); UNLOCK_LOCKERS(env, region); - return (ret); } int -__lock_getlocker_int(lt, locker, create, retp) +__lock_getlocker_int(lt, locker, create, ip, retp) DB_LOCKTAB *lt; u_int32_t locker; int create; + DB_THREAD_INFO *ip; DB_LOCKER **retp; { DB_LOCKER *sh_locker; DB_LOCKREGION *region; - DB_THREAD_INFO *ip; +#ifdef DIAGNOSTIC + DB_THREAD_INFO *diag; +#endif ENV *env; - db_mutex_t mutex; u_int32_t i, indx, nlockers; int ret; @@ -304,59 +326,85 @@ __lock_getlocker_int(lt, locker, create, retp) SH_TAILQ_FOREACH(sh_locker, <->locker_tab[indx], links, __db_locker) if (sh_locker->id == locker) break; + if (sh_locker == NULL && create) { - nlockers = 0; - /* Create new locker and then insert it into hash table. */ - if ((ret = __mutex_alloc(env, MTX_LOGICAL_LOCK, - DB_MUTEX_LOGICAL_LOCK | DB_MUTEX_SELF_BLOCK, - &mutex)) != 0) - return (ret); - else - MUTEX_LOCK(env, mutex); - if ((sh_locker = SH_TAILQ_FIRST( - ®ion->free_lockers, __db_locker)) == NULL) { - nlockers = region->stat.st_lockers >> 2; - /* Just in case. */ - if (nlockers == 0) - nlockers = 1; - if (region->stat.st_maxlockers != 0 && - region->stat.st_maxlockers < - region->stat.st_lockers + nlockers) - nlockers = region->stat.st_maxlockers - - region->stat.st_lockers; - /* - * Don't hold lockers when getting the region, - * we could deadlock. When creating a locker - * there is no race since the id allocation - * is synchronized. - */ - UNLOCK_LOCKERS(env, region); - LOCK_REGION_LOCK(env); - /* - * If the max memory is not sized for max objects, - * allocate as much as possible. - */ - F_SET(<->reginfo, REGION_TRACKED); - while (__env_alloc(<->reginfo, nlockers * - sizeof(struct __db_locker), &sh_locker) != 0) - if ((nlockers >> 1) == 0) - break; - F_CLR(<->reginfo, REGION_TRACKED); - LOCK_REGION_UNLOCK(lt->env); - LOCK_LOCKERS(env, region); - for (i = 0; i < nlockers; i++) { + /* Can we reuse a locker struct cached in the thread info? */ + if (ip != NULL && ip->dbth_local_locker != INVALID_ROFF && + (sh_locker = (DB_LOCKER*)R_ADDR(<->reginfo, + ip->dbth_local_locker))->id == DB_LOCK_INVALIDID) { + DB_ASSERT(env, !F_ISSET(sh_locker, DB_LOCKER_FREE)); +#ifdef HAVE_STATISTICS + region->stat.st_nlockers_reused++; +#endif + } else { + /* Create new locker and insert it into hash table. */ + if ((sh_locker = SH_TAILQ_FIRST( + ®ion->free_lockers, __db_locker)) == NULL) { + nlockers = region->stat.st_lockers >> 2; + /* Just in case. */ + if (nlockers == 0) + nlockers = 1; + if (region->stat.st_maxlockers != 0 && + region->stat.st_maxlockers < + region->stat.st_lockers + nlockers) + nlockers = region->stat.st_maxlockers - + region->stat.st_lockers; + /* + * Don't hold lockers when getting the region, + * we could deadlock. When creating a locker + * there is no race since the id allocation + * is synchronized. + */ + UNLOCK_LOCKERS(env, region); + LOCK_REGION_LOCK(env); + /* + * If the max memory is not sized for max + * objects, allocate as much as possible. + */ + F_SET(<->reginfo, REGION_TRACKED); + while (__env_alloc(<->reginfo, nlockers * + sizeof(struct __db_locker), + &sh_locker) != 0) { + nlockers >>= 1; + if (nlockers == 0) + break; + } + F_CLR(<->reginfo, REGION_TRACKED); + LOCK_REGION_UNLOCK(lt->env); + LOCK_LOCKERS(env, region); + for (i = 0; i < nlockers; i++) { + SH_TAILQ_INSERT_HEAD( + ®ion->free_lockers, + sh_locker, links, __db_locker); + sh_locker->mtx_locker = MUTEX_INVALID; +#ifdef DIAGNOSTIC + sh_locker->prev_locker = INVALID_ROFF; +#endif + sh_locker++; + } + if (nlockers == 0) + return (__lock_nomem(env, + "locker entries")); + region->stat.st_lockers += nlockers; + sh_locker = SH_TAILQ_FIRST( + ®ion->free_lockers, __db_locker); + } + SH_TAILQ_REMOVE( + ®ion->free_lockers, + sh_locker, links, __db_locker); + } + F_CLR(sh_locker, DB_LOCKER_FREE); + if (sh_locker->mtx_locker == MUTEX_INVALID) { + if ((ret = __mutex_alloc(env, MTX_LOGICAL_LOCK, + DB_MUTEX_LOGICAL_LOCK | DB_MUTEX_SELF_BLOCK, + &sh_locker->mtx_locker)) != 0) { SH_TAILQ_INSERT_HEAD(®ion->free_lockers, sh_locker, links, __db_locker); - sh_locker++; + return (ret); } - if (nlockers == 0) - return (__lock_nomem(env, "locker entries")); - region->stat.st_lockers += nlockers; - sh_locker = SH_TAILQ_FIRST( - ®ion->free_lockers, __db_locker); + MUTEX_LOCK(env, sh_locker->mtx_locker); } - SH_TAILQ_REMOVE( - ®ion->free_lockers, sh_locker, links, __db_locker); + ++region->nlockers; #ifdef HAVE_STATISTICS STAT_PERFMON2(env, lock, nlockers, region->nlockers, locker); @@ -365,10 +413,10 @@ __lock_getlocker_int(lt, locker, create, retp) region->stat.st_maxnlockers, region->nlockers, locker); #endif + sh_locker->id = locker; env->dbenv->thread_id( env->dbenv, &sh_locker->pid, &sh_locker->tid); - sh_locker->mtx_locker = mutex; sh_locker->dd_id = 0; sh_locker->master_locker = INVALID_ROFF; sh_locker->parent_locker = INVALID_ROFF; @@ -386,10 +434,20 @@ __lock_getlocker_int(lt, locker, create, retp) <->locker_tab[indx], sh_locker, links, __db_locker); SH_TAILQ_INSERT_HEAD(®ion->lockers, sh_locker, ulinks, __db_locker); - ENV_GET_THREAD_INFO(env, ip); + + if (ip != NULL && ip->dbth_local_locker == INVALID_ROFF) + ip->dbth_local_locker = + R_OFFSET(<->reginfo, sh_locker); #ifdef DIAGNOSTIC - if (ip != NULL) - ip->dbth_locker = R_OFFSET(<->reginfo, sh_locker); + /* + * __db_has_pagelock checks for proper locking by dbth_locker. + */ + if ((diag = ip) == NULL) + ENV_GET_THREAD_INFO(env, diag); + if (diag != NULL) { + sh_locker->prev_locker = diag->dbth_locker; + diag->dbth_locker = R_OFFSET(<->reginfo, sh_locker); + } #endif } @@ -420,7 +478,7 @@ __lock_addfamilylocker(env, pid, id, is_family) LOCK_LOCKERS(env, region); /* get/create the parent locker info */ - if ((ret = __lock_getlocker_int(lt, pid, 1, &mlockerp)) != 0) + if ((ret = __lock_getlocker_int(lt, pid, 1, NULL, &mlockerp)) != 0) goto err; /* @@ -430,7 +488,7 @@ __lock_addfamilylocker(env, pid, id, is_family) * we manipulate it, nor can another child in the * family be created at the same time. */ - if ((ret = __lock_getlocker_int(lt, id, 1, &lockerp)) != 0) + if ((ret = __lock_getlocker_int(lt, id, 1, NULL, &lockerp)) != 0) goto err; /* Point to our parent. */ @@ -466,9 +524,9 @@ err: UNLOCK_LOCKERS(env, region); } /* - * __lock_freelocker_int + * __lock_freelocker_int -- * Common code for deleting a locker; must be called with the - * locker bucket locked. + * lockers mutex locked. */ static int __lock_freelocker_int(lt, region, sh_locker, reallyfree) @@ -478,15 +536,21 @@ __lock_freelocker_int(lt, region, sh_locker, reallyfree) int reallyfree; { ENV *env; + DB_MSGBUF mb; + DB_THREAD_INFO *ip; u_int32_t indx; int ret; env = lt->env; - - if (SH_LIST_FIRST(&sh_locker->heldby, __db_lock) != NULL) { - __db_errx(env, DB_STR("2047", - "Freeing locker with locks")); - return (EINVAL); + if (!SH_LIST_EMPTY(&sh_locker->heldby)) { + ret = USR_ERR(env, EINVAL); + __db_errx(env, + DB_STR("2060", "Freeing locker %x with locks"), + sh_locker->id); + DB_MSGBUF_INIT(&mb); + (void)__lock_dump_locker(env, &mb, lt, sh_locker); + DB_MSGBUF_FLUSH(env, &mb); + return (ret); } /* If this is part of a family, we must fix up its links. */ @@ -499,16 +563,29 @@ __lock_freelocker_int(lt, region, sh_locker, reallyfree) LOCKER_HASH(lt, region, sh_locker->id, indx); SH_TAILQ_REMOVE(<->locker_tab[indx], sh_locker, links, __db_locker); - if (sh_locker->mtx_locker != MUTEX_INVALID && - (ret = __mutex_free(env, &sh_locker->mtx_locker)) != 0) - return (ret); - SH_TAILQ_INSERT_HEAD(®ion->free_lockers, sh_locker, - links, __db_locker); SH_TAILQ_REMOVE(®ion->lockers, sh_locker, ulinks, __db_locker); region->nlockers--; STAT_PERFMON2(env, lock, nlockers, region->nlockers, sh_locker->id); + /* + * If this locker is cached in the thread info, zero the id and + * leave it allocated. Otherwise, put it back on the free list. + */ + ENV_GET_THREAD_INFO(env, ip); + if (ip != NULL && ip->dbth_local_locker == + R_OFFSET(<->reginfo, sh_locker)) { + DB_ASSERT(env, + MUTEX_IS_BUSY(env, sh_locker->mtx_locker)); + sh_locker->id = DB_LOCK_INVALIDID; + } else { + if (sh_locker->mtx_locker != MUTEX_INVALID && (ret = + __mutex_free(env, &sh_locker->mtx_locker)) != 0) + return (ret); + F_SET(sh_locker, DB_LOCKER_FREE); + SH_TAILQ_INSERT_HEAD(®ion->free_lockers, sh_locker, + links, __db_locker); + } } return (0); @@ -518,7 +595,7 @@ __lock_freelocker_int(lt, region, sh_locker, reallyfree) * __lock_freelocker * Remove a locker its family from the hash table. * - * This must be called without the locker bucket locked. + * This must be called without the lockers mutex locked. * * PUBLIC: int __lock_freelocker __P((DB_LOCKTAB *, DB_LOCKER *)); */ @@ -570,3 +647,42 @@ __lock_familyremove(lt, sh_locker) return (ret); } + +/* + * __lock_local_locker_invalidate -- + * Search the thread info table's cached lockers and discard any reference + * to this mutex. + * + * PUBLIC: int __lock_local_locker_invalidate __P((ENV *, db_mutex_t)); + */ +int +__lock_local_locker_invalidate(env, mutex) + ENV *env; + db_mutex_t mutex; +{ + DB_HASHTAB *htab; + DB_LOCKER *locker; + DB_THREAD_INFO *ip; + u_int32_t i; + char buf[DB_THREADID_STRLEN]; + + htab = env->thr_hashtab; + for (i = 0; i < env->thr_nbucket; i++) { + SH_TAILQ_FOREACH(ip, &htab[i], dbth_links, __db_thread_info) { + if (ip->dbth_local_locker == INVALID_ROFF) + continue; + locker = (DB_LOCKER *)R_ADDR(&env->lk_handle->reginfo, + ip->dbth_local_locker); + if (locker->mtx_locker == mutex) { + __db_msg(env, +DB_STR_A("2061", "Removing cached locker mutex %lu reference by %s", "%lu %s"), + (u_long)mutex, + env->dbenv->thread_id_string(env->dbenv, + locker->pid, locker->tid, buf)); + locker->mtx_locker = MUTEX_INVALID; + return (0); + } + } + } + return (0); +} diff --git a/src/lock/lock_list.c b/src/lock/lock_list.c index 1e3d2a55..5d55e4a0 100644 --- a/src/lock/lock_list.c +++ b/src/lock/lock_list.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/lock/lock_method.c b/src/lock/lock_method.c index 0cc2e19d..0e6c0428 100644 --- a/src/lock/lock_method.c +++ b/src/lock/lock_method.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/lock/lock_region.c b/src/lock/lock_region.c index 1aae1815..ecc7ba47 100644 --- a/src/lock/lock_region.c +++ b/src/lock/lock_region.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -120,13 +120,15 @@ __lock_open(env) } /* - * A process joining the region may have reset the lock and transaction - * timeouts. + * Lock and transaction timeouts will be ignored when joining the + * environment, so print a warning if either was set. */ - if (dbenv->lk_timeout != 0) - region->lk_timeout = dbenv->lk_timeout; - if (dbenv->tx_timeout != 0) - region->tx_timeout = dbenv->tx_timeout; + if (dbenv->lk_timeout != 0 && region->lk_timeout != dbenv->lk_timeout) + __db_msg(env, DB_STR("2058", +"Warning: Ignoring DB_SET_LOCK_TIMEOUT when joining the environment.")); + if (dbenv->tx_timeout != 0 && region->tx_timeout != dbenv->tx_timeout) + __db_msg(env, DB_STR("2059", +"Warning: Ignoring DB_SET_TXN_TIMEOUT when joining the environment.")); LOCK_REGION_UNLOCK(env); region_locked = 0; @@ -396,13 +398,30 @@ __lock_env_refresh(env) R_ADDR(reginfo, lr->locker_mem_off)); } - /* Detach from the region. */ - ret = __env_region_detach(env, reginfo, 0); + ret = __lock_region_detach(env, lt); - /* Discard DB_LOCKTAB. */ - __os_free(env, lt); - env->lk_handle = NULL; + return (ret); +} + +/* + * __lock_region_detach -- + * + * PUBLIC: int __lock_region_detach __P((ENV *, DB_LOCKTAB *)); + */ +int +__lock_region_detach(env, lt) + ENV *env; + DB_LOCKTAB *lt; +{ + int ret; + ret = 0; + if (lt != NULL) { + ret = __env_region_detach(env, <->reginfo, 0); + /* Discard DB_LOCKTAB. */ + __os_free(env, lt); + env->lk_handle = NULL; + } return (ret); } diff --git a/src/lock/lock_stat.c b/src/lock/lock_stat.c index 11b934aa..1ce0796a 100644 --- a/src/lock/lock_stat.c +++ b/src/lock/lock_stat.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -15,8 +15,6 @@ #include "dbinc/db_am.h" #ifdef HAVE_STATISTICS -static int __lock_dump_locker - __P((ENV *, DB_MSGBUF *, DB_LOCKTAB *, DB_LOCKER *)); static int __lock_dump_object __P((DB_LOCKTAB *, DB_MSGBUF *, DB_LOCKOBJ *)); static int __lock_print_all __P((ENV *, u_int32_t)); static int __lock_print_stats __P((ENV *, u_int32_t)); @@ -363,6 +361,11 @@ __lock_print_stats(env, flags) __db_dl(env, "Maximum number of lockers at any one time", (u_long)sp->st_maxnlockers); __db_dl(env, + "Number of hits in the thread locker cache", + (u_long)sp->st_nlockers_hit); + __db_dl(env, + "Total number of lockers reused", (u_long)sp->st_nlockers_reused); + __db_dl(env, "Number of current lock objects", (u_long)sp->st_nobjects); __db_dl(env, "Maximum number of lock objects at any one time", (u_long)sp->st_maxnobjects); @@ -463,9 +466,17 @@ __lock_print_all(env, flags) if (timespecisset(&lrp->next_timeout)) { #ifdef HAVE_STRFTIME time_t t = (time_t)lrp->next_timeout.tv_sec; + struct tm *tm_p; char tbuf[64]; +#ifdef HAVE_LOCALTIME_R + struct tm tm; + + tm_p = localtime_r(&t, &tm); +#else + tm_p = localtime(&t); +#endif if (strftime(tbuf, sizeof(tbuf), - "%m-%d-%H:%M:%S", localtime(&t)) != 0) + "%m-%d-%H:%M:%S", tm_p) != 0) __db_msg(env, "next_timeout: %s.%09lu", tbuf, (u_long)lrp->next_timeout.tv_nsec); else @@ -519,80 +530,6 @@ __lock_print_all(env, flags) } static int -__lock_dump_locker(env, mbp, lt, lip) - ENV *env; - DB_MSGBUF *mbp; - DB_LOCKTAB *lt; - DB_LOCKER *lip; -{ - DB_LOCKREGION *lrp; - struct __db_lock *lp; - char buf[DB_THREADID_STRLEN]; - u_int32_t ndx; - - lrp = lt->reginfo.primary; - - __db_msgadd(env, - mbp, "%8lx dd=%2ld locks held %-4d write locks %-4d pid/thread %s", - (u_long)lip->id, (long)lip->dd_id, lip->nlocks, lip->nwrites, - env->dbenv->thread_id_string(env->dbenv, lip->pid, lip->tid, buf)); - __db_msgadd(env, mbp, - " flags %-4x priority %-10u", lip->flags, lip->priority); - - if (timespecisset(&lip->tx_expire)) { -#ifdef HAVE_STRFTIME - time_t t = (time_t)lip->tx_expire.tv_sec; - char tbuf[64]; - if (strftime(tbuf, sizeof(tbuf), - "%m-%d-%H:%M:%S", localtime(&t)) != 0) - __db_msgadd(env, mbp, "expires %s.%09lu", - tbuf, (u_long)lip->tx_expire.tv_nsec); - else -#endif - __db_msgadd(env, mbp, "expires %lu.%09lu", - (u_long)lip->tx_expire.tv_sec, - (u_long)lip->tx_expire.tv_nsec); - } - if (F_ISSET(lip, DB_LOCKER_TIMEOUT)) - __db_msgadd( - env, mbp, " lk timeout %lu", (u_long)lip->lk_timeout); - if (timespecisset(&lip->lk_expire)) { -#ifdef HAVE_STRFTIME - time_t t = (time_t)lip->lk_expire.tv_sec; - char tbuf[64]; - if (strftime(tbuf, - sizeof(tbuf), "%m-%d-%H:%M:%S", localtime(&t)) != 0) - __db_msgadd(env, mbp, " lk expires %s.%09lu", - tbuf, (u_long)lip->lk_expire.tv_nsec); - else -#endif - __db_msgadd(env, mbp, " lk expires %lu.%09lu", - (u_long)lip->lk_expire.tv_sec, - (u_long)lip->lk_expire.tv_nsec); - } - DB_MSGBUF_FLUSH(env, mbp); - - /* - * We need some care here since the list may change while we - * look. - */ -retry: SH_LIST_FOREACH(lp, &lip->heldby, locker_links, __db_lock) { - if (!SH_LIST_EMPTY(&lip->heldby) && lp != NULL) { - ndx = lp->indx; - OBJECT_LOCK_NDX(lt, lrp, ndx); - if (lp->indx == ndx) - __lock_printlock(lt, mbp, lp, 1); - else { - OBJECT_UNLOCK(lt, lrp, ndx); - goto retry; - } - OBJECT_UNLOCK(lt, lrp, ndx); - } - } - return (0); -} - -static int __lock_dump_object(lt, mbp, op) DB_LOCKTAB *lt; DB_MSGBUF *mbp; @@ -619,6 +556,31 @@ __lock_print_header(env) "Count", "Status", "----------------- Object ---------------"); } +#else /* !HAVE_STATISTICS */ + +int +__lock_stat_pp(dbenv, statp, flags) + DB_ENV *dbenv; + DB_LOCK_STAT **statp; + u_int32_t flags; +{ + COMPQUIET(statp, NULL); + COMPQUIET(flags, 0); + + return (__db_stat_not_built(dbenv->env)); +} + +int +__lock_stat_print_pp(dbenv, flags) + DB_ENV *dbenv; + u_int32_t flags; +{ + COMPQUIET(flags, 0); + + return (__db_stat_not_built(dbenv->env)); +} +#endif + /* * __lock_printlock -- * @@ -744,27 +706,81 @@ __lock_printlock(lt, mbp, lp, ispgno) DB_MSGBUF_FLUSH(env, mbp); } -#else /* !HAVE_STATISTICS */ - +/* + * __lock_dump_locker -- + * Display the identity and statistics of a locker. This is used during + * diagnostic error paths as well as when printing statistics. + * + * PUBLIC: int __lock_dump_locker + * PUBLIC: __P((ENV *, DB_MSGBUF *, DB_LOCKTAB *, DB_LOCKER *)); + */ int -__lock_stat_pp(dbenv, statp, flags) - DB_ENV *dbenv; - DB_LOCK_STAT **statp; - u_int32_t flags; +__lock_dump_locker(env, mbp, lt, lip) + ENV *env; + DB_MSGBUF *mbp; + DB_LOCKTAB *lt; + DB_LOCKER *lip; { - COMPQUIET(statp, NULL); - COMPQUIET(flags, 0); + DB_LOCKREGION *lrp; + struct __db_lock *lp; + char buf[DB_THREADID_STRLEN]; + u_int32_t ndx; - return (__db_stat_not_built(dbenv->env)); -} + lrp = lt->reginfo.primary; -int -__lock_stat_print_pp(dbenv, flags) - DB_ENV *dbenv; - u_int32_t flags; -{ - COMPQUIET(flags, 0); + __db_msgadd(env, + mbp, "%8lx dd=%2ld locks held %-4d write locks %-4d pid/thread %s", + (u_long)lip->id, (long)lip->dd_id, lip->nlocks, lip->nwrites, + env->dbenv->thread_id_string(env->dbenv, lip->pid, lip->tid, buf)); + __db_msgadd(env, mbp, + " flags %-4x priority %-10u", lip->flags, lip->priority); - return (__db_stat_not_built(dbenv->env)); -} + if (timespecisset(&lip->tx_expire)) { +#ifdef HAVE_STRFTIME + time_t t = (time_t)lip->tx_expire.tv_sec; + char tbuf[64]; + if (strftime(tbuf, sizeof(tbuf), + "%m-%d-%H:%M:%S", localtime(&t)) != 0) + __db_msgadd(env, mbp, "expires %s.%09lu", + tbuf, (u_long)lip->tx_expire.tv_nsec); + else #endif + __db_msgadd(env, mbp, "expires %lu.%09lu", + (u_long)lip->tx_expire.tv_sec, + (u_long)lip->tx_expire.tv_nsec); + } + if (F_ISSET(lip, DB_LOCKER_TIMEOUT)) + __db_msgadd( + env, mbp, " lk timeout %lu", (u_long)lip->lk_timeout); + if (timespecisset(&lip->lk_expire)) { +#ifdef HAVE_STRFTIME + time_t t = (time_t)lip->lk_expire.tv_sec; + char tbuf[64]; + if (strftime(tbuf, + sizeof(tbuf), "%m-%d-%H:%M:%S", localtime(&t)) != 0) + __db_msgadd(env, mbp, " lk expires %s.%09lu", + tbuf, (u_long)lip->lk_expire.tv_nsec); + else +#endif + __db_msgadd(env, mbp, " lk expires %lu.%09lu", + (u_long)lip->lk_expire.tv_sec, + (u_long)lip->lk_expire.tv_nsec); + } + DB_MSGBUF_FLUSH(env, mbp); + + /* We need some care here since the list may change while we look. */ +retry: SH_LIST_FOREACH(lp, &lip->heldby, locker_links, __db_lock) { + if (!SH_LIST_EMPTY(&lip->heldby) && lp != NULL) { + ndx = lp->indx; + OBJECT_LOCK_NDX(lt, lrp, ndx); + if (lp->indx == ndx) + __lock_printlock(lt, mbp, lp, 1); + else { + OBJECT_UNLOCK(lt, lrp, ndx); + goto retry; + } + OBJECT_UNLOCK(lt, lrp, ndx); + } + } + return (0); +} diff --git a/src/lock/lock_stub.c b/src/lock/lock_stub.c index 3875af55..a916c6df 100644 --- a/src/lock/lock_stub.c +++ b/src/lock/lock_stub.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -359,6 +359,7 @@ size_t __lock_region_max(env) ENV *env; { + COMPQUIET(env, NULL); return (0); } @@ -367,6 +368,7 @@ __lock_region_size(env, other_alloc) ENV *env; size_t other_alloc; { + COMPQUIET(env, NULL); COMPQUIET(other_alloc, 0); return (0); } @@ -584,6 +586,7 @@ __lock_list_print(env, mbp, list) DBT *list; { COMPQUIET(env, NULL); + COMPQUIET(mbp, NULL); COMPQUIET(list, NULL); } @@ -625,7 +628,7 @@ __lock_change(env, old_lock, new_lock) ENV *env; DB_LOCK *old_lock, *new_lock; { - COMPQUIET(env, NULL); COMPQUIET(old_lock, NULL); COMPQUIET(new_lock, NULL); + return (__db_nolocking(env)); } diff --git a/src/lock/lock_timer.c b/src/lock/lock_timer.c index 943047f0..9744438a 100644 --- a/src/lock/lock_timer.c +++ b/src/lock/lock_timer.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/lock/lock_util.c b/src/lock/lock_util.c index f7029cd7..07fdce72 100644 --- a/src/lock/lock_util.c +++ b/src/lock/lock_util.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/log/log.c b/src/log/log.c index 5808145f..9bef8d69 100644 --- a/src/log/log.c +++ b/src/log/log.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -32,6 +32,7 @@ __log_open(env) DB_ENV *dbenv; DB_LOG *dblp; LOG *lp; + u_int32_t log_flags; u_int8_t *bulk; int region_locked, ret; @@ -130,47 +131,59 @@ __log_open(env) } } else { /* - * A process joining the region may have reset the log file - * size, too. If so, it only affects the next log file we - * create. We need to check that the size is reasonable given - * the buffer size in the region. + * The log file size and DB_LOG_AUTO_REMOVE will be ignored + * when joining the environment, so print a warning if either + * was set. */ - LOG_SYSTEM_LOCK(env); - region_locked = 1; - - if (dbenv->lg_size != 0) { - if ((ret = - __log_check_sizes(env, dbenv->lg_size, 0)) != 0) - goto err; - - lp->log_nsize = dbenv->lg_size; - } - - LOG_SYSTEM_UNLOCK(env); - region_locked = 0; - - if (dbenv->lg_flags != 0 && (ret = - __log_set_config_int(dbenv, dbenv->lg_flags, 1, 0)) != 0) + if (dbenv->lg_size != 0 && lp->log_nsize != dbenv->lg_size) + __db_msg(env, DB_STR("2585", +"Warning: Ignoring maximum log file size when joining the environment")); + + log_flags = dbenv->lg_flags & ~DB_LOG_AUTO_REMOVE; + if ((dbenv->lg_flags & DB_LOG_AUTO_REMOVE) && + lp->db_log_autoremove == 0) + __db_msg(env, DB_STR("2586", +"Warning: Ignoring DB_LOG_AUTO_REMOVE when joining the environment.")); + if (log_flags != 0 && (ret = + __log_set_config_int(dbenv, log_flags, 1, 0)) != 0) return (ret); } dblp->reginfo.mtx_alloc = lp->mtx_region; return (0); -err: if (dblp->reginfo.addr != NULL) { - if (region_locked) - LOG_SYSTEM_UNLOCK(env); - (void)__env_region_detach(env, &dblp->reginfo, 0); - } - env->lg_handle = NULL; - +err: if (region_locked) + LOG_SYSTEM_UNLOCK(env); (void)__mutex_free(env, &dblp->mtx_dbreg); - __os_free(env, dblp); + (void)__log_region_detach(env, dblp); return (ret); } /* + * __log_region_detach -- + * + * PUBLIC: int __log_region_detach __P((ENV *, DB_LOG *)); + */ +int +__log_region_detach(env, dblp) + ENV *env; + DB_LOG *dblp; +{ + int ret; + + ret = 0; + if (dblp != NULL) { + if (dblp->reginfo.addr != NULL) + ret = __env_region_detach(env, &dblp->reginfo, 0); + /* Discard DB_LOG. */ + __os_free(env, dblp); + env->lg_handle = NULL; + } + return (ret); +} + +/* * __log_init -- * Initialize a log region in shared memory. */ @@ -638,7 +651,6 @@ __log_valid(dblp, number, set_persist, fhpp, flags, statusp, versionp) recsize = sizeof(LOGP); if (CRYPTO_ON(env)) { hdrsize = HDR_CRYPTO_SZ; - recsize = sizeof(LOGP); recsize += db_cipher->adj_size(recsize); is_hmac = 1; } @@ -700,7 +712,7 @@ __log_valid(dblp, number, set_persist, fhpp, flags, statusp, versionp) * we can only detect that by having an unreasonable * data length for our persistent data. */ - if ((hdr->len - hdrsize) != sizeof(LOGP)) { + if ((hdr->len - hdrsize) != recsize) { __db_errx(env, "log record size mismatch"); goto err; } @@ -722,10 +734,10 @@ __log_valid(dblp, number, set_persist, fhpp, flags, statusp, versionp) hdr->len - hdrsize, is_hmac)) != 0) goto bad_checksum; /* - * The checksum verifies without the header. Make note - * of that, because it is only acceptable when the log - * version < DB_LOGCHKSUM. Later, when we determine log - * version, we will confirm this. + * The checksum verifies without the header. Make note + * of that, because it is only acceptable when the log + * version < DB_LOGCHKSUM. Later, when we determine log + * version, we will confirm this. */ chksum_includes_hdr = 0; } @@ -800,7 +812,7 @@ __log_valid(dblp, number, set_persist, fhpp, flags, statusp, versionp) /* * We might have to declare a checksum failure here, if: * - the checksum verified only by ignoring the header, and - * - the log version indicates that the header should have + * - the log version indicates that the header should have * been included. */ if (!chksum_includes_hdr && logversion >= DB_LOGCHKSUM) @@ -899,66 +911,69 @@ __log_env_refresh(env) /* * After we close the files, check for any unlogged closes left in * the shared memory queue. If we find any, try to log it, otherwise - * return the error. We cannot say the environment was closed - * cleanly. + * return the error; we cannot say the environment was closed cleanly. + * This does not use the typical MUTEX_LOCK(), but MUTEX_LOCK_RET(). The + * normal function would immediately return DB_RUNRECOVERY if we are + * closing the env down during a panic. By using MUTEX_LOCK_RET(), we + * continue with the rest of the cleanup. */ - MUTEX_LOCK(env, lp->mtx_filelist); - SH_TAILQ_FOREACH(fnp, &lp->fq, q, __fname) - if (F_ISSET(fnp, DB_FNAME_NOTLOGGED) && - (t_ret = __dbreg_close_id_int( - env, fnp, DBREG_CLOSE, 1)) != 0) - ret = t_ret; - MUTEX_UNLOCK(env, lp->mtx_filelist); - + if (MUTEX_LOCK_RET(env, lp->mtx_filelist) == 0) { + SH_TAILQ_FOREACH(fnp, &lp->fq, q, __fname) + if (F_ISSET(fnp, DB_FNAME_NOTLOGGED) && + (t_ret = __dbreg_close_id_int( + env, fnp, DBREG_CLOSE, 1)) != 0) + ret = t_ret; + MUTEX_UNLOCK(env, lp->mtx_filelist); + } /* - * If a private region, return the memory to the heap. Not needed for - * filesystem-backed or system shared memory regions, that memory isn't - * owned by any particular process. + * If a private region, return the memory to the heap. Not + * needed for filesystem-backed or system shared memory regions, + * that memory isn't owned by any particular process. */ if (F_ISSET(env, ENV_PRIVATE)) { - reginfo->mtx_alloc = MUTEX_INVALID; - /* Discard the flush mutex. */ - if ((t_ret = - __mutex_free(env, &lp->mtx_flush)) != 0 && ret == 0) - ret = t_ret; - - /* Discard the buffer. */ - __env_alloc_free(reginfo, R_ADDR(reginfo, lp->buffer_off)); - - /* Discard stack of free file IDs. */ - if (lp->free_fid_stack != INVALID_ROFF) - __env_alloc_free(reginfo, - R_ADDR(reginfo, lp->free_fid_stack)); - - /* Discard the list of in-memory log file markers. */ - while ((filestart = SH_TAILQ_FIRST(&lp->logfiles, - __db_filestart)) != NULL) { - SH_TAILQ_REMOVE(&lp->logfiles, filestart, links, - __db_filestart); - __env_alloc_free(reginfo, filestart); - } - - while ((filestart = SH_TAILQ_FIRST(&lp->free_logfiles, - __db_filestart)) != NULL) { - SH_TAILQ_REMOVE(&lp->free_logfiles, filestart, links, - __db_filestart); - __env_alloc_free(reginfo, filestart); - } - - /* Discard commit queue elements. */ - while ((commit = SH_TAILQ_FIRST(&lp->free_commits, - __db_commit)) != NULL) { - SH_TAILQ_REMOVE(&lp->free_commits, commit, links, - __db_commit); - __env_alloc_free(reginfo, commit); - } - - /* Discard replication bulk buffer. */ - if (lp->bulk_buf != INVALID_ROFF) { - __env_alloc_free(reginfo, - R_ADDR(reginfo, lp->bulk_buf)); - lp->bulk_buf = INVALID_ROFF; - } + reginfo->mtx_alloc = MUTEX_INVALID; + /* Discard the flush mutex. */ + if ((t_ret = + __mutex_free(env, &lp->mtx_flush)) != 0 && ret == 0) + ret = t_ret; + + /* Discard the log buffer. */ + __env_alloc_free(reginfo, R_ADDR(reginfo, lp->buffer_off)); + + /* Discard stack of free file IDs. */ + if (lp->free_fid_stack != INVALID_ROFF) + __env_alloc_free(reginfo, + R_ADDR(reginfo, lp->free_fid_stack)); + + /* Discard the list of in-memory log file markers. */ + while ((filestart = SH_TAILQ_FIRST(&lp->logfiles, + __db_filestart)) != NULL) { + SH_TAILQ_REMOVE(&lp->logfiles, filestart, links, + __db_filestart); + __env_alloc_free(reginfo, filestart); + } + + while ((filestart = SH_TAILQ_FIRST(&lp->free_logfiles, + __db_filestart)) != NULL) { + SH_TAILQ_REMOVE(&lp->free_logfiles, filestart, links, + __db_filestart); + __env_alloc_free(reginfo, filestart); + } + + /* Discard commit queue elements. */ + while ((commit = SH_TAILQ_FIRST(&lp->free_commits, + __db_commit)) != NULL) { + SH_TAILQ_REMOVE(&lp->free_commits, commit, links, + __db_commit); + __env_alloc_free(reginfo, commit); + } + + /* Discard replication bulk buffer. */ + if (lp->bulk_buf != INVALID_ROFF) { + __env_alloc_free(reginfo, + R_ADDR(reginfo, lp->bulk_buf)); + lp->bulk_buf = INVALID_ROFF; + } } /* Discard the per-thread DBREG mutex. */ @@ -1394,7 +1409,7 @@ __log_inmem_lsnoff(dblp, lsnp, offsetp) return (0); } - return (DB_NOTFOUND); + return (USR_ERR(dblp->env, DB_NOTFOUND)); } /* diff --git a/src/log/log_archive.c b/src/log/log_archive.c index 280a2071..fb98e10b 100644 --- a/src/log/log_archive.c +++ b/src/log/log_archive.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -304,7 +304,7 @@ __log_get_stable_lsn(env, stable_lsn, group_wide) * so that the caller knows it may be done. */ if (IS_ZERO_LSN(*stable_lsn)) { - ret = DB_NOTFOUND; + ret = USR_ERR(env, DB_NOTFOUND); goto err; } } else if ((ret = __txn_getckp(env, stable_lsn)) != 0) diff --git a/src/log/log_compare.c b/src/log/log_compare.c index 97b59338..9bd28854 100644 --- a/src/log/log_compare.c +++ b/src/log/log_compare.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/log/log_debug.c b/src/log/log_debug.c index 32fb2542..d8f10798 100644 --- a/src/log/log_debug.c +++ b/src/log/log_debug.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/log/log_get.c b/src/log/log_get.c index db30c969..332dab8e 100644 --- a/src/log/log_get.c +++ b/src/log/log_get.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -472,7 +472,7 @@ nextrec: /* If at start-of-file, move to the previous file. */ if (nlsn.offset == 0) { if (nlsn.file == 1) { - ret = DB_NOTFOUND; + ret = USR_ERR(env, DB_NOTFOUND); goto err; } if ((!lp->db_log_inmemory && @@ -480,7 +480,7 @@ nextrec: 0, &status, NULL) != 0 || (status != DB_LV_NORMAL && status != DB_LV_OLD_READABLE)))) { - ret = DB_NOTFOUND; + ret = USR_ERR(env, DB_NOTFOUND); goto err; } @@ -607,7 +607,7 @@ nohdr: switch (flags) { if (eof && logc->bp_lsn.file != nlsn.file) __db_errx(env, DB_STR_A("2583", "Log file %d not found, check log directory configuration", "%d"), - nlsn.file); + nlsn.file); else __db_errx(env, DB_STR("2576", "Encountered zero length records while traversing backwards")); @@ -624,7 +624,7 @@ nohdr: switch (flags) { /* FALLTHROUGH */ case DB_SET: default: - ret = DB_NOTFOUND; + ret = USR_ERR(env, DB_NOTFOUND); goto err; } } @@ -830,7 +830,7 @@ __logc_incursor(logc, lsn, hdr, pp) if (LOG_SWAPPED(env)) __log_hdrswap(hdr, CRYPTO_ON(env)); if (__logc_hdrchk(logc, lsn, hdr, &eof)) - return (DB_NOTFOUND); + return (USR_ERR(env, DB_NOTFOUND)); if (eof || logc->bp_lsn.offset + logc->bp_rlen < lsn->offset + hdr->len) return (0); @@ -914,7 +914,7 @@ __logc_inregion(logc, lsn, rlockp, last_lsn, hdr, pp, need_cksump) if (IS_ZERO_LSN(lp->lsn)) return (0); if (LOG_COMPARE(lsn, &lp->lsn) >= 0) - return (DB_NOTFOUND); + return (USR_ERR(env, DB_NOTFOUND)); else if (lp->db_log_inmemory) { if ((ret = __log_inmem_lsnoff(dblp, lsn, &b_region)) != 0) return (ret); @@ -949,14 +949,14 @@ __logc_inregion(logc, lsn, rlockp, last_lsn, hdr, pp, need_cksump) if (LOG_SWAPPED(env)) __log_hdrswap(hdr, CRYPTO_ON(env)); if (__logc_hdrchk(logc, lsn, hdr, &eof) != 0) - return (DB_NOTFOUND); + return (USR_ERR(env, DB_NOTFOUND)); if (eof) return (0); if (lp->db_log_inmemory) { if (RINGBUF_LEN(lp, b_region, lp->b_off) < hdr->len) - return (DB_NOTFOUND); + return (USR_ERR(env, DB_NOTFOUND)); } else if (lsn->offset + hdr->len > lp->w_off + lp->buffer_size) - return (DB_NOTFOUND); + return (USR_ERR(env, DB_NOTFOUND)); if (logc->bp_size <= hdr->len) { len = (size_t)DB_ALIGN((uintmax_t)hdr->len * 2, 128); if ((ret = @@ -1535,6 +1535,10 @@ __log_read_record(env, dbpp, td, recbuf, spec, size, argpp) LOGCOPY_32(env, ap + sp->offset, bp); bp += sizeof(uinttmp); break; + case LOGREC_LONGARG: + LOGCOPY_64(env, ap + sp->offset, bp); + bp += sizeof(u_int64_t); + break; case LOGREC_OP: LOGCOPY_32(env, &op, bp); *(u_int32_t *)(ap + sp->offset) = op; diff --git a/src/log/log_method.c b/src/log/log_method.c index d5aec116..09fbe863 100644 --- a/src/log/log_method.c +++ b/src/log/log_method.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -342,6 +342,10 @@ __log_get_flags(dbenv, flagsp) LF_SET(DB_LOG_IN_MEMORY); else LF_CLR(DB_LOG_IN_MEMORY); + if (lp->nosync) + LF_SET(DB_LOG_NOSYNC); + else + LF_CLR(DB_LOG_NOSYNC); *flagsp = flags; } @@ -369,6 +373,8 @@ __log_set_flags(env, flags, on) lp->db_log_autoremove = on ? 1 : 0; if (LF_ISSET(DB_LOG_IN_MEMORY)) lp->db_log_inmemory = on ? 1 : 0; + if (LF_ISSET(DB_LOG_NOSYNC)) + lp->nosync = on ? 1 : 0; } /* @@ -377,13 +383,15 @@ __log_set_flags(env, flags, on) */ #undef OK_FLAGS #define OK_FLAGS \ - (DB_LOG_AUTO_REMOVE | DB_LOG_DIRECT | \ - DB_LOG_DSYNC | DB_LOG_IN_MEMORY | DB_LOG_ZERO) + (DB_LOG_AUTO_REMOVE | DB_LOG_BLOB | DB_LOG_DIRECT | \ + DB_LOG_DSYNC | DB_LOG_IN_MEMORY | DB_LOG_NOSYNC | DB_LOG_ZERO) static const FLAG_MAP LogMap[] = { { DB_LOG_AUTO_REMOVE, DBLOG_AUTOREMOVE}, + { DB_LOG_BLOB, DBLOG_BLOB}, { DB_LOG_DIRECT, DBLOG_DIRECT}, { DB_LOG_DSYNC, DBLOG_DSYNC}, { DB_LOG_IN_MEMORY, DBLOG_INMEMORY}, + { DB_LOG_NOSYNC, DBLOG_NOSYNC}, { DB_LOG_ZERO, DBLOG_ZERO} }; /* @@ -406,10 +414,14 @@ __log_get_config(dbenv, which, onp) if (FLD_ISSET(which, ~OK_FLAGS)) return (__db_ferr(env, "DB_ENV->log_get_config", 0)); dblp = env->lg_handle; - ENV_REQUIRES_CONFIG(env, dblp, "DB_ENV->log_get_config", DB_INIT_LOG); + ENV_NOT_CONFIGURED(env, dblp, "DB_ENV->log_get_config", DB_INIT_LOG); + + if (LOGGING_ON(env)) { + __env_fetch_flags(LogMap, sizeof(LogMap), &dblp->flags, &flags); + __log_get_flags(dbenv, &flags); + } else + flags = dbenv->lg_flags; - __env_fetch_flags(LogMap, sizeof(LogMap), &dblp->flags, &flags); - __log_get_flags(dbenv, &flags); if (LF_ISSET(which)) *onp = 1; else @@ -459,6 +471,17 @@ __log_set_config_int(dbenv, flags, on, in_open) "DB_ENV->log_set_config: direct I/O either not configured or not supported"); return (EINVAL); } + if (REP_ON(env) && LF_ISSET(DB_LOG_BLOB) && !on) { + __db_errx(env, +"DB_ENV->log_set_config: DB_LOG_BLOB must be enabled with replication."); + return (EINVAL); + } + if (FLD_ISSET(flags, DB_LOG_IN_MEMORY) && on > 0 && + PREFMAS_IS_SET(env)) { + __db_errx(env, DB_STR("2587", "DB_LOG_IN_MEMORY is not " + "supported in Replication Manager preferred master mode")); + return (EINVAL); + } if (LOGGING_ON(env)) { if (!in_open && LF_ISSET(DB_LOG_IN_MEMORY) && diff --git a/src/log/log_print.c b/src/log/log_print.c index d2cda519..e5c920b6 100644 --- a/src/log/log_print.c +++ b/src/log/log_print.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -37,6 +37,7 @@ __log_print_record(env, recbuf, lsnp, name, spec, info) LOG *lp; PAGE *hdrstart, *hdrtmp; int32_t inttmp; + u_int64_t ulltmp; u_int32_t hdrsize, op, uinttmp; u_int32_t type, txnid; u_int8_t *bp, *datatmp; @@ -150,6 +151,14 @@ __log_print_record(env, recbuf, lsnp, name, spec, info) __db_msgadd(env, &msgbuf, "\n"); bp += sizeof(uinttmp); break; + case LOGREC_LONGARG: + LOGCOPY_64(env, &ulltmp, bp); + __db_msgadd(env, &msgbuf, "\t%s: ", sp->name); + __db_msgadd(env, + &msgbuf, "%llu", (unsigned long long)ulltmp); + __db_msgadd(env, &msgbuf, "\n"); + bp += sizeof(ulltmp); + break; case LOGREC_TIME: /* time_t is long but we only store 32 bits. */ LOGCOPY_32(env, &uinttmp, bp); diff --git a/src/log/log_put.c b/src/log/log_put.c index 8f7e23d8..4d6c3d2f 100644 --- a/src/log/log_put.c +++ b/src/log/log_put.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -280,8 +280,7 @@ __log_put(env, lsnp, udbt, flags) * If the send fails and we're a commit or checkpoint, * there's nothing we can do; the record's in the log. * Flush it, even if we're running with TXN_NOSYNC, - * on the grounds that it should be in durable - * form somewhere. + * on the grounds that it should be in durable form somewhere. */ if (ret != 0 && FLD_ISSET(ctlflags, REPCTL_PERM)) LF_SET(DB_FLUSH); @@ -473,12 +472,12 @@ __log_put_next(env, lsn, dbt, hdr, old_lsnp) */ if (adv_file || lp->lsn.offset == 0 || lp->lsn.offset + hdr->size + dbt->size > lp->log_size) { - if (hdr->size + sizeof(LOGP) + dbt->size > lp->log_size) { + if (hdr->size + sizeof(LOGP) + dbt->size > lp->log_nsize) { __db_errx(env, DB_STR_A("2513", "DB_ENV->log_put: record larger than maximum file size (%lu > %lu)", "%lu %lu"), (u_long)hdr->size + sizeof(LOGP) + dbt->size, - (u_long)lp->log_size); + (u_long)lp->log_nsize); return (EINVAL); } @@ -561,7 +560,12 @@ __log_flush_commit(env, lsnp, flags) "Write failed on MASTER commit.")); return (__env_panic(env, ret)); } - + /* + * If this is a panic don't attempt to abort just this transaction; + * it may trip over the panic, and the whole env needs to go anyway. + */ + if (ret == DB_RUNRECOVERY) + return (__env_panic(env, ret)); /* * Else, make sure that the commit record does not get out after we * abort the transaction. Do this by overwriting the commit record @@ -735,7 +739,7 @@ __log_newfile(dblp, lsnp, logfile, version) __log_persistswap(tpersist); if ((ret = - __log_encrypt_record(env, &t, &hdr, (u_int32_t)tsize)) != 0) + __log_encrypt_record(env, &t, &hdr, (u_int32_t)sizeof(LOGP))) != 0) goto err; if ((ret = __log_putr(dblp, &lsn, @@ -1118,12 +1122,15 @@ flush: MUTEX_LOCK(env, lp->mtx_flush); LOG_SYSTEM_UNLOCK(env); /* Sync all writes to disk. */ - if ((ret = __os_fsync(env, dblp->lfhp)) != 0) { - MUTEX_UNLOCK(env, lp->mtx_flush); - if (release) - LOG_SYSTEM_LOCK(env); - lp->in_flush--; - goto done; + if (!lp->nosync) { + if ((ret = __os_fsync(env, dblp->lfhp)) != 0) { + MUTEX_UNLOCK(env, lp->mtx_flush); + if (release) + LOG_SYSTEM_LOCK(env); + lp->in_flush--; + goto done; + } + STAT(++lp->stat.st_scount); } /* @@ -1143,7 +1150,6 @@ flush: MUTEX_LOCK(env, lp->mtx_flush); LOG_SYSTEM_LOCK(env); lp->in_flush--; - STAT(++lp->stat.st_scount); /* * How many flush calls (usually commits) did this call actually sync? @@ -1440,7 +1446,7 @@ __log_newfh(dblp, create) "DB_ENV->log_newfh: %lu", (u_long)lp->lsn.file); else if (status != DB_LV_NORMAL && status != DB_LV_INCOMPLETE && status != DB_LV_OLD_READABLE) - ret = DB_NOTFOUND; + ret = USR_ERR(env, DB_NOTFOUND); return (ret); } @@ -1621,6 +1627,37 @@ err: return (ret); } +/* + * __log_rep_write -- + * Way for replication clients to write the log buffer for the + * DB_TXN_WRITE_NOSYNC option. This is just a thin PUBLIC wrapper + * for __log_write that is similar to __log_flush_commit. + * + * Note that the REP->mtx_clientdb should be held when this is called. + * Note that we acquire the log region mutex while holding mtx_clientdb. + * + * PUBLIC: int __log_rep_write __P((ENV *)); + */ +int +__log_rep_write(env) + ENV *env; +{ + DB_LOG *dblp; + LOG *lp; + int ret; + + dblp = env->lg_handle; + lp = dblp->reginfo.primary; + ret = 0; + LOG_SYSTEM_LOCK(env); + if (!lp->db_log_inmemory && lp->b_off != 0) + if ((ret = __log_write(dblp, dblp->bufp, + (u_int32_t)lp->b_off)) == 0) + lp->b_off = 0; + LOG_SYSTEM_UNLOCK(env); + return (ret); +} + static int __log_encrypt_record(env, dbt, hdr, orig) ENV *env; @@ -1773,6 +1810,7 @@ __log_put_record_int(env, dbp, txnp, ret_lsnp, DB_TXNLOGREC *lr; LOG *lp; PAGE *pghdrstart; + u_int64_t ulltmp; u_int32_t hdrsize, op, zero, uinttmp, txn_num; u_int npad; u_int8_t *bp; @@ -1819,7 +1857,7 @@ __log_put_record_int(env, dbp, txnp, ret_lsnp, return (ret); /* * We need to assign begin_lsn while holding region mutex. - * That assignment is done inside the DbEnv->log_put call, + * That assignment is done inside the __log_put call, * so pass in the appropriate memory location to be filled * in by the log_put code. */ @@ -1842,8 +1880,7 @@ __log_put_record_int(env, dbp, txnp, ret_lsnp, } if (is_durable || txnp == NULL) { - if ((ret = - __os_malloc(env, logrec.size, &logrec.data)) != 0) + if ((ret = __os_malloc(env, logrec.size, &logrec.data)) != 0) return (ret); } else { if ((ret = __os_malloc(env, @@ -1891,10 +1928,15 @@ __log_put_record_int(env, dbp, txnp, ret_lsnp, LOGCOPY_32(env, bp, &uinttmp); bp += sizeof(uinttmp); break; + case LOGREC_LONGARG: + ulltmp = va_arg(argp, u_int64_t); + LOGCOPY_64(env, bp, &ulltmp); + bp += sizeof(ulltmp); + break; case LOGREC_OP: op = va_arg(argp, u_int32_t); LOGCOPY_32(env, bp, &op); - bp += sizeof(uinttmp); + bp += sizeof(op); break; case LOGREC_DBT: case LOGREC_PGLIST: diff --git a/src/log/log_stat.c b/src/log/log_stat.c index 37b74c74..95fe0e2e 100644 --- a/src/log/log_stat.c +++ b/src/log/log_stat.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/log/log_verify.c b/src/log/log_verify.c index e7f8f688..2ed2f0f2 100644 --- a/src/log/log_verify.c +++ b/src/log/log_verify.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -38,6 +38,12 @@ __log_verify_pp(dbenv, lvconfig) lsnrg = ret = timerg = 0; phome = NULL; + if (lvconfig == NULL) { + __db_errx(dbenv->env, DB_STR("2584", + "Must provide a configuration structure.")); + ret = EINVAL; + goto err; + } if (!IS_ZERO_LSN(lvconfig->start_lsn) || !IS_ZERO_LSN(lvconfig->end_lsn)) lsnrg = 1; @@ -64,7 +70,8 @@ __log_verify_pp(dbenv, lvconfig) } ENV_ENTER(dbenv->env, ip); - ret = __log_verify(dbenv, lvconfig, ip); + REPLICATION_WRAP(dbenv->env, + (__log_verify(dbenv, lvconfig, ip)), 0, ret); ENV_LEAVE(dbenv->env, ip); err: return (ret); } @@ -79,18 +86,16 @@ __log_verify(dbenv, lvconfig, ip) const DB_LOG_VERIFY_CONFIG *lvconfig; DB_THREAD_INFO *ip; { - - u_int32_t logcflag, max_fileno; + DB_LOG_VRFY_INFO *logvrfy_hdl; DB_LOGC *logc; - ENV *env; - DBT data; DB_DISTAB dtab; DB_LSN key, start, start2, stop, stop2, verslsn; - u_int32_t newversion, version; + DBT data; + ENV *env; + u_int32_t logcflag, max_fileno, newversion, version; int cmp, fwdscroll, goprev, ret, tret; time_t starttime, endtime; const char *okmsg; - DB_LOG_VRFY_INFO *logvrfy_hdl; okmsg = NULL; fwdscroll = 1; @@ -98,6 +103,7 @@ __log_verify(dbenv, lvconfig, ip) goprev = 0; env = dbenv->env; logc = NULL; + logvrfy_hdl = NULL; memset(&dtab, 0, sizeof(dtab)); memset(&data, 0, sizeof(data)); version = newversion = 0; @@ -333,11 +339,12 @@ out: err: if (logc != NULL) (void)__logc_close(logc); - if ((tret = __destroy_log_vrfy_info(logvrfy_hdl)) != 0 && ret == 0) + if (logvrfy_hdl != NULL && + (tret = __destroy_log_vrfy_info(logvrfy_hdl)) != 0 && ret == 0) ret = tret; - if (dtab.int_dispatch) + if (dtab.int_dispatch != NULL) __os_free(dbenv->env, dtab.int_dispatch); - if (dtab.ext_dispatch) + if (dtab.ext_dispatch != NULL) __os_free(dbenv->env, dtab.ext_dispatch); return (ret); diff --git a/src/log/log_verify_auto.c b/src/log/log_verify_auto.c index 08bc5d64..de08998d 100644 --- a/src/log/log_verify_auto.c +++ b/src/log/log_verify_auto.c @@ -174,6 +174,9 @@ __fop_init_verify(env, dtabp) __fop_write_verify, DB___fop_write)) != 0) return (ret); if ((ret = __db_add_recovery_int(env, dtabp, + __fop_write_file_verify, DB___fop_write_file)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, __fop_rename_verify, DB___fop_rename)) != 0) return (ret); if ((ret = __db_add_recovery_int(env, dtabp, diff --git a/src/log/log_verify_int.c b/src/log/log_verify_int.c index abe564c6..f69f01c0 100644 --- a/src/log/log_verify_int.c +++ b/src/log/log_verify_int.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -593,7 +593,7 @@ __crdel_metasub_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -628,7 +628,7 @@ __crdel_inmem_create_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = __crdel_inmem_create_read(env, dbtp->data, &argp)) != 0) @@ -661,7 +661,7 @@ __crdel_inmem_rename_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = __crdel_inmem_rename_read(env, dbtp->data, &argp)) != 0) @@ -694,7 +694,7 @@ __crdel_inmem_remove_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = __crdel_inmem_remove_read(env, dbtp->data, &argp)) != 0) @@ -727,7 +727,7 @@ __db_addrem_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -762,7 +762,7 @@ __db_big_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -797,7 +797,7 @@ __db_ovref_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -832,7 +832,7 @@ __db_relink_42_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -864,7 +864,7 @@ __db_debug_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = __db_debug_read(env, dbtp->data, &argp)) != 0) @@ -897,7 +897,7 @@ __db_noop_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -931,7 +931,7 @@ __db_pg_alloc_42_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -963,7 +963,7 @@ __db_pg_alloc_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -998,7 +998,7 @@ __db_pg_free_42_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -1030,7 +1030,7 @@ __db_pg_free_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -1065,7 +1065,7 @@ __db_cksum_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = __db_cksum_read(env, dbtp->data, &argp)) != 0) @@ -1098,7 +1098,7 @@ __db_pg_freedata_42_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -1130,7 +1130,7 @@ __db_pg_freedata_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -1165,7 +1165,7 @@ __db_pg_init_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -1200,7 +1200,7 @@ __db_pg_sort_44_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -1232,7 +1232,7 @@ __db_pg_trunc_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -1264,7 +1264,7 @@ __db_realloc_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -1299,7 +1299,7 @@ __db_relink_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -1334,7 +1334,7 @@ __db_merge_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -1369,7 +1369,7 @@ __db_pgno_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -1515,7 +1515,7 @@ __dbreg_register_verify(env, dbtp, lsnp, notused2, lvhp) opcode = 0; ret = ret2 = rmv_dblife = 0; puid = NULL; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; fregp = NULL; pflife = NULL; @@ -1749,6 +1749,36 @@ err: } /* + * PUBLIC: int __dbreg_register_42_verify __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__dbreg_register_42_verify(env, dbtp, lsnp, notused2, lvhp) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *lvhp; +{ + __dbreg_register_42_args *argp; + DB_LOG_VRFY_INFO *lvh; + int ret; + + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); + lvh = (DB_LOG_VRFY_INFO *)lvhp; + + if ((ret = __dbreg_register_42_read(env, dbtp->data, &argp)) != 0) + goto err; + + ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type); + /* LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid); */ + +err: + __os_free(env, argp); + return (ret); +} + +/* * PUBLIC: int __bam_split_verify __P((ENV *, DBT *, DB_LSN *, * PUBLIC: db_recops, void *)); */ @@ -1764,7 +1794,7 @@ __bam_split_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -1804,7 +1834,7 @@ __bam_split_42_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -1836,7 +1866,7 @@ __bam_rsplit_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -1873,7 +1903,7 @@ __bam_adj_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -1910,7 +1940,7 @@ __bam_irep_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -1947,7 +1977,7 @@ __bam_cadjust_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -1984,7 +2014,7 @@ __bam_cdel_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -2021,7 +2051,7 @@ __bam_repl_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -2058,7 +2088,7 @@ __bam_root_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -2093,7 +2123,7 @@ __bam_curadj_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -2129,7 +2159,7 @@ __bam_rcuradj_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -2165,7 +2195,7 @@ __bam_relink_43_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -2197,7 +2227,7 @@ __bam_merge_44_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -2229,7 +2259,7 @@ __fop_create_42_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = __fop_create_42_read(env, dbtp->data, &argp)) != 0) @@ -2245,6 +2275,37 @@ err: } /* + * PUBLIC: int __fop_create_60_verify __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__fop_create_60_verify(env, dbtp, lsnp, notused2, lvhp) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *lvhp; +{ + __fop_create_60_args *argp; + DB_LOG_VRFY_INFO *lvh; + int ret; + + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); + lvh = (DB_LOG_VRFY_INFO *)lvhp; + + if ((ret = __fop_create_60_read(env, dbtp->data, &argp)) != 0) + return (ret); + + ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type); + /* LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID); */ +err: + + __os_free(env, argp); + + return (ret); +} + +/* * PUBLIC: int __fop_create_verify __P((ENV *, DBT *, DB_LSN *, * PUBLIC: db_recops, void *)); */ @@ -2260,7 +2321,7 @@ __fop_create_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = __fop_create_read(env, dbtp->data, &argp)) != 0) @@ -2278,6 +2339,38 @@ err: } /* + * PUBLIC: int __fop_remove_60_verify __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__fop_remove_60_verify(env, dbtp, lsnp, notused2, lvhp) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *lvhp; +{ + __fop_remove_60_args *argp; + DB_LOG_VRFY_INFO *lvh; + int ret; + + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); + lvh = (DB_LOG_VRFY_INFO *)lvhp; + + if ((ret = __fop_remove_60_read(env, dbtp->data, &argp)) != 0) + return (ret); + + ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type); + //LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID); + +err: + + __os_free(env, argp); + + return (ret); +} + +/* * PUBLIC: int __fop_remove_verify __P((ENV *, DBT *, DB_LSN *, * PUBLIC: db_recops, void *)); */ @@ -2293,7 +2386,7 @@ __fop_remove_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = __fop_remove_read(env, dbtp->data, &argp)) != 0) @@ -2326,7 +2419,7 @@ __fop_write_42_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = __fop_write_42_read(env, dbtp->data, &argp)) != 0) @@ -2341,6 +2434,36 @@ err: } /* + * PUBLIC: int __fop_write_60_verify __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__fop_write_60_verify(env, dbtp, lsnp, notused2, lvhp) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *lvhp; +{ + __fop_write_60_args *argp; + DB_LOG_VRFY_INFO *lvh; + int ret; + + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); + lvh = (DB_LOG_VRFY_INFO *)lvhp; + + if ((ret = __fop_write_60_read(env, dbtp->data, &argp)) != 0) + return (ret); + + ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type); + /* LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID); */ +err: + + __os_free(env, argp); + return (ret); +} + +/* * PUBLIC: int __fop_write_verify __P((ENV *, DBT *, DB_LSN *, * PUBLIC: db_recops, void *)); */ @@ -2356,7 +2479,7 @@ __fop_write_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = __fop_write_read(env, dbtp->data, &argp)) != 0) @@ -2373,6 +2496,67 @@ err: } /* + * PUBLIC: int __fop_write_file_60_verify __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__fop_write_file_60_verify(env, dbtp, lsnp, notused2, lvhp) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *lvhp; +{ + __fop_write_file_60_args *argp; + DB_LOG_VRFY_INFO *lvh; + int ret; + + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); + lvh = (DB_LOG_VRFY_INFO *)lvhp; + + if ((ret = __fop_write_file_60_read(env, dbtp->data, &argp)) != 0) + return (ret); + + ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type); + /*LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID);*/ +err: + __os_free(env, argp); + return (ret); +} + +/* + * PUBLIC: int __fop_write_file_verify __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__fop_write_file_verify(env, dbtp, lsnp, notused2, lvhp) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *lvhp; +{ + __fop_write_file_args *argp; + DB_LOG_VRFY_INFO *lvh; + int ret; + + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); + lvh = (DB_LOG_VRFY_INFO *)lvhp; + + if ((ret = __fop_write_file_read(env, dbtp->data, &argp)) != 0) + return (ret); + + LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID); + ON_PAGE_UPDATE4 /* No pages are locked by txns. */ +out: + +err: + + __os_free(env, argp); + return (ret); +} + +/* * PUBLIC: int __fop_rename_42_verify __P((ENV *, DBT *, DB_LSN *, * PUBLIC: db_recops, void *)); */ @@ -2388,7 +2572,7 @@ __fop_rename_42_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = __fop_rename_42_read(env, dbtp->data, &argp)) != 0) @@ -2404,6 +2588,37 @@ err: } /* + * PUBLIC: int __fop_rename_60_verify __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__fop_rename_60_verify(env, dbtp, lsnp, notused2, lvhp) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *lvhp; +{ + __fop_rename_60_args *argp; + DB_LOG_VRFY_INFO *lvh; + int ret; + + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); + lvh = (DB_LOG_VRFY_INFO *)lvhp; + + if ((ret = __fop_rename_60_read(env, dbtp->data, &argp)) != 0) + return (ret); + + ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type); + /* LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID); */ +err: + + __os_free(env, argp); + + return (ret); +} + +/* * PUBLIC: int __fop_rename_verify __P((ENV *, DBT *, DB_LSN *, * PUBLIC: db_recops, void *)); */ @@ -2423,7 +2638,7 @@ __fop_rename_verify(env, dbtp, lsnp, notused2, lvhp) VRFY_FILEREG_INFO freg, *fregp; memset(&freg, 0, sizeof(freg)); - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; buf = NULL; @@ -2470,6 +2685,38 @@ err: } /* + * PUBLIC: int __fop_file_remove_60_verify __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__fop_file_remove_60_verify(env, dbtp, lsnp, notused2, lvhp) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *lvhp; +{ + __fop_file_remove_60_args *argp; + DB_LOG_VRFY_INFO *lvh; + int ret; + + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); + lvh = (DB_LOG_VRFY_INFO *)lvhp; + + if ((ret = __fop_file_remove_60_read(env, dbtp->data, &argp)) != 0) + return (ret); + + ON_NOT_SUPPORTED(env, lvh, *lsnp, argp->type); + //LOG_VRFY_PROC(lvh, *lsnp, argp, INVAL_DBREGID); + +err: + + __os_free(env, argp); + + return (ret); +} + +/* * PUBLIC: int __fop_file_remove_verify __P((ENV *, DBT *, DB_LSN *, * PUBLIC: db_recops, void *)); */ @@ -2485,7 +2732,7 @@ __fop_file_remove_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = __fop_file_remove_read(env, dbtp->data, &argp)) != 0) @@ -2519,7 +2766,7 @@ __ham_insdel_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -2555,7 +2802,7 @@ __ham_newpage_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -2592,7 +2839,7 @@ __ham_splitdata_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -2629,7 +2876,7 @@ __ham_replace_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -2667,7 +2914,7 @@ __ham_copypage_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -2703,7 +2950,7 @@ __ham_metagroup_42_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -2735,7 +2982,7 @@ __ham_metagroup_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -2771,7 +3018,7 @@ __ham_groupalloc_42_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -2807,7 +3054,7 @@ __ham_groupalloc_verify(env, dbtp, lsnp, notused2, lvhp) ret = 0; pflife = NULL; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -2863,7 +3110,7 @@ __ham_changeslot_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -2900,7 +3147,7 @@ __ham_contract_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -2936,7 +3183,7 @@ __ham_curadj_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -2973,7 +3220,7 @@ __ham_chgpg_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -3011,7 +3258,7 @@ __heap_addrem_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -3030,6 +3277,40 @@ err: } /* + * PUBLIC: int __heap_addrem_60_verify + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__heap_addrem_60_verify(env, dbtp, lsnp, notused2, lvhp) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *lvhp; +{ + __heap_addrem_60_args *argp; + DB_LOG_VRFY_INFO *lvh; + int ret; + + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); + lvh = (DB_LOG_VRFY_INFO *)lvhp; + + if ((ret = + __heap_addrem_60_read(env, NULL, NULL, dbtp->data, &argp)) != 0) + return (ret); + + LOG_VRFY_PROC(lvh, *lsnp, argp, argp->fileid); + ON_PAGE_UPDATE(lvh, *lsnp, argp, argp->pgno); + if ((ret = __lv_on_heap_log(lvh, *lsnp, argp->fileid)) != 0) + goto err; +out: + +err: + __os_free(env, argp); + return (ret); +} + +/* * PUBLIC: int __heap_pg_alloc_verify * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); */ @@ -3045,7 +3326,7 @@ __heap_pg_alloc_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -3060,7 +3341,7 @@ out: err: __os_free(env, argp); - return (ret); + return (ret); } /* @@ -3079,7 +3360,7 @@ __heap_trunc_meta_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -3095,7 +3376,7 @@ out: err: __os_free(env, argp); - return (ret); + return (ret); } /* @@ -3114,7 +3395,7 @@ __heap_trunc_page_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -3150,7 +3431,7 @@ __qam_incfirst_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -3186,7 +3467,7 @@ __qam_mvptr_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -3222,7 +3503,7 @@ __qam_del_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -3258,7 +3539,7 @@ __qam_add_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -3294,7 +3575,7 @@ __qam_delext_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = @@ -3331,7 +3612,7 @@ __txn_regop_42_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = __txn_regop_42_read(env, dbtp->data, &argp)) != 0) @@ -3365,7 +3646,7 @@ __txn_regop_verify(env, dbtp, lsnp, notused2, lvhp) VRFY_TIMESTAMP_INFO tsinfo; ptvi = pptvi = NULL; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; ret = ret2 = started = 0; @@ -3480,7 +3761,7 @@ __txn_ckp_42_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = __txn_ckp_42_read(env, dbtp->data, &argp)) != 0) @@ -3517,7 +3798,7 @@ __txn_ckp_verify(env, dbtp, lsnp, notused2, lvhp) time_t ckp_time, lastckp_time; lastckp = NULL; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; memset(&ckpinfo, 0, sizeof(ckpinfo)); memset(&cvp, 0, sizeof(cvp)); @@ -3675,7 +3956,7 @@ __txn_child_verify(env, dbtp, lsnp, notused2, lvhp) * we never know the T0 has an active child txn T1, all child txns * we know are committed. */ - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; ptvi = ptvi2 = NULL; ret = ret2 = started = 0; @@ -3811,7 +4092,7 @@ __txn_xa_regop_42_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = __txn_xa_regop_42_read(env, dbtp->data, &argp)) != 0) @@ -3844,7 +4125,7 @@ __txn_prepare_verify(env, dbtp, lsnp, notused2, lvhp) ret = ret2 = started = 0; ptvi = NULL; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; if ((ret = __txn_prepare_read(env, dbtp->data, &argp)) != 0) @@ -3924,7 +4205,7 @@ __txn_recycle_verify(env, dbtp, lsnp, notused2, lvhp) DB_LOG_VRFY_INFO *lvh; int ret; - notused2 = DB_TXN_LOG_VERIFY; + COMPQUIET(notused2, DB_TXN_LOG_VERIFY); lvh = (DB_LOG_VRFY_INFO *)lvhp; ret = 0; diff --git a/src/log/log_verify_stub.c b/src/log/log_verify_stub.c index e6589a50..fdd9a795 100644 --- a/src/log/log_verify_stub.c +++ b/src/log/log_verify_stub.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/log/log_verify_util.c b/src/log/log_verify_util.c index 88682921..b0cfe0cb 100644 --- a/src/log/log_verify_util.c +++ b/src/log/log_verify_util.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -53,16 +53,16 @@ } \ } while (0) -typedef int (*btcmp_funct)(DB *, const DBT *, const DBT *); -typedef int (*dupcmp_funct)(DB *, const DBT *, const DBT *); +typedef int (*btcmp_funct)(DB *, const DBT *, const DBT *, size_t *); +typedef int (*dupcmp_funct)(DB *, const DBT *, const DBT *, size_t *); static int __lv_add_recycle_handler __P(( DB_LOG_VRFY_INFO *, VRFY_TXN_INFO *, void *)); static int __lv_add_recycle_lsn __P((VRFY_TXN_INFO *, const DB_LSN *)); static size_t __lv_dbt_arrsz __P((const DBT *, u_int32_t)); -static int __lv_fidpgno_cmp __P((DB *, const DBT *, const DBT *)); -static int __lv_i32_cmp __P((DB *, const DBT *, const DBT *)); -static int __lv_lsn_cmp __P((DB *, const DBT *, const DBT *)); +static int __lv_fidpgno_cmp __P((DB *, const DBT *, const DBT *, size_t *)); +static int __lv_i32_cmp __P((DB *, const DBT *, const DBT *, size_t *)); +static int __lv_lsn_cmp __P((DB *, const DBT *, const DBT *, size_t *)); static void __lv_on_bdbop_err __P((int)); static int __lv_open_db __P((DB_ENV *, DB **, DB_THREAD_INFO *, const char *, int, btcmp_funct, u_int32_t, dupcmp_funct)); @@ -73,8 +73,8 @@ static int __lv_seccbk_fname __P((DB *, const DBT *, const DBT *, DBT *)); static int __lv_seccbk_lsn __P((DB *, const DBT *, const DBT *, DBT *)); static int __lv_seccbk_txnpg __P((DB *, const DBT *, const DBT *, DBT *)); static void __lv_setup_logtype_names __P((DB_LOG_VRFY_INFO *lvinfo)); -static int __lv_txnrgns_lsn_cmp __P((DB *, const DBT *, const DBT *)); -static int __lv_ui32_cmp __P((DB *, const DBT *, const DBT *)); +static int __lv_txnrgns_lsn_cmp __P((DB *, const DBT *, const DBT *, size_t *)); +static int __lv_ui32_cmp __P((DB *, const DBT *, const DBT *, size_t *)); static int __lv_unpack_txn_vrfy_info __P((VRFY_TXN_INFO **, const DBT *)); static int __lv_unpack_filereg __P((const DBT *, VRFY_FILEREG_INFO **)); @@ -383,16 +383,18 @@ err: /* Btree compare function for a [fileid, pgno] key. */ static int -__lv_fidpgno_cmp(db, dbt1, dbt2) +__lv_fidpgno_cmp(db, dbt1, dbt2, locp) DB *db; const DBT *dbt1; const DBT *dbt2; + size_t * locp; { db_pgno_t pgno1, pgno2; int ret; size_t len; COMPQUIET(db, NULL); + COMPQUIET(locp, NULL); len = DB_FILE_ID_LEN; ret = memcmp(dbt1->data, dbt2->data, len); if (ret == 0) { @@ -408,14 +410,16 @@ __lv_fidpgno_cmp(db, dbt1, dbt2) /* Btree compare function for a int32_t type of key. */ static int -__lv_i32_cmp(db, dbt1, dbt2) +__lv_i32_cmp(db, dbt1, dbt2, locp) DB *db; const DBT *dbt1; const DBT *dbt2; + size_t *locp; { int32_t k1, k2; COMPQUIET(db, NULL); + COMPQUIET(locp, NULL); memcpy(&k1, dbt1->data, sizeof(k1)); memcpy(&k2, dbt2->data, sizeof(k2)); @@ -424,14 +428,16 @@ __lv_i32_cmp(db, dbt1, dbt2) /* Btree compare function for a u_int32_t type of key. */ static int -__lv_ui32_cmp(db, dbt1, dbt2) +__lv_ui32_cmp(db, dbt1, dbt2, locp) DB *db; const DBT *dbt1; const DBT *dbt2; + size_t *locp; { u_int32_t k1, k2; COMPQUIET(db, NULL); + COMPQUIET(locp, NULL); memcpy(&k1, dbt1->data, sizeof(k1)); memcpy(&k2, dbt2->data, sizeof(k2)); @@ -440,18 +446,21 @@ __lv_ui32_cmp(db, dbt1, dbt2) /* Btree compare function for a DB_LSN type of key. */ static int -__lv_lsn_cmp(db, dbt1, dbt2) +__lv_lsn_cmp(db, dbt1, dbt2, locp) DB *db; const DBT *dbt1; const DBT *dbt2; + size_t *locp; { DB_LSN lsn1, lsn2; + COMPQUIET(locp, NULL); DB_ASSERT(db->env, dbt1->size == sizeof(DB_LSN)); DB_ASSERT(db->env, dbt2->size == sizeof(DB_LSN)); memcpy(&lsn1, dbt1->data, sizeof(DB_LSN)); memcpy(&lsn2, dbt2->data, sizeof(DB_LSN)); + COMPQUIET(db, NULL); return (LOG_COMPARE(&lsn1, &lsn2)); } @@ -1663,17 +1672,21 @@ int __put_timestamp_info (lvinfo, tsinfo) } static int -__lv_txnrgns_lsn_cmp (db, d1, d2) +__lv_txnrgns_lsn_cmp (db, d1, d2, locp) DB *db; const DBT *d1, *d2; + size_t *locp; { struct __lv_txnrange r1, r2; + COMPQUIET(locp, NULL); + DB_ASSERT(db->env, d1->size == sizeof(r1)); DB_ASSERT(db->env, d2->size == sizeof(r2)); memcpy(&r1, d1->data, d1->size); memcpy(&r2, d2->data, d2->size); + COMPQUIET(db, NULL); return (LOG_COMPARE(&(r1.end), &(r2.end))); } diff --git a/src/mp/mp_alloc.c b/src/mp/mp_alloc.c index dc331215..011f54c6 100644 --- a/src/mp/mp_alloc.c +++ b/src/mp/mp_alloc.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -22,8 +22,112 @@ #endif /* + * __memp_bh_unreachable -- + * + * Determine whether this buffer can not ever be seen again: is the next + * newer version visible to the same transaction which sees this one? + * If both versions are visibile to the same transaction, there is no + * reason to keep the older one: it can be purged. + * + * If this buffer has a more recent version, and there is a transaction + * with a read_lsn between this buffer's and that more recent version's, + * the buffer is visible to at least that transaction, so return FALSE. + * Otherwise return TRUE. + * + * txns: 3/10 2/10 2/5 2/1 1/10 + * vers: 3/15 2/15 2/14 2/10 2/8 1/150 + * vis vis unreach vis unreach vis + * who new txns 3/10 2/10 2/5, 2/1 + * sees + * + * Note: in the abvove example, the page was allocated after txn 1/10 + * started. 1/10 would not see any version of the page. + * + * PUBLIC: int __memp_bh_unreachable __P((ENV *, BH *, DB_LSN *, int)); + */ +int +__memp_bh_unreachable(env, bhp, snapshots, n_snapshots) + ENV *env; + BH *bhp; + DB_LSN *snapshots; + int n_snapshots; +{ + BH *newer_bhp; + DB_LSN b_vlsn, n_vlsn; + int i, ret; +#ifdef DIAGNOSTIC + DB_MPOOL *dbmp; + DB_MSGBUF mb; + MPOOLFILE *bh_mfp; +#endif + + /* + * The buffer can't be purged if it is being used, or is the most recent + * version, or the next newer version isn't a copy yet. + */ + if (BH_REFCOUNT(bhp) != 0 || + (newer_bhp = SH_CHAIN_NEXT(bhp, vc, __bh)) == NULL || + newer_bhp->td_off == INVALID_ROFF) + return (FALSE); + + /* + * Find the visiblity LSNs for this buffer (b_vlsn) and the more recent, + * newer buffer (n_vlsn). If the newer version hasn't committed yet the + * bhp could be needed. + */ + n_vlsn = *VISIBLE_LSN(env, newer_bhp); + if (IS_MAX_LSN(n_vlsn)) + return (FALSE); + if (bhp->td_off == INVALID_ROFF) + INIT_LSN(b_vlsn); + else + b_vlsn = *VISIBLE_LSN(env, bhp); + + ret = TRUE; + /* + * Look for a transaction which is between n_lsn and b_lsn - determining + * that bhp is reachable. Stop looking once the transactions get so + * small (old) that they precede the buffer's version; no earlier txn + * could be between n_vlsn and b_vlsn. + */ + for (i = 0; + i < n_snapshots && LOG_COMPARE(&snapshots[i], &b_vlsn) >= 0; + i++) { + if (LOG_COMPARE(&snapshots[i], &n_vlsn) < 0) { + /* + * This txn can see (started after) bhp, but not + * newer_bhp (which committed after this txn started). + */ + ret = FALSE; + break; + } + } + +#ifdef DIAGNOSTIC + if (FLD_ISSET(env->dbenv->verbose, DB_VERB_MVCC)) { + dbmp = env->mp_handle; + bh_mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); + DB_MSGBUF_INIT(&mb); + __db_msgadd(env, &mb, + "bh_unreachable %s pgno %d %s %lu/%lu %x newer %lu/%lu txn #%d in\n", + __memp_fns(dbmp, bh_mfp), bhp->pgno, + ret ? "purgeable" : "needed", + (u_long)b_vlsn.file, (u_long)b_vlsn.offset, bhp->flags, + (u_long)n_vlsn.file, (u_long)n_vlsn.offset, i); + for (i = 0; i != n_snapshots; i++) + __db_msgadd(env, &mb, " %lu/%lu", + (u_long)snapshots[i].file, + (u_long)snapshots[i].offset); + DB_MSGBUF_FLUSH(env, &mb); + } +#endif + return (ret); +} + +/* * __memp_alloc -- - * Allocate some space from a cache region. + * Allocate some space from a cache region. If the region is full then + * reuse one or more cache buffers. * * PUBLIC: int __memp_alloc __P((DB_MPOOL *, * PUBLIC: REGINFO *, MPOOLFILE *, size_t, roff_t *, void *)); @@ -39,7 +143,7 @@ __memp_alloc(dbmp, infop, mfp, len, offsetp, retp) { BH *bhp, *current_bhp, *mvcc_bhp, *oldest_bhp; BH_FROZEN_PAGE *frozen_bhp; - DB_LSN oldest_reader, vlsn; + DB_LSN *snapshots, vlsn; DB_MPOOL_HASH *dbht, *hp, *hp_end, *hp_saved, *hp_tmp; ENV *env; MPOOL *c_mp; @@ -49,7 +153,7 @@ __memp_alloc(dbmp, infop, mfp, len, offsetp, retp) u_int32_t dirty_eviction, high_priority, priority, versions; u_int32_t priority_saved, put_counter, lru_generation, total_buckets; int aggressive, alloc_freeze, b_lock, giveup; - int h_locked, need_free, obsolete, ret, write_error; + int h_locked, need_free, n_snapshots, obsolete, ret, write_error; u_int8_t *endp; void *p; @@ -58,11 +162,10 @@ __memp_alloc(dbmp, infop, mfp, len, offsetp, retp) dbht = R_ADDR(infop, c_mp->htab); hp_end = &dbht[c_mp->htab_buckets]; hp_saved = NULL; - priority_saved = 0; - write_error = 0; - + snapshots = NULL; + priority_saved = write_error = 0; buckets = buffers = put_counter = total_buckets = versions = 0; - aggressive = alloc_freeze = giveup = h_locked = 0; + aggressive = alloc_freeze = giveup = h_locked = n_snapshots = 0; /* * If we're allocating a buffer, and the one we're discarding is the @@ -138,13 +241,15 @@ found: if (offsetp != NULL) c_mp->stat.st_alloc_pages, buffers, infop->id); } #endif - return (0); + goto done; } else if (giveup || c_mp->pages == 0) { MPOOL_REGION_UNLOCK(env, infop); __db_errx(env, DB_STR("3017", "unable to allocate space from the buffer cache")); - return ((ret == ENOMEM && write_error != 0) ? EIO : ret); + if (ret == ENOMEM && write_error != 0) + ret = EIO; + goto done; } search: @@ -158,7 +263,6 @@ search: lru_generation = c_mp->lru_generation; ret = 0; - MAX_LSN(oldest_reader); /* * We re-attempt the allocation every time we've freed 3 times what @@ -222,6 +326,13 @@ search: goto alloc; MPOOL_REGION_UNLOCK(env, infop); + /* Refresh the list of mvcc reader transactions. */ + if (snapshots != NULL) + __os_free(env, snapshots); + if ((ret = __txn_get_readers( + env, &snapshots, &n_snapshots)) != 0) + goto err; + aggressive++; /* * Once aggressive, we consider all buffers. By setting @@ -266,13 +377,6 @@ search: if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL) continue; - /* Set aggressive if we have already searched for too long. */ - if (aggressive == 0 && buckets >= MPOOL_ALLOC_SEARCH_LIMIT) { - aggressive = 1; - /* Once aggressive, we consider all buffers. */ - high_priority = MPOOL_LRU_MAX; - } - /* Unlock the region and lock the hash bucket. */ MPOOL_REGION_UNLOCK(env, infop); MUTEX_READLOCK(env, hp->mtx_hash); @@ -280,29 +384,45 @@ search: b_lock = 0; /* + * Set aggressive to consider all buffers if we have already + * searched in too many buckets. + */ + if (buckets > MPOOL_ALLOC_SEARCH_LIMIT && aggressive == 0) { + aggressive = 1; + /* Once aggressive, we consider all buffers. */ + high_priority = MPOOL_LRU_MAX; + if (snapshots == NULL && (ret = __txn_get_readers( + env, &snapshots, &n_snapshots)) != 0) + goto err; + } + + /* * Find a buffer we can use. + * Skip over refcount > 0 buffers; we can't get rid of them. * - * We use the lowest-LRU singleton buffer if we find one and - * it's better than the result of another hash bucket we've + * Without MVCC we use the lowest-LRU singleton buffer we find + * that's better than the result of another hash bucket we've * reviewed. We do not use a buffer which has a priority * greater than high_priority unless we are being aggressive. * - * With MVCC buffers, the situation is more complicated: we - * don't want to free a buffer out of the middle of an MVCC - * chain, since that requires I/O. So, walk the buffers, - * looking for an obsolete buffer at the end of an MVCC chain. - * Once a buffer becomes obsolete, its LRU priority is - * irrelevant because that version can never be accessed again. + * MVCC requires looking at additional factors: we don't want to + * free a still-relevent buffer out of the middle of an MVCC + * chain, since that requires freezing - lots of I/O. So, + * walk the buffers, looking for an obsolete buffer at the + * end of the MVCC chain. Once a buffer becomes obsolete, its + * LRU priority is irrelevant because that version can never + * be accessed again. * * If we don't find any obsolete MVCC buffers, we will get * aggressive, and in that case consider the lowest priority * buffer within a chain. - * - * Ignore referenced buffers, we can't get rid of them. */ retry_search: bhp = NULL; bucket_priority = high_priority; obsolete = 0; + if (n_snapshots > 0 && LOG_COMPARE(&snapshots[n_snapshots - 1], + &hp->old_reader) > 0) + hp->old_reader = snapshots[n_snapshots - 1]; SH_TAILQ_FOREACH(current_bhp, &hp->hash_bucket, hq, __bh) { /* * First, do the standard LRU check for singletons. @@ -340,55 +460,63 @@ retry_search: bhp = NULL; mvcc_bhp != NULL; oldest_bhp = mvcc_bhp, mvcc_bhp = SH_CHAIN_PREV(mvcc_bhp, vc, __bh)) { + DB_ASSERT(env, mvcc_bhp != + SH_CHAIN_PREV(mvcc_bhp, vc, __bh)); #ifdef MPOOL_ALLOC_SEARCH_DYN if (aggressive == 0 && - ++high_priority >= c_mp->lru_priority) + ++high_priority >= c_mp->lru_priority) { aggressive = 1; + if (snapshots == NULL && (ret = + __txn_readers(env, + &snapshots, &n_snapshots)) != 0) + goto err; + } #endif - DB_ASSERT(env, mvcc_bhp != - SH_CHAIN_PREV(mvcc_bhp, vc, __bh)); - if ((aggressive < 2 && - ++versions < (buffers >> 2)) || - BH_REFCOUNT(mvcc_bhp) != 0) + if (n_snapshots > 0 && + __memp_bh_unreachable(env, + mvcc_bhp, snapshots, n_snapshots)) { + oldest_bhp = mvcc_bhp; + goto is_obsolete; + } + if (bhp != NULL && + mvcc_bhp->priority >= bhp->priority) + continue; + if (BH_REFCOUNT(mvcc_bhp) != 0) + continue; + /* + * Since taking still-relevant versions requires + * freezing, skip over them at low aggression + * levels unless we see that a high proportion + * of buffers (over 1/4) are MVCC copies. + */ + if (aggressive < 2 && + ++versions < (buffers >> 2)) continue; buffers++; - if (!F_ISSET(mvcc_bhp, BH_FROZEN) && - (bhp == NULL || - bhp->priority > mvcc_bhp->priority)) { - if (bhp != NULL) - atomic_dec(env, &bhp->ref); - bhp = mvcc_bhp; - atomic_inc(env, &bhp->ref); - } + if (F_ISSET(mvcc_bhp, BH_FROZEN)) + continue; + /* + * Select mvcc_bhp as current best candidate, + * releasing the current candidate, if any. + */ + if (bhp != NULL) + atomic_dec(env, &bhp->ref); + bhp = mvcc_bhp; + atomic_inc(env, &bhp->ref); } /* * oldest_bhp is the last buffer on the MVCC chain, and * an obsolete buffer at the end of the MVCC chain gets - * used without further search. Before checking for - * obsolescence, update the cached oldest reader LSN in - * the bucket if it is older than call's oldest_reader. + * used without further search. */ if (BH_REFCOUNT(oldest_bhp) != 0) continue; - if (LOG_COMPARE(&oldest_reader, &hp->old_reader) > 0) { - if (IS_MAX_LSN(oldest_reader) && - (ret = __txn_oldest_reader( - env, &oldest_reader)) != 0) { - MUTEX_UNLOCK(env, hp->mtx_hash); - if (bhp != NULL) - atomic_dec(env, &bhp->ref); - return (ret); - } - if (LOG_COMPARE(&oldest_reader, - &hp->old_reader) > 0) - hp->old_reader = oldest_reader; - } - if (BH_OBSOLETE(oldest_bhp, hp->old_reader, vlsn)) { if (aggressive < 2) buffers++; +is_obsolete: obsolete = 1; if (bhp != NULL) atomic_dec(env, &bhp->ref); @@ -410,10 +538,18 @@ retry_search: bhp = NULL; /* * Compare two hash buckets and select the one with the lower - * priority. Performance testing showed looking at two improves - * the LRU-ness and looking at more only does a little better. + * priority, except mvcc at high aggression levels. Performance + * testing shows looking at two improves the LRU-ness and + * looking at more only does a little better. */ if (hp_saved == NULL) { + /* + * At high aggressive levels when mvcc is active, stop + * looking for candidate once one has been found. + * Freezing takes more time than writing out to a db. + */ + if (aggressive > 1 && n_snapshots > 1) + goto this_buffer; hp_saved = hp; priority_saved = priority; goto next_hb; @@ -487,11 +623,15 @@ this_buffer: /* /* We cannot block as the caller is probably holding locks. */ if ((ret = MUTEX_TRYLOCK(env, bhp->mtx_buf)) != 0) { - if (ret != DB_LOCK_NOTGRANTED) - return (ret); + if (ret != DB_LOCK_NOTGRANTED) { + goto err; + } + ret = 0; goto next_hb; } F_SET(bhp, BH_EXCLUSIVE); + if (obsolete) + F_SET(bhp, BH_UNREACHABLE); b_lock = 1; /* Someone may have grabbed it while we got the lock. */ @@ -557,7 +697,7 @@ this_buffer: /* F_CLR(bhp, BH_EXCLUSIVE); MUTEX_UNLOCK(env, bhp->mtx_buf); DB_ASSERT(env, !h_locked); - return (ret); + goto err; } } @@ -573,16 +713,25 @@ this_buffer: /* if (BH_REFCOUNT(bhp) != 1 || F_ISSET(bhp, BH_DIRTY) || (SH_CHAIN_HASNEXT(bhp, vc) && SH_CHAIN_NEXTP(bhp, vc, __bh)->td_off != bhp->td_off && - !BH_OBSOLETE(bhp, hp->old_reader, vlsn))) + !(obsolete || BH_OBSOLETE(bhp, hp->old_reader, vlsn)))) { + if (FLD_ISSET(env->dbenv->verbose, DB_VERB_MVCC)) + __db_msg(env, + "memp_alloc next_hb past bhp %lx flags %x ref %d %lx/%lx", + (u_long)R_OFFSET(infop, bhp), bhp->flags, + BH_REFCOUNT(bhp), + (u_long)R_OFFSET(infop, SH_CHAIN_NEXTP(bhp, vc, __bh)), + (u_long)R_OFFSET(infop, SH_CHAIN_PREVP(bhp, vc, __bh))); goto next_hb; + } /* * If the buffer is frozen, thaw it and look for another one - * we can use. (Calling __memp_bh_freeze above will not - * mark bhp BH_FROZEN.) + * we can use. (Calling __memp_bh_freeze above will not mark + * this bhp BH_FROZEN; it creates another frozen one.) */ if (F_ISSET(bhp, BH_FROZEN)) { - DB_ASSERT(env, obsolete || SH_CHAIN_SINGLETON(bhp, vc)); + DB_ASSERT(env, SH_CHAIN_SINGLETON(bhp, vc) || + obsolete || BH_OBSOLETE(bhp, hp->old_reader, vlsn)); DB_ASSERT(env, BH_REFCOUNT(bhp) > 0); if (!F_ISSET(bhp, BH_THAWED)) { /* @@ -592,10 +741,10 @@ this_buffer: /* */ if ((ret = __memp_bh_thaw(dbmp, infop, hp, bhp, NULL)) != 0) - return (ret); + goto done; MUTEX_READLOCK(env, hp->mtx_hash); } else { - need_free = (atomic_dec(env, &bhp->ref) == 0); + need_free = atomic_dec(env, &bhp->ref) == 0; F_CLR(bhp, BH_EXCLUSIVE); MUTEX_UNLOCK(env, bhp->mtx_buf); if (need_free) { @@ -626,7 +775,10 @@ this_buffer: /* if (alloc_freeze) { if ((ret = __memp_bhfree(dbmp, infop, bh_mfp, hp, bhp, 0)) != 0) - return (ret); + goto err; + DB_ASSERT(env, bhp->mtx_buf != MUTEX_INVALID); + if ((ret = __mutex_free(env, &bhp->mtx_buf)) != 0) + goto err; b_lock = 0; h_locked = 0; @@ -654,23 +806,21 @@ this_buffer: /* } /* - * Check to see if the buffer is the size we're looking for. - * If so, we can simply reuse it. Otherwise, free the buffer - * and its space and keep looking. + * If the buffer is the size we're looking for, we can simply + * reuse it. Otherwise, free it and keep looking. */ if (mfp != NULL && mfp->pagesize == bh_mfp->pagesize) { if ((ret = __memp_bhfree(dbmp, infop, bh_mfp, hp, bhp, 0)) != 0) - return (ret); + goto err; p = bhp; goto found; } freed_space += sizeof(*bhp) + bh_mfp->pagesize; - if ((ret = - __memp_bhfree(dbmp, infop, - bh_mfp, hp, bhp, BH_FREE_FREEMEM)) != 0) - return (ret); + if ((ret = __memp_bhfree(dbmp, + infop, bh_mfp, hp, bhp, BH_FREE_FREEMEM)) != 0) + goto err; /* Reset "aggressive" and "write_error" if we free any space. */ if (aggressive > 1) @@ -689,12 +839,14 @@ next_hb: if (bhp != NULL) { if (b_lock) { F_CLR(bhp, BH_EXCLUSIVE); MUTEX_UNLOCK(env, bhp->mtx_buf); + b_lock = 0; } } if (h_locked) MUTEX_UNLOCK(env, hp->mtx_hash); h_locked = 0; } + obsolete = 0; MPOOL_REGION_LOCK(env, infop); /* @@ -706,7 +858,15 @@ next_hb: if (bhp != NULL) { if (freed_space >= 3 * len) goto alloc; } - /* NOTREACHED */ +err: + if (h_locked) { + MUTEX_UNLOCK(env, hp->mtx_hash); + h_locked = 0; + } +done: + if (snapshots != NULL) + __os_free(env, snapshots); + return (ret); } /* diff --git a/src/mp/mp_backup.c b/src/mp/mp_backup.c index f376cda7..f1072292 100644 --- a/src/mp/mp_backup.c +++ b/src/mp/mp_backup.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -145,6 +145,9 @@ __memp_backup_mpf(env, mpf, ip, first_pgno, last_pgno, fp, handle, flags) if (backup == NULL || (len = backup->size) == 0) len = MEGABYTE; + /* Ensure backup page size is at least as big as db page size */ + if (len < mfp->pagesize) + len = mfp->pagesize; if ((ret = __os_malloc(env, len, &buf)) != 0) return (ret); write_size = (u_int32_t)(len / mfp->pagesize); @@ -188,7 +191,7 @@ __memp_backup_mpf(env, mpf, ip, first_pgno, last_pgno, fp, handle, flags) if (backup != NULL && backup->write != NULL) { if ((ret = backup->write( - env->dbenv, gigs, off, (u_int32_t)nr, + env->dbenv, gigs, off, (u_int32_t)nr, buf, handle)) != 0) break; } else { diff --git a/src/mp/mp_bh.c b/src/mp/mp_bh.c index 1df8e206..30293f29 100644 --- a/src/mp/mp_bh.c +++ b/src/mp/mp_bh.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -157,7 +157,7 @@ __memp_bhwrite(dbmp, hp, mfp, bhp, open_extents) opened = 1; if ((ret = __memp_fopen(dbmfp, mfp, NULL, NULL, DB_FLUSH | DB_DURABLE_UNKNOWN, 0, mfp->pagesize)) != 0) { - dbmfp->ref--; + dbmfp->ref--; (void)__memp_fclose(dbmfp, 0); /* @@ -264,7 +264,7 @@ __memp_pgread(dbmfp, bhp, can_create) * how to handle the error. */ if (!can_create) { - ret = DB_PAGE_NOTFOUND; + ret = USR_ERR(env, DB_PAGE_NOTFOUND); goto err; } @@ -557,6 +557,9 @@ err: __db_errx(env, DB_STR_A("3016", * __memp_bhfree -- * Free a bucket header and its referenced data. * + * The hash bucket is unlocked before returning except when flags includes + * BH_FREE_UNLOCKED -- or there was no hp passed in to begin with. + * * PUBLIC: int __memp_bhfree __P((DB_MPOOL *, * PUBLIC: REGINFO *, MPOOLFILE *, DB_MPOOL_HASH *, BH *, u_int32_t)); */ @@ -600,10 +603,13 @@ __memp_bhfree(dbmp, infop, mfp, hp, bhp, flags) (SH_CHAIN_NEXTP(bhp, vc, __bh)->td_off == bhp->td_off || bhp->td_off == INVALID_ROFF || IS_MAX_LSN(*VISIBLE_LSN(env, bhp)) || + F_ISSET(bhp, BH_UNREACHABLE) || BH_OBSOLETE(bhp, hp->old_reader, vlsn)))); PERFMON3(env, mpool, evict, __memp_fns(dbmp, mfp), bhp->pgno, bhp); - + if (FLD_ISSET(env->dbenv->verbose, DB_VERB_MVCC)) + __db_msg(env, "bhfree pgno %lu roff %lx", + (u_long)bhp->pgno, (u_long)R_OFFSET(dbmp->reginfo, bhp)); /* * Delete the buffer header from the hash bucket queue or the * version chain. diff --git a/src/mp/mp_fget.c b/src/mp/mp_fget.c index 5f9a4bf9..270135bd 100644 --- a/src/mp/mp_fget.c +++ b/src/mp/mp_fget.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -53,15 +53,19 @@ __memp_fget_pp(dbmfp, pgnoaddr, txnp, flags, addrp) * time, which we don't want to do because one of our big goals in life * is to keep database files small. It's sleazy as hell, but we catch * any attempt to actually write the file in memp_fput(). + * + * CREATE, LAST, and NEW are mutually exclusive. DIRTY and EDIT are also + * mutually exclusive - that is checked in __memp_fget() itself.. */ +#undef OKMODE #undef OKFLAGS -#define OKFLAGS (DB_MPOOL_CREATE | DB_MPOOL_DIRTY | \ - DB_MPOOL_EDIT | DB_MPOOL_LAST | DB_MPOOL_NEW) +#define OKMODE (DB_MPOOL_CREATE | DB_MPOOL_LAST | DB_MPOOL_NEW) +#define OKFLAGS (OKMODE | DB_MPOOL_DIRTY | DB_MPOOL_EDIT) if (flags != 0) { if ((ret = __db_fchk(env, "memp_fget", flags, OKFLAGS)) != 0) return (ret); - switch (FLD_CLR(flags, DB_MPOOL_DIRTY | DB_MPOOL_EDIT)) { + switch (FLD_ISSET(flags, OKMODE)) { case DB_MPOOL_CREATE: case DB_MPOOL_LAST: case DB_MPOOL_NEW: @@ -131,6 +135,7 @@ __memp_fget(dbmfp, pgnoaddr, ip, txn, flags, addrp) #ifdef DIAGNOSTIC DB_LOCKTAB *lt; DB_LOCKER *locker; + int pagelock_err; #endif *(void **)addrp = NULL; @@ -274,7 +279,7 @@ retry: MUTEX_LOCK(env, hp->mtx_hash); * the BTREE in a subsequent txn). */ if (bhp == NULL) { - ret = DB_PAGE_NOTFOUND; + ret = USR_ERR(env, DB_PAGE_NOTFOUND); goto err; } } @@ -303,7 +308,10 @@ retry: MUTEX_LOCK(env, hp->mtx_hash); MUTEX_UNLOCK(env, hp->mtx_hash); h_locked = 0; if (dirty || extending || makecopy || F_ISSET(bhp, BH_FROZEN)) { -xlatch: if (LF_ISSET(DB_MPOOL_TRY)) { +#ifdef HAVE_SHARED_LATCHES +xlatch: +#endif + if (LF_ISSET(DB_MPOOL_TRY)) { if ((ret = MUTEX_TRYLOCK(env, bhp->mtx_buf)) != 0) goto err; @@ -373,11 +381,11 @@ thawed: need_free = (atomic_dec(env, &bhp->ref) == 0); bhp = NULL; goto retry; } else if (dirty && SH_CHAIN_HASNEXT(bhp, vc)) { - ret = DB_LOCK_DEADLOCK; + ret = USR_ERR(env, DB_LOCK_DEADLOCK); goto err; } else if (F_ISSET(bhp, BH_FREED) && flags != DB_MPOOL_CREATE && flags != DB_MPOOL_NEW && flags != DB_MPOOL_FREE) { - ret = DB_PAGE_NOTFOUND; + ret = USR_ERR(env, DB_PAGE_NOTFOUND); goto err; } @@ -508,9 +516,13 @@ revive: if (F_ISSET(bhp, BH_FREED)) /* * With multiversion databases, we might need to * allocate a new buffer into which we can copy the one - * that we found. In that case, check the last buffer + * that we found. In that case, check the old versions * in the chain to see whether we can reuse an obsolete - * buffer. + * or unreachable buffer. First see whether the oldest + * version is truly obsolete. If not, look for somewhat + * more recent versions which are no longer needed + * because the snapshot transactions which once could + * have seen them have now exited. * * To provide snapshot isolation, we need to make sure * that we've seen a buffer older than the oldest @@ -523,24 +535,17 @@ reuse: if ((makecopy || F_ISSET(bhp, BH_FROZEN)) && } if ((makecopy || F_ISSET(bhp, BH_FROZEN)) && SH_CHAIN_HASPREV(bhp, vc)) { - oldest_bhp = SH_CHAIN_PREVP(bhp, vc, __bh); - while (SH_CHAIN_HASPREV(oldest_bhp, vc)) - oldest_bhp = SH_CHAIN_PREVP( - oldest_bhp, vc, __bh); - - if (BH_REFCOUNT(oldest_bhp) == 0 && - !BH_OBSOLETE( - oldest_bhp, hp->old_reader, vlsn) && - (ret = __txn_oldest_reader(env, - &hp->old_reader)) != 0) + if ((ret = __memp_find_obsolete_version(env, + bhp, hp, &oldest_bhp)) != 0) goto err; - - if (BH_OBSOLETE( - oldest_bhp, hp->old_reader, vlsn) && - BH_REFCOUNT(oldest_bhp) == 0) { + if (oldest_bhp != NULL) { DB_ASSERT(env, !F_ISSET(oldest_bhp, BH_DIRTY)); atomic_inc(env, &oldest_bhp->ref); +#ifdef HAVE_STATISTICS + if (SH_CHAIN_HASPREV(oldest_bhp, vc)) + c_mp->stat.st_mvcc_reused++; +#endif if (F_ISSET(oldest_bhp, BH_FROZEN)) { /* * This call will release the @@ -606,7 +611,7 @@ newpg: /* mfp->last_pgno >= mfp->maxpgno) { __db_errx(env, DB_STR_A("3023", "%s: file limited to %lu pages", "%s %lu"), - __memp_fn(dbmfp), (u_long)mfp->maxpgno); + __memp_fn(dbmfp), (u_long)mfp->maxpgno + 1); ret = ENOSPC; } else *pgnoaddr = mfp->last_pgno + 1; @@ -615,7 +620,7 @@ newpg: /* if (mfp->maxpgno != 0 && *pgnoaddr > mfp->maxpgno) { __db_errx(env, DB_STR_A("3024", "%s: file limited to %lu pages", "%s %lu"), - __memp_fn(dbmfp), (u_long)mfp->maxpgno); + __memp_fn(dbmfp), (u_long)mfp->maxpgno + 1); ret = ENOSPC; } else if (!extending) extending = *pgnoaddr > mfp->last_pgno; @@ -937,8 +942,17 @@ alloc: /* Allocate a new buffer header and data space. */ * need to make copy, so we now need to allocate another buffer * to hold the new copy. */ - if (alloc_bhp == NULL) + if (alloc_bhp == NULL) { + if (FLD_ISSET(env->dbenv->verbose, DB_VERB_MVCC)) + __db_msg(env, + "fget makecopy txn %08x %lu/%lu going to reuse pgno %d from %lu/%lu", + txn->txnid, td == NULL ? 0L : + (u_long)td->read_lsn.file, td == NULL ? 0L : + (u_long)td->read_lsn.offset, bhp->pgno, + (u_long)VISIBLE_LSN(env, bhp)->file, + (u_long)VISIBLE_LSN(env, bhp)->offset); goto reuse; + } DB_ASSERT(env, bhp != NULL && alloc_bhp != bhp); DB_ASSERT(env, bhp->td_off == INVALID_ROFF || @@ -1019,6 +1033,15 @@ alloc: /* Allocate a new buffer header and data space. */ F_CLR(bhp, BH_EXCLUSIVE); MUTEX_UNLOCK(env, bhp->mtx_buf); + if (FLD_ISSET(env->dbenv->verbose, DB_VERB_MVCC)) + __db_msg(env, + "fget makecopy txn %08x %lx pgno %d from %lu/%lu", + txn->txnid, (u_long)R_OFFSET(infop, bhp), + bhp->pgno, bhp->td_off == INVALID_ROFF ? 0L : + (u_long)VISIBLE_LSN(env, bhp)->file, + bhp->td_off == INVALID_ROFF ? 0L : + (u_long)VISIBLE_LSN(env, bhp)->offset); + bhp = alloc_bhp; DB_ASSERT(env, BH_REFCOUNT(bhp) > 0); b_incr = 1; @@ -1164,8 +1187,15 @@ alloc: /* Allocate a new buffer header and data space. */ lt = env->lk_handle; locker = (DB_LOCKER *) (R_ADDR(<->reginfo, ip->dbth_locker)); - DB_ASSERT(env, __db_has_pagelock(env, locker, dbmfp, - (PAGE*)bhp->buf, DB_LOCK_WRITE) == 0); + pagelock_err = __db_has_pagelock(env, locker, dbmfp, + (PAGE *)bhp->buf, DB_LOCK_WRITE); + if (pagelock_err != 0) { + if (pagelock_err == DB_RUNRECOVERY) + return (pagelock_err); + __db_syserr(env, pagelock_err, + "Locker %x has no page lock for pgno %d", + locker->id, ((PAGE *)bhp->buf)->pgno); + } } #endif @@ -1228,3 +1258,85 @@ err: /* return (ret); } + +/* + * __memp_find_obsolete_version -- + * + * Search the version chain, from oldest to youngest, looking for buffers + * which are no longer BH_VISIBLE() to any existing transaction. + * + * The hash bucket is locked, no buffer is locked. + * + * PUBLIC: int __memp_find_obsolete_version + * PUBLIC: __P((ENV *, BH *, DB_MPOOL_HASH *, BH **)); + */ +int +__memp_find_obsolete_version(env, vis_bhp, hp, foundp) + ENV *env; + BH *vis_bhp; + DB_MPOOL_HASH *hp; + BH **foundp; +{ + BH *bhp; + DB_LSN *readers, vlsn; + int n_readers, ret; + + *foundp = NULL; + readers = NULL; + ret = 0; + bhp = SH_CHAIN_PREVP(vis_bhp, vc, __bh); + while (SH_CHAIN_HASPREV(bhp, vc)) + bhp = SH_CHAIN_PREVP(bhp, vc, __bh); + + /* + * The least-expensive case is finding an obsolete version without + * needing to build the active snapshot transactionn list. + */ + if (BH_OBSOLETE(bhp, hp->old_reader, vlsn) && BH_REFCOUNT(bhp) == 0) { + *foundp = bhp; + goto out; + } + + if ((ret = __txn_get_readers(env, &readers, &n_readers)) != 0) + goto out; + + if (LOG_COMPARE(&readers[n_readers - 1], &hp->old_reader) > 0) { + hp->old_reader = readers[n_readers - 1]; + if (BH_OBSOLETE(bhp, hp->old_reader, vlsn) && + BH_REFCOUNT(bhp) == 0) { + *foundp = bhp; + goto cleanup; + } + } + + while ((bhp = SH_CHAIN_NEXT(bhp, vc, __bh)) != vis_bhp) { + if (BH_REFCOUNT(bhp) == 0 && + __memp_bh_unreachable(env, bhp, readers, n_readers)) { + *foundp = bhp; +#ifdef DIAGNOSTIC + /* + * Usually when the hash bucket is locked, the refcount + * is incremented and the bucket unlocked before the + * buffer is locked; this avoids mtx_buf deadlocks. + * This unreachable version cannot be involved with any + * deadlock-creating locking, though the head of the + * version chain could be locked. No TRYLOCK needed. + */ + MUTEX_LOCK(env, bhp->mtx_buf); + F_SET(bhp, BH_UNREACHABLE); + MUTEX_UNLOCK(env, bhp->mtx_buf); +#endif + break; + } + } + +cleanup: + if (readers != NULL) + __os_free(env, readers); +out: + if (FLD_ISSET(env->dbenv->verbose, DB_VERB_MVCC) && *foundp != NULL) + __db_msg(env, "fget reusing %p pgno %d @%lu/%lu", bhp, + bhp->pgno, (u_long)VISIBLE_LSN(env, bhp)->file, + (u_long)VISIBLE_LSN(env, bhp)->offset); + return (ret); +} diff --git a/src/mp/mp_fmethod.c b/src/mp/mp_fmethod.c index 41bd638c..4974f57c 100644 --- a/src/mp/mp_fmethod.c +++ b/src/mp/mp_fmethod.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -315,7 +315,7 @@ __memp_set_lsn_offset(dbmfp, lsn_offset) /* * __memp_get_maxsize -- - * Get the file's maximum size. + * Get the file's maximum size, returning zeroes if none is set. */ static int __memp_get_maxsize(dbmfp, gbytesp, bytesp) @@ -334,11 +334,22 @@ __memp_get_maxsize(dbmfp, gbytesp, bytesp) ENV_ENTER(env, ip); MUTEX_LOCK(env, mfp->mutex); - *gbytesp = (u_int32_t) - (mfp->maxpgno / (GIGABYTE / mfp->pagesize)); - *bytesp = (u_int32_t) - ((mfp->maxpgno % (GIGABYTE / mfp->pagesize)) * - mfp->pagesize); + if (mfp->maxpgno == 0) { + *gbytesp = *bytesp = 0; + } else { + *gbytesp = (u_int32_t) + (mfp->maxpgno / (GIGABYTE / mfp->pagesize)); + *bytesp = (u_int32_t) (mfp->maxpgno % + (GIGABYTE / mfp->pagesize) + 1) * mfp->pagesize; + /* + * After converting from 0-based maxpgno to #pages, we + * might have bumped into the next gigabyte boundary. + */ + if (*bytesp >= GIGABYTE) { + *bytesp -= GIGABYTE; + *gbytesp += 1; + } + } MUTEX_UNLOCK(env, mfp->mutex); ENV_LEAVE(env, ip); @@ -348,8 +359,34 @@ __memp_get_maxsize(dbmfp, gbytesp, bytesp) } /* + * __memp_set_maxpgno -- + * Set the file's maxpgno from the configured max size. If that size is + * pagesize or less then the filesize limit is disabled. + * + * PUBLIC: void __memp_set_maxpgno __P((MPOOLFILE *, u_int32_t, u_int32_t)); + */ +void +__memp_set_maxpgno(mfp, gbytes, bytes) + MPOOLFILE *mfp; + u_int32_t gbytes, bytes; +{ + if (gbytes == 0 && bytes <= mfp->pagesize) + mfp->maxpgno = 0; + else { + mfp->maxpgno = (db_pgno_t) + (gbytes * (GIGABYTE / mfp->pagesize)); + /* Round up to account for any fractional page. */ + mfp->maxpgno += (db_pgno_t) + ((bytes + mfp->pagesize - 1) / mfp->pagesize); + /* Convert from #pages to the zero-based max pgno. */ + mfp->maxpgno--; + } +} + +/* * __memp_set_maxsize -- - * Set the file's maximum size. + * Set the file's maximum size; if the size is <= pagesize then + * remove any file size limit. */ static int __memp_set_maxsize(dbmfp, gbytes, bytes) @@ -368,10 +405,7 @@ __memp_set_maxsize(dbmfp, gbytes, bytes) ENV_ENTER(env, ip); MUTEX_LOCK(env, mfp->mutex); - mfp->maxpgno = (db_pgno_t) - (gbytes * (GIGABYTE / mfp->pagesize)); - mfp->maxpgno += (db_pgno_t) - ((bytes + mfp->pagesize - 1) / mfp->pagesize); + __memp_set_maxpgno(mfp, gbytes, bytes); MUTEX_UNLOCK(env, mfp->mutex); ENV_LEAVE(env, ip); diff --git a/src/mp/mp_fopen.c b/src/mp/mp_fopen.c index ef7f886a..dbe7b9c8 100644 --- a/src/mp/mp_fopen.c +++ b/src/mp/mp_fopen.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -89,8 +89,9 @@ __memp_fopen_pp(dbmfp, path, flags, mode, pagesize) * Generate the number of user opens. If there is no backing file * there is an extra open count to keep the in memory db around. */ -#define MFP_OPEN_CNT(mfp) ((mfp)->mpf_cnt - ((mfp)->neutral_cnt + \ +#define MFP_OPEN_CNT(mfp) ((mfp)->mpf_cnt - ((mfp)->neutral_cnt + \ (u_int32_t)(mfp)->no_backing_file)) +#define MP_IOINFO_RETRIES 5 /* * __memp_fopen -- * DB_MPOOLFILE->open. @@ -118,7 +119,7 @@ __memp_fopen(dbmfp, mfp, path, dirp, flags, mode, pgsize) size_t maxmap; db_pgno_t last_pgno; u_int32_t bucket, mbytes, bytes, oflags, pagesize; - int refinc, ret, isdir; + int isdir, refinc, ret, tries; char *rpath; /* If this handle is already open, return. */ @@ -249,7 +250,7 @@ __memp_fopen(dbmfp, mfp, path, dirp, flags, mode, pgsize) if (MFP_OPEN_CNT(mfp) > 0 && atomic_read(&mfp->multiversion) == 0) { mvcc_err: __db_errx(env, DB_STR("3041", -"DB_MULTIVERSION cannot be specified on a database file which is already open")); +"DB_MULTIVERSION cannot be specified on a database file that is already open")); ret = EINVAL; goto err; } @@ -399,11 +400,44 @@ mvcc_err: __db_errx(env, DB_STR("3041", if (LF_ISSET(DB_ODDFILESIZE)) bytes -= (u_int32_t)(bytes % pagesize); else { - __db_errx(env, DB_STR_A("3037", - "%s: file size not a multiple of the pagesize", "%s"), - rpath); - ret = EINVAL; - goto err; + /* + * If the file size is not a multiple of the + * pagesize, it is likely because the ioinfo + * call is racing with a write that is extending + * the file. Many file systems will extend + * in fs block size units, and if the pagesize + * is larger than that, we can briefly see a + * file size that is not a multiple of pagesize. + * + * Yield the processor to allow that to finish + * and try again a few times. + */ + tries = 0; + STAT((mp->stat.st_oddfsize_detect++)); + while (tries < MP_IOINFO_RETRIES) { + if ((ret = __os_ioinfo(env, rpath, + dbmfp->fhp, &mbytes, &bytes, + NULL)) != 0) { + __db_err(env, ret, "%s", rpath); + goto err; + } + if (bytes % pagesize != 0) { + __os_yield(env, 0, 50000); + tries++; + } else { + STAT(( + mp->stat.st_oddfsize_resolve++)); + break; + } + } + if (tries == MP_IOINFO_RETRIES) { + __db_errx(env, DB_STR_A("3043", + "%s: file size (%lu %lu) not a multiple of the pagesize %lu", + "%s %lu %lu %lu"), + rpath, (u_long)mbytes, (u_long)bytes, (u_long)pagesize); + ret = EINVAL; + goto err; + } } } @@ -786,13 +820,7 @@ __memp_mpf_alloc(dbmp, dbmfp, path, pagesize, flags, retmfp) mfp->lsn_off = dbmfp->lsn_offset; mfp->clear_len = dbmfp->clear_len; mfp->priority = dbmfp->priority; - if (dbmfp->gbytes != 0 || dbmfp->bytes != 0) { - mfp->maxpgno = (db_pgno_t) - (dbmfp->gbytes * (GIGABYTE / mfp->pagesize)); - mfp->maxpgno += (db_pgno_t) - ((dbmfp->bytes + mfp->pagesize - 1) / - mfp->pagesize); - } + __memp_set_maxpgno(mfp, dbmfp->gbytes, dbmfp->bytes); if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) mfp->no_backing_file = 1; if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_UNLINK)) @@ -1019,6 +1047,7 @@ __memp_fclose(dbmfp, flags) ret = t_ret; __os_free(env, rpath); } + mfp->unlink_on_close = 0; } if (MFP_OPEN_CNT(mfp) == 0) { F_CLR(mfp, MP_NOT_DURABLE); @@ -1068,6 +1097,7 @@ __memp_mf_discard(dbmp, mfp, hp_locked) DB_MPOOL_STAT *sp; #endif MPOOL *mp; + char *rpath; int need_sync, ret, t_ret; env = dbmp->env; @@ -1095,6 +1125,23 @@ __memp_mf_discard(dbmp, mfp, hp_locked) */ mfp->deadfile = 1; + /* We should unlink the file if necessary. */ + if (mfp->block_cnt == 0 && mfp->mpf_cnt == 0 && mfp->unlink_on_close && + !F_ISSET(mfp, MP_TEMP) && !mfp->no_backing_file) { + if ((t_ret = __db_appname(env, DB_APP_DATA, + R_ADDR(dbmp->reginfo, mfp->path_off), NULL, + &rpath)) != 0 && ret == 0) + ret = t_ret; + if (t_ret == 0) { + if ((t_ret = __os_unlink( + dbmp->env, rpath, 0)) != 0 && ret == 0) + ret = t_ret; + __os_free(env, rpath); + } + mfp->unlink_on_close = 0; + need_sync = 0; + } + /* Discard the mutex we're holding and return it too the pool. */ MUTEX_UNLOCK(env, mfp->mutex); if ((t_ret = __mutex_free(env, &mfp->mutex)) != 0 && ret == 0) diff --git a/src/mp/mp_fput.c b/src/mp/mp_fput.c index 7a900fd0..06b30fd4 100644 --- a/src/mp/mp_fput.c +++ b/src/mp/mp_fput.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -52,7 +52,8 @@ __memp_fput_pp(dbmfp, pgaddr, priority, flags) /* * __memp_fput -- - * DB_MPOOLFILE->put. + * DB_MPOOLFILE->put. Release this reference to the page. If the reference + * count drop to zero adjust the buffer's cache priority. * * PUBLIC: int __memp_fput __P((DB_MPOOLFILE *, * PUBLIC: DB_THREAD_INFO *, void *, DB_CACHE_PRIORITY)); diff --git a/src/mp/mp_fset.c b/src/mp/mp_fset.c index 1129853f..770ec5c8 100644 --- a/src/mp/mp_fset.c +++ b/src/mp/mp_fset.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/mp/mp_method.c b/src/mp/mp_method.c index 7afae248..56d6c42b 100644 --- a/src/mp/mp_method.c +++ b/src/mp/mp_method.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -67,6 +67,7 @@ __memp_get_cachesize(dbenv, gbytesp, bytesp, ncachep) int *ncachep; { DB_MPOOL *dbmp; + DB_THREAD_INFO *ip; ENV *env; MPOOL *mp; @@ -78,12 +79,16 @@ __memp_get_cachesize(dbenv, gbytesp, bytesp, ncachep) if (MPOOL_ON(env)) { dbmp = env->mp_handle; mp = dbmp->reginfo[0].primary; + ENV_ENTER(env, ip); + MUTEX_LOCK(env, mp->mtx_resize); if (gbytesp != NULL) *gbytesp = mp->gbytes; if (bytesp != NULL) *bytesp = mp->bytes; if (ncachep != NULL) *ncachep = (int)mp->nreg; + MUTEX_UNLOCK(env, mp->mtx_resize); + ENV_LEAVE(env, ip); } else { if (gbytesp != NULL) *gbytesp = dbenv->mp_gbytes; @@ -380,7 +385,7 @@ __memp_set_mp_max_write(dbenv, maxwrite, maxwrite_sleep) env = dbenv->env; ENV_NOT_CONFIGURED(env, - env->mp_handle, "DB_ENV->get_mp_max_write", DB_INIT_MPOOL); + env->mp_handle, "DB_ENV->set_mp_max_write", DB_INIT_MPOOL); if (MPOOL_ON(env)) { dbmp = env->mp_handle; @@ -448,7 +453,7 @@ __memp_set_mp_mmapsize(dbenv, mp_mmapsize) env = dbenv->env; ENV_NOT_CONFIGURED(env, - env->mp_handle, "DB_ENV->set_mp_max_mmapsize", DB_INIT_MPOOL); + env->mp_handle, "DB_ENV->set_mp_mmapsize", DB_INIT_MPOOL); if (MPOOL_ON(env)) { dbmp = env->mp_handle; @@ -512,7 +517,7 @@ __memp_set_mp_pagesize(dbenv, mp_pagesize) env = dbenv->env; ENV_NOT_CONFIGURED(env, - env->mp_handle, "DB_ENV->get_mp_max_mmapsize", DB_INIT_MPOOL); + env->mp_handle, "DB_ENV->set_mp_pagesize", DB_INIT_MPOOL); ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_mp_pagesize"); dbenv->mp_pagesize = mp_pagesize; @@ -561,7 +566,7 @@ __memp_set_mp_tablesize(dbenv, mp_tablesize) env = dbenv->env; ENV_NOT_CONFIGURED(env, - env->mp_handle, "DB_ENV->get_mp_max_mmapsize", DB_INIT_MPOOL); + env->mp_handle, "DB_ENV->set_mp_tablesize", DB_INIT_MPOOL); ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_mp_tablesize"); dbenv->mp_tablesize = mp_tablesize; @@ -583,7 +588,7 @@ __memp_get_mp_mtxcount(dbenv, mp_mtxcountp) env = dbenv->env; ENV_NOT_CONFIGURED(env, - env->mp_handle, "DB_ENV->get_mp_max_mtxcount", DB_INIT_MPOOL); + env->mp_handle, "DB_ENV->get_mp_mtxcount", DB_INIT_MPOOL); if (MPOOL_ON(env)) { dbmp = env->mp_handle; @@ -610,7 +615,7 @@ __memp_set_mp_mtxcount(dbenv, mp_mtxcount) env = dbenv->env; ENV_NOT_CONFIGURED(env, - env->mp_handle, "DB_ENV->get_mp_max_mmapsize", DB_INIT_MPOOL); + env->mp_handle, "DB_ENV->set_mp_mtxcount", DB_INIT_MPOOL); ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_mp_mtxcount"); dbenv->mp_mtxcount = mp_mtxcount; @@ -870,7 +875,7 @@ __memp_ftruncate(dbmfp, txn, ip, pgno, flags) !mfp->no_backing_file && pgno <= mfp->last_flushed_pgno) #ifdef HAVE_FTRUNCATE ret = __os_truncate(env, - dbmfp->fhp, pgno, mfp->pagesize); + dbmfp->fhp, pgno, mfp->pagesize, 0); #else ret = __db_zero_extend(env, dbmfp->fhp, pgno, mfp->last_pgno, mfp->pagesize); diff --git a/src/mp/mp_mvcc.c b/src/mp/mp_mvcc.c index 47531528..b51ae135 100644 --- a/src/mp/mp_mvcc.c +++ b/src/mp/mp_mvcc.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2006, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -151,6 +151,11 @@ __memp_bh_freeze(dbmp, infop, hp, bhp, need_frozenp) real_name = NULL; fhp = NULL; + if (FLD_ISSET(env->dbenv->verbose, DB_VERB_MVCC)) + __db_msg(env, "freeze %s %d @%lu/%lu", __memp_fns(dbmp, mfp), + bhp->pgno, (u_long)VISIBLE_LSN(env, bhp)->file, + (u_long)VISIBLE_LSN(env, bhp)->offset); + MVCC_MPROTECT(bhp->buf, pagesize, PROT_READ | PROT_WRITE); MPOOL_REGION_LOCK(env, infop); @@ -161,7 +166,7 @@ __memp_bh_freeze(dbmp, infop, hp, bhp, need_frozenp) } else { *need_frozenp = 1; - /* There might be a small amount of unallocated space. */ + /* There might be enough space for a single-item block. */ if (__env_alloc(infop, sizeof(BH_FROZEN_ALLOC) + sizeof(BH_FROZEN_PAGE), &frozen_alloc) == 0) { @@ -405,6 +410,12 @@ __memp_bh_thaw(dbmp, infop, hp, frozen_bhp, alloc_bhp) ret = 0; real_name = NULL; + if (FLD_ISSET(env->dbenv->verbose, DB_VERB_MVCC)) + __db_msg(env, "thaw %s %d @%lu/%lu", __memp_fns(dbmp, mfp), + frozen_bhp->pgno, + (u_long)VISIBLE_LSN(env, frozen_bhp)->file, + (u_long)VISIBLE_LSN(env, frozen_bhp)->offset); + MUTEX_REQUIRED(env, hp->mtx_hash); DB_ASSERT(env, F_ISSET(frozen_bhp, BH_EXCLUSIVE) || alloc_bhp == NULL); h_locked = 1; @@ -414,7 +425,8 @@ __memp_bh_thaw(dbmp, infop, hp, frozen_bhp, alloc_bhp) DB_ASSERT(env, alloc_bhp != NULL || SH_CHAIN_SINGLETON(frozen_bhp, vc) || (SH_CHAIN_HASNEXT(frozen_bhp, vc) && - BH_OBSOLETE(frozen_bhp, hp->old_reader, vlsn))); + BH_OBSOLETE(frozen_bhp, hp->old_reader, vlsn)) || + F_ISSET(frozen_bhp, BH_UNREACHABLE)); DB_ASSERT(env, alloc_bhp == NULL || !F_ISSET(alloc_bhp, BH_FROZEN)); spgno = ((BH_FROZEN_PAGE *)frozen_bhp)->spgno; @@ -516,7 +528,7 @@ __memp_bh_thaw(dbmp, infop, hp, frozen_bhp, alloc_bhp) else { maxpgno -= (db_pgno_t)ntrunc; if ((ret = __os_truncate(env, fhp, - maxpgno + 1, pagesize)) != 0) + maxpgno + 1, pagesize, 0)) != 0) goto err; /* Fix up the linked list */ diff --git a/src/mp/mp_region.c b/src/mp/mp_region.c index 07134de7..ba836cf4 100644 --- a/src/mp/mp_region.c +++ b/src/mp/mp_region.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -11,7 +11,7 @@ #include "db_int.h" #include "dbinc/mp.h" -static int __memp_init_config __P((ENV *, MPOOL *)); +static int __memp_init_config __P((ENV *, MPOOL *, int)); static void __memp_region_size __P((ENV *, roff_t *, u_int32_t *)); #define MPOOL_DEFAULT_PAGESIZE (4 * 1024) @@ -34,7 +34,7 @@ __memp_open(env, create_ok) roff_t cache_size, max_size, reg_size; u_int i, max_nreg; u_int32_t htab_buckets, *regids; - int ret; + int create, ret; dbenv = env->dbenv; cache_size = 0; @@ -77,7 +77,8 @@ __memp_open(env, create_ok) * If we created the region, initialize it. Create or join any * additional regions. */ - if (F_ISSET(®info, REGION_CREATE)) { + create = F_ISSET(®info, REGION_CREATE); + if (create) { /* * We define how many regions there are going to be, allocate * the REGINFO structures and create them. Make sure we don't @@ -167,23 +168,38 @@ __memp_open(env, create_ok) env->mp_handle = dbmp; /* A process joining the region may reset the mpool configuration. */ - if ((ret = __memp_init_config(env, mp)) != 0) + if ((ret = __memp_init_config(env, mp, create)) != 0) return (ret); return (0); -err: env->mp_handle = NULL; - if (dbmp->reginfo != NULL && dbmp->reginfo[0].addr != NULL) { - for (i = 0; i < dbenv->mp_ncache; ++i) +err: (void)__mutex_free(env, &dbmp->mutex); + (void)__memp_region_detach(env, dbmp); + return (ret); +} + +/* __memp_region_detach + * Detach from any attached mempool regions. + * + * PUBLIC: int __memp_region_detach __P((ENV *, DB_MPOOL *)); + */ +int +__memp_region_detach(env, dbmp) + ENV *env; + DB_MPOOL *dbmp; +{ + u_int i; + + if (dbmp != NULL && + dbmp->reginfo != NULL && dbmp->reginfo[0].addr != NULL) { + for (i = 0; i < env->dbenv->mp_ncache; ++i) if (dbmp->reginfo[i].id != INVALID_REGION_ID) (void)__env_region_detach( env, &dbmp->reginfo[i], 0); __os_free(env, dbmp->reginfo); } - - (void)__mutex_free(env, &dbmp->mutex); - __os_free(env, dbmp); - return (ret); + env->mp_handle = NULL; + return (0); } /* @@ -207,7 +223,7 @@ __memp_init(env, dbmp, reginfo_off, htab_buckets, max_nreg) MPOOL *mp, *main_mp; REGINFO *infop; db_mutex_t mtx_base, mtx_discard, mtx_prev; - u_int32_t i; + u_int32_t i, mp_mtxcount; int ret; void *p; @@ -224,6 +240,23 @@ __memp_init(env, dbmp, reginfo_off, htab_buckets, max_nreg) __mutex_alloc(env, MTX_MPOOL_REGION, 0, &mp->mtx_region)) != 0) return (ret); + /* + * Intializing the first mpool region allocates the mpool region id + * array, file table and, if not ENV_PRIVATE, all the cache regions' + * hash bucket mutexes in a single contiguous block of mutex ids, which + * remain allocated when the cache is resized. The block is 'known' to + * start with the first id (mtx_base), and to end #regions * mp_mtxcount + * later. In private environments, mutex ids are not smallish integers, + * but __env_alloc()'d pointers. Since a range of (base, count) doesn't + * work for these likely-scattered mutexes, we allocate private threaded + * mutexes as they are needed. Private non-threaded caches don't need + * any mutexes at all. + */ + if ((mp_mtxcount = dbenv->mp_mtxcount) == 0) + mp_mtxcount = dbenv->mp_mtxcount = htab_buckets; + if (!MUTEX_ON(env) || + F_ISSET(env, ENV_PRIVATE | ENV_THREAD) == ENV_PRIVATE) + mp_mtxcount = dbenv->mp_mtxcount = 0; if (reginfo_off == 0) { ZERO_LSN(mp->lsn); @@ -248,15 +281,10 @@ __memp_init(env, dbmp, reginfo_off, htab_buckets, max_nreg) atomic_init(&htab[i].hash_page_dirty, 0); } - /* - * Allocate all of the hash bucket mutexes up front. We do - * this so that we don't need to free and reallocate mutexes as - * the cache is resized. - */ mtx_base = mtx_prev = MUTEX_INVALID; - if (!MUTEX_ON(env) || F_ISSET(env, ENV_PRIVATE)) + if (F_ISSET(env, ENV_PRIVATE)) goto no_prealloc; - for (i = 0; i < mp->max_nreg * dbenv->mp_mtxcount; i++) { + for (i = 0; i < mp->max_nreg * mp_mtxcount; i++) { if ((ret = __mutex_alloc(env, MTX_MPOOL_HASH_BUCKET, DB_MUTEX_SHARED, &mtx_discard)) != 0) return (ret); @@ -274,13 +302,12 @@ __memp_init(env, dbmp, reginfo_off, htab_buckets, max_nreg) } /* - * We preallocated all of the mutexes in a block, so for regions after - * the first, we skip mutexes in use in earlier regions. Each region - * has the same number of buckets + * If we preallocated all the mutexes, then in regions after the first, + * we skip mutexes in use in earlier regions. Each region has the same + * number of buckets. */ no_prealloc: - if (MUTEX_ON(env)) - mtx_base += reginfo_off * dbenv->mp_mtxcount; + mtx_base += reginfo_off * mp_mtxcount; /* Allocate hash table space and initialize it. */ if ((ret = __env_alloc(infop, @@ -289,18 +316,21 @@ no_prealloc: mp->htab = R_OFFSET(infop, htab); for (i = 0; i < htab_buckets; i++) { hp = &htab[i]; - if (!MUTEX_ON(env) || dbenv->mp_mtxcount == 0) + /* + * Set mtx_hash to do no locking, or share a mutex with an + * earlier hash bucket in this region, or assign it from the + * block of mutexes allocated above, or (in a private + * environment) allocate a new mutex. + */ + if (mp_mtxcount == 0) hp->mtx_hash = MUTEX_INVALID; - else if (F_ISSET(env, ENV_PRIVATE)) { - if (i >= dbenv->mp_mtxcount) - hp->mtx_hash = - htab[i % dbenv->mp_mtxcount].mtx_hash; - else if - ((ret = __mutex_alloc(env, MTX_MPOOL_HASH_BUCKET, - DB_MUTEX_SHARED, &hp->mtx_hash)) != 0) - return (ret); - } else - hp->mtx_hash = mtx_base + (i % dbenv->mp_mtxcount); + else if (i >= mp_mtxcount) + hp->mtx_hash = htab[i % mp_mtxcount].mtx_hash; + else if (!F_ISSET(env, ENV_PRIVATE)) + hp->mtx_hash = mtx_base + i; + else if ((ret = __mutex_alloc(env, MTX_MPOOL_HASH_BUCKET, + DB_MUTEX_SHARED, &hp->mtx_hash)) != 0) + return (ret); SH_TAILQ_INIT(&hp->hash_bucket); atomic_init(&hp->hash_page_dirty, 0); #ifdef HAVE_STATISTICS @@ -311,7 +341,7 @@ no_prealloc: ZERO_LSN(hp->old_reader); } mp->htab_buckets = htab_buckets; - mp->htab_mutexes = dbenv->mp_mtxcount; + mp->htab_mutexes = mp_mtxcount; mp->pagesize = dbenv->mp_pagesize == 0 ? MPOOL_DEFAULT_PAGESIZE : dbenv->mp_pagesize; @@ -443,11 +473,21 @@ __memp_region_mutex_count(env) dbenv = env->dbenv; __memp_region_size(env, ®_size, &htab_buckets); - if (F_ISSET(env->dbenv, DB_ENV_MULTIVERSION)) - pgsize = sizeof(BH_FROZEN_ALLOC) + sizeof(BH_FROZEN_PAGE); - if ((pgsize = dbenv->mp_pagesize) == 0) - pgsize = MPOOL_DEFAULT_PAGESIZE; + if (dbenv->mp_mtxcount != 0) + htab_buckets = dbenv->mp_mtxcount; max_region = __memp_max_regions(env); + if ((pgsize = dbenv->mp_pagesize) == 0) { + /* + * If MVCC is on during environment creation, provide enough + * mutexes so that half the cache can be frozen buffer headers. + */ + if (F_ISSET(env->dbenv, DB_ENV_MULTIVERSION)) + pgsize = (MPOOL_DEFAULT_PAGESIZE + + sizeof(BH_FROZEN_ALLOC) + + sizeof(BH_FROZEN_PAGE)) / 2; + else + pgsize = MPOOL_DEFAULT_PAGESIZE; + } /* * We need a couple of mutexes for the region itself, one for each @@ -456,10 +496,6 @@ __memp_region_mutex_count(env) * hash bucket. We then need one mutex per page in the cache, * the worst case is really big if the pages are 512 bytes. */ - if (dbenv->mp_mtxcount != 0) - htab_buckets = dbenv->mp_mtxcount; - else - dbenv->mp_mtxcount = htab_buckets; num_per_cache = htab_buckets + (u_int32_t)(reg_size / pgsize); return ((max_region * num_per_cache) + 50 + MPOOL_FILE_BUCKETS); } @@ -469,23 +505,39 @@ __memp_region_mutex_count(env) * Initialize shared configuration information. */ static int -__memp_init_config(env, mp) +__memp_init_config(env, mp, create) ENV *env; MPOOL *mp; + int create; { DB_ENV *dbenv; dbenv = env->dbenv; MPOOL_SYSTEM_LOCK(env); - if (dbenv->mp_mmapsize != 0) + if (create) { mp->mp_mmapsize = (db_size_t)dbenv->mp_mmapsize; - if (dbenv->mp_maxopenfd != 0) mp->mp_maxopenfd = dbenv->mp_maxopenfd; - if (dbenv->mp_maxwrite != 0) mp->mp_maxwrite = dbenv->mp_maxwrite; - if (dbenv->mp_maxwrite_sleep != 0) mp->mp_maxwrite_sleep = dbenv->mp_maxwrite_sleep; + } else { + if (dbenv->mp_mmapsize != 0 && + mp->mp_mmapsize != (db_size_t)dbenv->mp_mmapsize) + __db_msg(env, DB_STR("3044", +"Warning: Ignoring maximum memory map size when joining environment")); + + if (dbenv->mp_maxopenfd != 0 && + mp->mp_maxopenfd != dbenv->mp_maxopenfd) + __db_msg(env, DB_STR("3045", +"Warning: Ignoring max open file descriptors value when joining environment")); + + if ((dbenv->mp_maxwrite != 0 && + mp->mp_maxwrite != dbenv->mp_maxwrite) || + (dbenv->mp_maxwrite_sleep != 0 && + mp->mp_maxwrite_sleep != dbenv->mp_maxwrite_sleep)) + __db_msg(env, DB_STR("3046", +"Warning: Ignoring maximum sequential writes value when joining environment")); + } MPOOL_SYSTEM_UNLOCK(env); return (0); @@ -501,22 +553,18 @@ int __memp_env_refresh(env) ENV *env; { - BH *bhp; - BH_FROZEN_ALLOC *frozen_alloc; DB_MPOOL *dbmp; DB_MPOOLFILE *dbmfp; - DB_MPOOL_HASH *hp; DB_MPREG *mpreg; MPOOL *mp, *c_mp; REGINFO *infop; - u_int32_t bucket, i, nreg; + u_int32_t i, nreg; int ret, t_ret; ret = 0; dbmp = env->mp_handle; mp = dbmp->reginfo[0].primary; nreg = mp->nreg; - hp = R_ADDR(&dbmp->reginfo[0], mp->htab); /* * If a private region, return the memory to the heap. Not needed for @@ -526,49 +574,20 @@ __memp_env_refresh(env) if (!F_ISSET(env, ENV_PRIVATE)) goto not_priv; - /* Discard buffers. */ for (i = 0; i < nreg; ++i) { infop = &dbmp->reginfo[i]; - c_mp = infop->primary; - for (hp = R_ADDR(infop, c_mp->htab), bucket = 0; - bucket < c_mp->htab_buckets; ++hp, ++bucket) { - while ((bhp = SH_TAILQ_FIRST( - &hp->hash_bucket, __bh)) != NULL) - if (F_ISSET(bhp, BH_FROZEN)) - SH_TAILQ_REMOVE( - &hp->hash_bucket, bhp, - hq, __bh); - else { - if (F_ISSET(bhp, BH_DIRTY)) { - atomic_dec(env, - &hp->hash_page_dirty); - F_CLR(bhp, - BH_DIRTY | BH_DIRTY_CREATE); - } - atomic_inc(env, &bhp->ref); - if ((t_ret = __memp_bhfree(dbmp, infop, - R_ADDR(dbmp->reginfo, - bhp->mf_offset), hp, bhp, - BH_FREE_FREEMEM | - BH_FREE_UNLOCKED)) != 0 && ret == 0) - ret = t_ret; - } - } - MPOOL_REGION_LOCK(env, infop); - while ((frozen_alloc = SH_TAILQ_FIRST( - &c_mp->alloc_frozen, __bh_frozen_a)) != NULL) { - SH_TAILQ_REMOVE(&c_mp->alloc_frozen, frozen_alloc, - links, __bh_frozen_a); - __env_alloc_free(infop, frozen_alloc); - } - MPOOL_REGION_UNLOCK(env, infop); + if ((t_ret = __memp_region_bhfree(infop)) != 0 && ret == 0) + ret = t_ret; } not_priv: /* Discard DB_MPOOLFILEs. */ while ((dbmfp = TAILQ_FIRST(&dbmp->dbmfq)) != NULL) - if ((t_ret = __memp_fclose(dbmfp, DB_FLUSH)) != 0 && ret == 0) - ret = t_ret; + if ((t_ret = __memp_fclose(dbmfp, DB_FLUSH)) != 0) { + if (ret == 0) + ret = t_ret; + break; + } /* Discard DB_MPREGs. */ if (dbmp->pg_inout != NULL) @@ -618,3 +637,62 @@ not_priv: env->mp_handle = NULL; return (ret); } + +/* + * __memp_region_bhfree -- + * Discard the buffers for a region. + * + * PUBLIC: int __memp_region_bhfree __P((REGINFO *)); + */ +int +__memp_region_bhfree(infop) + REGINFO *infop; +{ + BH *bhp; + BH_FROZEN_ALLOC *frozen_alloc; + DB_MPOOL *dbmp; + DB_MPOOL_HASH *hp; + ENV *env; + MPOOL *c_mp; + u_int32_t bucket; + int ret, t_ret; + + env = infop->env; + dbmp = env->mp_handle; + ret = 0; + + /* Discard buffers. */ + c_mp = infop->primary; + for (hp = R_ADDR(infop, c_mp->htab), bucket = 0; + bucket < c_mp->htab_buckets; ++hp, ++bucket) { + while ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) != NULL) + if (F_ISSET(bhp, BH_FROZEN)) + SH_TAILQ_REMOVE(&hp->hash_bucket, + bhp, hq, __bh); + else { + if (F_ISSET(bhp, BH_DIRTY)) { + atomic_dec(env, &hp->hash_page_dirty); + F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE); + } + atomic_inc(env, &bhp->ref); + if ((t_ret = __memp_bhfree(dbmp, infop, + R_ADDR(dbmp->reginfo, bhp->mf_offset), + hp, bhp, BH_FREE_FREEMEM | + BH_FREE_UNLOCKED)) != 0) { + if (ret == 0) + ret = t_ret; + break; + } + } + } + MPOOL_REGION_LOCK(env, infop); + while ((frozen_alloc = SH_TAILQ_FIRST( + &c_mp->alloc_frozen, __bh_frozen_a)) != NULL) { + SH_TAILQ_REMOVE(&c_mp->alloc_frozen, + frozen_alloc, links, __bh_frozen_a); + __env_alloc_free(infop, frozen_alloc); + } + MPOOL_REGION_UNLOCK(env, infop); + + return (ret); +} diff --git a/src/mp/mp_register.c b/src/mp/mp_register.c index dc7015a7..cc59af9c 100644 --- a/src/mp/mp_register.c +++ b/src/mp/mp_register.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/mp/mp_resize.c b/src/mp/mp_resize.c index 97719554..932a1baa 100644 --- a/src/mp/mp_resize.c +++ b/src/mp/mp_resize.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2006, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -126,12 +126,13 @@ __memp_merge_buckets(dbmp, new_nbuckets, old_bucket, new_bucket) MPOOLFILE *mfp; REGINFO *new_infop, *old_infop; u_int32_t bucket, high_mask, new_region, old_region; - int ret; + int expanding, ret; env = dbmp->env; mp = dbmp->reginfo[0].primary; new_bhp = NULL; ret = 0; + expanding = (mp->nbuckets > new_nbuckets) ? 0 : 1; MP_MASK(new_nbuckets, high_mask); @@ -150,36 +151,42 @@ __memp_merge_buckets(dbmp, new_nbuckets, old_bucket, new_bucket) /* * Before merging, we need to check that there are no old buffers left * in the target hash bucket after a previous split. + * Only free the buffers if we are expanding into new buckets. If + * we are contracting, the buffers in the original (old) bucket should + * not be freed. */ free_old: - MUTEX_LOCK(env, new_hp->mtx_hash); - SH_TAILQ_FOREACH(bhp, &new_hp->hash_bucket, hq, __bh) { - MP_BUCKET(bhp->mf_offset, bhp->pgno, mp->nbuckets, bucket); + if (expanding != 0) { + MUTEX_LOCK(env, new_hp->mtx_hash); + SH_TAILQ_FOREACH(bhp, &new_hp->hash_bucket, hq, __bh) { + MP_BUCKET( + bhp->mf_offset, bhp->pgno, mp->nbuckets, bucket); + + if (bucket != new_bucket) { + /* + * There is no way that an old buffer can be + * locked after a split, since everyone will + * look for it in the new hash bucket. + */ + DB_ASSERT(env, !F_ISSET(bhp, BH_DIRTY) && + atomic_read(&bhp->ref) == 0); + atomic_inc(env, &bhp->ref); + mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); + if ((ret = __memp_bhfree(dbmp, new_infop, + mfp, new_hp, bhp, BH_FREE_FREEMEM)) != 0) { + MUTEX_UNLOCK(env, new_hp->mtx_hash); + return (ret); + } - if (bucket != new_bucket) { - /* - * There is no way that an old buffer can be locked - * after a split, since everyone will look for it in - * the new hash bucket. - */ - DB_ASSERT(env, !F_ISSET(bhp, BH_DIRTY) && - atomic_read(&bhp->ref) == 0); - atomic_inc(env, &bhp->ref); - mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); - if ((ret = __memp_bhfree(dbmp, new_infop, - mfp, new_hp, bhp, BH_FREE_FREEMEM)) != 0) { - MUTEX_UNLOCK(env, new_hp->mtx_hash); - return (ret); + /* + * The free has modified the list of buffers and + * dropped the mutex. We need to start again. + */ + goto free_old; } - - /* - * The free has modified the list of buffers and - * dropped the mutex. We need to start again. - */ - goto free_old; } + MUTEX_UNLOCK(env, new_hp->mtx_hash); } - MUTEX_UNLOCK(env, new_hp->mtx_hash); /* * Before we begin, make sure that all of the buffers we care about are @@ -305,7 +312,9 @@ err: atomic_dec(env, &bhp->ref); next_bhp, alloc_bhp, vc, __bh); } - DB_ASSERT(env, new_hp->mtx_hash != old_hp->mtx_hash); + /* The mutexes must be different, unless they aren't in use. */ + DB_ASSERT(env, new_hp->mtx_hash != old_hp->mtx_hash || + new_hp->mtx_hash == MUTEX_INVALID); MUTEX_LOCK(env, new_hp->mtx_hash); SH_TAILQ_INSERT_TAIL(&new_hp->hash_bucket, new_bhp, hq); if (F_ISSET(new_bhp, BH_DIRTY)) @@ -362,16 +371,15 @@ __memp_add_region(dbmp) MPOOL *mp; REGINFO *infop; int ret; - roff_t cache_size, reg_size; + roff_t reg_size; u_int i; u_int32_t *regids; env = dbmp->env; mp = dbmp->reginfo[0].primary; - cache_size = (roff_t)mp->gbytes * GIGABYTE + mp->bytes; /* All cache regions are the same size. */ - reg_size = dbmp->reginfo[0].rp->size; + reg_size = dbmp->reginfo[0].rp->max; ret = 0; infop = &dbmp->reginfo[mp->nreg]; @@ -384,9 +392,6 @@ __memp_add_region(dbmp) if ((ret = __memp_init(env, dbmp, mp->nreg, mp->htab_buckets, mp->max_nreg)) != 0) return (ret); - cache_size += reg_size; - mp->gbytes = (u_int32_t)(cache_size / GIGABYTE); - mp->bytes = (u_int32_t)(cache_size % GIGABYTE); regids = R_ADDR(dbmp->reginfo, mp->regids); regids[mp->nreg++] = infop->id; @@ -425,16 +430,13 @@ __memp_remove_region(dbmp) { DB_MPOOL_HASH *hp; ENV *env; - MPOOL *mp; + MPOOL *mp, *c_mp; REGINFO *infop; int ret; - roff_t cache_size, reg_size; u_int i; env = dbmp->env; mp = dbmp->reginfo[0].primary; - reg_size = dbmp->reginfo[0].rp->size; - cache_size = (roff_t)mp->gbytes * GIGABYTE + mp->bytes; ret = 0; if (mp->nreg == 1) { @@ -448,21 +450,36 @@ __memp_remove_region(dbmp) return (ret); /* Detach from the region then destroy it. */ - infop = &dbmp->reginfo[mp->nreg]; + infop = &dbmp->reginfo[mp->nreg - 1]; + c_mp = infop->primary; + hp = R_ADDR(infop, c_mp->htab); + /* + * For private enviroment, we need to free everything, and + * for non-private environment, we need to refresh the mutexes + * so that they can be in a ready state for later resize. + */ if (F_ISSET(env, ENV_PRIVATE)) { - hp = R_ADDR(infop, ((MPOOL*)infop->primary)->htab); - for (i = 0; i < env->dbenv->mp_mtxcount; i++) - if ((ret = __mutex_free(env, &hp[i].mtx_hash)) != 0) + if ((ret = __memp_region_bhfree(infop)) != 0) + return (ret); + if (MUTEX_ON(env)) { + DB_ASSERT(env, + env->dbenv->mp_mtxcount == mp->htab_mutexes); + for (i = 0; i < mp->htab_mutexes; i++) + if ((ret = __mutex_free(env, + &hp[i].mtx_hash)) != 0) + return (ret); + } + __env_alloc_free(infop, hp); + } else if (MUTEX_ON(env)) { + DB_ASSERT(env, env->dbenv->mp_mtxcount == mp->htab_mutexes); + for (i = 0; i < mp->htab_mutexes; i++) + if ((ret = __mutex_refresh(env, hp[i].mtx_hash)) != 0) return (ret); } ret = __env_region_detach(env, infop, 1); - if (ret == 0) { + if (ret == 0) mp->nreg--; - cache_size -= reg_size; - mp->gbytes = (u_int32_t)(cache_size / GIGABYTE); - mp->bytes = (u_int32_t)(cache_size % GIGABYTE); - } return (ret); } @@ -511,6 +528,9 @@ __memp_map_regions(dbmp) } /* + * __memp_resize -- + * Change the overall cache size by adding or removing cache regions. + * * PUBLIC: int __memp_resize __P((DB_MPOOL *, u_int32_t, u_int32_t)); */ int @@ -526,7 +546,7 @@ __memp_resize(dbmp, gbytes, bytes) env = dbmp->env; mp = dbmp->reginfo[0].primary; - reg_size = dbmp->reginfo[0].rp->size; + reg_size = dbmp->reginfo[0].rp->max; total_size = (roff_t)gbytes * GIGABYTE + bytes; ncache = (u_int32_t)((total_size + reg_size / 2) / reg_size); @@ -546,6 +566,9 @@ __memp_resize(dbmp, gbytes, bytes) __memp_add_region(dbmp) : __memp_remove_region(dbmp))) != 0) break; + total_size = reg_size * (roff_t)mp->nreg; + mp->gbytes = (u_int32_t)(total_size / GIGABYTE); + mp->bytes = (u_int32_t)(total_size % GIGABYTE); MUTEX_UNLOCK(env, mp->mtx_resize); return (ret); @@ -567,13 +590,13 @@ __memp_get_cache_max(dbenv, max_gbytesp, max_bytesp) env = dbenv->env; ENV_NOT_CONFIGURED(env, - env->mp_handle, "DB_ENV->get_mp_max_ncache", DB_INIT_MPOOL); + env->mp_handle, "DB_ENV->get_cache_max", DB_INIT_MPOOL); if (MPOOL_ON(env)) { /* Cannot be set after open, no lock required to read. */ dbmp = env->mp_handle; mp = dbmp->reginfo[0].primary; - reg_size = dbmp->reginfo[0].rp->size; + reg_size = dbmp->reginfo[0].rp->max; max_size = mp->max_nreg * reg_size; *max_gbytesp = (u_int32_t)(max_size / GIGABYTE); *max_bytesp = (u_int32_t)(max_size % GIGABYTE); diff --git a/src/mp/mp_stat.c b/src/mp/mp_stat.c index 246b44d7..81ea35c1 100644 --- a/src/mp/mp_stat.c +++ b/src/mp/mp_stat.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -133,7 +133,14 @@ __memp_stat(env, gspp, fspp, flags) sp->st_ro_evict += c_mp->stat.st_ro_evict; sp->st_rw_evict += c_mp->stat.st_rw_evict; sp->st_page_trickle += c_mp->stat.st_page_trickle; + sp->st_mvcc_reused += c_mp->stat.st_mvcc_reused; sp->st_pages += c_mp->pages; + /* Undocumented field used by tests only. */ + sp->st_oddfsize_detect += + c_mp->stat.st_oddfsize_detect; + /* Undocumented field used by tests only. */ + sp->st_oddfsize_resolve += + c_mp->stat.st_oddfsize_resolve; /* * st_page_dirty calculated by __memp_stat_hash * st_page_clean calculated here @@ -195,7 +202,12 @@ __memp_stat(env, gspp, fspp, flags) /* Count the MPOOLFILE structures. */ i = 0; - len = 0; + /* + * Allow space for the first __memp_get_files() to align the + * structure array to uintmax_t, DB_MPOOL_STAT's most + * restrictive field. [#23150] + */ + len = sizeof(uintmax_t); if ((ret = __memp_walk_files(env, mp, __memp_count_files, &len, &i, flags)) != 0) return (ret); @@ -252,6 +264,11 @@ __memp_file_stats(env, mfp, argp, countp, flags) return (0); } +/* + * __memp_count_files -- + * This __memp_walk_files() iterator counts the number of files as well as + * the space needed for their statistics, including file names. + */ static int __memp_count_files(env, mfp, argp, countp, flags) ENV *env; @@ -277,13 +294,25 @@ __memp_count_files(env, mfp, argp, countp, flags) /* * __memp_get_files -- - * get file specific statistics + * get another file's specific statistics * - * Build each individual entry. We assume that an array of pointers are - * aligned correctly to be followed by an array of structures, which should - * be safe (in this particular case, the first element of the structure - * is a pointer, so we're doubly safe). The array is followed by space - * for the text file names. + * Add a file statistics entry to the current list. The chunk of memory + * starts with an array of DB_MPOOL_FSTAT pointers, a null pointer to mark + * the last one, then an aligned array of DB_MPOOL_FSTAT structures, then + * characters space for the file names. + * +-----------------------------------------------+ + * | count * DB_MPOOL_FSTAT pointers | + * +-----------------------------------------------+ + * | null pointer + + * +-----------------------------------------------| + * | [space for aligning DB_MPOOL_FSTAT array] | + * +-----------------------------------------------+ + * | count * DB_MPOOL_FSTAT structs | + * +-----------------------------------------------+ + * | first file name | second file name | third... | + * +-----------------------------------------------+ + * | file name | ... | + * +-----------------------------------------------+ */ static int __memp_get_files(env, mfp, argp, countp, flags) @@ -305,11 +334,21 @@ __memp_get_files(env, mfp, argp, countp, flags) tfsp = *(DB_MPOOL_FSTAT ***)argp; if (*tfsp == NULL) { - /* Add 1 to count because we need to skip over the NULL. */ - tstruct = (DB_MPOOL_FSTAT *)(tfsp + *countp + 1); - tname = (char *)(tstruct + *countp); + /* + * Add 1 to count because to skip over the NULL end marker. + * Align it further for DB_MPOOL_STAT's most restrictive field + * because uintmax_t might require stricter alignment than + * pointers; e.g., IP32 LL64 SPARC. [#23150] + */ + tstruct = (DB_MPOOL_FSTAT *)&tfsp[*countp + 1]; + tstruct = ALIGNP_INC(tstruct, sizeof(uintmax_t)); + tname = (char *)&tstruct[*countp]; *tfsp = tstruct; } else { + /* + * This stat struct follows the previous one; the file name + * follows the previous entry's filename. + */ tstruct = *tfsp + 1; tname = (*tfsp)->file_name + strlen((*tfsp)->file_name) + 1; *++tfsp = tstruct; @@ -486,6 +525,8 @@ __memp_print_stats(env, flags) (u_long)gsp->st_mvcc_thawed); __db_dl(env, "The number of frozen buffers freed", (u_long)gsp->st_mvcc_freed); + __db_dl(env, "The number of outdated intermediate versions reused", + (u_long)gsp->st_mvcc_reused); __db_dl(env, "The number of page allocations", (u_long)gsp->st_alloc); __db_dl(env, "The number of hash buckets examined during allocations", @@ -744,11 +785,18 @@ __memp_print_hash(env, dbmp, reginfo, fmap, flags) vbhp != NULL; vbhp = SH_CHAIN_PREV(vbhp, vc, __bh)) { __memp_print_bh(env, dbmp, - " next:\t", vbhp, fmap); + " prev:\t", vbhp, fmap); } } MUTEX_UNLOCK(env, hp->mtx_hash); } +#ifdef DIAGNOSTIC + SH_TAILQ_FOREACH(bhp, &c_mp->free_frozen, hq, __bh) { + __db_msg(env, "free frozen %lu pgno %lu mtx_buf %lu", + (u_long)R_OFFSET(dbmp->reginfo, bhp), + (u_long)bhp->pgno, (u_long)bhp->mtx_buf); + } +#endif return (0); } @@ -775,6 +823,7 @@ __memp_print_bh(env, dbmp, prefix, bhp, fmap) { BH_FROZEN, "frozen" }, { BH_TRASH, "trash" }, { BH_THAWED, "thawed" }, + { BH_UNREACHABLE, "unreachable" }, { 0, NULL } }; DB_MSGBUF mb; diff --git a/src/mp/mp_sync.c b/src/mp/mp_sync.c index fa06b1d4..82d5c8de 100644 --- a/src/mp/mp_sync.c +++ b/src/mp/mp_sync.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -95,9 +95,11 @@ __memp_discard_all_mpfs (env, mp) while ((mfp = SH_TAILQ_FIRST( &hp->hash_bucket, __mpoolfile)) != NULL) { MUTEX_LOCK(env, mfp->mutex); - if ((t_ret = __memp_mf_discard(dbmp, mfp, 1)) != 0 && - ret == 0) - ret = t_ret; + if ((t_ret = __memp_mf_discard(dbmp, mfp, 1)) != 0) { + if (ret == 0) + ret = t_ret; + break; + } } MUTEX_UNLOCK(env, hp->mtx_hash); } @@ -837,6 +839,7 @@ __memp_mf_sync(dbmp, mfp, locked) MPOOLFILE *mfp; int locked; { + APPNAME appname; DB_FH *fhp; DB_MPOOL_HASH *hp; ENV *env; @@ -846,6 +849,7 @@ __memp_mf_sync(dbmp, mfp, locked) COMPQUIET(hp, NULL); env = dbmp->env; + appname = DB_APP_DATA; /* * We need to be holding the hash lock: we're using the path name @@ -859,13 +863,20 @@ __memp_mf_sync(dbmp, mfp, locked) MUTEX_LOCK(env, hp->mtx_hash); } - if ((ret = __db_appname(env, DB_APP_DATA, +mpsync: if ((ret = __db_appname(env, appname, R_ADDR(dbmp->reginfo, mfp->path_off), NULL, &rpath)) == 0) { if ((ret = __os_open(env, rpath, 0, 0, 0, &fhp)) == 0) { ret = __os_fsync(env, fhp); if ((t_ret = __os_closehandle(env, fhp)) != 0 && ret == 0) ret = t_ret; + } else { + /* We may be syncing the blob meta db. */ + if (appname != DB_APP_BLOB) { + __os_free(env, rpath); + appname = DB_APP_BLOB; + goto mpsync; + } } __os_free(env, rpath); } diff --git a/src/mp/mp_trickle.c b/src/mp/mp_trickle.c index fba528b3..ff8cb875 100644 --- a/src/mp/mp_trickle.c +++ b/src/mp/mp_trickle.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/mutex/mut_alloc.c b/src/mutex/mut_alloc.c index 5df3de53..06b3541e 100644 --- a/src/mutex/mut_alloc.c +++ b/src/mutex/mut_alloc.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -9,6 +9,9 @@ #include "db_config.h" #include "db_int.h" +#include "dbinc/log.h" + +static char *__mutex_action_print __P((MUTEX_ACTION)); /* * __mutex_alloc -- @@ -35,8 +38,7 @@ __mutex_alloc(env, alloc_id, flags, indxp) if (alloc_id != MTX_APPLICATION && alloc_id != MTX_MUTEX_TEST && (F_ISSET(env->dbenv, DB_ENV_NOLOCKING) || (!F_ISSET(env, ENV_THREAD) && - (LF_ISSET(DB_MUTEX_PROCESS_ONLY) || - F_ISSET(env, ENV_PRIVATE))))) + (LF_ISSET(DB_MUTEX_PROCESS_ONLY) || F_ISSET(env, ENV_PRIVATE))))) return (0); /* Private environments never share mutexes. */ @@ -109,13 +111,17 @@ nomem: __db_errx(env, DB_STR("2034", mtxregion->stat.st_mutex_max) cnt = mtxregion->stat.st_mutex_max - mtxregion->stat.st_mutex_cnt; + + /* Set i to the first newly created db_mutex_t. */ if (F_ISSET(env, ENV_PRIVATE)) { F_SET(&mtxmgr->reginfo, REGION_TRACKED); while (__env_alloc(&mtxmgr->reginfo, (cnt * mtxregion->mutex_size) + - mtxregion->stat.st_mutex_align, &i) != 0) - if ((cnt >> 1) == 0) + mtxregion->stat.st_mutex_align, &i) != 0) { + cnt >>= 1; + if (cnt == 0) break; + } F_CLR(&mtxmgr->reginfo, REGION_TRACKED); i = (db_mutex_t)ALIGNP_INC(i, mtxregion->stat.st_mutex_align); @@ -130,21 +136,16 @@ nomem: __db_errx(env, DB_STR("2034", } if (cnt == 0) goto nomem; - mutexp = MUTEXP_SET(env, i); + mtxregion->stat.st_mutex_free = cnt; mtxregion->mutex_next = i; mtxregion->stat.st_mutex_cnt += cnt; - while (--cnt > 0) { - mutexp->flags = 0; - if (F_ISSET(env, ENV_PRIVATE)) - mutexp->mutex_next_link = - (uintptr_t)(mutexp + 1); - else - mutexp->mutex_next_link = ++i; - mutexp++; - } - mutexp->flags = 0; - mutexp->mutex_next_link = MUTEX_INVALID; + + /* + * Now link the rest of the newly allocated db_mutex_t's into + * the free list. + */ + MUTEX_BULK_INIT(env, mtxregion, i, cnt); } *indxp = mtxregion->mutex_next; @@ -158,14 +159,12 @@ nomem: __db_errx(env, DB_STR("2034", if (mtxregion->stat.st_mutex_inuse > mtxregion->stat.st_mutex_inuse_max) mtxregion->stat.st_mutex_inuse_max = mtxregion->stat.st_mutex_inuse; - if (locksys) - MUTEX_SYSTEM_UNLOCK(env); /* Initialize the mutex. */ memset(mutexp, 0, sizeof(*mutexp)); F_SET(mutexp, DB_MUTEX_ALLOCATED | - LF_ISSET(DB_MUTEX_LOGICAL_LOCK | - DB_MUTEX_PROCESS_ONLY | DB_MUTEX_SHARED)); + LF_ISSET(DB_MUTEX_LOGICAL_LOCK | DB_MUTEX_PROCESS_ONLY | + DB_MUTEX_SELF_BLOCK | DB_MUTEX_SHARED)); /* * If the mutex is associated with a single process, set the process @@ -182,7 +181,9 @@ nomem: __db_errx(env, DB_STR("2034", #endif if ((ret = __mutex_init(env, *indxp, flags)) != 0) - (void)__mutex_free_int(env, locksys, indxp); + (void)__mutex_free_int(env, 0, indxp); + if (locksys) + MUTEX_SYSTEM_UNLOCK(env); return (ret); } @@ -262,6 +263,44 @@ __mutex_free_int(env, locksys, indxp) return (ret); } +#ifdef HAVE_FAILCHK_BROADCAST +/* + * __mutex_died -- + * Announce that a mutex request couldn't been granted because the last + * thread to own it was killed by failchk. Sets ENV_DEAD_MUTEX in the + * possibly shared environment so that mutex unlock calls don't complain. + * + * + * PUBLIC: int __mutex_died __P((ENV *, db_mutex_t)); + */ +int +__mutex_died(env, mutex) + ENV *env; + db_mutex_t mutex; +{ + DB_ENV *dbenv; + DB_EVENT_MUTEX_DIED_INFO info; + DB_MUTEX *mutexp; + char tidstr[DB_THREADID_STRLEN], failmsg[DB_FAILURE_SYMPTOM_SIZE]; + + dbenv = env->dbenv; + + mutexp = MUTEXP_SET(env, mutex); + info.mutex = mutex; + info.pid = mutexp->pid; + info.tid = mutexp->tid; + (void)dbenv->thread_id_string(dbenv, mutexp->pid, mutexp->tid, tidstr); + (void)__mutex_describe(env, mutex, info.desc); + (void)snprintf(failmsg, sizeof(failmsg), DB_STR_A("2073", + "Mutex died: %s owned %s", "%s %s"), tidstr, info.desc); + __db_errx(env, "%s", failmsg); + /* If this is the first crashed process, save its description. */ + (void)__env_failure_remember(env, failmsg); + DB_EVENT(env, DB_EVENT_MUTEX_DIED, &info); + return (__env_panic(env, USR_ERR(env, DB_RUNRECOVERY))); +} +#endif + /* * __mutex_refresh -- * Reinitialize a mutex, if we are not sure of its state. @@ -289,3 +328,154 @@ __mutex_refresh(env, mutex) } return (ret); } + +/* + * __mutex_record_lock -- + * Record that this thread is about to lock a latch. + * The last parameter is updated to point to this mutex's entry in the + * per-thread mutex state array, so that it can update it if it gets the + * mutex, or free it if the mutex is not acquired (e.g. it times out). + * Mutexes which can be unlocked by other threads are not placed in this + * list, because it would be too costly for that other thread to to find + * the right slot to clear. The caller has already checked that thread + * tracking is enabled. + * + * PUBLIC: int __mutex_record_lock + * PUBLIC: __P((ENV *, db_mutex_t, MUTEX_ACTION, MUTEX_STATE **)); + */ +int +__mutex_record_lock(env, mutex, action, retp) + ENV *env; + db_mutex_t mutex; + MUTEX_ACTION action; + MUTEX_STATE **retp; +{ + DB_MUTEX *mutexp; + DB_THREAD_INFO *ip; + int i, ret; + + *retp = NULL; + mutexp = MUTEXP_SET(env, mutex); + if (!F_ISSET(mutexp, DB_MUTEX_SHARED)) + return (0); + if ((ret = __env_set_state(env, &ip, THREAD_VERIFY)) != 0) + return (ret); + for (i = 0; i != MUTEX_STATE_MAX; i++) { + if (ip->dbth_latches[i].action == MUTEX_ACTION_UNLOCKED) { + ip->dbth_latches[i].mutex = mutex; + ip->dbth_latches[i].action = action; +#ifdef DIAGNOSTIC + __os_gettime(env, &ip->dbth_latches[i].when, 0); +#endif + *retp = &ip->dbth_latches[i]; + return (0); + } + } + __db_errx(env, DB_STR_A("2074", + "No space available in latch table for %lu", "%lu"), (u_long)mutex); + (void)__mutex_record_print(env, ip); + return (__env_panic(env, USR_ERR(env, DB_RUNRECOVERY))); +} + +/* + * __mutex_record_unlock -- + * Verify that this thread owns the mutex it is about to unlock. + * + * PUBLIC: int __mutex_record_unlock __P((ENV *, db_mutex_t)); + */ +int +__mutex_record_unlock(env, mutex) + ENV *env; + db_mutex_t mutex; +{ + DB_MUTEX *mutexp; + DB_THREAD_INFO *ip; + int i, ret; + + if (env->thr_hashtab == NULL) + return (0); + mutexp = MUTEXP_SET(env, mutex); + if (!F_ISSET(mutexp, DB_MUTEX_SHARED)) + return (0); + if ((ret = __env_set_state(env, &ip, THREAD_VERIFY)) != 0) + return (ret); + for (i = 0; i != MUTEX_STATE_MAX; i++) { + if (ip->dbth_latches[i].mutex == mutex && + ip->dbth_latches[i].action != MUTEX_ACTION_UNLOCKED) { + ip->dbth_latches[i].action = MUTEX_ACTION_UNLOCKED; + return (0); + } + } + (void)__mutex_record_print(env, ip); + if (ip->dbth_state == THREAD_FAILCHK) { + DB_DEBUG_MSG(env, "mutex_record_unlock %lu by failchk thread", + (u_long)mutex); + return (0); + } + __db_errx(env, DB_STR_A("2075", + "Latch %lu was not held", "%lu"), (u_long)mutex); + return (__env_panic(env, USR_ERR(env, DB_RUNRECOVERY))); +} + +static char * +__mutex_action_print(action) + MUTEX_ACTION action; +{ + switch (action) { + case MUTEX_ACTION_UNLOCKED: + return ("unlocked"); + case MUTEX_ACTION_INTEND_SHARE: + return ("waiting to share"); + case MUTEX_ACTION_SHARED: + return ("sharing"); + default: + return ("unknown"); + } + /* NOTREACHED */ +} + +/* + * __mutex_record_print -- + * Display the thread's mutex state via __db_msg(), including any + * information which would be relevant for db_stat or diagnostic messages. + * + * PUBLIC: int __mutex_record_print __P((ENV *, DB_THREAD_INFO *)); + */ +int +__mutex_record_print(env, ip) + ENV *env; + DB_THREAD_INFO *ip; +{ + DB_MSGBUF mb, *mbp; + db_mutex_t mutex; + int i; + char desc[DB_MUTEX_DESCRIBE_STRLEN]; + char time_buf[CTIME_BUFLEN]; + + DB_MSGBUF_INIT(&mb); + mbp = &mb; + for (i = 0; i != MUTEX_STATE_MAX; i++) { + if (ip->dbth_latches[i].action == MUTEX_ACTION_UNLOCKED) + continue; + if ((mutex = ip->dbth_latches[i].mutex) == + MUTEX_INVALID) + continue; + time_buf[4] = '\0'; +#ifdef DIAGNOSTIC + if (timespecisset(&ip->dbth_latches[i].when)) + (void)__db_ctimespec(&ip->dbth_latches[i].when, + time_buf); + else +#endif + time_buf[0] = '\0'; + + __db_msgadd(env, mbp, "%s %s %s ", + __mutex_describe(env, mutex, desc), + __mutex_action_print(ip->dbth_latches[i].action), time_buf); +#ifdef HAVE_STATISTICS + __mutex_print_debug_stats(env, mbp, mutex, 0); +#endif + DB_MSGBUF_FLUSH(env, mbp); + } + return (0); +} diff --git a/src/mutex/mut_failchk.c b/src/mutex/mut_failchk.c index 1425389f..28e5d992 100644 --- a/src/mutex/mut_failchk.c +++ b/src/mutex/mut_failchk.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -9,68 +9,193 @@ #include "db_config.h" #include "db_int.h" +#include "dbinc/lock.h" + +static int __mutex_failchk_single __P((ENV *, db_mutex_t, DB_THREAD_INFO *)); /* - * __mut_failchk -- - * Check for mutexes held by dead processes. + * __mutex_failchk -- + * Clean up after dead processes which left behind allocated per-process or + * locked mutexes. * - * PUBLIC: int __mut_failchk __P((ENV *)); + * PUBLIC: int __mutex_failchk __P((ENV *)); */ int -__mut_failchk(env) +__mutex_failchk(env) ENV *env; { - DB_ENV *dbenv; - DB_MUTEX *mutexp; + DB_HASHTAB *htab; DB_MUTEXMGR *mtxmgr; DB_MUTEXREGION *mtxregion; - db_mutex_t i; - int ret; - char buf[DB_THREADID_STRLEN]; - db_threadid_t unused; + DB_THREAD_INFO *ip; + db_mutex_t mutex; + unsigned i; + int count; - if (F_ISSET(env, ENV_PRIVATE)) + if (F_ISSET(env, ENV_PRIVATE) || (htab = env->thr_hashtab) == NULL) return (0); - DB_THREADID_INIT(unused); - - dbenv = env->dbenv; mtxmgr = env->mutex_handle; mtxregion = mtxmgr->reginfo.primary; - ret = 0; + count = 0; + DB_ASSERT(env, F_ISSET(env->dbenv, DB_ENV_FAILCHK)); MUTEX_SYSTEM_LOCK(env); - for (i = 1; i <= mtxregion->stat.st_mutex_cnt; ++i, ++mutexp) { - mutexp = MUTEXP_SET(env, i); - /* - * We're looking for per-process mutexes where the process - * has died. - */ - if (!F_ISSET(mutexp, DB_MUTEX_ALLOCATED) || - !F_ISSET(mutexp, DB_MUTEX_PROCESS_ONLY)) + /* + * The first loop does each thread's read-locked latches; the second + * does all locked mutexes. + */ + for (i = 0; i < env->thr_nbucket; i++) + SH_TAILQ_FOREACH(ip, &htab[i], dbth_links, __db_thread_info) { + if (ip->dbth_state == THREAD_SLOT_NOT_IN_USE) + continue; + count += __mutex_failchk_thread(env, ip); + } + + for (mutex = 1; mutex <= mtxregion->stat.st_mutex_cnt; mutex++) + if (__mutex_failchk_single(env, mutex, NULL) != 0) + count++; + + MUTEX_SYSTEM_UNLOCK(env); + + if (count == 0) + return (count); + else + return (USR_ERR(env, DB_RUNRECOVERY)); +} + +/* + * __mutex_failchk_thread - + * Do the per-latch failchk work on each of this thread's shared latches. + * + * PUBLIC: int __mutex_failchk_thread __P((ENV *, DB_THREAD_INFO *)); + */ +int +__mutex_failchk_thread(env, ip) + ENV *env; + DB_THREAD_INFO *ip; +{ + db_mutex_t mutex; + int count, i; + + count = 0; + for (i = 0; i != MUTEX_STATE_MAX; i++) { + if (ip->dbth_latches[i].action == MUTEX_ACTION_UNLOCKED || + (mutex = ip->dbth_latches[i].mutex) == MUTEX_INVALID) continue; + if (__mutex_failchk_single(env, mutex, ip) != 0) + count++; + } + return (count); +} +/* + * __mutex_failchk_single -- + * Determine whether this mutex is locked or shared by a potentially + * dead thread. If so, and the call to is_alive() finds that it is dead, + * clean up if possible (a process-only mutex); else wake up any waiters. + */ +static int +__mutex_failchk_single(env, mutex, ip) + ENV *env; + db_mutex_t mutex; + DB_THREAD_INFO *ip; +{ + DB_ENV *dbenv; + DB_MUTEX *mutexp; + db_threadid_t threadid; + pid_t pid; + int already_dead, ret; + u_int32_t flags; + char id_str[DB_THREADID_STRLEN]; + char mtx_desc[DB_MUTEX_DESCRIBE_STRLEN]; + + dbenv = env->dbenv; + mutexp = MUTEXP_SET(env, mutex); + flags = mutexp->flags; + /* + * Filter out mutexes which couldn't possibly be "interesting", in order + * to reduce the number of possibly costly is_alive() calls. Check that: + * it is allocated + * is it either locked, or a shared latch, or a per-process mutex + * it is nether a logical lock, nor self-block, nor already dead. + * Self-blocking mutexes are skipped because it is expected that they + * can still be locked even though they are really 'idle', as with + * the wait case in __lock_get_internal(), LOG->free_commits, and + * __rep_waiter->mtx_repwait; or they were allocated by the application. + */ + if (!LF_ISSET(DB_MUTEX_ALLOCATED)) + return (0); + if (!LF_ISSET( + DB_MUTEX_SHARED | DB_MUTEX_LOCKED | DB_MUTEX_PROCESS_ONLY)) + return (0); + if (LF_ISSET( + DB_MUTEX_SELF_BLOCK | DB_MUTEX_LOGICAL_LOCK | DB_MUTEX_OWNER_DEAD)) + return (0); + + already_dead = ip != NULL && timespecisset(&ip->dbth_failtime); + /* + * The pid in the mutex is valid when for locked or per-process mutexes. + * The tid is correct only when exclusively locked. It's okay to look at + * the tid of an unlocked per-process mutex, we won't use it in the + * is_alive() call. + */ + if (LF_ISSET(DB_MUTEX_LOCKED | DB_MUTEX_PROCESS_ONLY)) { + pid = mutexp->pid; + threadid = mutexp->tid; + } else { + DB_ASSERT(env, LF_ISSET(DB_MUTEX_SHARED)); /* - * The thread that allocated the mutex may have exited, but - * we cannot reclaim the mutex if the process is still alive. + * If we get here with no thread, then this is an shared latch + * which is neither locked nor shared, we're done with it. */ - if (dbenv->is_alive( - dbenv, mutexp->pid, unused, DB_MUTEX_PROCESS_ONLY)) - continue; + if (ip == NULL) + return (0); + pid = ip->dbth_pid; + threadid = ip->dbth_tid; + } + if (!already_dead && dbenv->is_alive(dbenv, + pid, threadid, LF_ISSET(DB_MUTEX_PROCESS_ONLY))) + return (0); + + /* The thread is dead; the mutex type indicates the kind of cleanup. */ + (void)dbenv->thread_id_string(dbenv, pid, threadid, id_str); + (void)__mutex_describe(env, mutex, mtx_desc); - __db_msg(env, DB_STR_A("2017", - "Freeing mutex for process: %s", "%s"), - dbenv->thread_id_string(dbenv, mutexp->pid, unused, buf)); + if (LF_ISSET(DB_MUTEX_PROCESS_ONLY)) { + if (already_dead) + return (0); + + __db_errx(env, DB_STR_A("2065", + "Freeing %s for process: %s", "%s %s"), mtx_desc, id_str); + + /* Clear the mutex id if it is in a cached locker. */ + if ((ret = __lock_local_locker_invalidate(env, mutex)) != 0) + return (ret); /* Unlock and free the mutex. */ - if (F_ISSET(mutexp, DB_MUTEX_LOCKED)) - MUTEX_UNLOCK(env, i); + if (LF_ISSET(DB_MUTEX_LOCKED)) + MUTEX_UNLOCK(env, mutex); - if ((ret = __mutex_free_int(env, 0, &i)) != 0) - break; + return (__mutex_free_int(env, 0, &mutex)); } - MUTEX_SYSTEM_UNLOCK(env); - - return (ret); +#ifdef HAVE_FAILCHK_BROADCAST + else if (LF_ISSET(DB_MUTEX_LOCKED)) { + __db_errx(env, DB_STR_A("2066", + "Marking %s as owned by dead thread %s", "%lu %s"), + mtx_desc, id_str); + F_SET(mutexp, DB_MUTEX_OWNER_DEAD); + } else if (LF_ISSET(DB_MUTEX_SHARED)) { + __db_errx(env, DB_STR_A("2067", + "Marking %s as shared by dead thread %s", "%lu %s"), + mtx_desc, id_str); + F_SET(mutexp, DB_MUTEX_OWNER_DEAD); + } else { + __db_errx(env, DB_STR_A("2068", + "mutex_failchk: unknown state for %s with dead thread %s", "%lu %s"), + mtx_desc, id_str); + } +#endif + return (USR_ERR(env, DB_RUNRECOVERY)); } diff --git a/src/mutex/mut_fcntl.c b/src/mutex/mut_fcntl.c deleted file mode 100644 index 0694aa59..00000000 --- a/src/mutex/mut_fcntl.c +++ /dev/null @@ -1,248 +0,0 @@ -/*- - * See the file LICENSE for redistribution information. - * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. - * - * $Id$ - */ - -#include "db_config.h" - -#include "db_int.h" - -static inline int __db_fcntl_mutex_lock_int - __P((ENV *, db_mutex_t, db_timeout_t, int)); - -/* - * __db_fcntl_mutex_init -- - * Initialize a fcntl mutex. - * - * PUBLIC: int __db_fcntl_mutex_init __P((ENV *, db_mutex_t, u_int32_t)); - */ -int -__db_fcntl_mutex_init(env, mutex, flags) - ENV *env; - db_mutex_t mutex; - u_int32_t flags; -{ - COMPQUIET(env, NULL); - COMPQUIET(mutex, MUTEX_INVALID); - COMPQUIET(flags, 0); - - return (0); -} - -/* - * __db_fcntl_mutex_lock_int - * Internal function to lock a mutex, blocking only when requested - */ -inline int -__db_fcntl_mutex_lock_int(env, mutex, timeout, wait) - ENV *env; - db_mutex_t mutex; - db_timeout_t timeout; - int wait; -{ - DB_ENV *dbenv; - DB_MUTEX *mutexp; - DB_THREAD_INFO *ip; - struct flock k_lock; - int locked, ms, ret; - db_timespec now, timespec; - db_timeout_t time_left; - - dbenv = env->dbenv; - - if (!MUTEX_ON(env) || F_ISSET(dbenv, DB_ENV_NOLOCKING)) - return (0); - - mutexp = MUTEXP_SET(env, mutex); - - CHECK_MTX_THREAD(env, mutexp); - -#ifdef HAVE_STATISTICS - if (F_ISSET(mutexp, DB_MUTEX_LOCKED)) - ++mutexp->mutex_set_wait; - else - ++mutexp->mutex_set_nowait; -#endif - - /* Initialize the lock. */ - k_lock.l_whence = SEEK_SET; - k_lock.l_start = mutex; - k_lock.l_len = 1; - - if (timeout != 0) { - timespecclear(×pec); - __clock_set_expires(env, ×pec, timeout); - } - - /* - * Only check the thread state once, by initializing the thread - * control block pointer to null. If it is not the failchk - * thread, then ip will have a valid value subsequent times - * in the loop. - */ - ip = NULL; - - for (locked = 0;;) { - /* - * Wait for the lock to become available; wait 1ms initially, - * up to 1 second. - */ - for (ms = 1; F_ISSET(mutexp, DB_MUTEX_LOCKED);) { - if (F_ISSET(dbenv, DB_ENV_FAILCHK) && - ip == NULL && dbenv->is_alive(dbenv, - mutexp->pid, mutexp->tid, 0) == 0) { - ret = __env_set_state(env, &ip, THREAD_VERIFY); - if (ret != 0 || - ip->dbth_state == THREAD_FAILCHK) - return (DB_RUNRECOVERY); - } - if (!wait) - return (DB_LOCK_NOTGRANTED); - if (timeout != 0) { - timespecclear(&now); - if (__clock_expired(env, &now, ×pec)) - return (DB_TIMEOUT); - DB_TIMESPEC_TO_TIMEOUT(time_left, &now, 0); - time_left = timeout - time_left; - if (ms * US_PER_MS > time_left) - ms = time_left / US_PER_MS; - } - __os_yield(NULL, 0, ms * US_PER_MS); - if ((ms <<= 1) > MS_PER_SEC) - ms = MS_PER_SEC; - } - - /* Acquire an exclusive kernel lock on the byte. */ - k_lock.l_type = F_WRLCK; - if (fcntl(env->lockfhp->fd, F_SETLKW, &k_lock)) - goto err; - - /* If the resource is still available, it's ours. */ - if (!F_ISSET(mutexp, DB_MUTEX_LOCKED)) { - locked = 1; - - F_SET(mutexp, DB_MUTEX_LOCKED); - dbenv->thread_id(dbenv, &mutexp->pid, &mutexp->tid); - } - - /* Release the kernel lock. */ - k_lock.l_type = F_UNLCK; - if (fcntl(env->lockfhp->fd, F_SETLK, &k_lock)) - goto err; - - /* - * If we got the resource lock we're done. - * - * !!! - * We can't check to see if the lock is ours, because we may - * be trying to block ourselves in the lock manager, and so - * the holder of the lock that's preventing us from getting - * the lock may be us! (Seriously.) - */ - if (locked) - break; - } - -#ifdef DIAGNOSTIC - /* - * We want to switch threads as often as possible. Yield every time - * we get a mutex to ensure contention. - */ - if (F_ISSET(dbenv, DB_ENV_YIELDCPU)) - __os_yield(env, 0, 0); -#endif - return (0); - -err: ret = __os_get_syserr(); - __db_syserr(env, ret, DB_STR("2019", "fcntl lock failed")); - return (__env_panic(env, __os_posix_err(ret))); -} - -/* - * __db_fcntl_mutex_lock - * Lock a mutex, blocking if necessary. - * - * PUBLIC: int __db_fcntl_mutex_lock __P((ENV *, db_mutex_t, db_timeout_t)); - */ -int -__db_fcntl_mutex_lock(env, mutex, timeout) - ENV *env; - db_mutex_t mutex; - db_timeout_t timeout; -{ - return (__db_fcntl_mutex_lock_int(env, mutex, timeout, 1)); -} - -/* - * __db_fcntl_mutex_trylock - * Try to lock a mutex, without blocking when it is busy. - * - * PUBLIC: int __db_fcntl_mutex_trylock __P((ENV *, db_mutex_t)); - */ -int -__db_fcntl_mutex_trylock(env, mutex) - ENV *env; - db_mutex_t mutex; -{ - return (__db_fcntl_mutex_lock_int(env, mutex, 0, 0)); -} - -/* - * __db_fcntl_mutex_unlock -- - * Release a mutex. - * - * PUBLIC: int __db_fcntl_mutex_unlock __P((ENV *, db_mutex_t)); - */ -int -__db_fcntl_mutex_unlock(env, mutex) - ENV *env; - db_mutex_t mutex; -{ - DB_ENV *dbenv; - DB_MUTEX *mutexp; - - dbenv = env->dbenv; - - if (!MUTEX_ON(env) || F_ISSET(dbenv, DB_ENV_NOLOCKING)) - return (0); - - mutexp = MUTEXP_SET(env, mutex); - -#ifdef DIAGNOSTIC - if (!F_ISSET(mutexp, DB_MUTEX_LOCKED)) { - __db_errx(env, DB_STR("2020", - "fcntl unlock failed: lock already unlocked")); - return (__env_panic(env, EACCES)); - } -#endif - - /* - * Release the resource. We don't have to acquire any locks because - * processes trying to acquire the lock are waiting for the flag to - * go to 0. Once that happens the waiters will serialize acquiring - * an exclusive kernel lock before locking the mutex. - */ - F_CLR(mutexp, DB_MUTEX_LOCKED); - - return (0); -} - -/* - * __db_fcntl_mutex_destroy -- - * Destroy a mutex. - * - * PUBLIC: int __db_fcntl_mutex_destroy __P((ENV *, db_mutex_t)); - */ -int -__db_fcntl_mutex_destroy(env, mutex) - ENV *env; - db_mutex_t mutex; -{ - COMPQUIET(env, NULL); - COMPQUIET(mutex, MUTEX_INVALID); - - return (0); -} diff --git a/src/mutex/mut_method.c b/src/mutex/mut_method.c index cb666082..99bafeae 100644 --- a/src/mutex/mut_method.c +++ b/src/mutex/mut_method.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -371,6 +371,33 @@ __mutex_set_tas_spins(dbenv, tas_spins) return (0); } +#ifdef HAVE_ERROR_HISTORY +/* + * __mutex_diags -- + * + * PUBLIC: #ifdef HAVE_ERROR_HISTORY + * PUBLIC: int __mutex_diags __P((ENV *, db_mutex_t, int)); + * PUBLIC: #endif + */ +int +__mutex_diags(env, mutex, error) + ENV *env; + db_mutex_t mutex; + int error; +{ + DB_MSGBUF *mb; + + if ((mb = __db_deferred_get()) != NULL) { + (void)__db_remember_context(env, mb, error); + __db_msgadd(env, mb, "Mutex %u ", (unsigned int)mutex); +#ifdef HAVE_STATISTICS + __mutex_print_debug_stats(env, mb, mutex, 0); +#endif + } + return (error); +} +#endif + #if !defined(HAVE_ATOMIC_SUPPORT) && defined(HAVE_MUTEX_SUPPORT) /* * Provide atomic operations for platforms which have mutexes yet do not have diff --git a/src/mutex/mut_pthread.c b/src/mutex/mut_pthread.c index 1ec4fb9c..4b2cfb81 100644 --- a/src/mutex/mut_pthread.c +++ b/src/mutex/mut_pthread.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -64,6 +64,19 @@ } while (0) /* + * !!! + * Solaris bug workaround: pthread_cond_wait() sometimes returns ETIME -- out + * of sheer paranoia, check both ETIME and ETIMEDOUT. We believe this happens + * when the application uses SIGALRM for some purpose, e.g., the C library sleep + * call, and Solaris delivers the signal to the wrong LWP. + */ +#ifdef ETIME +#define ETIME_TO_ETIMEDOUT(ret) ((ret) == ETIME ? ETIMEDOUT : (ret)) +#else +#define ETIME_TO_ETIMEDOUT(ret) (ret) +#endif + +/* * __db_pthread_mutex_init -- * Initialize a pthread mutex: either a native one or * just the mutex for block/wakeup of a hybrid test-and-set mutex @@ -104,18 +117,18 @@ __db_pthread_mutex_init(env, mutex, flags) pthread_rwlockattr_t rwlockattr, *rwlockattrp = NULL; #ifndef HAVE_MUTEX_THREAD_ONLY if (!LF_ISSET(DB_MUTEX_PROCESS_ONLY)) { - RET_SET((pthread_rwlockattr_init(&rwlockattr)), ret); + RET_SET(pthread_rwlockattr_init(&rwlockattr), ret); if (ret != 0) goto err; - RET_SET((pthread_rwlockattr_setpshared( - &rwlockattr, PTHREAD_PROCESS_SHARED)), ret); + RET_SET(pthread_rwlockattr_setpshared( + &rwlockattr, PTHREAD_PROCESS_SHARED), ret); rwlockattrp = &rwlockattr; } #endif if (ret == 0) - RET_SET((pthread_rwlock_init(&mutexp->u.rwlock, - rwlockattrp)), ret); + RET_SET(pthread_rwlock_init(&mutexp->u.rwlock, + rwlockattrp), ret); if (rwlockattrp != NULL) (void)pthread_rwlockattr_destroy(rwlockattrp); @@ -127,18 +140,18 @@ __db_pthread_mutex_init(env, mutex, flags) #endif #ifndef HAVE_MUTEX_THREAD_ONLY if (!LF_ISSET(DB_MUTEX_PROCESS_ONLY)) { - RET_SET((pthread_mutexattr_init(&mutexattr)), ret); + RET_SET(pthread_mutexattr_init(&mutexattr), ret); if (ret != 0) goto err; - RET_SET((pthread_mutexattr_setpshared( - &mutexattr, PTHREAD_PROCESS_SHARED)), ret); + RET_SET(pthread_mutexattr_setpshared( + &mutexattr, PTHREAD_PROCESS_SHARED), ret); mutexattrp = &mutexattr; } #endif if (ret == 0) RET_SET( - (pthread_mutex_init(&mutexp->u.m.mutex, mutexattrp)), ret); + pthread_mutex_init(&mutexp->u.m.mutex, mutexattrp), ret); if (mutexattrp != NULL) (void)pthread_mutexattr_destroy(mutexattrp); @@ -147,19 +160,19 @@ __db_pthread_mutex_init(env, mutex, flags) if (LF_ISSET(DB_MUTEX_SELF_BLOCK)) { #ifndef HAVE_MUTEX_THREAD_ONLY if (!LF_ISSET(DB_MUTEX_PROCESS_ONLY)) { - RET_SET((pthread_condattr_init(&condattr)), ret); + RET_SET(pthread_condattr_init(&condattr), ret); if (ret != 0) goto err; condattrp = &condattr; - RET_SET((pthread_condattr_setpshared( - &condattr, PTHREAD_PROCESS_SHARED)), ret); + RET_SET(pthread_condattr_setpshared( + &condattr, PTHREAD_PROCESS_SHARED), ret); } #endif if (ret == 0) - RET_SET((pthread_cond_init( - &mutexp->u.m.cond, condattrp)), ret); + RET_SET(pthread_cond_init( + &mutexp->u.m.cond, condattrp), ret); F_SET(mutexp, DB_MUTEX_SELF_BLOCK); if (condattrp != NULL) @@ -239,6 +252,9 @@ __db_pthread_mutex_prep(env, mutex, mutexp, exclusive) { DB_ENV *dbenv; DB_THREAD_INFO *ip; +#ifdef HAVE_FAILCHK_BROADCAST + db_timespec timespec; +#endif int ret; dbenv = env->dbenv; @@ -266,13 +282,32 @@ __db_pthread_mutex_prep(env, mutex, mutexp, exclusive) * hadn't gone down the 'if * DB_ENV_FAILCHK' path to start with. */ - RET_SET_PTHREAD_LOCK(mutexp, ret); - break; + goto lockit; } + __os_yield(env, 0, 10); } } - } else - RET_SET_PTHREAD_LOCK(mutexp, ret); + } else { +lockit: +#ifdef HAVE_FAILCHK_BROADCAST + if (dbenv->mutex_failchk_timeout != 0) { + timespecclear(×pec); + __clock_set_expires(env, + ×pec, dbenv->mutex_failchk_timeout); + do { + RET_SET_PTHREAD_TIMEDLOCK(mutexp, + (struct timespec *)×pec, ret); + ret = ETIME_TO_ETIMEDOUT(ret); + if (ret == ETIMEDOUT && + F_ISSET(mutexp, DB_MUTEX_OWNER_DEAD) && + !F_ISSET(dbenv, DB_ENV_FAILCHK)) + ret = USR_ERR(env, + __mutex_died(env, mutex)); + } while (ret == ETIMEDOUT); + } else +#endif + RET_SET_PTHREAD_LOCK(mutexp, ret); + } PERFMON4(env, mutex, resume, mutex, exclusive, mutexp->alloc_id, mutexp); @@ -302,49 +337,75 @@ __db_pthread_mutex_condwait(env, mutex, mutexp, timespec) DB_MUTEX *mutexp; db_timespec *timespec; { + DB_ENV *dbenv; int ret; - -#ifdef MUTEX_DIAG - printf("condwait %ld %x wait busy %x count %d\n", - mutex, pthread_self(), MUTEXP_BUSY_FIELD(mutexp), mutexp->wait); +#ifdef HAVE_FAILCHK_BROADCAST + db_timespec failchk_timespec; #endif + + dbenv = env->dbenv; PERFMON4(env, mutex, suspend, mutex, TRUE, mutexp->alloc_id, mutexp); +#ifdef HAVE_FAILCHK_BROADCAST + /* + * If the failchk timeout would be soon than the timeout passed in, + * argument, use the failchk timeout. The caller handles "short" waits. + */ + if (dbenv->mutex_failchk_timeout != 0) { + timespecclear(&failchk_timespec); + __clock_set_expires(env, + &failchk_timespec, dbenv->mutex_failchk_timeout); + if (timespec == NULL || + timespeccmp(timespec, &failchk_timespec, >)) + timespec = &failchk_timespec; + } +#endif + if (timespec != NULL) { - RET_SET((pthread_cond_timedwait(&mutexp->u.m.cond, - &mutexp->u.m.mutex, (struct timespec *) timespec)), ret); + RET_SET(pthread_cond_timedwait(&mutexp->u.m.cond, + &mutexp->u.m.mutex, (struct timespec *) timespec), ret); + ret = ETIME_TO_ETIMEDOUT(ret); +#ifdef HAVE_FAILCHK_BROADCAST + if (F_ISSET(mutexp, DB_MUTEX_OWNER_DEAD) && + !F_ISSET(dbenv, DB_ENV_FAILCHK)) { + ret = USR_ERR(env, __mutex_died(env, mutex)); + goto err; + } +#endif if (ret == ETIMEDOUT) { ret = DB_TIMEOUT; - goto ret; + goto err; } } else - RET_SET((pthread_cond_wait(&mutexp->u.m.cond, - &mutexp->u.m.mutex)), ret); -#ifdef MUTEX_DIAG - printf("condwait %ld %x wait returns %d busy %x\n", - mutex, pthread_self(), ret, MUTEXP_BUSY_FIELD(mutexp)); + RET_SET(pthread_cond_wait(&mutexp->u.m.cond, + &mutexp->u.m.mutex), ret); +#ifdef HAVE_FAILCHK_BROADCAST + if (ret == 0 && F_ISSET(mutexp, DB_MUTEX_OWNER_DEAD) && + !F_ISSET(dbenv, DB_ENV_FAILCHK)) { + ret = USR_ERR(env, __mutex_died(env, mutex)); + goto err; + } #endif /* * !!! * Solaris bug workaround: pthread_cond_wait() sometimes returns ETIME - * -- out of sheer paranoia, check both ETIME and ETIMEDOUT. We + * -- out of sheer paranoia, check both ETIME and ETIMEDOUT. We * believe this happens when the application uses SIGALRM for some * purpose, e.g., the C library sleep call, and Solaris delivers the - * signal to the wrong LWP. + * signal to the wrong LWP. */ if (ret != 0) { - if (ret == ETIMEDOUT || -#ifdef ETIME - ret == ETIME || -#endif + if ((ret = ETIME_TO_ETIMEDOUT(ret)) == ETIMEDOUT || ret == EINTR) ret = 0; - else + else { /* Failure, caller shouldn't condwait again. */ (void)pthread_mutex_unlock(&mutexp->u.m.mutex); + (void)MUTEX_ERR(env, mutex, ret); + } } -ret: +err: PERFMON4(env, mutex, resume, mutex, TRUE, mutexp->alloc_id, mutexp); COMPQUIET(mutex, 0); @@ -356,7 +417,10 @@ ret: /* * __db_pthread_mutex_lock * Lock on a mutex, blocking if necessary. - * Timeouts are supported only for self-blocking mutexes. + * Timeouts are supported only for self-blocking mutexes. When both a + * given timeout and a dbenv-wide failchk timeout are specified, the + * given timeout takes precedence -- a process failure might not be noticed + * for a little while. * * Self-blocking shared latches are not supported. * @@ -372,6 +436,7 @@ __db_pthread_mutex_lock(env, mutex, timeout) { DB_ENV *dbenv; DB_MUTEX *mutexp; + db_timeout_t checktimeout; db_timespec timespec; int ret, t_ret; @@ -385,7 +450,6 @@ __db_pthread_mutex_lock(env, mutex, timeout) CHECK_MTX_THREAD(env, mutexp); -#if defined(HAVE_STATISTICS) /* * We want to know which mutexes are contentious, but don't want to * do an interlocked test here -- that's slower when the underlying @@ -398,6 +462,11 @@ __db_pthread_mutex_lock(env, mutex, timeout) else STAT_INC(env, mutex, set_nowait, mutexp->mutex_set_nowait, mutex); + + checktimeout = timeout; +#ifdef HAVE_FAILCHK_BROADCAST + if (checktimeout == 0 || checktimeout > dbenv->mutex_failchk_timeout) + checktimeout = dbenv->mutex_failchk_timeout; #endif /* Single-thread the next block, except during the possible condwait. */ @@ -405,14 +474,12 @@ __db_pthread_mutex_lock(env, mutex, timeout) goto err; if (F_ISSET(mutexp, DB_MUTEX_SELF_BLOCK)) { - if (timeout != 0) + if (checktimeout != 0) timespecclear(×pec); while (MUTEXP_IS_BUSY(mutexp)) { /* Set expiration timer upon first need. */ - if (timeout != 0 && !timespecisset(×pec)) { - timespecclear(×pec); + if (checktimeout != 0 && !timespecisset(×pec)) __clock_set_expires(env, ×pec, timeout); - } t_ret = __db_pthread_mutex_condwait(env, mutex, mutexp, timeout == 0 ? NULL : ×pec); if (t_ret != 0) { @@ -428,18 +495,20 @@ __db_pthread_mutex_lock(env, mutex, timeout) out: /* #2471: HP-UX can sporadically return EFAULT. See above */ RETRY_ON_EFAULT(pthread_mutex_unlock(&mutexp->u.m.mutex), ret); - if (ret != 0) + if (ret != 0) { + (void)MUTEX_ERR(env, mutex, ret); goto err; + } } else { #ifdef DIAGNOSTIC if (F_ISSET(mutexp, DB_MUTEX_LOCKED)) { char buf[DB_THREADID_STRLEN]; (void)dbenv->thread_id_string(dbenv, mutexp->pid, mutexp->tid, buf); + ret = MUTEX_ERR(env, mutex, EINVAL); __db_errx(env, DB_STR_A("2022", "pthread lock failed: lock currently in use: pid/tid: %s", "%s"), buf); - ret = EINVAL; goto err; } #endif @@ -455,6 +524,13 @@ out: if (F_ISSET(dbenv, DB_ENV_YIELDCPU)) __os_yield(env, 0, 0); #endif +#ifdef MUTEX_DIAG + if (t_ret == 0) { + __os_gettime(env, &mutexp->mutex_history.when, 0); + __os_stack_text(env, mutexp->mutex_history.stacktext, + sizeof(mutexp->mutex_history.stacktext), 12, 2); + } +#endif return (t_ret); err: @@ -479,6 +555,10 @@ __db_pthread_mutex_readlock(env, mutex) { DB_ENV *dbenv; DB_MUTEX *mutexp; + MUTEX_STATE *state; +#ifdef HAVE_FAILCHK_BROADCAST + db_timespec timespec; +#endif int ret; dbenv = env->dbenv; @@ -491,7 +571,6 @@ __db_pthread_mutex_readlock(env, mutex) CHECK_MTX_THREAD(env, mutexp); -#if defined(HAVE_STATISTICS) /* * We want to know which mutexes are contentious, but don't want to * do an interlocked test here -- that's slower when the underlying @@ -505,15 +584,52 @@ __db_pthread_mutex_readlock(env, mutex) else STAT_INC(env, mutex, set_rd_nowait, mutexp->mutex_set_rd_nowait, mutex); -#endif + + state = NULL; + if (env->thr_hashtab != NULL && (ret = __mutex_record_lock(env, + mutex, MUTEX_ACTION_INTEND_SHARE, &state)) != 0) + return (ret); PERFMON4(env, mutex, suspend, mutex, FALSE, mutexp->alloc_id, mutexp); - RET_SET((pthread_rwlock_rdlock(&mutexp->u.rwlock)), ret); + +#ifdef HAVE_FAILCHK_BROADCAST + if (dbenv->mutex_failchk_timeout != 0) { + do { + timespecclear(×pec); + __clock_set_expires(env, + ×pec, dbenv->mutex_failchk_timeout); + RET_SET(pthread_rwlock_timedrdlock(&mutexp->u.rwlock, + (struct timespec *)×pec), ret); + if (F_ISSET(mutexp, DB_MUTEX_OWNER_DEAD) && + !F_ISSET(dbenv, DB_ENV_FAILCHK)) { + if (ret == 0) + RETRY_ON_EFAULT(pthread_rwlock_unlock( + &mutexp->u.rwlock), ret); + ret = USR_ERR(env, __mutex_died(env, mutex)); + goto err; + } + } while (ret == DB_TIMEOUT); + } else +#endif + RET_SET(pthread_rwlock_rdlock(&mutexp->u.rwlock), ret); + PERFMON4(env, mutex, resume, mutex, FALSE, mutexp->alloc_id, mutexp); DB_ASSERT(env, !F_ISSET(mutexp, DB_MUTEX_LOCKED)); if (ret != 0) goto err; +#ifdef HAVE_FAILCHK_BROADCAST + if (F_ISSET(mutexp, DB_MUTEX_OWNER_DEAD) && + !F_ISSET(dbenv, DB_ENV_FAILCHK)) { + ret = USR_ERR(env, __mutex_died(env, mutex)); + goto err; + } +#endif +#ifdef MUTEX_DIAG + __os_gettime(env, &mutexp->mutex_history.when, 0); + __os_stack_text(env, mutexp->mutex_history.stacktext, + sizeof(mutexp->mutex_history.stacktext), 12, 2); +#endif #ifdef DIAGNOSTIC /* * We want to switch threads as often as possible. Yield every time @@ -524,7 +640,10 @@ __db_pthread_mutex_readlock(env, mutex) #endif return (0); -err: __db_err(env, ret, DB_STR("2024", "pthread readlock failed")); +err: + if (state != NULL) + state->action = MUTEX_ACTION_UNLOCKED; + __db_err(env, ret, DB_STR("2024", "pthread readlock failed")); return (__env_panic(env, ret)); } #endif @@ -532,8 +651,10 @@ err: __db_err(env, ret, DB_STR("2024", "pthread readlock failed")); #ifdef HAVE_MUTEX_HYBRID /* * __db_hybrid_mutex_suspend - * Suspend this thread until the mutex is free enough to give the caller a - * good chance of getting the mutex in the requested exclusivity mode. + * Suspend this thread, usually until the mutex is free enough to give the + * caller a good chance of getting the mutex in the requested exclusivity + * mode. Return early if the timeout is reached or a dead mutex is found + * to be dead. * * The major difference between this and the old __db_pthread_mutex_lock() * is the additional 'exclusive' parameter. @@ -551,6 +672,9 @@ __db_hybrid_mutex_suspend(env, mutex, timespec, exclusive) int exclusive; { DB_MUTEX *mutexp; +#ifdef HAVE_FAILCHECK_BROADCAST + db_timespec failchk_timespec; +#endif int ret, t_ret; t_ret = 0; @@ -571,7 +695,7 @@ __db_hybrid_mutex_suspend(env, mutex, timespec, exclusive) * before checking the wait counter. */ mutexp->wait++; - MUTEX_MEMBAR(mutexp->wait); + (void)MUTEX_MEMBAR(mutexp->wait); while (exclusive ? MUTEXP_IS_BUSY(mutexp) : atomic_read(&mutexp->sharecount) == MUTEX_SHARE_ISEXCLUSIVE) { t_ret = __db_pthread_mutex_condwait(env, @@ -582,7 +706,7 @@ __db_hybrid_mutex_suspend(env, mutex, timespec, exclusive) ret = t_ret; goto err; } - MUTEX_MEMBAR(mutexp->flags); + (void)MUTEX_MEMBAR(mutexp->flags); } mutexp->wait--; @@ -627,8 +751,8 @@ __db_pthread_mutex_unlock(env, mutex) DB_ENV *dbenv; DB_MUTEX *mutexp; int ret; -#if defined(MUTEX_DIAG) && defined(HAVE_MUTEX_HYBRID) - int waiters; +#ifndef HAVE_MUTEX_HYBRID + char description[DB_MUTEX_DESCRIBE_STRLEN]; #endif dbenv = env->dbenv; @@ -637,14 +761,13 @@ __db_pthread_mutex_unlock(env, mutex) return (0); mutexp = MUTEXP_SET(env, mutex); -#if defined(MUTEX_DIAG) && defined(HAVE_MUTEX_HYBRID) - waiters = mutexp->wait; -#endif -#if !defined(HAVE_MUTEX_HYBRID) && defined(DIAGNOSTIC) +#if !defined(HAVE_MUTEX_HYBRID) if (!F_ISSET(mutexp, DB_MUTEX_LOCKED | DB_MUTEX_SHARED)) { - __db_errx(env, DB_STR("2025", - "pthread unlock failed: lock already unlocked")); + if (!PANIC_ISSET(env)) + __db_errx(env, DB_STR("2069", + "pthread unlock %s: already unlocked"), + __mutex_describe(env, mutex, description)); return (__env_panic(env, EACCES)); } #endif @@ -662,14 +785,19 @@ __db_pthread_mutex_unlock(env, mutex) if (F_ISSET(mutexp, DB_MUTEX_SHARED)) RET_SET( - (pthread_cond_broadcast(&mutexp->u.m.cond)), ret); + pthread_cond_broadcast(&mutexp->u.m.cond), ret); else - RET_SET((pthread_cond_signal(&mutexp->u.m.cond)), ret); + RET_SET(pthread_cond_signal(&mutexp->u.m.cond), ret); if (ret != 0) goto err; } else { #ifndef HAVE_MUTEX_HYBRID - F_CLR(mutexp, DB_MUTEX_LOCKED); + + if (F_ISSET(mutexp, DB_MUTEX_LOCKED)) + F_CLR(mutexp, DB_MUTEX_LOCKED); + else if (env->thr_hashtab != NULL && + (ret = __mutex_record_unlock(env, mutex)) != 0) + goto err; #endif } @@ -685,12 +813,6 @@ err: if (ret != 0) { __db_err(env, ret, "pthread unlock failed"); return (__env_panic(env, ret)); } -#if defined(MUTEX_DIAG) && defined(HAVE_MUTEX_HYBRID) - if (!MUTEXP_IS_BUSY(mutexp) && mutexp->wait != 0) - printf("unlock %ld %x busy %x waiters %d/%d\n", - mutex, pthread_self(), ret, - MUTEXP_BUSY_FIELD(mutexp), waiters, mutexp->wait); -#endif return (ret); } @@ -739,7 +861,7 @@ __db_pthread_mutex_destroy(env, mutex) if (!failchk_thread) #endif RET_SET( - (pthread_rwlock_destroy(&mutexp->u.rwlock)), ret); + pthread_rwlock_destroy(&mutexp->u.rwlock), ret); /* For rwlocks, we're done - must not destroy rest of union */ return (ret); #endif @@ -754,15 +876,14 @@ __db_pthread_mutex_destroy(env, mutex) #ifdef HAVE_PTHREAD_COND_REINIT_OKAY if (!failchk_thread) #endif - RET_SET((pthread_cond_destroy(&mutexp->u.m.cond)), ret); + RET_SET(pthread_cond_destroy(&mutexp->u.m.cond), ret); if (ret != 0) __db_err(env, ret, DB_STR("2026", "unable to destroy cond")); } - RET_SET((pthread_mutex_destroy(&mutexp->u.m.mutex)), t_ret); + RET_SET(pthread_mutex_destroy(&mutexp->u.m.mutex), t_ret); if (t_ret != 0 && !failchk_thread) { - __db_err(env, t_ret, DB_STR("2027", - "unable to destroy mutex")); + __db_err(env, t_ret, DB_STR("2027", "unable to destroy mutex")); if (ret == 0) ret = t_ret; } diff --git a/src/mutex/mut_region.c b/src/mutex/mut_region.c index 26ae0a03..976ff231 100644 --- a/src/mutex/mut_region.c +++ b/src/mutex/mut_region.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -34,7 +34,7 @@ __mutex_open(env, create_ok) DB_MUTEXMGR *mtxmgr; DB_MUTEXREGION *mtxregion; size_t size; - u_int32_t cpu_count; + u_int32_t cpu_count, tas_spins; int ret; #ifndef HAVE_ATOMIC_SUPPORT u_int i; @@ -55,8 +55,14 @@ __mutex_open(env, create_ok) dbenv->mutex_align = MUTEX_ALIGN; if (dbenv->mutex_tas_spins == 0) { cpu_count = __os_cpu_count(); - if ((ret = __mutex_set_tas_spins(dbenv, cpu_count == 1 ? - cpu_count : cpu_count * MUTEX_SPINS_PER_PROCESSOR)) != 0) + if (cpu_count == 1) + tas_spins = 1; + else { + tas_spins = cpu_count * MUTEX_SPINS_PER_PROCESSOR; + if (tas_spins > MUTEX_SPINS_DEFAULT_MAX) + tas_spins = MUTEX_SPINS_DEFAULT_MAX; + } + if ((ret = __mutex_set_tas_spins(dbenv, tas_spins)) != 0) return (ret); } @@ -118,11 +124,29 @@ __mutex_open(env, create_ok) return (0); -err: env->mutex_handle = NULL; - if (mtxmgr->reginfo.addr != NULL) - (void)__env_region_detach(env, &mtxmgr->reginfo, 0); +err: (void)__mutex_region_detach(env, mtxmgr); + return (ret); +} - __os_free(env, mtxmgr); +/* + * __mutex_region_detach -- + * + * PUBLIC: int __mutex_region_detach __P((ENV *, DB_MUTEXMGR *)); + */ +int +__mutex_region_detach(env, mtxmgr) + ENV *env; + DB_MUTEXMGR *mtxmgr; +{ + int ret; + + ret = 0; + if (mtxmgr != NULL) { + if (mtxmgr->reginfo.addr != NULL) + ret = __env_region_detach(env, &mtxmgr->reginfo, 0); + __os_free(env, mtxmgr); + env->mutex_handle = NULL; + } return (ret); } @@ -136,7 +160,6 @@ __mutex_region_init(env, mtxmgr) DB_MUTEXMGR *mtxmgr; { DB_ENV *dbenv; - DB_MUTEX *mutexp; DB_MUTEXREGION *mtxregion; db_mutex_t mutex; int ret; @@ -144,8 +167,6 @@ __mutex_region_init(env, mtxmgr) dbenv = env->dbenv; - COMPQUIET(mutexp, NULL); - if ((ret = __env_alloc(&mtxmgr->reginfo, sizeof(DB_MUTEXREGION), &mtxmgr->reginfo.primary)) != 0) { __db_errx(env, DB_STR("2013", @@ -205,26 +226,11 @@ __mutex_region_init(env, mtxmgr) * in each link. */ env->mutex_handle = mtxmgr; - if (F_ISSET(env, ENV_PRIVATE)) { - mutexp = (DB_MUTEX *)mutex_array; - mutexp++; - mutexp = ALIGNP_INC(mutexp, mtxregion->stat.st_mutex_align); - mtxregion->mutex_next = (db_mutex_t)mutexp; - } else { - mtxregion->mutex_next = 1; - mutexp = MUTEXP_SET(env, 1); - } - for (mutex = 1; mutex < mtxregion->stat.st_mutex_cnt; ++mutex) { - mutexp->flags = 0; - if (F_ISSET(env, ENV_PRIVATE)) - mutexp->mutex_next_link = (db_mutex_t)(mutexp + 1); - else - mutexp->mutex_next_link = mutex + 1; - mutexp++; - mutexp = ALIGNP_INC(mutexp, mtxregion->stat.st_mutex_align); - } - mutexp->flags = 0; - mutexp->mutex_next_link = MUTEX_INVALID; + mtxregion->mutex_next = (F_ISSET(env, ENV_PRIVATE) ? + ((uintptr_t)mutex_array + mtxregion->mutex_size) : 1); + MUTEX_BULK_INIT(env, + mtxregion, mtxregion->mutex_next, mtxregion->stat.st_mutex_cnt); + mtxregion->stat.st_mutex_free = mtxregion->stat.st_mutex_cnt; mtxregion->stat.st_mutex_inuse = mtxregion->stat.st_mutex_inuse_max = 0; if ((ret = __mutex_alloc(env, MTX_MUTEX_REGION, 0, &mutex)) != 0) diff --git a/src/mutex/mut_stat.c b/src/mutex/mut_stat.c index b64207fa..af622c7d 100644 --- a/src/mutex/mut_stat.c +++ b/src/mutex/mut_stat.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -19,6 +19,17 @@ static int __mutex_print_stats __P((ENV *, u_int32_t)); static void __mutex_print_summary __P((ENV *)); static int __mutex_stat __P((ENV *, DB_MUTEX_STAT **, u_int32_t)); +static const FN MutexFlagNames[] = { + { DB_MUTEX_ALLOCATED, "alloc" }, + { DB_MUTEX_LOCKED, "locked" }, + { DB_MUTEX_LOGICAL_LOCK, "logical" }, + { DB_MUTEX_OWNER_DEAD, "ower-dead" }, + { DB_MUTEX_PROCESS_ONLY, "process-private" }, + { DB_MUTEX_SELF_BLOCK, "self-block" }, + { DB_MUTEX_SHARED, "shared" }, + { 0, NULL } +}; + /* * __mutex_stat_pp -- * ENV->mutex_stat pre/post processing. @@ -170,11 +181,12 @@ __mutex_print_summary(env) size = 0; if (F_ISSET(env, ENV_PRIVATE)) { - mutexp = (DB_MUTEX *)mtxmgr->mutex_array + 1; + mutexp = (DB_MUTEX *)((uintptr_t)mtxmgr->mutex_array + + mtxregion->mutex_size); chunk = NULL; size = __env_elem_size(env, ROFF_TO_P(mtxregion->mutex_off_alloc)); - size -= sizeof(*mutexp); + size -= mtxregion->mutex_size; } else mutexp = MUTEXP_SET(env, 1); for (i = 1; i <= mtxregion->stat.st_mutex_cnt; ++i) { @@ -185,13 +197,15 @@ __mutex_print_summary(env) else counts[mutexp->alloc_id]++; - mutexp++; + mutexp = (DB_MUTEX *)((uintptr_t)mutexp + + mtxregion->mutex_size); if (F_ISSET(env, ENV_PRIVATE) && (size -= sizeof(*mutexp)) < sizeof(*mutexp)) { mutexp = __env_get_chunk(&mtxmgr->reginfo, &chunk, &size); + mutexp = ALIGNP_INC(mutexp, + mtxregion->stat.st_mutex_align); } - mutexp = ALIGNP_INC(mutexp, mtxregion->stat.st_mutex_align); } __db_msg(env, "Mutex counts"); __db_msg(env, "%d\tUnallocated", counts[0]); @@ -252,14 +266,6 @@ __mutex_print_all(env, flags) ENV *env; u_int32_t flags; { - static const FN fn[] = { - { DB_MUTEX_ALLOCATED, "alloc" }, - { DB_MUTEX_LOCKED, "locked" }, - { DB_MUTEX_LOGICAL_LOCK, "logical" }, - { DB_MUTEX_PROCESS_ONLY, "process-private" }, - { DB_MUTEX_SELF_BLOCK, "self-block" }, - { 0, NULL } - }; DB_MSGBUF mb, *mbp; DB_MUTEX *mutexp; DB_MUTEXMGR *mtxmgr; @@ -294,37 +300,32 @@ __mutex_print_all(env, flags) __db_msg(env, "mutex\twait/nowait, pct wait, holder, flags"); size = 0; if (F_ISSET(env, ENV_PRIVATE)) { - mutexp = (DB_MUTEX *)mtxmgr->mutex_array + 1; + mutexp = (DB_MUTEX *)((uintptr_t)mtxmgr->mutex_array + + mtxregion->mutex_size); chunk = NULL; size = __env_elem_size(env, ROFF_TO_P(mtxregion->mutex_off_alloc)); - size -= sizeof(*mutexp); + size -= mtxregion->mutex_size; } else mutexp = MUTEXP_SET(env, 1); for (i = 1; i <= mtxregion->stat.st_mutex_cnt; ++i) { if (F_ISSET(mutexp, DB_MUTEX_ALLOCATED)) { __db_msgadd(env, mbp, "%5lu\t", (u_long)i); - __mutex_print_debug_stats(env, mbp, F_ISSET(env, ENV_PRIVATE) ? (db_mutex_t)mutexp : i, flags); - - if (mutexp->alloc_id != 0) - __db_msgadd(env, mbp, - ", %s", __mutex_print_id(mutexp->alloc_id)); - - __db_prflags(env, mbp, mutexp->flags, fn, " (", ")"); - DB_MSGBUF_FLUSH(env, mbp); } - mutexp++; + mutexp = (DB_MUTEX *)((uintptr_t)mutexp + + mtxregion->mutex_size); if (F_ISSET(env, ENV_PRIVATE) && - (size -= sizeof(*mutexp)) < sizeof(*mutexp)) { + (size -= mtxregion->mutex_size) < mtxregion->mutex_size) { mutexp = __env_get_chunk(&mtxmgr->reginfo, &chunk, &size); + mutexp = ALIGNP_INC(mutexp, + mtxregion->stat.st_mutex_align); } - mutexp = ALIGNP_INC(mutexp, mtxregion->stat.st_mutex_align); } return (0); @@ -332,8 +333,7 @@ __mutex_print_all(env, flags) /* * __mutex_print_debug_single -- - * Print mutex internal debugging statistics for a single mutex on a - * single output line. + * Print mutex internal debugging statistics for a single mutex. * * PUBLIC: void __mutex_print_debug_single * PUBLIC: __P((ENV *, const char *, db_mutex_t, u_int32_t)); @@ -359,8 +359,9 @@ __mutex_print_debug_single(env, tag, mutex, flags) /* * __mutex_print_debug_stats -- - * Print mutex internal debugging statistics, that is, the statistics - * in the [] square brackets. + * Print the mutex internal debugging statistics in square bracket,s on a + * followed by the allocation id and flags, on single line. When MUTEX_DIAG + * is on and the mutex is held, append the owner's stack trace. * * PUBLIC: void __mutex_print_debug_stats * PUBLIC: __P((ENV *, DB_MSGBUF *, db_mutex_t, u_int32_t)); @@ -380,6 +381,9 @@ __mutex_print_debug_stats(env, mbp, mutex, flags) !defined(HAVE_MUTEX_PTHREADS)) int sharecount; #endif +#ifdef MUTEX_DIAG + char timestr[CTIME_BUFLEN]; +#endif if (mutex == MUTEX_INVALID) { __db_msgadd(env, mbp, "[!Set]"); @@ -448,6 +452,22 @@ __mutex_print_debug_stats(env, mbp, mutex, flags) mutexp->hybrid_wait, mutexp->hybrid_wakeup); #endif + if (mutexp->alloc_id != 0) + __db_msgadd(env, + mbp, ", %s", __mutex_print_id(mutexp->alloc_id)); + + __db_prflags(env, mbp, mutexp->flags, MutexFlagNames, " (", ")"); +#ifdef MUTEX_DIAG + if (mutexp->alloc_id != MTX_LOGICAL_LOCK && + timespecisset(&mutexp->mutex_history.when)) { + __db_ctimespec(&mutexp->mutex_history.when, timestr); + __db_msgadd(env, mbp, "\nLocked %s", timestr); + if (mutexp->mutex_history.stacktext[0] != '\0') + __db_msgadd(env, mbp, "\n%.*s", + (int)sizeof(mutexp->mutex_history.stacktext) - 1, + mutexp->mutex_history.stacktext); + } +#endif if (LF_ISSET(DB_STAT_CLEAR)) __mutex_clear(env, mutex); } @@ -495,7 +515,8 @@ __mutex_print_id(alloc_id) case MTX_TXN_COMMIT: return ("txn commit"); case MTX_TXN_MVCC: return ("txn mvcc"); case MTX_TXN_REGION: return ("txn region"); - default: return ("unknown mutex type"); + case 0: return ("invalid 0 mutex type"); + default: return ("unknown non-zero mutex type"); /* NOTREACHED */ } } @@ -577,3 +598,39 @@ __mutex_stat_print_pp(dbenv, flags) return (__db_stat_not_built(dbenv->env)); } #endif + +/* + * __mutex_describe + * Fill in a buffer with the mutex #, alloc_id, and any other + * characteristics which are likely to be useful for diagnostics. The + * destination buffer must hold at least DB_MUTEX_DESCRIBE_STRLEN bytes. + * + * PUBLIC: char *__mutex_describe __P((ENV *, db_mutex_t, char *)); + */ +char * +__mutex_describe(env, mutex, dest) + ENV *env; + db_mutex_t mutex; + char *dest; +{ + DB_MUTEX *mutexp; + DB_MSGBUF mb, *mbp; + const char *type; + + DB_MSGBUF_INIT(&mb); + mbp = &mb; + mutexp = MUTEXP_SET(env, mutex); + type = F_ISSET(mutexp, DB_MUTEX_SHARED) ? "latch" : "mutex"; +#ifdef HAVE_STATISTICS + __db_msgadd(env, mbp, "%s %s id %ld ", + __mutex_print_id(mutexp->alloc_id), type, (long)mutex); + __db_prflags(env, mbp, mutexp->flags, MutexFlagNames, " (", ")"); +#else + __db_msgadd(env, mbp, "%s flags %x id %ld ", + type, mutexp->flags, (long)mutex); +#endif + (void)snprintf(dest, DB_MUTEX_DESCRIBE_STRLEN - 1, + "%.*s", (int)(mbp->cur - mbp->buf), mbp->buf); + dest[DB_MUTEX_DESCRIBE_STRLEN - 1] = '\0'; + return (dest); +} diff --git a/src/mutex/mut_stub.c b/src/mutex/mut_stub.c index 61ecc80c..0ece9a9d 100644 --- a/src/mutex/mut_stub.c +++ b/src/mutex/mut_stub.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -157,6 +157,16 @@ __mutex_print_debug_stats(env, mbp, mutex, flags) } int +__mutex_refresh(env, mutex) + ENV *env; + db_mutex_t mutex; +{ + COMPQUIET(env, NULL); + COMPQUIET(mutex, MUTEX_INVALID); + return (0); +} + +int __mutex_set_align(dbenv, align) DB_ENV *dbenv; u_int32_t align; diff --git a/src/mutex/mut_tas.c b/src/mutex/mut_tas.c index 0899d237..c7cc3ea5 100644 --- a/src/mutex/mut_tas.c +++ b/src/mutex/mut_tas.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -52,8 +52,7 @@ __db_tas_mutex_init(env, mutex, flags) #endif if (MUTEX_INIT(&mutexp->tas)) { ret = __os_get_syserr(); - __db_syserr(env, ret, DB_STR("2029", - "TAS: mutex initialize")); + __db_syserr(env, ret, DB_STR("2029", "TAS: mutex initialize")); return (__os_posix_err(ret)); } #ifdef HAVE_MUTEX_HYBRID @@ -66,7 +65,9 @@ __db_tas_mutex_init(env, mutex, flags) /* * __db_tas_mutex_lock_int - * Internal function to lock a mutex, or just try to lock it without waiting + * Internal function to lock a mutex, or just try to lock it without + * waiting. MUTEX_WAIT() passes in a timeout to allow an early exit + * returning DB_TIMEOUT. */ inline static int __db_tas_mutex_lock_int(env, mutex, timeout, nowait) @@ -80,13 +81,15 @@ __db_tas_mutex_lock_int(env, mutex, timeout, nowait) DB_MUTEXMGR *mtxmgr; DB_MUTEXREGION *mtxregion; DB_THREAD_INFO *ip; - db_timespec now, timespec; + db_timespec now, timeout_timespec; u_int32_t nspins; + u_long micros; int ret; -#ifdef HAVE_MUTEX_HYBRID - const u_long micros = 0; -#else - u_long micros, max_micros; +#ifdef DIAGNOSTIC + char buf[DB_THREADID_STRLEN]; +#endif +#ifndef HAVE_MUTEX_HYBRID + u_long max_micros; db_timeout_t time_left; #endif @@ -95,21 +98,23 @@ __db_tas_mutex_lock_int(env, mutex, timeout, nowait) if (!MUTEX_ON(env) || F_ISSET(dbenv, DB_ENV_NOLOCKING)) return (0); + PANIC_CHECK(env); + mtxmgr = env->mutex_handle; mtxregion = mtxmgr->reginfo.primary; mutexp = MUTEXP_SET(env, mutex); CHECK_MTX_THREAD(env, mutexp); -#ifdef HAVE_STATISTICS if (F_ISSET(mutexp, DB_MUTEX_LOCKED)) STAT_INC(env, mutex, set_wait, mutexp->mutex_set_wait, mutex); else STAT_INC(env, mutex, set_nowait, mutexp->mutex_set_nowait, mutex); -#endif -#ifndef HAVE_MUTEX_HYBRID +#ifdef HAVE_MUTEX_HYBRID + micros = 0; +#else /* * Wait 1ms initially, up to 10ms for mutexes backing logical database * locks, and up to 25 ms for mutual exclusion data structure mutexes. @@ -119,16 +124,15 @@ __db_tas_mutex_lock_int(env, mutex, timeout, nowait) max_micros = F_ISSET(mutexp, DB_MUTEX_LOGICAL_LOCK) ? 10000 : 25000; #endif - /* Clear the ending timespec so it'll be initialed upon first need. */ + /* Clear the ending timespec so it'll be initialized upon first need. */ if (timeout != 0) - timespecclear(×pec); + timespecclear(&timeout_timespec); /* - * Only check the thread state once, by initializing the thread - * control block pointer to null. If it is not the failchk - * thread, then ip will have a valid value subsequent times - * in the loop. - */ + * Only check the thread state once, by initializing the thread + * control block pointer to null. If it is not the failchk thread, + * then ip will be valid during the subsequent times in the loop. + */ ip = NULL; loop: /* Attempt to acquire the resource for N spins. */ @@ -151,16 +155,45 @@ loop: /* Attempt to acquire the resource for N spins. */ if (F_ISSET(dbenv, DB_ENV_FAILCHK) && ip == NULL && dbenv->is_alive(dbenv, mutexp->pid, mutexp->tid, 0) == 0) { + /* + * The process owing the mutex is "dead" now, but it may + * have already released the mutex. We need to check again + * by going back to the top of the loop if the mutex is + * still held by the "dead" process. We yield 10 us to + * increase the likelyhood of mutexp fields being up-to-date. + * Set spin so we spin one more time because there isno need + * to spin more if the dead process owns the mutex. + */ + if (nspins > 1) { + nspins = 2; + __os_yield(env, 0, 10); + continue; + } ret = __env_set_state(env, &ip, THREAD_VERIFY); if (ret != 0 || - ip->dbth_state == THREAD_FAILCHK) - return (DB_RUNRECOVERY); + ip->dbth_state == THREAD_FAILCHK) { + /* + * Either we could not get the thread + * state or we did and found that this + * is the failchk thread. Return a panic + * code in either case, but if the + * failchk thread don't give more + * notice of the already-existing panic. + */ + if (ret == 0) + return (USR_ERR(env, + DB_RUNRECOVERY)); + else + return (__env_panic(env, + USR_ERR(env, ret))); + } } if (nowait) - return (DB_LOCK_NOTGRANTED); + return (USR_ERR(env, DB_LOCK_NOTGRANTED)); /* * Some systems (notably those with newer Intel CPUs) * need a small pause here. [#6975] + * XXX Is there some better post-Pentum 4? */ MUTEX_PAUSE continue; @@ -189,9 +222,14 @@ loop: /* Attempt to acquire the resource for N spins. */ * the DB mutex unlock function. */ #endif +#ifdef HAVE_FAILCHK_BROADCAST + if (F_ISSET(mutexp, DB_MUTEX_OWNER_DEAD)) { + MUTEX_UNSET(&mutexp->tas); + return (__mutex_died(env, mutex)); + } +#endif #ifdef DIAGNOSTIC if (F_ISSET(mutexp, DB_MUTEX_LOCKED)) { - char buf[DB_THREADID_STRLEN]; __db_errx(env, DB_STR_A("2030", "TAS lock failed: lock %ld currently in use: ID: %s", "%ld %s"), (long)mutex, @@ -202,6 +240,12 @@ loop: /* Attempt to acquire the resource for N spins. */ #endif F_SET(mutexp, DB_MUTEX_LOCKED); dbenv->thread_id(dbenv, &mutexp->pid, &mutexp->tid); +#if defined(MUTEX_DIAG) + __os_gettime(env, &mutexp->mutex_history.when, 0); + /* Why 3? Skip __os_stack_text, __db_tas_mutex_lock{_int,} */ + __os_stack_text(env, mutexp->mutex_history.stacktext, + sizeof(mutexp->mutex_history.stacktext), 12, 3); +#endif #ifdef DIAGNOSTIC /* @@ -215,20 +259,20 @@ loop: /* Attempt to acquire the resource for N spins. */ } /* - * We need to wait for the lock to become available. - * Possibly setup timeouts if this is the first wait, or - * check expiration times for the second and subsequent waits. + * We need to wait for the lock to become available. Setup timeouts if + * this is the first wait, or the failchk timeout is smaller than the + * wait timeout. Check expiration times for subsequent waits. */ if (timeout != 0) { /* Set the expiration time if this is the first sleep . */ - if (!timespecisset(×pec)) - __clock_set_expires(env, ×pec, timeout); + if (!timespecisset(&timeout_timespec)) + __clock_set_expires(env, &timeout_timespec, timeout); else { timespecclear(&now); - if (__clock_expired(env, &now, ×pec)) - return (DB_TIMEOUT); + if (__clock_expired(env, &now, &timeout_timespec)) + return (USR_ERR(env, DB_TIMEOUT)); #ifndef HAVE_MUTEX_HYBRID - timespecsub(&now, ×pec); + timespecsub(&now, &timeout_timespec); DB_TIMESPEC_TO_TIMEOUT(time_left, &now, 0); time_left = timeout - time_left; if (micros > time_left) @@ -253,13 +297,21 @@ loop: /* Attempt to acquire the resource for N spins. */ goto loop; /* Wait until the mutex can be obtained exclusively or it times out. */ if ((ret = __db_hybrid_mutex_suspend(env, - mutex, timeout == 0 ? NULL : ×pec, TRUE)) != 0) + mutex, timeout == 0 ? NULL : &timeout_timespec, TRUE)) != 0) { + DB_DEBUG_MSG(env, + "mutex_lock %ld suspend returned %d", (u_long)mutex, ret); return (ret); + } #else if ((micros <<= 1) > max_micros) micros = max_micros; #endif +#ifdef HAVE_FAILCHK_BROADCAST + if (F_ISSET(mutexp, DB_MUTEX_OWNER_DEAD) && + dbenv->mutex_failchk_timeout != 0) + return (__mutex_died(env, mutex)); +#endif /* * We're spinning. The environment might be hung, and somebody else * has already recovered it. The first thing recovery does is panic @@ -291,7 +343,7 @@ __db_tas_mutex_lock(env, mutex, timeout) * Try to exclusively lock a mutex without ever blocking - ever! * * Returns 0 on success, - * DB_LOCK_NOTGRANTED on timeout + * DB_LOCK_NOTGRANTED if it is busy. * Possibly DB_RUNRECOVERY if DB_ENV_FAILCHK or panic. * * This will work for DB_MUTEX_SHARED, though it always tries @@ -324,9 +376,9 @@ __db_tas_mutex_readlock_int(env, mutex, nowait) DB_MUTEXMGR *mtxmgr; DB_MUTEXREGION *mtxregion; DB_THREAD_INFO *ip; - int lock; + MUTEX_STATE *state; + int lock, ret; u_int32_t nspins; - int ret; #ifndef HAVE_MUTEX_HYBRID u_long micros, max_micros; #endif @@ -342,14 +394,17 @@ __db_tas_mutex_readlock_int(env, mutex, nowait) CHECK_MTX_THREAD(env, mutexp); DB_ASSERT(env, F_ISSET(mutexp, DB_MUTEX_SHARED)); -#ifdef HAVE_STATISTICS if (F_ISSET(mutexp, DB_MUTEX_LOCKED)) STAT_INC(env, mutex, set_rd_wait, mutexp->mutex_set_rd_wait, mutex); else STAT_INC(env, mutex, set_rd_nowait, mutexp->mutex_set_rd_nowait, mutex); -#endif + + state = NULL; + if (env->thr_hashtab != NULL && (ret = __mutex_record_lock(env, + mutex, MUTEX_ACTION_INTEND_SHARE, &state)) != 0) + return (ret); #ifndef HAVE_MUTEX_HYBRID /* @@ -375,25 +430,52 @@ loop: /* Attempt to acquire the resource for N spins. */ MUTEX_PAUSE continue; } +#ifdef HAVE_FAILCHK_BROADCAST + if (F_ISSET(mutexp, DB_MUTEX_OWNER_DEAD) && + !F_ISSET(dbenv, DB_ENV_FAILCHK)) { + (void)atomic_compare_exchange(env, + &mutexp->sharecount, lock, lock - 1); + if (state != NULL) + state->action = MUTEX_ACTION_UNLOCKED; + return (__mutex_died(env, mutex)); + } +#endif MEMBAR_ENTER(); +#ifdef MUTEX_DIAG + __os_gettime(env, &mutexp->mutex_history.when, 0); + __os_stack_text(env, mutexp->mutex_history.stacktext, + sizeof(mutexp->mutex_history.stacktext), 12, 3); +#endif /* For shared latches the threadid is the last requestor's id. */ dbenv->thread_id(dbenv, &mutexp->pid, &mutexp->tid); + if (state != NULL) + state->action = MUTEX_ACTION_SHARED; return (0); } - /* - * Waiting for the latched must be avoided when it could allow a - * 'failchk'ing thread to hang. - */ + /* Waiting for the latch must be avoided if it could hang up failchk. */ if (F_ISSET(dbenv, DB_ENV_FAILCHK) && dbenv->is_alive(dbenv, mutexp->pid, mutexp->tid, 0) == 0) { ret = __env_set_state(env, &ip, THREAD_VERIFY); - if (ret != 0 || ip->dbth_state == THREAD_FAILCHK) - return (DB_RUNRECOVERY); + if (ret != 0 || ip->dbth_state == THREAD_FAILCHK) { + if (state != NULL) + state->action = MUTEX_ACTION_UNLOCKED; + if (ret == 0) + return (USR_ERR(env, DB_RUNRECOVERY)); + else + return (__env_panic(env, USR_ERR(env, ret))); + } } +#ifdef HAVE_FAILCHK_BROADCAST + if (F_ISSET(mutexp, DB_MUTEX_OWNER_DEAD)) { + if (state != NULL) + state->action = MUTEX_ACTION_UNLOCKED; + return (__mutex_died(env, mutex)); + } +#endif /* * It is possible to spin out when the latch is just shared, due to @@ -403,6 +485,8 @@ loop: /* Attempt to acquire the resource for N spins. */ if (nowait) { if (atomic_read(&mutexp->sharecount) != MUTEX_SHARE_ISEXCLUSIVE) goto loop; + if (state != NULL) + state->action = MUTEX_ACTION_UNLOCKED; return (DB_LOCK_NOTGRANTED); } @@ -419,8 +503,11 @@ loop: /* Attempt to acquire the resource for N spins. */ if (atomic_read(&mutexp->sharecount) != MUTEX_SHARE_ISEXCLUSIVE) goto loop; /* Wait until the mutex is no longer exclusively locked. */ - if ((ret = __db_hybrid_mutex_suspend(env, mutex, NULL, FALSE)) != 0) + if ((ret = __db_hybrid_mutex_suspend(env, mutex, NULL, FALSE)) != 0) { + if (state != NULL) + state->action = MUTEX_ACTION_UNLOCKED; return (ret); + } #else PERFMON4(env, mutex, suspend, mutex, FALSE, mutexp->alloc_id, mutexp); __os_yield(env, 0, micros); @@ -486,17 +573,13 @@ __db_tas_mutex_tryreadlock(env, mutex) */ int __db_tas_mutex_unlock(env, mutex) - ENV *env; + ENV *env; db_mutex_t mutex; { DB_ENV *dbenv; DB_MUTEX *mutexp; -#ifdef HAVE_MUTEX_HYBRID int ret; -#ifdef MUTEX_DIAG - int waiters; -#endif -#endif + char description[DB_MUTEX_DESCRIBE_STRLEN]; #ifdef HAVE_SHARED_LATCHES int sharecount; #endif @@ -506,14 +589,14 @@ __db_tas_mutex_unlock(env, mutex) return (0); mutexp = MUTEXP_SET(env, mutex); -#if defined(HAVE_MUTEX_HYBRID) && defined(MUTEX_DIAG) - waiters = mutexp->wait; -#endif #if defined(DIAGNOSTIC) #if defined(HAVE_SHARED_LATCHES) if (F_ISSET(mutexp, DB_MUTEX_SHARED)) { if (atomic_read(&mutexp->sharecount) == 0) { + if (PANIC_ISSET(env)) + return (__env_panic(env, + USR_ERR(env, DB_RUNRECOVERY))); __db_errx(env, DB_STR_A("2031", "shared unlock %ld already unlocked", "%ld"), (long)mutex); @@ -522,16 +605,39 @@ __db_tas_mutex_unlock(env, mutex) } else #endif if (!F_ISSET(mutexp, DB_MUTEX_LOCKED)) { + if (PANIC_ISSET(env)) + return (__env_panic(env, + USR_ERR(env, DB_RUNRECOVERY))); __db_errx(env, DB_STR_A("2032", "unlock %ld already unlocked", "%ld"), (long)mutex); return (__env_panic(env, EACCES)); } #endif +#ifdef MUTEX_DIAG + timespecclear(&mutexp->mutex_history.when); +#endif #ifdef HAVE_SHARED_LATCHES if (F_ISSET(mutexp, DB_MUTEX_SHARED)) { sharecount = atomic_read(&mutexp->sharecount); - /*MUTEX_MEMBAR(mutexp->sharecount);*/ /* XXX why? */ + /* + * Many code paths contain sequence of the form + * MUTEX_LOCK(); ret = function(); MUTEX_UNLOCK(); + * If function() sees or causes a panic while it had temporarily + * unlocked the mutex it won't be locked anymore. Don't confuse + * the error by generating spurious follow-on messages. + */ + if (sharecount == 0) { +was_not_locked: + if (!PANIC_ISSET(env)) { + __db_errx(env, DB_STR_A("2070", + "Shared unlock %s: already unlocked", "%s"), + __mutex_describe(env, mutex, description)); + return (__env_panic(env, + USR_ERR(env, DB_RUNRECOVERY))); + } + return (__env_panic(env, EACCES)); + } if (sharecount == MUTEX_SHARE_ISEXCLUSIVE) { F_CLR(mutexp, DB_MUTEX_LOCKED); /* Flush flag update before zeroing count */ @@ -542,12 +648,17 @@ __db_tas_mutex_unlock(env, mutex) MEMBAR_EXIT(); sharecount = atomic_dec(env, &mutexp->sharecount); DB_ASSERT(env, sharecount >= 0); + if (env->thr_hashtab != NULL && + (ret = __mutex_record_unlock(env, mutex)) != 0) + return (ret); if (sharecount > 0) return (0); } } else #endif { + if (!F_ISSET(mutexp, DB_MUTEX_LOCKED)) + goto was_not_locked; F_CLR(mutexp, DB_MUTEX_LOCKED); MUTEX_UNSET(&mutexp->tas); } @@ -559,17 +670,10 @@ __db_tas_mutex_unlock(env, mutex) #endif /* Prevent the load of wait from being hoisted before MUTEX_UNSET */ - MUTEX_MEMBAR(mutexp->flags); + (void)MUTEX_MEMBAR(mutexp->flags); if (mutexp->wait && (ret = __db_pthread_mutex_unlock(env, mutex)) != 0) return (ret); - -#ifdef MUTEX_DIAG - if (mutexp->wait) - printf("tas_unlock %ld %x waiters! busy %x waiters %d/%d\n", - mutex, pthread_self(), - MUTEXP_BUSY_FIELD(mutexp), waiters, mutexp->wait); -#endif #endif return (0); diff --git a/src/mutex/mut_win32.c b/src/mutex/mut_win32.c index 07d5a8dd..270e03fb 100644 --- a/src/mutex/mut_win32.c +++ b/src/mutex/mut_win32.c @@ -1,7 +1,7 @@ /* * See the file LICENSE for redistribution information. * - * Copyright (c) 2002, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2002, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -92,6 +92,9 @@ __db_win32_mutex_lock_int(env, mutex, timeout, wait) db_timespec now, tempspec, timeoutspec; db_timeout_t time_left; int ret; +#ifdef DIAGNOSTIC + char buf[DB_THREADID_STRLEN]; +#endif #ifdef MUTEX_DIAG LARGE_INTEGER now; #endif @@ -143,8 +146,10 @@ loop: /* Attempt to acquire the mutex mutex_tas_spins times, if waiting. */ mutexp->pid, mutexp->tid, 0) == 0) { ret = __env_set_state(env, &ip, THREAD_VERIFY); if (ret != 0 || - ip->dbth_state == THREAD_FAILCHK) - return (DB_RUNRECOVERY); + ip->dbth_state == THREAD_FAILCHK) { + ret = DB_RUNRECOVERY; + goto failed; + } } if (!wait) return (DB_LOCK_NOTGRANTED); @@ -155,15 +160,20 @@ loop: /* Attempt to acquire the mutex mutex_tas_spins times, if waiting. */ MUTEX_PAUSE continue; } - +#ifdef HAVE_FAILCHK_BROADCAST + if (F_ISSET(mutexp, DB_MUTEX_OWNER_DEAD)) { + MUTEX_UNSET(&mutexp->tas); + goto died; + } +#endif #ifdef DIAGNOSTIC if (F_ISSET(mutexp, DB_MUTEX_LOCKED)) { - char buf[DB_THREADID_STRLEN]; __db_errx(env, DB_STR_A("2003", "Win32 lock failed: mutex already locked by %s", "%s"), dbenv->thread_id_string(dbenv, mutexp->pid, mutexp->tid, buf)); - return (__env_panic(env, EACCES)); + ret = __env_panic(env, EACCES); + goto failed; } #endif F_SET(mutexp, DB_MUTEX_LOCKED); @@ -179,11 +189,12 @@ loop: /* Attempt to acquire the mutex mutex_tas_spins times, if waiting. */ CloseHandle(event); InterlockedDecrement(&mutexp->nwaiters); #ifdef MUTEX_DIAG + /* "ret" was set by WaitForSingleObject(). */ if (ret != WAIT_OBJECT_0) { QueryPerformanceCounter(&diag_now); printf(DB_STR_A("2004", - "[%I64d]: Lost signal on mutex %p, " - "id %d, ms %d\n", "%I64d %p %d %d"), + "[%lld]: Lost signal on mutex %p, " + "id %d, ms %d\n", "%lld %p %d %d"), diag_now.QuadPart, mutexp, mutexp->id, ms); } #endif @@ -210,11 +221,8 @@ loop: /* Attempt to acquire the mutex mutex_tas_spins times, if waiting. */ if (timeout != 0) { timespecclear(&now); if (__clock_expired(env, &now, &timeoutspec)) { - if (event != NULL) { - CloseHandle(event); - InterlockedDecrement(&mutexp->nwaiters); - } - return (DB_TIMEOUT); + ret = DB_TIMEOUT; + goto failed; } /* Reduce the event wait if the timeout would happen first. */ tempspec = timeoutspec; @@ -228,24 +236,41 @@ loop: /* Attempt to acquire the mutex mutex_tas_spins times, if waiting. */ #ifdef MUTEX_DIAG QueryPerformanceCounter(&diag_now); printf(DB_STR_A("2005", - "[%I64d]: Waiting on mutex %p, id %d\n", - "%I64d %p %d"), diag_now.QuadPart, mutexp, mutexp->id); + "[%lld]: Waiting on mutex %p, id %d\n", + "%lld %p %d"), diag_now.QuadPart, mutexp, mutexp->id); #endif InterlockedIncrement(&mutexp->nwaiters); - if ((ret = get_handle(env, mutexp, &event)) != 0) - goto err; + if ((ret = get_handle(env, mutexp, &event)) != 0) { + InterlockedDecrement(&mutexp->nwaiters); + goto syserr; + } } if ((ret = WaitForSingleObject(event, ms)) == WAIT_FAILED) { ret = __os_get_syserr(); - goto err; + goto syserr; } if ((ms <<= 1) > MS_PER_SEC) ms = MS_PER_SEC; +#ifdef HAVE_FAILCHK_BROADCAST + if (F_ISSET(mutexp, DB_MUTEX_OWNER_DEAD) && + !F_ISSET(dbenv, DB_ENV_FAILCHK)) { +died: + ret = __mutex_died(env, mutex); + goto failed; + } +#endif PANIC_CHECK(env); goto loop; -err: __db_syserr(env, ret, DB_STR("2006", "Win32 lock failed")); +failed: + if (event != NULL) { + CloseHandle(event); + InterlockedDecrement(&mutexp->nwaiters); + } + return (ret); + +syserr: __db_syserr(env, ret, DB_STR("2006", "Win32 lock failed")); return (__env_panic(env, __os_posix_err(ret))); } @@ -266,6 +291,12 @@ __db_win32_mutex_init(env, mutex, flags) mutexp = MUTEXP_SET(env, mutex); mutexp->id = ((getpid() & 0xffff) << 16) ^ P_TO_UINT32(mutexp); F_SET(mutexp, flags); + /* + * See WINCE_ATOMIC_MAGIC definition for details. + * Use sharecount, because the value just needs to be a db_atomic_t + * memory mapped onto the same page as those being Interlocked*. + */ + WINCE_ATOMIC_MAGIC(&mutexp->sharecount); return (0); } @@ -315,9 +346,11 @@ __db_win32_mutex_readlock_int(env, mutex, nowait) DB_MUTEXMGR *mtxmgr; DB_MUTEXREGION *mtxregion; HANDLE event; + MUTEX_STATE *state; u_int32_t nspins; - int ms, ret; - long exch_ret, mtx_val; + int max_ms, ms, ret; + long mtx_val; + #ifdef MUTEX_DIAG LARGE_INTEGER diag_now; #endif @@ -342,11 +375,23 @@ __db_win32_mutex_readlock_int(env, mutex, nowait) event = NULL; ms = 50; ret = 0; + + state = NULL; + if (env->thr_hashtab != NULL && (ret = __mutex_record_lock(env, + mutex, MUTEX_ACTION_INTEND_SHARE, &state)) != 0) + return (ret); +#ifdef HAVE_FAILCHK_BROADCAST /* - * This needs to be initialized, since if mutexp->tas - * is write locked on the first pass, it needs a value. + * Limit WaitForSingleObject() sleeps to at most the failchk timeout, + * and least 1 millisecond. When failchk broadcasting is not + * supported check at least every second. */ - exch_ret = 0; + if (dbenv->mutex_failchk_timeout != 0 && + (max_ms = (dbenv->mutex_failchk_timeout / US_PER_MS)) == 0) + max_ms = 1; + else +#endif + max_ms = MS_PER_SEC; loop: /* Attempt to acquire the resource for N spins. */ for (nspins = @@ -357,9 +402,10 @@ loop: /* Attempt to acquire the resource for N spins. */ */ retry: mtx_val = atomic_read(&mutexp->sharecount); if (mtx_val == MUTEX_SHARE_ISEXCLUSIVE) { - if (nowait) - return (DB_LOCK_NOTGRANTED); - + if (nowait) { + ret = DB_LOCK_NOTGRANTED; + goto failed; + } continue; } else if (!atomic_compare_exchange(env, &mutexp->sharecount, mtx_val, mtx_val + 1)) { @@ -370,6 +416,15 @@ retry: mtx_val = atomic_read(&mutexp->sharecount); MUTEX_PAUSE goto retry; } +#ifdef HAVE_FAILCHK_BROADCAST + if (F_ISSET(mutexp, DB_MUTEX_OWNER_DEAD) && + !F_ISSET(dbenv, DB_ENV_FAILCHK)) { + InterlockedDecrement( + (interlocked_val)&mutexp->sharecount); + ret = __mutex_died(env, mutex); + goto failed; + } +#endif #ifdef HAVE_STATISTICS if (event == NULL) @@ -384,12 +439,14 @@ retry: mtx_val = atomic_read(&mutexp->sharecount); if (ret != WAIT_OBJECT_0) { QueryPerformanceCounter(&diag_now); printf(DB_STR_A("2007", - "[%I64d]: Lost signal on mutex %p, " - "id %d, ms %d\n", "%I64d %p %d %d"), + "[%lld]: Lost signal on mutex %p, " + "id %d, ms %d\n", "%lld %p %d %d"), diag_now.QuadPart, mutexp, mutexp->id, ms); } #endif } + if (state != NULL) + state->action = MUTEX_ACTION_SHARED; #ifdef DIAGNOSTIC /* @@ -404,17 +461,17 @@ retry: mtx_val = atomic_read(&mutexp->sharecount); } /* - * Yield the processor; wait 50 ms initially, up to 1 second. This - * loop is needed to work around a race where the signal from the - * unlocking thread gets lost. We start at 50 ms because it's unlikely - * to happen often and we want to avoid wasting CPU. + * Yield the processor; wait 50 ms initially, up to 1 second or the + * failchk timeout. This loop works around a race where the signal from + * the unlocking thread gets lost. We start at 50 ms because it's + * unlikely to happen often and we want to avoid wasting CPU. */ if (event == NULL) { #ifdef MUTEX_DIAG QueryPerformanceCounter(&diag_now); printf(DB_STR_A("2008", - "[%I64d]: Waiting on mutex %p, id %d\n", - "%I64d %p %d"), diag_now.QuadPart, mutexp, mutexp->id); + "[%lld]: Waiting on mutex %p, id %d\n", + "%lld %p %d"), diag_now.QuadPart, mutexp, mutexp->id); #endif InterlockedIncrement(&mutexp->nwaiters); if ((ret = get_handle(env, mutexp, &event)) != 0) @@ -424,12 +481,32 @@ retry: mtx_val = atomic_read(&mutexp->sharecount); ret = __os_get_syserr(); goto err; } - if ((ms <<= 1) > MS_PER_SEC) - ms = MS_PER_SEC; + +#ifdef HAVE_FAILCHK_BROADCAST + if (F_ISSET(mutexp, DB_MUTEX_OWNER_DEAD) && + !F_ISSET(dbenv, DB_ENV_FAILCHK)) { + (void)atomic_compare_exchange(env, + &mutexp->sharecount, mtx_val, mtx_val - 1); + ret = __mutex_died(env, mutex); + goto failed; + } +#endif PANIC_CHECK(env); + + if ((ms <<= 1) > max_ms) + ms = max_ms; goto loop; +failed: + if (event != NULL) { + CloseHandle(event); + InterlockedDecrement(&mutexp->nwaiters); + } + if (state != NULL) + state->action = MUTEX_ACTION_UNLOCKED; + return (ret); + err: __db_syserr(env, ret, DB_STR("2009", "Win32 read lock failed")); return (__env_panic(env, __os_posix_err(ret))); @@ -482,7 +559,8 @@ __db_win32_mutex_unlock(env, mutex) DB_ENV *dbenv; DB_MUTEX *mutexp; HANDLE event; - int ret; + int ret, sharecount; + char description[DB_MUTEX_DESCRIBE_STRLEN]; #ifdef MUTEX_DIAG LARGE_INTEGER diag_now; #endif @@ -510,6 +588,16 @@ __db_win32_mutex_unlock(env, mutex) */ #ifdef HAVE_SHARED_LATCHES if (F_ISSET(mutexp, DB_MUTEX_SHARED)) { + sharecount = atomic_read(&mutexp->sharecount); + if (sharecount == 0) { + if (!PANIC_ISSET(env)) { + __db_errx(env, DB_STR_A("2071", + "Shared unlock %s: already unlocked", "%s"), + __mutex_describe(env, mutex, description)); + return (DB_RUNRECOVERY); + } + return (__env_panic(env, EACCES)); + } if (F_ISSET(mutexp, DB_MUTEX_LOCKED)) { F_CLR(mutexp, DB_MUTEX_LOCKED); if ((ret = InterlockedExchange( @@ -519,12 +607,26 @@ __db_win32_mutex_unlock(env, mutex) ret = DB_RUNRECOVERY; goto err; } - } else if (InterlockedDecrement( - (interlocked_val)(&atomic_read(&mutexp->sharecount))) > 0) - return (0); + } else { + if (env->thr_hashtab != NULL && + (ret = __mutex_record_unlock(env, mutex)) != 0) + return (ret); + if (InterlockedDecrement((interlocked_val) + (&atomic_read(&mutexp->sharecount))) > 0) + return (0); + } } else #endif { + if (!F_ISSET(mutexp, DB_MUTEX_LOCKED)) { + if (!PANIC_ISSET(env)) { + __db_errx(env, DB_STR_A("2072", + "Unlock %s: already unlocked", "%s"), + __mutex_describe(env, mutex, description)); + return (DB_RUNRECOVERY); + } + return (__env_panic(env, EACCES)); + } F_CLR(mutexp, DB_MUTEX_LOCKED); MUTEX_UNSET(&mutexp->tas); } @@ -536,8 +638,8 @@ __db_win32_mutex_unlock(env, mutex) #ifdef MUTEX_DIAG QueryPerformanceCounter(&diag_now); printf(DB_STR_A("2011", - "[%I64d]: Signalling mutex %p, id %d\n", - "%I64d %p %d"), diag_now.QuadPart, mutexp, mutexp->id); + "[%lld]: Signalling mutex %p, id %d\n", + "%lld %p %d"), diag_now.QuadPart, mutexp, mutexp->id); #endif if (!PulseEvent(event)) { ret = __os_get_syserr(); diff --git a/src/mutex/test_mutex.c b/src/mutex/test_mutex.c index 24c18016..d6183bdb 100644 --- a/src/mutex/test_mutex.c +++ b/src/mutex/test_mutex.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved. * * Standalone mutex tester for Berkeley DB mutexes. * @@ -13,7 +13,6 @@ #include "db_int.h" #ifdef DB_WIN32 -#define MUTEX_THREAD_TEST 1 extern int getopt(int, char * const *, const char *); @@ -33,29 +32,13 @@ typedef HANDLE os_thread_t; #include <sys/wait.h> typedef pid_t os_pid_t; - -/* - * There's only one mutex implementation that can't support thread-level - * locking: UNIX/fcntl mutexes. - * - * The general Berkeley DB library configuration doesn't look for the POSIX - * pthread functions, with one exception -- pthread_yield. - * - * Use these two facts to decide if we're going to build with or without - * threads. - */ -#if !defined(HAVE_MUTEX_FCNTL) && defined(HAVE_PTHREAD_YIELD) -#define MUTEX_THREAD_TEST 1 - -#include <pthread.h> - typedef pthread_t os_thread_t; #define os_thread_create(thrp, attr, func, arg) \ pthread_create((thrp), (attr), (func), (arg)) #define os_thread_join(thr, statusp) pthread_join((thr), (statusp)) #define os_thread_self() pthread_self() -#endif /* HAVE_PTHREAD_YIELD */ + #endif /* !DB_WIN32 */ #define OS_BAD_PID ((os_pid_t)-1) @@ -76,28 +59,25 @@ typedef struct { u_int wakeme; /* Request to awake. */ } TM; -DB_ENV *dbenv; /* Backing environment */ +DB_ENV *dbenv; /* Backing environment. */ ENV *env; size_t len; /* Backing data chunk size. */ +u_int alignment = 0; /* Specify mutex alignment. */ + u_int8_t *gm_addr; /* Global mutex */ u_int8_t *lm_addr; /* Locker mutexes */ u_int8_t *tm_addr; /* Thread mutexes */ -#ifdef MUTEX_THREAD_TEST os_thread_t *kidsp; /* Locker threads */ os_thread_t wakep; /* Wakeup thread */ -#endif #ifndef HAVE_MMAP u_int nprocs = 1; /* -p: Processes. */ u_int nthreads = 20; /* -t: Threads. */ -#elif MUTEX_THREAD_TEST +#else u_int nprocs = 5; /* -p: Processes. */ u_int nthreads = 4; /* -t: Threads. */ -#else -u_int nprocs = 20; /* -p: Processes. */ -u_int nthreads = 1; /* -t: Threads. */ #endif u_int maxlocks = 20; /* -l: Backing locks. */ @@ -147,8 +127,11 @@ main(argc, argv) rtype = PARENT; id = 0; tmpath = argv[0]; - while ((ch = getopt(argc, argv, "l:n:p:T:t:v")) != EOF) + while ((ch = getopt(argc, argv, "a:l:n:p:T:t:v")) != EOF) switch (ch) { + case 'a': + alignment = (u_int)atoi(optarg); + break; case 'l': maxlocks = (u_int)atoi(optarg); break; @@ -161,14 +144,6 @@ main(argc, argv) case 't': if ((nthreads = (u_int)atoi(optarg)) == 0) nthreads = 1; -#if !defined(MUTEX_THREAD_TEST) - if (nthreads != 1) { - fprintf(stderr, - "%s: thread support not available or not compiled for this platform.\n", - progname); - return (EXIT_FAILURE); - } -#endif break; case 'T': if (!memcmp(optarg, "locker", sizeof("locker") - 1)) @@ -242,7 +217,11 @@ main(argc, argv) * * Clean up from any previous runs. */ +#ifdef DB_WIN32 + snprintf(cmd, sizeof(cmd), "rmdir /S /Q %s", TESTDIR); +#else snprintf(cmd, sizeof(cmd), "rm -rf %s", TESTDIR); +#endif (void)system(cmd); snprintf(cmd, sizeof(cmd), "mkdir %s", TESTDIR); (void)system(cmd); @@ -292,8 +271,8 @@ main(argc, argv) /* Wait for all lockers to exit. */ if ((err = os_wait(pids, nprocs)) != 0) { - fprintf(stderr, "%s: locker wait failed with %d\n", - progname, err); + fprintf(stderr, "%s: locker wait failed with %s\n", + progname, db_strerror(err)); goto fail; } @@ -357,7 +336,6 @@ int locker_start(id) u_long id; { -#if defined(MUTEX_THREAD_TEST) u_int i; int err; @@ -378,17 +356,13 @@ locker_start(id) return (1); } return (0); -#else - return (run_lthread((void *)id) == NULL ? 0 : 1); -#endif } int locker_wait() { -#if defined(MUTEX_THREAD_TEST) u_int i; - void *retp; + void *retp = NULL; /* Wait for the threads to exit. */ for (i = 0; i < nthreads; i++) { @@ -400,7 +374,6 @@ locker_wait() } } free(kidsp); -#endif return (0); } @@ -414,11 +387,7 @@ run_lthread(arg) int err, i; id = (u_long)arg; -#if defined(MUTEX_THREAD_TEST) tid = (u_long)os_thread_self(); -#else - tid = 0; -#endif printf("Locker: ID %03lu (PID: %lu; TID: %lx)\n", id, (u_long)getpid(), tid); @@ -534,7 +503,6 @@ int wakeup_start(id) u_long id; { -#if defined(MUTEX_THREAD_TEST) int err; /* @@ -547,16 +515,12 @@ wakeup_start(id) return (1); } return (0); -#else - return (run_wthread((void *)id) == NULL ? 0 : 1); -#endif } int wakeup_wait() { -#if defined(MUTEX_THREAD_TEST) - void *retp; + void *retp = NULL; /* * A file is created when the wakeup thread is no longer needed. @@ -567,7 +531,6 @@ wakeup_wait() "%s: wakeup thread exited with error\n", progname); return (1); } -#endif return (0); } @@ -586,11 +549,7 @@ run_wthread(arg) id = (u_long)arg; quitcheck = 0; -#if defined(MUTEX_THREAD_TEST) tid = (u_long)os_thread_self(); -#else - tid = 0; -#endif printf("Wakeup: ID %03lu (PID: %lu; TID: %lx)\n", id, (u_long)getpid(), tid); @@ -683,6 +642,12 @@ tm_env_init() home = TESTDIR; if (nthreads != 1) flags |= DB_THREAD; + if (alignment != 0 && + (ret = dbenv->mutex_set_align(dbenv, alignment)) != 0) { + dbenv->err(dbenv, ret, "set_align(%d): %s", alignment, home); + return (1); + } + if ((ret = dbenv->open(dbenv, home, flags, 0)) != 0) { dbenv->err(dbenv, ret, "environment open: %s", home); return (1); @@ -748,8 +713,10 @@ tm_mutex_init() if (verbose) printf("\n"); - if (verbose) + if (verbose) { + (void)dbenv->mutex_stat_print(dbenv, DB_STAT_ALL); printf("Allocate %d per-lock mutexes: ", maxlocks); + } for (i = 0; i < maxlocks; ++i) { mp = (TM *)(lm_addr + i * sizeof(TM)); if ((err = dbenv->mutex_alloc(dbenv, 0, &mp->mutex)) != 0) { @@ -930,7 +897,7 @@ int usage() { fprintf(stderr, "usage: %s %s\n\t%s\n", progname, - "[-v] [-l maxlocks]", + "[-a alignment] [-v] [-l maxlocks]", "[-n locks] [-p procs] [-T locker=ID|wakeup=ID] [-t threads]"); return (EXIT_FAILURE); } diff --git a/src/mutex/uts4_cc.s b/src/mutex/uts4_cc.s index 4f59e9c8..76eeed6c 100644 --- a/src/mutex/uts4_cc.s +++ b/src/mutex/uts4_cc.s @@ -1,6 +1,6 @@ / See the file LICENSE for redistribution information. / - / Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + / Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. / / $Id$ / diff --git a/src/os/os_abort.c b/src/os/os_abort.c index 68b4bc05..72ac6751 100644 --- a/src/os/os_abort.c +++ b/src/os/os_abort.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -13,11 +13,11 @@ /* * __os_abort -- * - * PUBLIC: void __os_abort __P((ENV *)); + * PUBLIC: void __os_abort __P((const ENV *)); */ void __os_abort(env) - ENV *env; + const ENV *env; { __os_stack(env); /* Try and get a stack trace. */ diff --git a/src/os/os_abs.c b/src/os/os_abs.c index 4a1a5abd..a241c653 100644 --- a/src/os/os_abs.c +++ b/src/os/os_abs.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os/os_addrinfo.c b/src/os/os_addrinfo.c index 205f41ec..aec30386 100644 --- a/src/os/os_addrinfo.c +++ b/src/os/os_addrinfo.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2006, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os/os_alloc.c b/src/os/os_alloc.c index fb7bf109..478924df 100644 --- a/src/os/os_alloc.c +++ b/src/os/os_alloc.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -11,7 +11,7 @@ #include "db_int.h" #ifdef DIAGNOSTIC -static void __os_guard __P((ENV *)); +static void __os_guard __P((const ENV *)); typedef union { size_t size; @@ -204,11 +204,11 @@ __os_strdup(env, str, storep) * __os_calloc -- * The calloc(3) function for DB. * - * PUBLIC: int __os_calloc __P((ENV *, size_t, size_t, void *)); + * PUBLIC: int __os_calloc __P((const ENV *, size_t, size_t, void *)); */ int __os_calloc(env, num, size, storep) - ENV *env; + const ENV *env; size_t num, size; void *storep; { @@ -227,11 +227,11 @@ __os_calloc(env, num, size, storep) * __os_malloc -- * The malloc(3) function for DB. * - * PUBLIC: int __os_malloc __P((ENV *, size_t, void *)); + * PUBLIC: int __os_malloc __P((const ENV *, size_t, void *)); */ int __os_malloc(env, size, storep) - ENV *env; + const ENV *env; size_t size; void *storep; { @@ -261,9 +261,11 @@ __os_malloc(env, size, storep) * Windows/NT in an MT environment. */ if ((ret = __os_get_errno_ret_zero()) == 0) { - ret = ENOMEM; + ret = USR_ERR(env, ENOMEM); __os_set_errno(ENOMEM); } + else + (void)USR_ERR(env, ret); __db_err(env, ret, DB_STR_A("0147", "malloc: %lu", "%lu"), (u_long)size); return (ret); @@ -292,11 +294,11 @@ __os_malloc(env, size, storep) * __os_realloc -- * The realloc(3) function for DB. * - * PUBLIC: int __os_realloc __P((ENV *, size_t, void *)); + * PUBLIC: int __os_realloc __P((const ENV *, size_t, void *)); */ int __os_realloc(env, size, storep) - ENV *env; + const ENV *env; size_t size; void *storep; { @@ -345,7 +347,7 @@ __os_realloc(env, size, storep) * Windows/NT in an MT environment. */ if ((ret = __os_get_errno_ret_zero()) == 0) { - ret = ENOMEM; + ret = USR_ERR(env, ENOMEM); __os_set_errno(ENOMEM); } __db_err(env, ret, DB_STR_A("0148", "realloc: %lu", "%lu"), @@ -368,11 +370,11 @@ __os_realloc(env, size, storep) * __os_free -- * The free(3) function for DB. * - * PUBLIC: void __os_free __P((ENV *, void *)); + * PUBLIC: void __os_free __P((const ENV *, void *)); */ void __os_free(env, ptr) - ENV *env; + const ENV *env; void *ptr; { #ifdef DIAGNOSTIC @@ -416,7 +418,7 @@ __os_free(env, ptr) */ static void __os_guard(env) - ENV *env; + const ENV *env; { __db_errx(env, DB_STR("0149", "Guard byte incorrect during free")); diff --git a/src/os/os_clock.c b/src/os/os_clock.c index 25eeb704..78f1c8df 100644 --- a/src/os/os_clock.c +++ b/src/os/os_clock.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -14,11 +14,15 @@ * __os_gettime -- * Return the current time-of-day clock in seconds and nanoseconds. * - * PUBLIC: void __os_gettime __P((ENV *, db_timespec *, int)); + * If you want the time of day, pass 0 in the monotonic argument. If you pass + * non-zero, you might get time-of-day or you might get a non-decreasing number + * which is unrelated to the time of day, such as the seconds since system boot. + * + * PUBLIC: void __os_gettime __P((const ENV *, db_timespec *, int)); */ void __os_gettime(env, tp, monotonic) - ENV *env; + const ENV *env; db_timespec *tp; int monotonic; { @@ -35,7 +39,6 @@ __os_gettime(env, tp, monotonic) RETRY_CHK((clock_gettime( CLOCK_REALTIME, (struct timespec *)tp)), ret); - RETRY_CHK((clock_gettime(CLOCK_REALTIME, (struct timespec *)tp)), ret); if (ret != 0) { sc = "clock_gettime"; goto err; @@ -69,5 +72,5 @@ __os_gettime(env, tp, monotonic) return; err: __db_syserr(env, ret, "%s", sc); - (void)__env_panic(env, __os_posix_err(ret)); + (void)__env_panic((ENV *) env, __os_posix_err(ret)); } diff --git a/src/os/os_config.c b/src/os/os_config.c index c455a349..3fe2f045 100644 --- a/src/os/os_config.c +++ b/src/os/os_config.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1998, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os/os_cpu.c b/src/os/os_cpu.c index 6b7f9f1e..53cadecb 100644 --- a/src/os/os_cpu.c +++ b/src/os/os_cpu.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os/os_ctime.c b/src/os/os_ctime.c index 3f656c32..82925cc1 100644 --- a/src/os/os_ctime.c +++ b/src/os/os_ctime.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -32,10 +32,7 @@ __os_ctime(tod, time_buf) * int. */ #if defined(HAVE_VXWORKS) - { - size_t buflen = CTIME_BUFLEN; - (void)ctime_r(tod, time_buf, &buflen); - } + (void)ctime_r(tod, time_buf); #elif defined(HAVE_CTIME_R_3ARG) (void)ctime_r(tod, time_buf, CTIME_BUFLEN); #elif defined(HAVE_CTIME_R) diff --git a/src/os/os_dir.c b/src/os/os_dir.c index 42bad194..7bd91bff 100644 --- a/src/os/os_dir.c +++ b/src/os/os_dir.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os/os_errno.c b/src/os/os_errno.c index a8219f90..9bc15513 100644 --- a/src/os/os_errno.c +++ b/src/os/os_errno.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os/os_fid.c b/src/os/os_fid.c index f2d80e25..43c61202 100644 --- a/src/os/os_fid.c +++ b/src/os/os_fid.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os/os_flock.c b/src/os/os_flock.c index 904d5efe..8f58f244 100644 --- a/src/os/os_flock.c +++ b/src/os/os_flock.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os/os_fsync.c b/src/os/os_fsync.c index 4b757b2c..377d7ff3 100644 --- a/src/os/os_fsync.c +++ b/src/os/os_fsync.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os/os_getenv.c b/src/os/os_getenv.c index 05972112..b7c4e990 100644 --- a/src/os/os_getenv.c +++ b/src/os/os_getenv.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os/os_handle.c b/src/os/os_handle.c index 8ae9dc7f..7dbe31e1 100644 --- a/src/os/os_handle.c +++ b/src/os/os_handle.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1998, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -90,7 +90,7 @@ __os_openhandle(env, name, flags, mode, fhpp) * return EEXISTS. */ DB_END_SINGLE_THREAD; - ret = EEXIST; + ret = USR_ERR(env, EEXIST); goto err; } /* @@ -127,7 +127,10 @@ __os_openhandle(env, name, flags, mode, fhpp) break; } - switch (ret = __os_posix_err(__os_get_syserr())) { + ret = __os_posix_err(__os_get_syserr()); + if (ret != ENOENT) + (void)USR_ERR(env, ret); + switch (ret) { case EMFILE: case ENFILE: case ENOSPC: @@ -160,9 +163,8 @@ __os_openhandle(env, name, flags, mode, fhpp) /* Deny file descriptor access to any child process. */ if ((fcntl_flags = fcntl(fhp->fd, F_GETFD)) == -1 || fcntl(fhp->fd, F_SETFD, fcntl_flags | FD_CLOEXEC) == -1) { - ret = __os_get_syserr(); - __db_syserr(env, ret, DB_STR("0162", - "fcntl(F_SETFD)")); + ret = USR_ERR(env, __os_get_syserr()); + __db_syserr(env, ret, DB_STR("0162", "fcntl(F_SETFD)")); ret = __os_posix_err(ret); goto err; } @@ -226,6 +228,7 @@ __os_closehandle(env, fhp) else RETRY_CHK((close(fhp->fd)), ret); if (ret != 0) { + ret = USR_ERR(env, ret); __db_syserr(env, ret, DB_STR("0164", "close")); ret = __os_posix_err(ret); } diff --git a/src/os/os_map.c b/src/os/os_map.c index 0528f473..b17bf107 100644 --- a/src/os/os_map.c +++ b/src/os/os_map.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -213,6 +213,15 @@ __os_attach(env, infop, rp) if (rp->max < rp->size) rp->max = rp->size; if (ret == 0 && F_ISSET(infop, REGION_CREATE)) { +#ifdef HAVE_MLOCK + /* + * When locking the region in memory extend it fully so that it + * can all be mlock()'d now, and not later when paging could + * interfere with the application. [#21379] + */ + if (F_ISSET(env, ENV_LOCKDOWN)) + rp->size = rp->max; +#endif if (F_ISSET(dbenv, DB_ENV_REGION_INIT)) ret = __db_file_write(env, infop->fhp, rp->size / MEGABYTE, rp->size % MEGABYTE, 0x00); @@ -255,7 +264,7 @@ __os_detach(env, infop, destroy) { DB_ENV *dbenv; REGION *rp; - int ret; + int ret, t_ret; /* * We pass a DB_ENV handle to the user's replacement unmap function, @@ -263,8 +272,16 @@ __os_detach(env, infop, destroy) */ DB_ASSERT(env, env != NULL && env->dbenv != NULL); dbenv = env->dbenv; + ret = 0; + /* + * Don't use a region which is no longer valid, e.g., after the + * env has been removed. + */ rp = infop->rp; + if ((rp->id != 0 && rp->id != infop->id) || + rp->type <= INVALID_REGION_TYPE || rp->type > REGION_TYPE_MAX) + return (EINVAL); /* If the user replaced the unmap call, call through their interface. */ if (DB_GLOBAL(j_region_unmap) != NULL) @@ -314,16 +331,26 @@ __os_detach(env, infop, destroy) return (ret); } + if (F_ISSET(env, ENV_FORCESYNCENV)) + if (msync(infop->addr, rp->max, MS_INVALIDATE | MS_SYNC) != 0) { + t_ret = __os_get_syserr(); + __db_syserr(env, t_ret, DB_STR("0248", + "msync failed on closing environment")); + if (ret == 0) + ret = t_ret; + } + if (munmap(infop->addr, rp->max) != 0) { - ret = __os_get_syserr(); - __db_syserr(env, ret, DB_STR("0123", "munmap")); - return (__os_posix_err(ret)); + t_ret = __os_get_syserr(); + __db_syserr(env, t_ret, DB_STR("0123", "munmap")); + if (ret == 0) + ret = t_ret; } - if (destroy && (ret = __os_unlink(env, infop->name, 1)) != 0) - return (ret); + if (destroy && (t_ret = __os_unlink(env, infop->name, 1)) != 0 && ret == 0) + ret = t_ret; - return (0); + return (ret); #else COMPQUIET(destroy, 0); COMPQUIET(ret, 0); diff --git a/src/os/os_mkdir.c b/src/os/os_mkdir.c index 800d445c..b3034e30 100644 --- a/src/os/os_mkdir.c +++ b/src/os/os_mkdir.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os/os_open.c b/src/os/os_open.c index 5090c8e1..0c58848e 100644 --- a/src/os/os_open.c +++ b/src/os/os_open.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os/os_path.c b/src/os/os_path.c index 478fdf45..b712b31a 100644 --- a/src/os/os_path.c +++ b/src/os/os_path.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os/os_pid.c b/src/os/os_pid.c index b1b94d60..9efe4633 100644 --- a/src/os/os_pid.c +++ b/src/os/os_pid.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -40,7 +40,7 @@ __os_id(dbenv, pidp, tidp) *pidp = dbenv->env->pid_cache; } -/* +/* * When building on MinGW, we define both HAVE_PTHREAD_SELF and DB_WIN32, * and we are using pthreads instead of Windows threads implementation. * So here, we need to check the thread implementations before checking diff --git a/src/os/os_rename.c b/src/os/os_rename.c index 63aac7bb..1a3d7cbd 100644 --- a/src/os/os_rename.c +++ b/src/os/os_rename.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os/os_rmdir.c b/src/os/os_rmdir.c new file mode 100644 index 00000000..ab3a1556 --- /dev/null +++ b/src/os/os_rmdir.c @@ -0,0 +1,38 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" + +/* + * __os_rmdir -- + * Remove a directory. + * + * PUBLIC: int __os_rmdir __P((ENV *, const char *)); + */ +int +__os_rmdir(env, name) + ENV *env; + const char *name; +{ + DB_ENV *dbenv; + int ret; + + dbenv = env == NULL ? NULL : env->dbenv; + if (dbenv != NULL && + FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL)) + __db_msg(env, DB_STR_A("0239", "fileops: rmdir %s", + "%s"), name); + + RETRY_CHK((rmdir(name)), ret); + if (ret != 0) + return (__os_posix_err(ret)); + + return (ret); +} diff --git a/src/os/os_root.c b/src/os/os_root.c index 77e7a72c..6634a4a2 100644 --- a/src/os/os_root.c +++ b/src/os/os_root.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os/os_rpath.c b/src/os/os_rpath.c index 16f3e54c..48c59b3d 100644 --- a/src/os/os_rpath.c +++ b/src/os/os_rpath.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os/os_rw.c b/src/os/os_rw.c index c0967514..cc665ee4 100644 --- a/src/os/os_rw.c +++ b/src/os/os_rw.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os/os_seek.c b/src/os/os_seek.c index 4676d33a..95408f3d 100644 --- a/src/os/os_seek.c +++ b/src/os/os_seek.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os/os_stack.c b/src/os/os_stack.c index 037080f3..9844930f 100644 --- a/src/os/os_stack.c +++ b/src/os/os_stack.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -14,32 +14,143 @@ #include <execinfo.h> #endif +#undef __DB_STACK_MAXFRAMES +#define __DB_STACK_MAXFRAMES 25 + /* * __os_stack -- - * Output a stack trace to the message file handle. + * Output a stack trace in a single write to the error file handle. * - * PUBLIC: void __os_stack __P((ENV *)); + * PUBLIC: void __os_stack __P((const ENV *)); */ void __os_stack(env) - ENV *env; + const ENV *env; +{ + /* Adjust by 2 to exclude __os_stack() and __os_stack_top(). */ + __os_stack_top(env, __DB_STACK_MAXFRAMES - 2, 2); +} + +/* + * __os_stack_top -- + * Output just a certain range of stack frames to the error file handle. + * + * PUBLIC: void __os_stack_top __P((const ENV *, unsigned, unsigned)); + */ +void +__os_stack_top(env, nframes, skipframes) + const ENV *env; + unsigned nframes; + unsigned skipframes; { #if defined(HAVE_BACKTRACE) && defined(HAVE_BACKTRACE_SYMBOLS) - void *array[200]; - size_t i, size; - char **strings; + char buf[__DB_STACK_MAXFRAMES * 80]; /* Allow for 80 chars/line. */ + __os_stack_text(env, buf, sizeof(buf), nframes, skipframes + 1); + __db_errx(env, "Top of stack:\n%s", buf); +#else + COMPQUIET(env, NULL); + COMPQUIET(nframes, 0); + COMPQUIET(skipframes, 0); +#endif +} + +/* + * __os_stack_text -- + * 'Print' the current stack into a char text buffer. + * + * PUBLIC: void __os_stack_text + * PUBLIC: __P((const ENV *, char *, size_t, unsigned, unsigned)); + */ +void +__os_stack_text(env, result, bufsize, nframes, skip) + const ENV *env; + char *result; + size_t bufsize; + unsigned nframes; + unsigned skip; +{ + DB_MSGBUF mb; + + DB_MSGBUF_INIT(&mb); + mb.buf = mb.cur = result; + mb.len = bufsize; + F_SET(&mb, DB_MSGBUF_PREALLOCATED); + __os_stack_msgadd(env, &mb, nframes, skip, NULL); +} + +/* + * __os_stack_save -- + * Save a certain range of stack frames into the frames argument. + * + * PUBLIC: int __os_stack_save __P((const ENV *, unsigned, void **)); + */ +int +__os_stack_save(env, nframes, frames) + const ENV *env; + unsigned nframes; + void **frames; +{ + COMPQUIET(env, NULL); +#if defined(HAVE_BACKTRACE) && defined(HAVE_BACKTRACE_SYMBOLS) /* * Solaris and the GNU C library support this interface. Solaris * has additional interfaces (printstack and walkcontext), I don't * know if they offer any additional value or not. */ - size = backtrace(array, sizeof(array) / sizeof(array[0])); - strings = backtrace_symbols(array, size); + return ((int) backtrace(frames, nframes)); +#else + COMPQUIET(nframes, 0); + COMPQUIET(frames, NULL); + return (0); +#endif +} + +/* + * __os_stack_msgadd -- + * Decode a stack and add it to a DB_MSGBUF. The stack was either + * previously obtained stack, e.g., from __os_stack_save(), or if it is + * null, the current stack is fetched here. + * + * PUBLIC: void __os_stack_msgadd + * PUBLIC: __P((const ENV *, DB_MSGBUF *, unsigned, unsigned, void **)); + */ +void +__os_stack_msgadd(env, mb, totalframes, skipframes, stack) + const ENV *env; + DB_MSGBUF *mb; + unsigned totalframes; + unsigned skipframes; + void **stack; +{ +#if defined(HAVE_BACKTRACE) && defined(HAVE_BACKTRACE_SYMBOLS) + char **strings; + void *local_frames[__DB_STACK_MAXFRAMES]; + unsigned i; + + if (stack == NULL) { + stack = local_frames; + if (totalframes > __DB_STACK_MAXFRAMES) + totalframes = __DB_STACK_MAXFRAMES; + totalframes = backtrace(local_frames, totalframes); + skipframes++; + } + + /* + * Solaris and the GNU C library support this interface. Solaris + * has additional interfaces (printstack and walkcontext) which have + * know if they offer any additional value or not. + */ + strings = backtrace_symbols(stack, totalframes); - for (i = 0; i < size; ++i) - __db_errx(env, "%s", strings[i]); + for (i = skipframes; i < totalframes; ++i) + __db_msgadd((ENV *)env, mb, "\t%s\n", strings[i]); free(strings); -#endif +#else COMPQUIET(env, NULL); + COMPQUIET(mb, NULL); + COMPQUIET(totalframes, 0); + COMPQUIET(skipframes, 0); + COMPQUIET(stack, NULL); +#endif } diff --git a/src/os/os_stat.c b/src/os/os_stat.c index 43c66075..493531b7 100644 --- a/src/os/os_stat.c +++ b/src/os/os_stat.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os/os_tmpdir.c b/src/os/os_tmpdir.c index 06d35ba9..f41383d7 100644 --- a/src/os/os_tmpdir.c +++ b/src/os/os_tmpdir.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1998, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os/os_truncate.c b/src/os/os_truncate.c index f559e9cb..473db9cc 100644 --- a/src/os/os_truncate.c +++ b/src/os/os_truncate.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2004, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2004, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -14,14 +14,16 @@ * __os_truncate -- * Truncate the file. * - * PUBLIC: int __os_truncate __P((ENV *, DB_FH *, db_pgno_t, u_int32_t)); + * PUBLIC: int __os_truncate + * PUBLIC: __P((ENV *, DB_FH *, db_pgno_t, u_int32_t, off_t)); */ int -__os_truncate(env, fhp, pgno, pgsize) +__os_truncate(env, fhp, pgno, pgsize, relative) ENV *env; DB_FH *fhp; db_pgno_t pgno; u_int32_t pgsize; + off_t relative; { DB_ENV *dbenv; off_t offset; @@ -33,7 +35,7 @@ __os_truncate(env, fhp, pgno, pgsize) * Truncate a file so that "pgno" is discarded from the end of the * file. */ - offset = (off_t)pgsize * pgno; + offset = (off_t)pgsize * pgno + relative; if (dbenv != NULL && FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL)) diff --git a/src/os/os_uid.c b/src/os/os_uid.c index 2e5c9f87..c3bccb3d 100644 --- a/src/os/os_uid.c +++ b/src/os/os_uid.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -26,8 +26,6 @@ __os_unique_id(env, idp) pid_t pid; u_int32_t id; - *idp = 0; - dbenv = env == NULL ? NULL : env->dbenv; /* @@ -35,21 +33,60 @@ __os_unique_id(env, idp) * time of day and a stack address, all XOR'd together. */ __os_id(dbenv, &pid, NULL); - __os_gettime(env, &v, 1); + __os_gettime(env, &v, 0); id = (u_int32_t)pid ^ (u_int32_t)v.tv_sec ^ (u_int32_t)v.tv_nsec ^ P_TO_UINT32(&pid); - /* - * We could try and find a reasonable random-number generator, but - * that's not all that easy to do. Seed and use srand()/rand(), if - * we can find them. - */ - if (DB_GLOBAL(uid_init) == 0) { - DB_GLOBAL(uid_init) = 1; - srand((u_int)id); - } - id ^= (u_int)rand(); + if (DB_GLOBAL(random_seeded) == 0) + __os_srandom(id); + id ^= __os_random(); *idp = id; } + +/* + * __os_srandom -- + * Set the random number generator seed for BDB. + * + * PUBLIC: void __os_srandom __P((u_int)); + */ +void +__os_srandom(seed) + u_int seed; +{ + DB_GLOBAL(random_seeded) = 1; +#ifdef HAVE_RANDOM_R + (void)initstate_r(seed, &DB_GLOBAL(random_state), + sizeof(DB_GLOBAL(random_state)), &DB_GLOBAL(random_data)); + (void)srandom_r(seed, &DB_GLOBAL(random_data)); +#elif defined(HAVE_RANDOM) + srandom(seed); +#else + srand(seed); +#endif +} + +/* + * __os_random -- + * Return the next the random number generator for BDB. + * + * PUBLIC: u_int __os_random __P((void)); + */ +u_int +__os_random() +{ +#ifdef HAVE_RANDOM_R + int32_t result; +#endif + if (DB_GLOBAL(random_seeded) == 0) + __os_srandom((u_int)time(NULL)); +#ifdef HAVE_RANDOM_R + random_r(&DB_GLOBAL(random_data), &result); + return ((u_int)result); +#elif defined(HAVE_RANDOM) + return ((u_int)random()); +#else + return ((u_int)rand()); +#endif +} diff --git a/src/os/os_unlink.c b/src/os/os_unlink.c index f9a0b688..9b6d26fa 100644 --- a/src/os/os_unlink.c +++ b/src/os/os_unlink.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os/os_yield.c b/src/os/os_yield.c index f0e170f0..ff54921e 100644 --- a/src/os/os_yield.c +++ b/src/os/os_yield.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os_qnx/os_qnx_fsync.c b/src/os_qnx/os_qnx_fsync.c index 827fa446..6ea04b00 100644 --- a/src/os_qnx/os_qnx_fsync.c +++ b/src/os_qnx/os_qnx_fsync.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os_qnx/os_qnx_open.c b/src/os_qnx/os_qnx_open.c index d0214a0d..cf2f781e 100644 --- a/src/os_qnx/os_qnx_open.c +++ b/src/os_qnx/os_qnx_open.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os_vxworks/os_vx_abs.c b/src/os_vxworks/os_vx_abs.c index 69413ee5..78342fce 100644 --- a/src/os_vxworks/os_vx_abs.c +++ b/src/os_vxworks/os_vx_abs.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os_vxworks/os_vx_config.c b/src/os_vxworks/os_vx_config.c index 649a3b4a..7c7fa4c8 100644 --- a/src/os_vxworks/os_vx_config.c +++ b/src/os_vxworks/os_vx_config.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os_vxworks/os_vx_map.c b/src/os_vxworks/os_vx_map.c index 517cadae..859bde6c 100644 --- a/src/os_vxworks/os_vx_map.c +++ b/src/os_vxworks/os_vx_map.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1998, 2015 Oracle and/or its affiliates. All rights reserved. * * This code is derived from software contributed to Sleepycat Software by * Frederick G.M. Roeber of Netscape Communications Corp. diff --git a/src/os_vxworks/os_vx_rpath.c b/src/os_vxworks/os_vx_rpath.c index 1ffd3549..d7202c78 100644 --- a/src/os_vxworks/os_vx_rpath.c +++ b/src/os_vxworks/os_vx_rpath.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os_vxworks/os_vx_yield.c b/src/os_vxworks/os_vx_yield.c index c7c54cf2..e3741c3f 100644 --- a/src/os_vxworks/os_vx_yield.c +++ b/src/os_vxworks/os_vx_yield.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os_windows/ce_ctime.c b/src/os_windows/ce_ctime.c index e8ae76aa..d4e6a4fc 100644 --- a/src/os_windows/ce_ctime.c +++ b/src/os_windows/ce_ctime.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -25,8 +25,8 @@ __os_ctime(tod, time_buf) __int64 i64_tod; struct _FILETIME file_tod, file_loc; struct _SYSTEMTIME sys_loc; -static const __int64 SECS_BETWEEN_EPOCHS = 11644473600; -static const __int64 SECS_TO_100NS = 10000000; /* 10^7 */ + static const __int64 SECS_BETWEEN_EPOCHS = 11644473600; + static const __int64 SECS_TO_100NS = 10000000; /* 10^7 */ strcpy(time_buf, "Thu Jan 01 00:00:00 1970"); time_buf[CTIME_BUFLEN - 1] = '\0'; diff --git a/src/os_windows/ce_freopen.c b/src/os_windows/ce_freopen.c new file mode 100644 index 00000000..331450d0 --- /dev/null +++ b/src/os_windows/ce_freopen.c @@ -0,0 +1,52 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" + +/* + * __ce_freopen -- + * Reopen a stream on WinCE. + * + * PUBLIC: #ifdef DB_WINCE + * PUBLIC: FILE * __ce_freopen + * PUBLIC: __P((const char *, const char *, FILE *)); + * PUBLIC: #endif + */ +FILE * +__ce_freopen(path, mode, stream) + const char *path, *mode; + FILE *stream; +{ + size_t lenm, lenp; + wchar_t *wpath, *wmode; + FILE *handle; + + wpath = NULL; + wmode = NULL; + handle = NULL; + lenp = strlen(path) + 1; + lenm = strlen(mode) + 1; + + if (__os_malloc(NULL, lenp * sizeof(wchar_t), &wpath) != 0 || + __os_malloc(NULL, lenm * sizeof(wchar_t), &wmode) != 0) + goto err; + + if (mbstowcs(wpath, path, lenp) != lenp || + mbstowcs(wmode, mode, lenm) != lenm) + goto err; + + handle = _wfreopen(wpath, wmode, stream); +err: + if (wpath != NULL) + __os_free(NULL, wpath); + if (wmode != NULL) + __os_free(NULL, wmode); + return handle; +} diff --git a/src/os_windows/ce_gmtime.c b/src/os_windows/ce_gmtime.c new file mode 100644 index 00000000..55605c89 --- /dev/null +++ b/src/os_windows/ce_gmtime.c @@ -0,0 +1,58 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2012, 2015 Oracle and/or its affiliates. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" + +/* + * __ce_gmtime -- + * gmtime implementation on WinCE. + * + * PUBLIC: #ifdef DB_WINCE + * PUBLIC: struct tm * __ce_gmtime __P((const time_t *)); + * PUBLIC: #endif + */ + +struct tm * +__ce_gmtime(timer) + const time_t *timer; +{ + static struct tm br_time; + struct tm *timep; + time_t ti; + unsigned long dayclock, dayno; + int year; + + timep = &br_time; + ti = *timer; + dayclock = (unsigned long)ti % SECSPERDAY; + dayno = (unsigned long)ti / SECSPERDAY; + year = TM_YEAR_EPOCH; + + timep->tm_sec = dayclock % 60; + timep->tm_min = (dayclock % 3600) / 60; + timep->tm_hour = dayclock / 3600; + /* day 0 was a thursday */ + timep->tm_wday = (dayno + 4) % 7; + while (dayno >= year_lengths[isleap(year)]) { + dayno -= year_lengths[isleap(year)]; + year++; + } + timep->tm_year = year - TM_YEAR_BASE; + timep->tm_yday = dayno; + timep->tm_mon = 0; + while (dayno >= mon_lengths[isleap(year)][timep->tm_mon]) { + dayno -= mon_lengths[isleap(year)][timep->tm_mon]; + timep->tm_mon++; + } + timep->tm_mday = dayno + 1; + timep->tm_isdst = 0; + + return timep; +} diff --git a/src/os_windows/ce_localtime.c b/src/os_windows/ce_localtime.c new file mode 100644 index 00000000..23c53bed --- /dev/null +++ b/src/os_windows/ce_localtime.c @@ -0,0 +1,44 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2012, 2015 Oracle and/or its affiliates. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" + +/* + * __ce_localtime -- + * localtime implementation on WinCE. + * + * PUBLIC: #ifdef DB_WINCE + * PUBLIC: struct tm * localtime __P((const time_t *)); + * PUBLIC: #endif + */ +struct tm * +localtime(t) + const time_t *t; +{ + static struct tm y; + FILETIME uTm, lTm; + SYSTEMTIME pTm; + int64_t t64; + + t64 = *t; + t64 = (t64 + 11644473600)*10000000; + uTm.dwLowDateTime = (DWORD)(t64 & 0xFFFFFFFF); + uTm.dwHighDateTime= (DWORD)(t64 >> 32); + FileTimeToLocalFileTime(&uTm,&lTm); + FileTimeToSystemTime(&lTm,&pTm); + y.tm_year = pTm.wYear - 1900; + y.tm_mon = pTm.wMonth - 1; + y.tm_wday = pTm.wDayOfWeek; + y.tm_mday = pTm.wDay; + y.tm_hour = pTm.wHour; + y.tm_min = pTm.wMinute; + y.tm_sec = pTm.wSecond; + return &y; +} diff --git a/src/os_windows/ce_mktime.c b/src/os_windows/ce_mktime.c new file mode 100644 index 00000000..0d3a0906 --- /dev/null +++ b/src/os_windows/ce_mktime.c @@ -0,0 +1,257 @@ +/* + * Copyright (c) 1987, 1989 Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Arthur David Olson of the National Cancer Institute. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. */ + +/*static char *sccsid = "from: @(#)ctime.c 5.26 (Berkeley) 2/23/91";*/ + +/* + * This implementation of mktime is lifted straight from the NetBSD (BSD 4.4) + * version. I modified it slightly to divorce it from the internals of the + * ctime library. Thus this version can't use details of the internal + * timezone state file to figure out strange unnormalized struct tm values, + * as might result from someone doing date math on the tm struct then passing + * it to mktime. + * + * It just does as well as it can at normalizing the tm input, then does a + * binary search of the time space using the system's localtime() function. + * + * The original binary search was defective in that it didn't consider the + * setting of tm_isdst when comparing tm values, causing the search to be + * flubbed for times near the dst/standard time changeover. The original + * code seems to make up for this by grubbing through the timezone info + * whenever the binary search barfed. Since I don't have that luxury in + * portable code, I have to take care of tm_isdst in the comparison routine. + * This requires knowing how many minutes offset dst is from standard time. + * + * So, if you live somewhere in the world where dst is not 60 minutes offset, + * and your vendor doesn't supply mktime(), you'll have to edit this variable + * by hand. Sorry about that. + */ + +#include "db_config.h" + +#include "db_int.h" + +#undef DSTMINUTES +#define DSTMINUTES 60 + +#undef FALSE +#undef TRUE +#define FALSE 0 +#define TRUE 1 + +/* +** Adapted from code provided by Robert Elz, who writes: +** The "best" way to do mktime I think is based on an idea of Bob +** Kridle's (so its said...) from a long time ago. (mtxinu!kridle now). +** It does a binary search of the time_t space. Since time_t's are +** just 32 bits, its a max of 32 iterations (even at 64 bits it +** would still be very reasonable). +*/ + +#undef WRONG +#define WRONG (-1) + +const unsigned int mon_lengths[2][MONSPERYEAR] = { + { 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 }, + { 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 } +}; +const unsigned int year_lengths[2] = { + DAYSPERNYEAR, DAYSPERLYEAR +}; + +static void +normalize(tensptr, unitsptr, base) + int base, *tensptr, *unitsptr; +{ + if (*unitsptr >= base) { + *tensptr += *unitsptr / base; + *unitsptr %= base; + } else if (*unitsptr < 0) { + --*tensptr; + *unitsptr += base; + if (*unitsptr < 0) { + *tensptr -= 1 + (-*unitsptr) / base; + *unitsptr = base - (-*unitsptr) % base; + } + } +} + +static struct tm * +mkdst(tmp) + struct tm * tmp; +{ + /* jds */ + static struct tm tmbuf; + + tmbuf = *tmp; + tmbuf.tm_isdst = 1; + tmbuf.tm_min += DSTMINUTES; + normalize(&tmbuf.tm_hour, &tmbuf.tm_min, MINSPERHOUR); + return &tmbuf; +} + +static int +tmcomp(atmp, btmp) + register struct tm *atmp, *btmp; +{ + register int result; + + /* compare down to the same day */ + if ((result = (atmp->tm_year - btmp->tm_year)) == 0 && + (result = (atmp->tm_mon - btmp->tm_mon)) == 0) + result = (atmp->tm_mday - btmp->tm_mday); + + if (result != 0) + return result; + + /* get rid of one-sided dst bias */ + if (atmp->tm_isdst == 1 && !btmp->tm_isdst) + btmp = mkdst(btmp); + else if (btmp->tm_isdst == 1 && !atmp->tm_isdst) + atmp = mkdst(atmp); + + /* compare the rest of the way */ + if ((result = (atmp->tm_hour - btmp->tm_hour)) == 0 && + (result = (atmp->tm_min - btmp->tm_min)) == 0) + result = atmp->tm_sec - btmp->tm_sec; + + return result; +} + +static time_t +time2(tmp, okayp, usezn) + struct tm *tmp; + int *okayp, usezn; +{ + register int bits, dir, i, saved_seconds; + time_t t; + struct tm yourtm, mytm; + + *okayp = FALSE; + yourtm = *tmp; + if (yourtm.tm_sec >= SECSPERMIN + 2 || yourtm.tm_sec < 0) + normalize(&yourtm.tm_min, &yourtm.tm_sec, SECSPERMIN); + normalize(&yourtm.tm_hour, &yourtm.tm_min, MINSPERHOUR); + normalize(&yourtm.tm_mday, &yourtm.tm_hour, HOURSPERDAY); + normalize(&yourtm.tm_year, &yourtm.tm_mon, MONSPERYEAR); + while (yourtm.tm_mday <= 0) { + --yourtm.tm_year; + yourtm.tm_mday += + year_lengths[isleap(yourtm.tm_year + TM_YEAR_BASE)]; + } + for ( ; ; ) { + i = mon_lengths[isleap(yourtm.tm_year + + TM_YEAR_BASE)][yourtm.tm_mon]; + if (yourtm.tm_mday <= i) + break; + yourtm.tm_mday -= i; + if (++yourtm.tm_mon >= MONSPERYEAR) { + yourtm.tm_mon = 0; + ++yourtm.tm_year; + } + } + saved_seconds = yourtm.tm_sec; + yourtm.tm_sec = 0; + /* + ** Calculate the number of magnitude bits in a time_t + ** (this works regardless of whether time_t is + ** signed or unsigned, though lint complains if unsigned). + */ + for (bits = 0, t = 1; t > 0; ++bits, t <<= 1) + ; + /* + ** If time_t is signed, then 0 is the median value, + ** if time_t is unsigned, then 1 << bits is median. + */ + t = (t < 0) ? 0 : ((time_t) 1 << bits); + for ( ; ; ) { + if (usezn) + mytm = *localtime(&t); + else + mytm = *gmtime(&t); + dir = tmcomp(&mytm, &yourtm); + if (dir != 0) { + if (bits-- < 0) + return WRONG; + if (bits < 0) + --t; + else if (dir > 0) + t -= (time_t) 1 << bits; + else t += (time_t) 1 << bits; + continue; + } + if (yourtm.tm_isdst < 0 || mytm.tm_isdst == yourtm.tm_isdst) + break; + + return WRONG; + } + t += saved_seconds; + if (usezn) + *tmp = *localtime(&t); + else + *tmp = *gmtime(&t); + *okayp = TRUE; + return t; +} + +static time_t +time1(tmp) + struct tm * tmp; +{ + register time_t t; + int okay; + + if (tmp->tm_isdst > 1) + tmp->tm_isdst = 1; + t = time2(tmp, &okay, 1); + if (okay || tmp->tm_isdst < 0) + return t; + + return WRONG; +} + +/* + * mktime -- + * + * PUBLIC: #ifdef DB_WINCE + * PUBLIC: time_t __ce_mktime __P((struct tm *)); + * PUBLIC: #endif + */ +time_t +__ce_mktime(tmp) + struct tm * tmp; +{ + return time1(tmp); +} diff --git a/src/os_windows/ce_remove.c b/src/os_windows/ce_remove.c new file mode 100644 index 00000000..f955f3b4 --- /dev/null +++ b/src/os_windows/ce_remove.c @@ -0,0 +1,26 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2012, 2015 Oracle and/or its affiliates. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" + +/* + * remove implementation on WinCE. + * + * PUBLIC: #ifdef DB_WINCE + * PUBLIC: int __ce_remove __P((const char *path)); + * PUBLIC: #endif + */ + +int +__ce_remove(path) + const char *path; +{ + return __os_unlink(NULL, path, 0); +} diff --git a/src/os_windows/ce_util_sig.c b/src/os_windows/ce_util_sig.c new file mode 100644 index 00000000..11fb4ad7 --- /dev/null +++ b/src/os_windows/ce_util_sig.c @@ -0,0 +1,35 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2012, 2015 Oracle and/or its affiliates. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" + +/* + * The stub functions for signal handling. + * WinCE does not support signal handling, so we just define stub functions to + * avoid linkage errors for utilities build. + */ + +void +__db_util_siginit() +{ + return; +} + +int +__db_util_interrupted() +{ + return (0); +} + +void +__db_util_sigresend() +{ + return; +} diff --git a/src/os_windows/os_abs.c b/src/os_windows/os_abs.c index e769ab2c..f9be934e 100644 --- a/src/os_windows/os_abs.c +++ b/src/os_windows/os_abs.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os_windows/os_clock.c b/src/os_windows/os_clock.c index e548729b..80a96785 100644 --- a/src/os_windows/os_clock.c +++ b/src/os_windows/os_clock.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -16,7 +16,7 @@ */ void __os_gettime(env, tp, monotonic) - ENV *env; + const ENV *env; db_timespec *tp; int monotonic; { diff --git a/src/os_windows/os_config.c b/src/os_windows/os_config.c index 4250dbd4..c4b61700 100644 --- a/src/os_windows/os_config.c +++ b/src/os_windows/os_config.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os_windows/os_cpu.c b/src/os_windows/os_cpu.c index 0922071f..41004753 100644 --- a/src/os_windows/os_cpu.c +++ b/src/os_windows/os_cpu.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os_windows/os_dir.c b/src/os_windows/os_dir.c index 31d364d7..4065d182 100644 --- a/src/os_windows/os_dir.c +++ b/src/os_windows/os_dir.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os_windows/os_errno.c b/src/os_windows/os_errno.c index ba8ec359..a8c35480 100644 --- a/src/os_windows/os_errno.c +++ b/src/os_windows/os_errno.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os_windows/os_fid.c b/src/os_windows/os_fid.c index f2d190b1..bfd4182c 100644 --- a/src/os_windows/os_fid.c +++ b/src/os_windows/os_fid.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -118,12 +118,12 @@ __os_fileid(env, fname, unique_okay, fidp) DB_GLOBAL(fid_serial) = (u_int32_t)pid; } else DB_GLOBAL(fid_serial) += 100000; - + tmp = (u_int32_t)DB_GLOBAL(fid_serial); } else { tmp = (u_int32_t)fi.dwVolumeSerialNumber; - for (p = (u_int8_t *)&tmp, i = sizeof(u_int32_t); i > 0; --i) - *fidp++ = *p++; } + for (p = (u_int8_t *)&tmp, i = sizeof(u_int32_t); i > 0; --i) + *fidp++ = *p++; return (0); } diff --git a/src/os_windows/os_flock.c b/src/os_windows/os_flock.c index cb3e4986..9dcd1e81 100644 --- a/src/os_windows/os_flock.c +++ b/src/os_windows/os_flock.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os_windows/os_fsync.c b/src/os_windows/os_fsync.c index 8824aac1..5194c00b 100644 --- a/src/os_windows/os_fsync.c +++ b/src/os_windows/os_fsync.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os_windows/os_getenv.c b/src/os_windows/os_getenv.c index aad59d01..0ac1db0a 100644 --- a/src/os_windows/os_getenv.c +++ b/src/os_windows/os_getenv.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os_windows/os_handle.c b/src/os_windows/os_handle.c index e6edc3ef..65809017 100644 --- a/src/os_windows/os_handle.c +++ b/src/os_windows/os_handle.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1998, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os_windows/os_map.c b/src/os_windows/os_map.c index 8f646d68..eefa3e8b 100644 --- a/src/os_windows/os_map.c +++ b/src/os_windows/os_map.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -105,9 +105,12 @@ __os_detach(env, infop, destroy) int destroy; { DB_ENV *dbenv; + REGION *rp; int ret, t_ret; dbenv = env->dbenv; + rp = infop->rp; + ret = 0; if (infop->wnt_handle != NULL) { (void)CloseHandle(infop->wnt_handle); @@ -120,10 +123,19 @@ __os_detach(env, infop, destroy) return (ret); } - ret = !UnmapViewOfFile(infop->addr) ? __os_get_syserr() : 0; - if (ret != 0) { - __db_syserr(env, ret, DB_STR("0007", "UnmapViewOfFile")); - ret = __os_posix_err(ret); + if (F_ISSET(env, ENV_FORCESYNCENV)) + if (!FlushViewOfFile(infop->addr, rp->max)) { + ret = __os_get_syserr(); + __db_syserr(env, ret, DB_STR("0249", + "FlushViewOfFile failed on closing environment")); + ret = __os_posix_err(ret); + } + + t_ret = !UnmapViewOfFile(infop->addr) ? __os_get_syserr() : 0; + if (t_ret != 0) { + __db_syserr(env, t_ret, DB_STR("0007", "UnmapViewOfFile")); + if (ret == 0) + ret = __os_posix_err(t_ret); } if (!F_ISSET(env, ENV_SYSTEM_MEM) && destroy && diff --git a/src/os_windows/os_mkdir.c b/src/os_windows/os_mkdir.c index b87f3f9d..7ad7eed2 100644 --- a/src/os_windows/os_mkdir.c +++ b/src/os_windows/os_mkdir.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os_windows/os_open.c b/src/os_windows/os_open.c index 44f2faf3..bc715a96 100644 --- a/src/os_windows/os_open.c +++ b/src/os_windows/os_open.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os_windows/os_rename.c b/src/os_windows/os_rename.c index 791f53a5..d70f20ca 100644 --- a/src/os_windows/os_rename.c +++ b/src/os_windows/os_rename.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os_windows/os_rmdir.c b/src/os_windows/os_rmdir.c new file mode 100644 index 00000000..18090f09 --- /dev/null +++ b/src/os_windows/os_rmdir.c @@ -0,0 +1,42 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" + +/* + * __os_rmdir -- + * Remove a directory. + */ +int +__os_rmdir(env, name) + ENV *env; + const char *name; +{ + DB_ENV *dbenv; + _TCHAR *tname; + int ret; + + dbenv = env == NULL ? NULL : env->dbenv; + + if (dbenv != NULL && + FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL)) + __db_msg(env, DB_STR_A("0240", "fileops: rmdir %s", + "%s"), name); + + TO_TSTRING(env, name, tname, ret); + if (ret != 0) + return (ret); + RETRY_CHK(!RemoveDirectory(tname), ret); + FREE_STRING(env, tname); + if (ret != 0) + return (__os_posix_err(ret)); + + return (ret); +} diff --git a/src/os_windows/os_rw.c b/src/os_windows/os_rw.c index e64a7d08..20644e6e 100644 --- a/src/os_windows/os_rw.c +++ b/src/os_windows/os_rw.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os_windows/os_seek.c b/src/os_windows/os_seek.c index 7632c15d..613e4a7c 100644 --- a/src/os_windows/os_seek.c +++ b/src/os_windows/os_seek.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os_windows/os_stat.c b/src/os_windows/os_stat.c index 11248886..5c3a0fcc 100644 --- a/src/os_windows/os_stat.c +++ b/src/os_windows/os_stat.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os_windows/os_truncate.c b/src/os_windows/os_truncate.c index fcbb37b2..d1150c85 100644 --- a/src/os_windows/os_truncate.c +++ b/src/os_windows/os_truncate.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2004, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2004, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -15,11 +15,12 @@ * Truncate the file. */ int -__os_truncate(env, fhp, pgno, pgsize) +__os_truncate(env, fhp, pgno, pgsize, relative) ENV *env; DB_FH *fhp; db_pgno_t pgno; u_int32_t pgsize; + off_t relative; { /* Yes, this really is how Microsoft have designed their API */ union { @@ -34,7 +35,7 @@ __os_truncate(env, fhp, pgno, pgsize) int ret; dbenv = env == NULL ? NULL : env->dbenv; - offset = (off_t)pgsize * pgno; + offset = (off_t)pgsize * pgno + relative; ret = 0; if (dbenv != NULL && @@ -84,7 +85,7 @@ __os_truncate(env, fhp, pgno, pgsize) * We can't switch to SetFilePointerEx, which knows about 64-bit * offsets, because it isn't supported on Win9x/ME. */ - RETRY_CHK((off.bigint = (__int64)pgsize * pgno, + RETRY_CHK((off.bigint = (__int64)pgsize * pgno + relative, (SetFilePointer(fhp->trunc_handle, off.low, &off.high, FILE_BEGIN) == INVALID_SET_FILE_POINTER && GetLastError() != NO_ERROR) || !SetEndOfFile(fhp->trunc_handle)), ret); diff --git a/src/os_windows/os_unlink.c b/src/os_windows/os_unlink.c index 6a0a6572..5c63a5e6 100644 --- a/src/os_windows/os_unlink.c +++ b/src/os_windows/os_unlink.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/os_windows/os_yield.c b/src/os_windows/os_yield.c index 0d32ef69..bf326ee2 100644 --- a/src/os_windows/os_yield.c +++ b/src/os_windows/os_yield.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/qam/qam.c b/src/qam/qam.c index e81d4795..0c71fd0d 100644 --- a/src/qam/qam.c +++ b/src/qam/qam.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -260,7 +260,7 @@ __qamc_put(dbc, key, data, flags, pgnop) } if (exact != 0 && flags == DB_NOOVERWRITE) - ret = DB_KEYEXIST; + ret = DBC_ERR(dbc, DB_KEYEXIST); else /* Put the item on the page. */ ret = __qam_pitem(dbc, @@ -526,7 +526,7 @@ __qamc_del(dbc, flags) return (ret); if (QAM_NOT_VALID(meta, cp->recno)) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } first = meta->first_recno; @@ -549,7 +549,7 @@ __qamc_del(dbc, flags) goto err; if (!exact) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } @@ -636,9 +636,9 @@ __qamc_get(dbc, key, data, flags, pgnop) QUEUE_CURSOR *cp; db_lockmode_t lock_mode; db_pgno_t metapno; - db_recno_t first; + db_recno_t first, old_first; int exact, inorder, is_first, ret, t_ret, wait, with_delete; - int retrying; + int retrying, stay; u_int32_t skip, meta_mode; dbp = dbc->dbp; @@ -652,7 +652,9 @@ __qamc_get(dbc, key, data, flags, pgnop) meta = NULL; *pgnop = 0; pg = NULL; - retrying = t_ret = wait = with_delete = 0; + retrying = t_ret = wait = with_delete = 0; + stay = 1; + old_first = RECNO_OOB; if (flags == DB_CONSUME_WAIT) { wait = 1; @@ -676,25 +678,25 @@ __qamc_get(dbc, key, data, flags, pgnop) t = (QUEUE *)dbp->q_internal; metapno = t->q_meta; - /* - * Get the meta page first - */ - if ((ret = __memp_fget(mpf, &metapno, - dbc->thread_info, dbc->txn, meta_mode, &meta)) != 0) - return (ret); - /* Release any previous lock if not in a transaction. */ if ((ret = __TLPUT(dbc, cp->lock)) != 0) goto err; skip = 0; -retry: /* Update the record number. */ +retry: + /* + * Get the meta page first + */ + if (meta == NULL && (ret = __memp_fget(mpf, &metapno, + dbc->thread_info, dbc->txn, meta_mode, &meta)) != 0) + return (ret); /* Update the record number. */ + switch (flags) { case DB_CURRENT: break; case DB_NEXT_DUP: case DB_PREV_DUP: - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; /* NOTREACHED */ case DB_NEXT: @@ -711,7 +713,7 @@ retry: /* Update the record number. */ if (QAM_AFTER_CURRENT(meta, cp->recno)) { pg = NULL; if (!wait) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } /* @@ -774,6 +776,7 @@ retry: /* Update the record number. */ DB_LOCK_UPGRADE, &metalock)) != 0) { if (ret == DB_LOCK_DEADLOCK) ret = DB_LOCK_NOTGRANTED; + (void)DBC_ERR(dbc, ret); goto err; } @@ -792,6 +795,8 @@ retry: /* Update the record number. */ /* get the first record number */ cp->recno = first = meta->first_recno; + if (old_first == RECNO_OOB) + old_first = first; break; case DB_PREV: @@ -799,7 +804,7 @@ retry: /* Update the record number. */ if (cp->recno != RECNO_OOB) { if (cp->recno == meta->first_recno || QAM_BEFORE_FIRST(meta, cp->recno)) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } QAM_DEC_RECNO(cp->recno); @@ -808,7 +813,7 @@ retry: /* Update the record number. */ /* FALLTHROUGH */ case DB_LAST: if (meta->first_recno == meta->cur_recno) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } cp->recno = meta->cur_recno; @@ -892,11 +897,11 @@ dolock: if (!with_delete || inorder || retrying) { LOCK_INIT(lock); goto release_retry; } - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto lerr; } if (QAM_AFTER_CURRENT(meta, cp->recno)) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto lerr; } } @@ -951,9 +956,37 @@ release_retry: /* Release locks and retry, if possible. */ case DB_NEXT_NODUP: if (!with_delete) is_first = 0; - else if (first == cp->recno) + else if (first == cp->recno) { /* we have verified that this record is gone. */ QAM_INC_RECNO(first); + /* + * If we are reading in order and the first + * record was not there, we need to reflect + * this in the meta page, so that we can + * avoid checking this record again and again. + */ + if (inorder && cp->recno == meta->first_recno) { + if (DBC_LOGGING(dbc)) { +#ifdef QDEBUG + (void)__log_printf( + dbp->env, dbc->txn, + "Queue O: %x %u %u %u", + dbc->locker ? + dbc->locker->id : 0, + cp->recno, first, + meta->cur_recno); +#endif + if ((ret = __qam_incfirst_log( + dbp, dbc->txn, + &meta->dbmeta.lsn, 0, + cp->recno, + PGNO_BASE_MD)) != 0) + goto err; + } else + LSN_NOT_LOGGED(meta->dbmeta.lsn); + meta->first_recno = first; + } + } if (QAM_BEFORE_FIRST(meta, cp->recno) && DONT_NEED_LOCKS(dbc)) flags = DB_FIRST; @@ -979,7 +1012,7 @@ release_retry: /* Release locks and retry, if possible. */ default: /* this is for the SET and GET_BOTH cases */ - ret = DB_KEYEMPTY; + ret = DBC_ERR(dbc, DB_KEYEMPTY); goto err1; } retrying = 0; @@ -1031,10 +1064,10 @@ release_retry: /* Release locks and retry, if possible. */ */ tmp.data = qp->data; tmp.size = t->re_len; - if ((ret = __bam_defcmp(dbp, data, &tmp)) != 0) { + if ((ret = __bam_defcmp(dbp, data, &tmp, NULL)) != 0) { if (flags == DB_GET_BOTH_RANGE) goto release_retry; - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err1; } } @@ -1139,14 +1172,17 @@ release_retry: /* Release locks and retry, if possible. */ * If we deleted the first record we checked then we moved * the first pointer properly. */ - - if (first == cp->recno && (skip = (first % t->rec_page)) != 0) + if (((QUEUE *)dbp->q_internal)->page_ext != 0) + stay = (QAM_RECNO_EXTENT(dbp, old_first) == + QAM_RECNO_EXTENT(dbp, first)); + if (stay && first == cp->recno && + (skip = (first % t->rec_page)) != 0) goto done; if (meta == NULL && (ret = __memp_fget(mpf, &metapno, dbc->thread_info, dbc->txn, 0, &meta)) != 0) goto err; - if (skip && !QAM_BEFORE_FIRST(meta, first)) + if (stay && skip && !QAM_BEFORE_FIRST(meta, first)) goto done; #ifdef QDEBUG @@ -1156,7 +1192,11 @@ release_retry: /* Release locks and retry, if possible. */ dbc->locker ? dbc->locker->id : 0, cp->recno, first, meta->first_recno); #endif - ret = __qam_consume(dbc, meta, first); + if (stay) { + ret = __qam_consume(dbc, meta, first); + } else { + ret = __qam_consume(dbc, meta, old_first); + } } err1: if (cp->page != NULL) { @@ -1272,8 +1312,8 @@ __qam_consume(dbc, meta, first) */ if (rec_extent != 0 && ((exact = (first % rec_extent == 0)) || - (first % meta->rec_page == 0) || - first == UINT32_MAX)) { + (exact = (first == UINT32_MAX)) || + (first % meta->rec_page == 0))) { #ifdef QDEBUG if (DBC_LOGGING(dbc)) (void)__log_printf(dbp->env, dbc->txn, diff --git a/src/qam/qam.src b/src/qam/qam.src index a8e2e4e0..eca6c07c 100644 --- a/src/qam/qam.src +++ b/src/qam/qam.src @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/qam/qam_conv.c b/src/qam/qam_conv.c index beb7c973..34ce321a 100644 --- a/src/qam/qam_conv.c +++ b/src/qam/qam_conv.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/qam/qam_files.c b/src/qam/qam_files.c index e9a9ff07..f5c7d2ec 100644 --- a/src/qam/qam_files.c +++ b/src/qam/qam_files.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -445,6 +445,8 @@ __qam_fremove(dbp, pgnoaddr) * sizeof(array->mpfarray[0])); array->mpfarray[ array->hi_extent - array->low_extent].mpf = NULL; + array->mpfarray[ + array->hi_extent - array->low_extent].pinref = 0; if (array->low_extent != array->hi_extent) array->low_extent++; } else { @@ -570,8 +572,11 @@ again: for (i = first; i >= first && i <= stop; i += rec_extent) { if ((ret = __qam_fprobe(dbc, QAM_RECNO_PAGE(dbp, i), &fp->mpf, QAM_PROBE_MPF, dbp->priority, 0)) != 0) { - if (ret == ENOENT) + if (ret == ENOENT) { + /* Missing extents are acceptable; skip them. */ + ret = 0; continue; + } goto err; } fp->id = QAM_RECNO_EXTENT(dbp, i); diff --git a/src/qam/qam_method.c b/src/qam/qam_method.c index 0867e5dd..5d796cdb 100644 --- a/src/qam/qam_method.c +++ b/src/qam/qam_method.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/qam/qam_open.c b/src/qam/qam_open.c index 69f6cb75..5be78f68 100644 --- a/src/qam/qam_open.c +++ b/src/qam/qam_open.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/qam/qam_rec.c b/src/qam/qam_rec.c index c9ff6c83..c5f6b3f4 100644 --- a/src/qam/qam_rec.c +++ b/src/qam/qam_rec.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -152,6 +152,10 @@ __qam_incfirst_recover(env, dbtp, lsnp, op, info) REC_DIRTY(mpf, ip, dbc->priority, &meta); LSN(meta) = *lsnp; } + if (QAM_BEFORE_FIRST(meta, argp->recno)) { + REC_DIRTY(mpf, ip, dbc->priority, &meta); + meta->first_recno = argp->recno; + } if ((ret = __qam_adjust_first(file_dbp, dbc, meta, argp->recno + 1)) != 0) goto err; diff --git a/src/qam/qam_stat.c b/src/qam/qam_stat.c index 15c41bb5..19e09383 100644 --- a/src/qam/qam_stat.c +++ b/src/qam/qam_stat.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/qam/qam_stub.c b/src/qam/qam_stub.c index f5140079..6df0536c 100644 --- a/src/qam/qam_stub.c +++ b/src/qam/qam_stub.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/qam/qam_upgrade.c b/src/qam/qam_upgrade.c index ac96c889..4b9e9453 100644 --- a/src/qam/qam_upgrade.c +++ b/src/qam/qam_upgrade.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/qam/qam_verify.c b/src/qam/qam_verify.c index af5ab5db..d2f8ab79 100644 --- a/src/qam/qam_verify.c +++ b/src/qam/qam_verify.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -115,14 +115,14 @@ __qam_vrfy_meta(dbp, vdp, meta, pgno, flags) * this assumption fails. (We need the qp info to be reasonable * before we do per-page verification of queue extents.) */ - if (F_ISSET(vdp, VRFY_QMETA_SET)) { + if (F_ISSET(vdp, SALVAGE_QMETA_SET)) { isbad = 1; EPRINT((env, DB_STR_A("1148", "Page %lu: database contains multiple Queue metadata pages", "%lu"), (u_long)pgno)); goto err; } - F_SET(vdp, VRFY_QMETA_SET); + F_SET(vdp, SALVAGE_QMETA_SET); qp->page_ext = meta->page_ext; dbp->pgsize = meta->dbmeta.pagesize; qp->q_meta = pgno; diff --git a/src/rep/mlease.html b/src/rep/mlease.html index 7d44b465..4e82f63c 100644 --- a/src/rep/mlease.html +++ b/src/rep/mlease.html @@ -1,5 +1,5 @@ <!DOCTYPE doctype PUBLIC "-//w3c//dtd html 4.0 transitional//en"> -<!--Copyright (c) 2011, 2012 Oracle and/or its affiliates. All rights reserved.--> +<!--Copyright (c) 2011, 2015 Oracle and/or its affiliates. All rights reserved.--> <html> <head> <meta http-equiv="Content-Type" diff --git a/src/rep/rep.msg b/src/rep/rep.msg index b751a64d..d5c56d93 100644 --- a/src/rep/rep.msg +++ b/src/rep/rep.msg @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -57,7 +57,22 @@ ARG pgsize u_int32_t ARG pgno db_pgno_t ARG max_pgno db_pgno_t ARG filenum u_int32_t -ARG finfo_flags u_int32_t +ARG finfo_flags u_int32_t +ARG type u_int32_t +ARG db_flags u_int32_t +ARG uid DBT +ARG info DBT +ARG dir DBT +ARG blob_fid_lo u_int32_t +ARG blob_fid_hi u_int32_t +END + +BEGIN_MSG fileinfo_v7 alloc check_length version +ARG pgsize u_int32_t +ARG pgno db_pgno_t +ARG max_pgno db_pgno_t +ARG filenum u_int32_t +ARG finfo_flags u_int32_t ARG type u_int32_t ARG db_flags u_int32_t ARG uid DBT @@ -158,3 +173,54 @@ ARG lsn DB_LSN ARG hist_sec u_int32_t ARG hist_nsec u_int32_t END + +/* + * Request for blob files. + */ +BEGIN_MSG blob_update_req +ARG blob_fid u_int64_t +ARG blob_sid u_int64_t +ARG blob_id u_int64_t +ARG highest_id u_int64_t +END + +/* + * A list of blob file for a database. + */ +BEGIN_MSG blob_update +ARG blob_fid u_int64_t +ARG highest_id u_int64_t +ARG flags u_int32_t +ARG num_blobs u_int32_t +END + +/* + * Blob file description, part of blob_update. + */ +BEGIN_MSG blob_file +ARG blob_sid u_int64_t +ARG blob_id u_int64_t +ARG blob_size u_int64_t +END + +/* + * A piece of data from a blob file. + */ +BEGIN_MSG blob_chunk +ARG flags u_int32_t +ARG blob_fid u_int64_t +ARG blob_sid u_int64_t +ARG blob_id u_int64_t +ARG offset u_int64_t +ARG data DBT +END + +/* + * Request for data from a blob file at the given offset. + */ +BEGIN_MSG blob_chunk_req +ARG blob_fid u_int64_t +ARG blob_sid u_int64_t +ARG blob_id u_int64_t +ARG offset u_int64_t +END diff --git a/src/rep/rep_automsg.c b/src/rep/rep_automsg.c index 5d8155fb..cab68b3e 100644 --- a/src/rep/rep_automsg.c +++ b/src/rep/rep_automsg.c @@ -280,6 +280,16 @@ __rep_fileinfo_marshal(env, version, argp, bp, max, lenp) memcpy(bp, argp->dir.data, argp->dir.size); bp += argp->dir.size; } + if (copy_only) { + memcpy(bp, &argp->blob_fid_lo, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_HTONL_COPYOUT(env, bp, argp->blob_fid_lo); + if (copy_only) { + memcpy(bp, &argp->blob_fid_hi, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_HTONL_COPYOUT(env, bp, argp->blob_fid_hi); *lenp = (size_t)(bp - start); return (0); @@ -386,6 +396,16 @@ __rep_fileinfo_unmarshal(env, version, argpp, bp, max, nextp) if (max < needed) goto too_few; bp += argp->dir.size; + if (copy_only) { + memcpy(&argp->blob_fid_lo, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_NTOHL_COPYIN(env, argp->blob_fid_lo, bp); + if (copy_only) { + memcpy(&argp->blob_fid_hi, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_NTOHL_COPYIN(env, argp->blob_fid_hi, bp); if (nextp != NULL) *nextp = bp; @@ -399,6 +419,211 @@ too_few: } /* + * PUBLIC: int __rep_fileinfo_v7_marshal __P((ENV *, u_int32_t, + * PUBLIC: __rep_fileinfo_v7_args *, u_int8_t *, size_t, size_t *)); + */ +int +__rep_fileinfo_v7_marshal(env, version, argp, bp, max, lenp) + ENV *env; + u_int32_t version; + __rep_fileinfo_v7_args *argp; + u_int8_t *bp; + size_t *lenp, max; +{ + int copy_only; + u_int8_t *start; + + if (max < __REP_FILEINFO_V7_SIZE + + (size_t)argp->uid.size + + (size_t)argp->info.size + + (size_t)argp->dir.size) + return (ENOMEM); + start = bp; + + copy_only = 0; + if (version < DB_REPVERSION_47) + copy_only = 1; + if (copy_only) { + memcpy(bp, &argp->pgsize, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_HTONL_COPYOUT(env, bp, argp->pgsize); + if (copy_only) { + memcpy(bp, &argp->pgno, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_HTONL_COPYOUT(env, bp, argp->pgno); + if (copy_only) { + memcpy(bp, &argp->max_pgno, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_HTONL_COPYOUT(env, bp, argp->max_pgno); + if (copy_only) { + memcpy(bp, &argp->filenum, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_HTONL_COPYOUT(env, bp, argp->filenum); + if (copy_only) { + memcpy(bp, &argp->finfo_flags, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_HTONL_COPYOUT(env, bp, argp->finfo_flags); + if (copy_only) { + memcpy(bp, &argp->type, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_HTONL_COPYOUT(env, bp, argp->type); + if (copy_only) { + memcpy(bp, &argp->db_flags, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_HTONL_COPYOUT(env, bp, argp->db_flags); + if (copy_only) { + memcpy(bp, &argp->uid.size, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_HTONL_COPYOUT(env, bp, argp->uid.size); + if (argp->uid.size > 0) { + memcpy(bp, argp->uid.data, argp->uid.size); + bp += argp->uid.size; + } + if (copy_only) { + memcpy(bp, &argp->info.size, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_HTONL_COPYOUT(env, bp, argp->info.size); + if (argp->info.size > 0) { + memcpy(bp, argp->info.data, argp->info.size); + bp += argp->info.size; + } + if (copy_only) { + memcpy(bp, &argp->dir.size, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_HTONL_COPYOUT(env, bp, argp->dir.size); + if (argp->dir.size > 0) { + memcpy(bp, argp->dir.data, argp->dir.size); + bp += argp->dir.size; + } + + *lenp = (size_t)(bp - start); + return (0); +} + +/* + * PUBLIC: int __rep_fileinfo_v7_unmarshal __P((ENV *, u_int32_t, + * PUBLIC: __rep_fileinfo_v7_args **, u_int8_t *, size_t, u_int8_t **)); + */ +int +__rep_fileinfo_v7_unmarshal(env, version, argpp, bp, max, nextp) + ENV *env; + u_int32_t version; + __rep_fileinfo_v7_args **argpp; + u_int8_t *bp; + size_t max; + u_int8_t **nextp; +{ + size_t needed; + __rep_fileinfo_v7_args *argp; + int ret; + int copy_only; + + needed = __REP_FILEINFO_V7_SIZE; + if (max < needed) + goto too_few; + if ((ret = __os_malloc(env, sizeof(*argp), &argp)) != 0) + return (ret); + + copy_only = 0; + if (version < DB_REPVERSION_47) + copy_only = 1; + if (copy_only) { + memcpy(&argp->pgsize, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_NTOHL_COPYIN(env, argp->pgsize, bp); + if (copy_only) { + memcpy(&argp->pgno, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_NTOHL_COPYIN(env, argp->pgno, bp); + if (copy_only) { + memcpy(&argp->max_pgno, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_NTOHL_COPYIN(env, argp->max_pgno, bp); + if (copy_only) { + memcpy(&argp->filenum, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_NTOHL_COPYIN(env, argp->filenum, bp); + if (copy_only) { + memcpy(&argp->finfo_flags, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_NTOHL_COPYIN(env, argp->finfo_flags, bp); + if (copy_only) { + memcpy(&argp->type, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_NTOHL_COPYIN(env, argp->type, bp); + if (copy_only) { + memcpy(&argp->db_flags, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_NTOHL_COPYIN(env, argp->db_flags, bp); + if (copy_only) { + memcpy(&argp->uid.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_NTOHL_COPYIN(env, argp->uid.size, bp); + if (argp->uid.size == 0) + argp->uid.data = NULL; + else + argp->uid.data = bp; + needed += (size_t)argp->uid.size; + if (max < needed) + goto too_few; + bp += argp->uid.size; + if (copy_only) { + memcpy(&argp->info.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_NTOHL_COPYIN(env, argp->info.size, bp); + if (argp->info.size == 0) + argp->info.data = NULL; + else + argp->info.data = bp; + needed += (size_t)argp->info.size; + if (max < needed) + goto too_few; + bp += argp->info.size; + if (copy_only) { + memcpy(&argp->dir.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_NTOHL_COPYIN(env, argp->dir.size, bp); + if (argp->dir.size == 0) + argp->dir.data = NULL; + else + argp->dir.data = bp; + needed += (size_t)argp->dir.size; + if (max < needed) + goto too_few; + bp += argp->dir.size; + + if (nextp != NULL) + *nextp = bp; + *argpp = argp; + return (0); + +too_few: + __db_errx(env, DB_STR("3675", + "Not enough input bytes to fill a __rep_fileinfo_v7 message")); + return (EINVAL); +} + +/* * PUBLIC: int __rep_fileinfo_v6_marshal __P((ENV *, u_int32_t, * PUBLIC: __rep_fileinfo_v6_args *, u_int8_t *, size_t, size_t *)); */ @@ -1039,3 +1264,245 @@ too_few: return (EINVAL); } +/* + * PUBLIC: void __rep_blob_update_req_marshal __P((ENV *, + * PUBLIC: __rep_blob_update_req_args *, u_int8_t *)); + */ +void +__rep_blob_update_req_marshal(env, argp, bp) + ENV *env; + __rep_blob_update_req_args *argp; + u_int8_t *bp; +{ + DB_HTONLL_COPYOUT(env, bp, argp->blob_fid); + DB_HTONLL_COPYOUT(env, bp, argp->blob_sid); + DB_HTONLL_COPYOUT(env, bp, argp->blob_id); + DB_HTONLL_COPYOUT(env, bp, argp->highest_id); +} + +/* + * PUBLIC: int __rep_blob_update_req_unmarshal __P((ENV *, + * PUBLIC: __rep_blob_update_req_args *, u_int8_t *, size_t, u_int8_t **)); + */ +int +__rep_blob_update_req_unmarshal(env, argp, bp, max, nextp) + ENV *env; + __rep_blob_update_req_args *argp; + u_int8_t *bp; + size_t max; + u_int8_t **nextp; +{ + if (max < __REP_BLOB_UPDATE_REQ_SIZE) + goto too_few; + DB_NTOHLL_COPYIN(env, argp->blob_fid, bp); + DB_NTOHLL_COPYIN(env, argp->blob_sid, bp); + DB_NTOHLL_COPYIN(env, argp->blob_id, bp); + DB_NTOHLL_COPYIN(env, argp->highest_id, bp); + + if (nextp != NULL) + *nextp = bp; + return (0); + +too_few: + __db_errx(env, DB_STR("3675", + "Not enough input bytes to fill a __rep_blob_update_req message")); + return (EINVAL); +} + +/* + * PUBLIC: void __rep_blob_update_marshal __P((ENV *, + * PUBLIC: __rep_blob_update_args *, u_int8_t *)); + */ +void +__rep_blob_update_marshal(env, argp, bp) + ENV *env; + __rep_blob_update_args *argp; + u_int8_t *bp; +{ + DB_HTONLL_COPYOUT(env, bp, argp->blob_fid); + DB_HTONLL_COPYOUT(env, bp, argp->highest_id); + DB_HTONL_COPYOUT(env, bp, argp->flags); + DB_HTONL_COPYOUT(env, bp, argp->num_blobs); +} + +/* + * PUBLIC: int __rep_blob_update_unmarshal __P((ENV *, + * PUBLIC: __rep_blob_update_args *, u_int8_t *, size_t, u_int8_t **)); + */ +int +__rep_blob_update_unmarshal(env, argp, bp, max, nextp) + ENV *env; + __rep_blob_update_args *argp; + u_int8_t *bp; + size_t max; + u_int8_t **nextp; +{ + if (max < __REP_BLOB_UPDATE_SIZE) + goto too_few; + DB_NTOHLL_COPYIN(env, argp->blob_fid, bp); + DB_NTOHLL_COPYIN(env, argp->highest_id, bp); + DB_NTOHL_COPYIN(env, argp->flags, bp); + DB_NTOHL_COPYIN(env, argp->num_blobs, bp); + + if (nextp != NULL) + *nextp = bp; + return (0); + +too_few: + __db_errx(env, DB_STR("3675", + "Not enough input bytes to fill a __rep_blob_update message")); + return (EINVAL); +} + +/* + * PUBLIC: void __rep_blob_file_marshal __P((ENV *, + * PUBLIC: __rep_blob_file_args *, u_int8_t *)); + */ +void +__rep_blob_file_marshal(env, argp, bp) + ENV *env; + __rep_blob_file_args *argp; + u_int8_t *bp; +{ + DB_HTONLL_COPYOUT(env, bp, argp->blob_sid); + DB_HTONLL_COPYOUT(env, bp, argp->blob_id); + DB_HTONLL_COPYOUT(env, bp, argp->blob_size); +} + +/* + * PUBLIC: int __rep_blob_file_unmarshal __P((ENV *, + * PUBLIC: __rep_blob_file_args *, u_int8_t *, size_t, u_int8_t **)); + */ +int +__rep_blob_file_unmarshal(env, argp, bp, max, nextp) + ENV *env; + __rep_blob_file_args *argp; + u_int8_t *bp; + size_t max; + u_int8_t **nextp; +{ + if (max < __REP_BLOB_FILE_SIZE) + goto too_few; + DB_NTOHLL_COPYIN(env, argp->blob_sid, bp); + DB_NTOHLL_COPYIN(env, argp->blob_id, bp); + DB_NTOHLL_COPYIN(env, argp->blob_size, bp); + + if (nextp != NULL) + *nextp = bp; + return (0); + +too_few: + __db_errx(env, DB_STR("3675", + "Not enough input bytes to fill a __rep_blob_file message")); + return (EINVAL); +} + +/* + * PUBLIC: void __rep_blob_chunk_marshal __P((ENV *, + * PUBLIC: __rep_blob_chunk_args *, u_int8_t *)); + */ +void +__rep_blob_chunk_marshal(env, argp, bp) + ENV *env; + __rep_blob_chunk_args *argp; + u_int8_t *bp; +{ + DB_HTONL_COPYOUT(env, bp, argp->flags); + DB_HTONLL_COPYOUT(env, bp, argp->blob_fid); + DB_HTONLL_COPYOUT(env, bp, argp->blob_sid); + DB_HTONLL_COPYOUT(env, bp, argp->blob_id); + DB_HTONLL_COPYOUT(env, bp, argp->offset); + DB_HTONL_COPYOUT(env, bp, argp->data.size); + if (argp->data.size > 0) { + memcpy(bp, argp->data.data, argp->data.size); + bp += argp->data.size; + } +} + +/* + * PUBLIC: int __rep_blob_chunk_unmarshal __P((ENV *, + * PUBLIC: __rep_blob_chunk_args *, u_int8_t *, size_t, u_int8_t **)); + */ +int +__rep_blob_chunk_unmarshal(env, argp, bp, max, nextp) + ENV *env; + __rep_blob_chunk_args *argp; + u_int8_t *bp; + size_t max; + u_int8_t **nextp; +{ + size_t needed; + + needed = __REP_BLOB_CHUNK_SIZE; + if (max < needed) + goto too_few; + DB_NTOHL_COPYIN(env, argp->flags, bp); + DB_NTOHLL_COPYIN(env, argp->blob_fid, bp); + DB_NTOHLL_COPYIN(env, argp->blob_sid, bp); + DB_NTOHLL_COPYIN(env, argp->blob_id, bp); + DB_NTOHLL_COPYIN(env, argp->offset, bp); + DB_NTOHL_COPYIN(env, argp->data.size, bp); + if (argp->data.size == 0) + argp->data.data = NULL; + else + argp->data.data = bp; + needed += (size_t)argp->data.size; + if (max < needed) + goto too_few; + bp += argp->data.size; + + if (nextp != NULL) + *nextp = bp; + return (0); + +too_few: + __db_errx(env, DB_STR("3675", + "Not enough input bytes to fill a __rep_blob_chunk message")); + return (EINVAL); +} + +/* + * PUBLIC: void __rep_blob_chunk_req_marshal __P((ENV *, + * PUBLIC: __rep_blob_chunk_req_args *, u_int8_t *)); + */ +void +__rep_blob_chunk_req_marshal(env, argp, bp) + ENV *env; + __rep_blob_chunk_req_args *argp; + u_int8_t *bp; +{ + DB_HTONLL_COPYOUT(env, bp, argp->blob_fid); + DB_HTONLL_COPYOUT(env, bp, argp->blob_sid); + DB_HTONLL_COPYOUT(env, bp, argp->blob_id); + DB_HTONLL_COPYOUT(env, bp, argp->offset); +} + +/* + * PUBLIC: int __rep_blob_chunk_req_unmarshal __P((ENV *, + * PUBLIC: __rep_blob_chunk_req_args *, u_int8_t *, size_t, u_int8_t **)); + */ +int +__rep_blob_chunk_req_unmarshal(env, argp, bp, max, nextp) + ENV *env; + __rep_blob_chunk_req_args *argp; + u_int8_t *bp; + size_t max; + u_int8_t **nextp; +{ + if (max < __REP_BLOB_CHUNK_REQ_SIZE) + goto too_few; + DB_NTOHLL_COPYIN(env, argp->blob_fid, bp); + DB_NTOHLL_COPYIN(env, argp->blob_sid, bp); + DB_NTOHLL_COPYIN(env, argp->blob_id, bp); + DB_NTOHLL_COPYIN(env, argp->offset, bp); + + if (nextp != NULL) + *nextp = bp; + return (0); + +too_few: + __db_errx(env, DB_STR("3675", + "Not enough input bytes to fill a __rep_blob_chunk_req message")); + return (EINVAL); +} + diff --git a/src/rep/rep_backup.c b/src/rep/rep_backup.c index cfde7622..14bc63bb 100644 --- a/src/rep/rep_backup.c +++ b/src/rep/rep_backup.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2004, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2004, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -9,6 +9,7 @@ #include "db_config.h" #include "db_int.h" +#include "dbinc/blob.h" #include "dbinc/db_page.h" #include "dbinc/db_am.h" #include "dbinc/fop.h" @@ -26,21 +27,45 @@ * Note that the fileinfo for the first file in the list always appears at * (constant) offset __REP_UPDATE_SIZE in the buffer. */ +#define FILE_CTX_INMEM_ONLY 0x01 typedef struct { u_int8_t *buf; /* Buffer base address. */ u_int32_t size; /* Total allocated buffer size. */ u_int8_t *fillptr; /* Pointer to first unused space. */ u_int32_t count; /* Number of entries currently in list. */ u_int32_t version; /* Rep version of marshaled format. */ + u_int32_t flags; /* Context flags. */ } FILE_LIST_CTX; #define FIRST_FILE_PTR(buf) ((buf) + __REP_UPDATE_SIZE) /* + * Flags used to show the state of blob files on the master in messages + * sent to the client. + */ +#define BLOB_DONE 0x01 +#define BLOB_DELETE 0x02 +#define BLOB_CHUNK_FAIL 0x04 + +#define BLOB_ID_SIZE sizeof(db_seq_t) +#define BLOB_KEY_SIZE (2 * BLOB_ID_SIZE) + +/* * Function that performs any desired processing on a single file, as part of * the traversal of a list of database files, such as with internal init. */ typedef int (FILE_WALK_FN) __P((ENV *, __rep_fileinfo_args *, void *)); +static int __rep_add_files_to_list __P(( + ENV *, const char *, const char *, FILE_LIST_CTX *, const char **, int)); +static int __rep_blob_chunk_gap + __P((ENV *, int, DB_THREAD_INFO *, REP *, int *, db_seq_t, int)); +static int __rep_blob_cleanup __P((ENV *, REP *)); +static int __rep_blobdone + __P((ENV *, int, DB_THREAD_INFO *, REP *, db_seq_t, int)); +static int __rep_blob_find_files __P((ENV *, DB_THREAD_INFO *, const char *, + db_seq_t *, db_seq_t, db_seq_t, db_seq_t *, DBT *, size_t *, u_int32_t *)); +static int __rep_blob_sort_dirs __P((ENV *, + int (*)(const char *), char **, int, char ***, int *)); static FILE_WALK_FN __rep_check_uid; static int __rep_clean_interrupted __P((ENV *)); static FILE_WALK_FN __rep_cleanup_nimdbs; @@ -52,6 +77,8 @@ static int __rep_get_fileinfo __P((ENV *, const char *, const char *, __rep_fileinfo_args *, u_int8_t *)); static int __rep_get_file_list __P((ENV *, DB_FH *, u_int32_t, u_int32_t *, DBT *)); +static int __rep_init_file_list_context __P((ENV *, + u_int32_t, u_int32_t, int, FILE_LIST_CTX *)); static int __rep_is_replicated_db __P((const char *, const char *)); static int __rep_log_setup __P((ENV *, REP *, u_int32_t, u_int32_t, DB_LSN *)); @@ -72,9 +99,12 @@ static FILE_WALK_FN __rep_remove_file; static int __rep_remove_logs __P((ENV *)); static int __rep_remove_nimdbs __P((ENV *)); static int __rep_rollback __P((ENV *, DB_LSN *)); +static int __rep_select_blob_file __P((const char *)); +static int __rep_select_blob_sdb __P((const char *)); static int __rep_unlink_by_list __P((ENV *, u_int32_t, u_int8_t *, u_int32_t, u_int32_t)); static FILE_WALK_FN __rep_unlink_file; +static int __rep_walk_blob_dir __P((ENV *, FILE_LIST_CTX*)); static int __rep_walk_filelist __P((ENV *, u_int32_t, u_int8_t *, u_int32_t, u_int32_t, FILE_WALK_FN *, void *)); static int __rep_walk_dir __P((ENV *, const char *, const char *, @@ -129,14 +159,12 @@ __rep_update_req(env, rp) dblp = env->lg_handle; logc = NULL; - if ((ret = __os_calloc(env, 1, MEGABYTE, &context.buf)) != 0) - goto err_noalloc; - context.size = MEGABYTE; - context.count = 0; - context.version = rp->rep_version; /* Reserve space for the update_args, and fill in file info. */ - context.fillptr = FIRST_FILE_PTR(context.buf); + if ((ret = __rep_init_file_list_context(env, rp->rep_version, + F_ISSET(rp, REPCTL_INMEM_ONLY) ? FILE_CTX_INMEM_ONLY : 0, + 1, &context)) != 0) + goto err_noalloc; if ((ret = __rep_find_dbs(env, &context)) != 0) goto err; @@ -214,6 +242,472 @@ err_noalloc: } /* + * Passed to the __rep_blob_sort_dirs function. + * Select blob files, of the form __db.bl### + */ +static int +__rep_select_blob_file(file) + const char *file; +{ + if (strncmp(BLOB_FILE_PREFIX, file, strlen(BLOB_FILE_PREFIX)) == 0) + return (1); + else + return (0); +} + +/* + * Passed to the __rep_blob_sort_dirs function. + * Select blob subdatabase directories, of the form __db### + */ +static int +__rep_select_blob_sdb(file) + const char *file; +{ + if (strncmp(BLOB_DIR_PREFIX, file, strlen(BLOB_DIR_PREFIX)) == 0 && + strncmp(BLOB_FILE_PREFIX, file, strlen(BLOB_FILE_PREFIX)) != 0 && + strcmp(BLOB_META_FILE_NAME, file) != 0) + return (1); + else + return (0); +} + +/* + * __rep_blob_sort_dirs + * Create a sorted list of directory names that all share a type that + * is selected using the given function. + */ +static int +__rep_blob_sort_dirs(env, select_fn, dirs, dirs_cnt, sorted, sorted_cnt) + ENV *env; + int (*select_fn) __P((const char *)); + char **dirs; + int dirs_cnt; + char ***sorted; + int *sorted_cnt; +{ + char **sort, *tmp; + int i, ret, size, sort_cnt, swapped; + + *sorted = NULL; + *sorted_cnt = 0; + sort_cnt = 0; + + if ((ret = __os_malloc(env, + (sizeof(char *) * (unsigned int)dirs_cnt), &sort)) != 0) + return (ret); + + for (i = 0; i < dirs_cnt; i++) { + if (select_fn(dirs[i])) { + sort[sort_cnt] = dirs[i]; + sort_cnt++; + } + } + + /* + * Directories are usually returned in order, or close to it, so use + * Bubble Sort to sort the list. + */ + size = sort_cnt; + swapped = 1; + while (swapped == 1 && size > 1) { + swapped = 0; + for (i = 0; (i + 1) < size; i++) { + if (strcmp(sort[i], sort[i+1]) > 0) { + tmp = sort[i]; + sort[i] = sort[i+1]; + sort[i+1] = tmp; + swapped = 1; + } + } + size--; + } + + *sorted = sort; + *sorted_cnt = sort_cnt; + + return (0); +} + +#define BLOB_THROTTLE_DEFAULT (10 * MEGABYTE) + +/* + * __rep_blob_update_req + * Send a list of blob files, starting after the blob id and sub-database + * id sent in the BLOB_UPDATE_REQ message. + * + * PUBLIC: int __rep_blob_update_req __P((ENV *, DB_THREAD_INFO *, DBT *)); + */ +int +__rep_blob_update_req(env, ip, rec) + ENV *env; + DB_THREAD_INFO *ip; + DBT *rec; +{ + DBT rbudbt; + REP *rep; + __rep_blob_update_args rbu; + __rep_blob_update_req_args rbur; + db_seq_t blob_fid, blob_id, blob_sdb, tmp; + int cur, dirs_cnt, ret, sdb_cnt; + size_t sent; + char *blob_sub_dir, *dir, **dirs, **sdb; + u_int32_t num_blobs, throttle; + u_int8_t *ptr; + + memset(&rbu, 0, sizeof(__rep_blob_update_args)); + memset(&rbudbt, 0, sizeof(DBT)); + blob_sub_dir = dir = NULL; + dirs = sdb = NULL; + sent = 0; + num_blobs = 0; + cur = dirs_cnt = sdb_cnt = 0; + rep = env->rep_handle->region; + throttle = rep->gbytes * GIGABYTE + rep->bytes; + if (throttle == 0) + throttle = BLOB_THROTTLE_DEFAULT; + + if ((ret = __rep_blob_update_req_unmarshal( + env, &rbur, rec->data, rec->size, &ptr)) != 0) + goto err; + + RPRINT(env, (env, DB_VERB_REP_SYNC, +"blob_update_req: file_id %llu sdb_id %llu blob_id %llu highest %llu", + (long long)rbur.blob_fid, (long long)rbur.blob_sid, + (long long)rbur.blob_id, (long long)rbur.highest_id)); + + rbu.blob_fid = rbur.blob_fid; + + if ((ret = __os_malloc(env, MEGABYTE, &rbudbt.data)) != 0) + goto err; + rbudbt.ulen = MEGABYTE; + rbudbt.size = __REP_BLOB_UPDATE_SIZE; + + blob_fid = (db_seq_t)rbur.blob_fid; + blob_sdb = (db_seq_t)rbur.blob_sid; + blob_id = (db_seq_t)rbur.blob_id; + + /* Find the first blob file if it is unknown. */ + if (blob_id == 0 && blob_sdb == 0) { +find_sdb: if (dirs == NULL) { + if ((ret = __blob_make_sub_dir( + env, &blob_sub_dir, blob_fid, 0)) != 0) + goto err; + if ((ret = __db_appname( + env, DB_APP_BLOB, blob_sub_dir, NULL, &dir)) != 0) + goto err; + /* If no directory, there are no blobs to send. */ + if (__os_exists(env, dir, NULL) != 0) + goto filedone; + + if ((ret = __os_dirlist( + env, dir, 1, &dirs, &dirs_cnt)) != 0) + goto err; + + if (dirs_cnt == 0) + goto filedone; + + if ((ret = __rep_blob_sort_dirs( + env, __rep_select_blob_sdb, + dirs, dirs_cnt, &sdb, &sdb_cnt)) != 0) + goto err; + } + /* + * Iterate through the list of subdirectories, until we find + * one that has an id larger than the current subdirectory id. + */ + while (cur < sdb_cnt) { + if ((ret = __blob_path_to_dir_ids( + env, sdb[cur], &tmp, NULL)) != 0) + goto err; + if (blob_sdb < tmp) { + blob_sdb = tmp; + break; + } + cur++; + } + /* Check if no more subdirectories to search */ + if (sdb_cnt != 0 && cur == sdb_cnt) + goto filedone; + if (dir != NULL) + __os_free(env, dir); + dir = NULL; + if (blob_sub_dir != NULL) + __os_free(env, blob_sub_dir); + blob_sub_dir = NULL; + } + + if (blob_sub_dir == NULL && (ret = + __blob_make_sub_dir(env, &blob_sub_dir, blob_fid, blob_sdb)) != 0) + goto err; + + if (dir == NULL && (ret = __db_appname( + env, DB_APP_BLOB, blob_sub_dir, NULL, &dir)) != 0) + goto err; + /* Search the current directory for blob files with id > blob_id. */ + if ((ret = __rep_blob_find_files( + env, ip, dir, &blob_id, blob_sdb, blob_fid, + (db_seq_t *)&rbur.highest_id, &rbudbt, &sent, &num_blobs)) != 0) + goto err; + + /* + * If we have not reached the send limit, and there are still + * directories to search, then search the next directory. + */ + if (sent < throttle) { + if (blob_sdb != 0) { + rbur.highest_id = 0; + blob_id = 0; + __os_free(env, blob_sub_dir); + blob_sub_dir = NULL; + __os_free(env, dir); + dir = NULL; + goto find_sdb; + } else { + /* Mark as the end of the files. */ +filedone: F_SET(&rbu, BLOB_DONE); + rbur.highest_id = 0; + } + } else + STAT(rep->stat.st_nthrottles++); + + rbu.num_blobs = num_blobs; + rbu.highest_id = rbur.highest_id; + __rep_blob_update_marshal(env, &rbu, rbudbt.data); + RPRINT(env, (env, DB_VERB_REP_SYNC, + "Sending blob_update: file_id %llu, num_blobs %lu, flags %lu", + (long long)rbu.blob_fid, + (long)num_blobs, (unsigned long)rbu.flags)); + (void)__rep_send_message( + env, DB_EID_BROADCAST, REP_BLOB_UPDATE, NULL, &rbudbt, 0, 0); + +err: if (sdb != NULL) + __os_free(env, sdb); + if (dirs != NULL) + __os_dirfree(env, dirs, dirs_cnt); + if (dir != NULL) + __os_free(env, dir); + if (blob_sub_dir != NULL) + __os_free(env, blob_sub_dir); + if (rbudbt.data != NULL) + __os_free(env, rbudbt.data); + return (ret); +} + +/* + * __rep_blob_find_files + * + * Search a directory for blob files, starting with the given blob id and + * sub-database id. Add information for each blob to the message buffer until + * there are no more files, or it has reached the maximum send amount in terms + * of combined blob files size. + * + * This search is complicated because the blobs have to be sent in order by id, + * but there can be huge holes between a blob file and the one with the next + * highest id, so iterating through the ids looking to see if the file exists + * for each id will take too long. The solution is to walk the directory + * hierarchy in order, reading every file in that directory, sorting them by + * id, and adding them to the update list. + */ +static int +__rep_blob_find_files( + env, ip, dir, blob_id, blob_sid, blob_fid, highest, buf, sent, num) + ENV *env; + DB_THREAD_INFO *ip; + const char *dir; + db_seq_t *blob_id; + db_seq_t blob_sid; + db_seq_t blob_fid; + db_seq_t *highest; + DBT *buf; + size_t *sent; + u_int32_t *num; +{ + DB *bmd; + DB_FH *fhp; + DB_TXN *txn; + REP *rep; + __rep_blob_file_args rbf; + char blob_path[MAX_BLOB_PATH_SZ], **dirs, **files, *path, *ptr; + db_seq_t tmp; + int blob_path_len, cur, depth, dirs_cnt, files_cnt, ret; + off_t blob_size; + size_t len; + u_int32_t bytes, mbytes, throttle; + + bmd = NULL; + txn = NULL; + fhp = NULL; + path = NULL; + dirs = files = NULL; + dirs_cnt = files_cnt = 0; + rbf.blob_sid = (u_int64_t)blob_sid; + rep = env->rep_handle->region; + throttle = rep->gbytes * GIGABYTE + rep->bytes; + if (throttle == 0) + throttle = BLOB_THROTTLE_DEFAULT; + + if ((ret = __os_malloc( + env, strlen(dir) + MAX_BLOB_PATH_SZ, &path)) != 0) + goto err; + + /* + * Read the highest possible blob id from the blob meta database, so + * we know when to stop looking for files for this database. The + * highest value is reset everytime we switch to a new subdatabase. + */ + if (*highest == 0) { + if ((ret = __db_create_internal(&bmd, env, 0)) != 0) + goto err; + + if ((ret = __txn_begin( + env, ip, NULL, &txn, DB_IGNORE_LEASE)) != 0) + goto err; + + bmd->blob_file_id = blob_fid; + bmd->blob_sdb_id = blob_sid; + if ((ret = __blob_highest_id(bmd, txn, highest) ) != 0) + goto err; + + if ((ret = __txn_abort(txn)) != 0) + goto err; + txn = NULL; + if ((ret = __db_close(bmd, NULL, 0)) != 0) + goto err; + bmd = NULL; + (*highest)++; + } + + (*blob_id)++; + while (*sent < throttle && *blob_id < *highest) { + memset(blob_path, 0, MAX_BLOB_PATH_SZ); + blob_path_len = depth = 0; + + /* Calucate the subdirectory from the blob id. */ + __blob_calculate_dirs( + *blob_id, blob_path, &blob_path_len, &depth); + if (blob_path_len != 0) { + (void)sprintf(path, "%s%c%s%c", + dir, PATH_SEPARATOR[0], blob_path, PATH_SEPARATOR[0]); + } else + (void)sprintf(path, "%s", dir); + len = strlen(path); + + /* If the sub-directory does not exist, look for the next. */ + if (__os_exists(env, path, NULL) != 0) { + (*blob_id) += + BLOB_DIR_ELEMS - (*blob_id % BLOB_DIR_ELEMS); + continue; + } + + /* Get a list of all the blob files, sorted by id. */ + if ((ret = __os_dirlist(env, path, 0, &dirs, &dirs_cnt)) != 0) + goto err; + + if ((ret = __rep_blob_sort_dirs(env, __rep_select_blob_file, + dirs, dirs_cnt, &files, &files_cnt)) != 0) + goto err; + + /* + * Find the first blob file with an id greater than or equal to + * the last id. + */ + for (cur = 0; cur < files_cnt; cur++) { + ptr = files[cur]; + ptr += strlen(BLOB_FILE_PREFIX); + if ((ret = __blob_str_to_id( + env, (const char **)&ptr, &tmp)) != 0) + goto err; + DB_ASSERT(env, tmp != 0); + if (tmp >= *blob_id) + break; + } + + /* Add each remaining blob file to the message buffer. */ + while (cur < files_cnt) { + /* Get the blob id from the current file name. */ + (void)sprintf(path + len, "%s", files[cur]); + ptr = path + len + strlen(BLOB_FILE_PREFIX); + if ((ret = __blob_str_to_id( + env, (const char **)&ptr, blob_id)) != 0) + goto err; + rbf.blob_id = (u_int64_t)*blob_id; + /* Open the file and get its size. */ + if ((ret = __os_open( + env, path, 0, DB_OSO_RDONLY, 0, &fhp)) != 0) { + if (ret == ENOENT) { + ret = 0; + RPRINT(env, (env, DB_VERB_REP_SYNC, + "blob_update blob file: %llu deleted, skipping.", + (long long)rbf.blob_id)); + cur++; + continue; + } + goto err; + } + if ((ret = __os_ioinfo( + env, path, fhp, &mbytes, &bytes, NULL)) != 0) + goto err; + if ((ret =__os_closehandle(env, fhp)) != 0) + goto err; + fhp = NULL; + blob_size = ((off_t)mbytes * (off_t)MEGABYTE) + bytes; + rbf.blob_size = (u_int64_t)blob_size; + if (blob_size > UINT32_MAX) + (*sent) = throttle + 1; + else { + if (((*sent) + (size_t)blob_size) < (*sent)) + (*sent) = throttle + 1; + else + (*sent) += (size_t)blob_size; + } + __rep_blob_file_marshal( + env, &rbf, (u_int8_t *)buf->data + buf->size); + (*num)++; + buf->size += __REP_BLOB_FILE_SIZE; + RPRINT(env, (env, DB_VERB_REP_SYNC, + "blob_update adding: blob_sid %llu, blob_id %llu blob_size %llu", + (long long)rbf.blob_sid, + (long long)rbf.blob_id, (long long)rbf.blob_size)); + if ((*sent) > throttle) + goto err; + + /* Resize if there is not enough space to grow. */ + if (buf->size > (buf->ulen - __REP_BLOB_FILE_SIZE)) { + if ((ret = __os_realloc( + env, buf->ulen * 2, &buf->data)) != 0) + goto err; + buf->ulen *= 2; + } + cur++; + } + /* + * Move to the next directory of blob files by setting the blob + * id to the next largest possible value. + */ + (*blob_id) += BLOB_DIR_ELEMS - (*blob_id % BLOB_DIR_ELEMS); + __os_free(env, files); + files = NULL; + __os_dirfree(env, dirs, dirs_cnt); + dirs = NULL; + } +err: + if (path != NULL) + __os_free(env, path); + if (files != NULL) + __os_free(env, files); + if (dirs != NULL) + __os_dirfree(env, dirs, dirs_cnt); + if (fhp != NULL) + (void)__os_closehandle(env, fhp); + if (txn != NULL) + (void)__txn_abort(txn); + if (bmd != NULL) + (void)__db_close(bmd, NULL, 0); + + return (ret); +} + +/* * __rep_find_dbs - * Walk through all the named files/databases including those in the * environment or data_dirs and those that in named and in-memory. We @@ -240,7 +734,8 @@ __rep_find_dbs(env, context) * replicated user databases. If the application has a metadata_dir, * this will also find any persistent internal system databases. */ - if (dbenv->db_data_dir != NULL) { + if (!F_ISSET(context, FILE_CTX_INMEM_ONLY) && + dbenv->db_data_dir != NULL) { for (ddir = dbenv->db_data_dir; *ddir != NULL; ++ddir) { if ((ret = __db_appname(env, DB_APP_NONE, *ddir, NULL, &real_dir)) != 0) @@ -252,16 +747,24 @@ __rep_find_dbs(env, context) real_dir = NULL; } } + /* * Walk the environment directory. If the application doesn't * have a metadata_dir, this will return persistent internal system * databases. If the application doesn't have a separate data * directory, this will also return all user databases. */ - if (ret == 0) + if (!F_ISSET(context, FILE_CTX_INMEM_ONLY) && ret == 0) ret = __rep_walk_dir(env, env->db_home, NULL, context); - /* Now, collect any in-memory named databases. */ + /* Gather the databases in the blob directory. */ + if (!F_ISSET(context, FILE_CTX_INMEM_ONLY) && ret == 0) + ret = __rep_walk_blob_dir(env, context); + + /* + * Now, collect any in-memory named databases. We do this no + * matter if the INMEM_ONLY flag is set or not. + */ if (ret == 0) ret = __rep_walk_dir(env, NULL, NULL, context); @@ -271,6 +774,148 @@ __rep_find_dbs(env, context) } /* + * __rep_walk_blob_dir -- + * + * The blob directory hierarchy consists of a top layer that contains the + * blob meta database (BMD) and a set of blob directories (BLDIR). + * Each BLDIR corresponds to a database file. If the database file doesn't + * contain subdatabases, the BLDIR contains a BMD and blob files. If the + * database file contains subdatabases, the BLDIR contains a BLSDIR + * subdirectory for each subdatabase. Each BLSDIR contains a BMD and blob + * files. + * + * This function walks the blob directory hierarchy and records any BMD. + * It first checks if the top level BMD exists, and if it does searches + * the first and second layers of the hierarchy for BMDs. + */ +static int +__rep_walk_blob_dir(env, context) + ENV *env; + FILE_LIST_CTX *context; +{ + int cnt, cnt2, i, j, ret; + size_t len; + char *blob_dir, *blob_sub, **dirs, *name, *name2, **subdirs; + char blob_sub_buf[MAX_BLOB_PATH_SZ]; + const char *bmd, *dirp; + + cnt = cnt2 = 0; + blob_dir = name = name2 = NULL; + dirs = subdirs = NULL; + bmd = BLOB_META_FILE_NAME; + blob_sub = blob_sub_buf; + + if ((ret = __db_appname( + env, DB_APP_BLOB, BLOB_META_FILE_NAME, &dirp, &name)) != 0) + goto err; + + /* + * If the main blob meta database does not exist, then no databases in + * the environment supports blobs. + */ + if ((ret = __os_exists(env, name, NULL)) != 0) { + ret = 0; + goto err; + } + + /* Get the blob directory. */ + if ((ret = __db_appname( + env, DB_APP_BLOB, NULL, &dirp, &blob_dir)) != 0) + goto err; + + if ((ret = __rep_add_files_to_list( + env, blob_dir, NULL, context, &bmd, 1)) != 0) + goto err; + + if ((ret = __os_dirlist(env, blob_dir, 1, &dirs, &cnt)) != 0) + goto err; + + __os_free(env, name); + name = NULL; + if ((ret = __os_malloc( + env, MAX_BLOB_PATH_SZ + strlen(blob_dir), &name)) != 0) + goto err; + + for (i = 0; i < cnt; i++) { + /* + * Skip blob files and the top level BMD + * (which was handled above). + */ + if (IS_BLOB_META(dirs[i]) || IS_BLOB_FILE(dirs[i])) + continue; + len = strlen(blob_dir) + + strlen(dirs[i]) + strlen(BLOB_META_FILE_NAME) + 3; + (void)snprintf(name, len, "%s%c%s%c%s", blob_dir, + PATH_SEPARATOR[0], dirs[i], PATH_SEPARATOR[0], + BLOB_META_FILE_NAME); + /* + * If a blob meta database exists, add it to the list, and move + * on to the next directory, otherwise get a directory list and + * check the second layer for BMD. If a directory contains a + * BMD, then it cannot contain subdirectories with BMD. + */ + if (__os_exists(env, name, NULL) == 0) { + (void)snprintf(blob_sub, + strlen(dirs[i]) + strlen(bmd) + 2, + "%s%c%s", dirs[i], PATH_SEPARATOR[0], bmd); + if ((ret = __rep_add_files_to_list(env, blob_dir, + NULL, context, (const char **)&blob_sub, 1)) != 0) + goto err; + } else { + len = strlen(blob_dir) + strlen(dirs[i]) + 2; + (void)snprintf(name, len, "%s%c%s", + blob_dir, PATH_SEPARATOR[0], dirs[i]); + if ((ret = __os_dirlist( + env, name, 1, &subdirs, &cnt2)) != 0) + goto err; + if (name2 == NULL) { + if ((ret = __os_malloc(env, + MAX_BLOB_PATH_SZ + strlen(name), + &name2)) != 0) + goto err; + } + for (j = 0; j < cnt2; j++) { + if (IS_BLOB_FILE(subdirs[j])) + continue; + len = strlen(name) + strlen(subdirs[j]) + + strlen(BLOB_META_FILE_NAME) + 3; + (void)snprintf(name2, len, "%s%c%s%c%s", + name, PATH_SEPARATOR[0], subdirs[j], + PATH_SEPARATOR[0], BLOB_META_FILE_NAME); + if ((ret = __os_exists( + env, name2, NULL)) == 0) { + len = strlen(dirs[i]) + + strlen(subdirs[j]) + + strlen(bmd) + 3; + (void)snprintf(blob_sub, + len, "%s%c%s%c%s", dirs[i], + PATH_SEPARATOR[0], subdirs[j], + PATH_SEPARATOR[0], bmd); + if ((ret = __rep_add_files_to_list( + env, blob_dir, NULL, context, + (const char **)&blob_sub, 1)) != 0) + goto err; + } + } + __os_dirfree(env, subdirs, cnt2); + subdirs = NULL; + } + } + +err: if (name != NULL) + __os_free(env, name); + if (name2 != NULL) + __os_free(env, name2); + if (blob_dir != NULL) + __os_free(env, blob_dir); + if (dirs != NULL) + __os_dirfree(env, dirs, cnt); + if (subdirs != NULL) + __os_dirfree(env, subdirs, cnt2); + return (ret); +} + +/* * __rep_walk_dir -- * * This is the routine that walks a directory and fills in the structures @@ -284,11 +929,8 @@ __rep_walk_dir(env, dir, datadir, context) const char *dir, *datadir; FILE_LIST_CTX *context; { - __rep_fileinfo_args tmpfp; - size_t avail, len; - int cnt, first_file, i, ret; - u_int8_t uid[DB_FILE_ID_LEN]; - char *file, **names, *subdb; + int cnt, ret; + char **names; if (dir == NULL) { VPRINT(env, (env, DB_VERB_REP_SYNC, @@ -304,7 +946,34 @@ __rep_walk_dir(env, dir, datadir, context) } VPRINT(env, (env, DB_VERB_REP_SYNC, "Walk_dir: Dir %s has %d files", (dir == NULL) ? "INMEM" : dir, cnt)); + ret = __rep_add_files_to_list( + env, dir, datadir, context, (const char **)names, cnt); + + __os_dirfree(env, names, cnt); + return (ret); +} + +/* + * __rep_add_files_to_list -- + * + * Add the given files to the file list. + */ +static int +__rep_add_files_to_list(env, dir, datadir, context, names, cnt) + ENV *env; + const char *dir, *datadir; + FILE_LIST_CTX *context; + const char **names; + int cnt; +{ + __rep_fileinfo_args tmpfp; + size_t avail, len; + int first_file, i, ret; + u_int8_t uid[DB_FILE_ID_LEN]; + const char *file, *subdb; + first_file = 1; + ret = 0; for (i = 0; i < cnt; i++) { VPRINT(env, (env, DB_VERB_REP_SYNC, "Walk_dir: File %d name: %s", i, names[i])); @@ -372,15 +1041,19 @@ __rep_walk_dir(env, dir, datadir, context) DB_SET_DBT(tmpfp.uid, uid, DB_FILE_ID_LEN); retry: avail = (size_t)(&context->buf[context->size] - context->fillptr); + /* + * It is safe to cast to the old structs + * because the first part of the current + * struct matches the old structs. + */ if (context->version < DB_REPVERSION_53) - /* - * It is safe to cast to the old struct - * because the first part of the current - * struct matches the old struct. - */ ret = __rep_fileinfo_v6_marshal(env, context->version, (__rep_fileinfo_v6_args *)&tmpfp, context->fillptr, avail, &len); + else if (context->version < DB_REPVERSION_61) + ret = __rep_fileinfo_v7_marshal(env, context->version, + (__rep_fileinfo_v7_args *)&tmpfp, + context->fillptr, avail, &len); else ret = __rep_fileinfo_marshal(env, context->version, &tmpfp, context->fillptr, avail, &len); @@ -409,9 +1082,7 @@ retry: avail = (size_t)(&context->buf[context->size] - */ context->fillptr += len; } -err: - __os_dirfree(env, names, cnt); - return (ret); +err: return (ret); } /* @@ -430,7 +1101,7 @@ __rep_is_replicated_db(name, dir) /* * Remaining things that don't have a "__db" prefix are eligible. */ - if (!IS_DB_FILE(name)) + if (!IS_DB_FILE(name) || IS_BLOB_META(name)) return (1); /* Here, we know we have a "__db" name. */ @@ -470,7 +1141,7 @@ __rep_check_uid(env, rfp, uid) if (memcmp(rfp->uid.data, uid, DB_FILE_ID_LEN) == 0) { VPRINT(env, (env, DB_VERB_REP_SYNC, "Check_uid: Found matching file.")); - ret = DB_KEYEXIST; + ret = USR_ERR(env, DB_KEYEXIST); } return (ret); @@ -489,6 +1160,7 @@ __rep_get_fileinfo(env, file, subdb, rfp, uid) DB_THREAD_INFO *ip; PAGE *pagep; int lorder, ret, t_ret; + u_int32_t flags; dbp = NULL; dbc = NULL; @@ -503,11 +1175,15 @@ __rep_get_fileinfo(env, file, subdb, rfp, uid) * database handles would block the master from handling UPDATE_REQ. */ F_SET(dbp, DB_AM_RECOVER); - if ((ret = __db_open(dbp, ip, NULL, file, subdb, DB_UNKNOWN, - DB_RDONLY | (F_ISSET(env, ENV_THREAD) ? DB_THREAD : 0), - 0, PGNO_BASE_MD)) != 0) + flags = DB_RDONLY | (F_ISSET(env, ENV_THREAD) ? DB_THREAD : 0); + if (file != NULL && IS_BLOB_META(file)) + LF_SET(DB_INTERNAL_BLOB_DB); + if ((ret = __db_open(dbp, ip, NULL, + file, subdb, DB_UNKNOWN, flags, 0, PGNO_BASE_MD)) != 0) goto err; + SET_LO_HI_VAR(dbp->blob_file_id, rfp->blob_fid_lo, rfp->blob_fid_hi); + if ((ret = __db_cursor(dbp, ip, NULL, &dbc, 0)) != 0) goto err; if ((ret = __memp_fget(dbp->mpf, &dbp->meta_pgno, ip, dbc->txn, @@ -574,6 +1250,7 @@ __rep_page_req(env, ip, eid, rp, rec) { __rep_fileinfo_args *msgfp, msgf; __rep_fileinfo_v6_args *msgfpv6; + __rep_fileinfo_v7_args *msgfpv7; DB_MPOOLFILE *mpf; DB_REP *db_rep; REP *rep; @@ -584,21 +1261,30 @@ __rep_page_req(env, ip, eid, rp, rec) db_rep = env->rep_handle; rep = db_rep->region; + /* + * Build a current struct by copying in the older + * version struct and then setting up the new fields. + * This is safe because all old fields are in the + * same location in the current struct. + */ if (rp->rep_version < DB_REPVERSION_53) { - /* - * Build a current struct by copying in the older - * version struct and then setting up the data_dir. - * This is safe because all old fields are in the - * same location in the current struct. - */ if ((ret = __rep_fileinfo_v6_unmarshal(env, rp->rep_version, &msgfpv6, rec->data, rec->size, &next)) != 0) return (ret); memcpy(&msgf, msgfpv6, sizeof(__rep_fileinfo_v6_args)); msgf.dir.data = NULL; msgf.dir.size = 0; + msgf.blob_fid_lo = msgf.blob_fid_hi = 0; msgfp = &msgf; msgfree = msgfpv6; + } else if (rp->rep_version < DB_REPVERSION_61) { + if ((ret = __rep_fileinfo_v7_unmarshal(env, rp->rep_version, + &msgfpv7, rec->data, rec->size, &next)) != 0) + return (ret); + memcpy(&msgf, msgfpv7, sizeof(__rep_fileinfo_v7_args)); + msgf.blob_fid_lo = msgf.blob_fid_hi = 0; + msgfp = &msgf; + msgfree = msgfpv7; } else { if ((ret = __rep_fileinfo_unmarshal(env, rp->rep_version, &msgfp, rec->data, rec->size, &next)) != 0) @@ -624,7 +1310,7 @@ __rep_page_req(env, ip, eid, rp, rec) (void)__rep_send_message(env, eid, REP_FILE_FAIL, NULL, rec, 0, 0); else - ret = DB_NOTFOUND; + ret = USR_ERR(env, DB_NOTFOUND); goto err; } @@ -738,7 +1424,7 @@ __rep_page_sendpages(env, ip, eid, rp, msgfp, mpf, dbp) #ifdef HAVE_QUEUE if ((ret = __qam_fget(qdbc, &p, 0, &pagep)) == ENOENT) #endif - ret = DB_PAGE_NOTFOUND; + ret = USR_ERR(env, DB_PAGE_NOTFOUND); } else ret = __memp_fget(mpf, &p, ip, NULL, 0, &pagep); msgfp->pgno = p; @@ -748,16 +1434,21 @@ __rep_page_sendpages(env, ip, eid, rp, msgfp, mpf, dbp) RPRINT(env, (env, DB_VERB_REP_SYNC, "sendpages: PAGE_FAIL on page %lu", (u_long)p)); + /* + * It is safe to cast to the old structs + * because the first part of the current + * struct matches the old structs. + */ if (rp->rep_version < DB_REPVERSION_53) - /* - * It is safe to cast to the old struct - * because the first part of the current - * struct matches the old struct. - */ ret = __rep_fileinfo_v6_marshal(env, rp->rep_version, (__rep_fileinfo_v6_args *)msgfp, buf, msgsz, &len); + else if (rp->rep_version < DB_REPVERSION_61) + ret = __rep_fileinfo_v7_marshal(env, + rp->rep_version, + (__rep_fileinfo_v7_args *)msgfp, + buf, msgsz, &len); else ret = __rep_fileinfo_marshal(env, rp->rep_version, msgfp, buf, @@ -772,7 +1463,7 @@ __rep_page_sendpages(env, ip, eid, rp, msgfp, mpf, dbp) REP_PAGE_FAIL, &lsn, &msgdbt, 0, 0); continue; } else - ret = DB_NOTFOUND; + ret = USR_ERR(env, DB_NOTFOUND); goto err; } else if (ret != 0) goto err; @@ -796,16 +1487,21 @@ __rep_page_sendpages(env, ip, eid, rp, msgfp, mpf, dbp) RPRINT(env, (env, DB_VERB_REP_SYNC, "sendpages: %lu, page lsn [%lu][%lu]", (u_long)p, (u_long)pagep->lsn.file, (u_long)pagep->lsn.offset)); + /* + * It is safe to cast to the old structs + * because the first part of the current + * structs matches the old struct. + */ if (rp->rep_version < DB_REPVERSION_53) - /* - * It is safe to cast to the old struct - * because the first part of the current - * struct matches the old struct. - */ ret = __rep_fileinfo_v6_marshal(env, rp->rep_version, (__rep_fileinfo_v6_args *)msgfp, buf, msgsz, &len); + else if (rp->rep_version < DB_REPVERSION_61) + ret = __rep_fileinfo_v7_marshal(env, + rp->rep_version, + (__rep_fileinfo_v7_args *)msgfp, + buf, msgsz, &len); else ret = __rep_fileinfo_marshal(env, rp->rep_version, msgfp, buf, msgsz, &len); @@ -1010,7 +1706,8 @@ __rep_update_setup(env, eid, rp, rec, savetime, lsn) ZERO_LSN(lp->waiting_lsn); ZERO_LSN(lp->max_wait_lsn); ZERO_LSN(lp->max_perm_lsn); - if (db_rep->rep_db == NULL) + ret = __rep_blob_cleanup(env, rep); + if (ret == 0 && db_rep->rep_db == NULL) ret = __rep_client_dbinit(env, 0, REP_DB); MUTEX_UNLOCK(env, rep->mtx_clientdb); if (ret != 0) @@ -1148,6 +1845,337 @@ err: /* return (ret); } +/* + * __rep_blob_update + * Prepare to receive blob file data by setting up the blob gap database, + * then requesting the blob file data. + * + * PUBLIC: int __rep_blob_update __P((ENV *, int, DB_THREAD_INFO *, DBT *)); + */ +int +__rep_blob_update(env, eid, ip, rec) + ENV *env; + int eid; + DB_THREAD_INFO *ip; + DBT *rec; +{ + DBC *dbc; + DB_REP *db_rep; + DBT data, key; + REP *rep; + REGINFO *infop; + __rep_blob_file_args rbf; + __rep_blob_update_args rbu; + __rep_fileinfo_args *rfp; + db_seq_t blob_fid; + int ret; + off_t offset; + size_t len; + u_int32_t num_blobs; + u_int8_t keybuf[BLOB_KEY_SIZE], *ptr; + + db_rep = env->rep_handle; + rep = db_rep->region; + infop = env->reginfo; + rfp = NULL; + dbc = NULL; + memset(&rbu, 0, sizeof(__rep_blob_update_args)); + memset(&rbf, 0, sizeof(__rep_blob_file_args)); + + if ((ret = __rep_blob_update_unmarshal( + env, &rbu, rec->data, rec->size, &ptr)) != 0) + return (ret); + len = rec->size - __REP_BLOB_UPDATE_SIZE; + + RPRINT(env, (env, DB_VERB_REP_SYNC, +"blob_update: file_id %llu, num_blobs %lu, flags %lu, highest %llu", + (long long)rbu.blob_fid, (long)rbu.num_blobs, + (unsigned long)rbu.flags, (long long)rbu.highest_id)); + + MUTEX_LOCK(env, rep->mtx_clientdb); + REP_SYSTEM_LOCK(env); + + /* + * Check if the world changed. + */ + if (rep->sync_state != SYNC_PAGE) + goto unlock; + + /* Make sure this is for the current database. */ + GET_CURINFO(rep, infop, rfp); + GET_LO_HI(env, rfp->blob_fid_lo, rfp->blob_fid_hi, blob_fid, ret); + if (ret != 0) + goto unlock; + + if (blob_fid != (db_seq_t)rbu.blob_fid) + goto unlock; + + rep->highest_id = (db_seq_t)rbu.highest_id; + /* + * For each blob file, add an entry to the database for each 1 MB + * section of that file. The entries will be deleted as the + * coresponding blob chunks arrive and are written to disk. + */ + if (db_rep->blob_dbp == NULL && + (ret = __rep_client_dbinit(env, 0, REP_BLOB)) != 0) + goto unlock; + + if ((ret = __db_cursor(db_rep->blob_dbp, ip, NULL, &dbc, 0)) != 0) + goto unlock; + + /* + * Make sure no one else has populated the database, this could happen + * if the update message is sent twice. + */ + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + if ((ret = __dbc_get(dbc, &key, &data, DB_FIRST)) != DB_NOTFOUND) + goto unlock; + + /* It is possible for a blob database to have no blobs. */ + if (rbu.num_blobs == 0) { + (void)__dbc_close(dbc); + dbc = NULL; + rep->blob_more_files = 0; + rep->gap_bl_hi_id = rep->gap_bl_hi_sid = 0; + rep->last_blob_id = rep->last_blob_sid = 0; + rep->prev_blob_id = rep->prev_blob_sid = 0; + rep->gap_bl_hi_off = 0; + rep->blob_sync = 0; + rep->highest_id = 0; + rep->blob_rereq = 0; + ret = __rep_blobdone(env, eid, ip, rep, blob_fid, 0); + goto unlock; + } + + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + data.flags = key.flags = DB_DBT_USERMEM; + key.data = keybuf; + key.ulen = key.size = BLOB_KEY_SIZE; + data.data = (void *)&offset; + data.ulen = data.size = sizeof(offset); + num_blobs = 0; + while (num_blobs < rbu.num_blobs) { + if ((ret = + __rep_blob_file_unmarshal(env, &rbf, ptr, len, &ptr)) != 0) + goto unlock; + len -= __REP_BLOB_FILE_SIZE; + + RPRINT(env, (env, DB_VERB_REP_SYNC, + "blob_update adding file: blob_id %llu, sdb_id %llu, blob_size %llu", + (long long)rbf.blob_id, (long long)rbf.blob_sid, + (long long)rbf.blob_size)); + + memcpy(keybuf, &rbf.blob_sid, BLOB_ID_SIZE); + memcpy(&(keybuf[BLOB_ID_SIZE]), &rbf.blob_id, BLOB_ID_SIZE); + offset = 0; + /* + * Add an entry for each megabyte of the blob file. Zero + * length blob files should have at least one entry. + */ + do { + if ((ret = __dbc_put(dbc, &key, &data, 0)) != 0) + goto unlock; + offset += MEGABYTE; + /* + * Check for overflow, this can happen when the master + * supports 64 file offsets, but the client does not. + */ + if (offset < 0) { + __db_errx(env, + DB_STR("3704", + "Blob file offset overflow")); + ret = EINVAL; + goto unlock; + } + } while ((u_int32_t)offset < rbf.blob_size); + num_blobs++; + } + /* Set whether there are more files after the ones on the list. */ + if (F_ISSET(&rbu, BLOB_DONE)) + rep->blob_more_files = 0; + else + rep->blob_more_files = 1; + rep->prev_blob_id = rep->last_blob_id; + rep->prev_blob_sid = rep->last_blob_sid; + rep->last_blob_sid = (db_seq_t)rbf.blob_sid; + rep->last_blob_id = (db_seq_t)rbf.blob_id; + + /* + * Send the same message payload in a REP_BLOB_ALL_REQ message to get + * the blob data. Peer-to-peer initialization is not supported for + * blobs, so we can only send this back to the master despite the fact + * that building the list of blob files is expensive. + */ + (void)__rep_send_message( + env, rep->master_id, REP_BLOB_ALL_REQ, NULL, rec, 0, 0); + +unlock: REP_SYSTEM_UNLOCK(env); + MUTEX_UNLOCK(env, rep->mtx_clientdb); + if (dbc != NULL) + (void)__dbc_close(dbc); + + return (ret); +} + +/* + * __rep_blob_allreq + * Request blob file data. + * + * PUBLIC: int __rep_blob_allreq __P((ENV *, int, DBT *)); + */ +int +__rep_blob_allreq(env, eid, rec) + ENV *env; + int eid; + DBT *rec; +{ + DB *dbp; + DB_FH *fhp; + DBT msg; + __rep_blob_chunk_args rbc; + __rep_blob_file_args rbf; + __rep_blob_update_args rbu; + db_seq_t old_sdb_id; + int done, ret; + off_t offset; + size_t len; + u_int32_t num_blobs; + u_int8_t *chunk_buf, *msg_buf, *ptr; + + dbp = NULL; + fhp = NULL; + chunk_buf = msg_buf = NULL; + memset(&rbu, 0, sizeof(__rep_blob_update_args)); + memset(&rbc, 0, sizeof(__rep_blob_chunk_args)); + memset(&msg, 0, sizeof(DBT)); + + if ((ret = + __os_malloc(env, MEGABYTE + __REP_BLOB_CHUNK_SIZE, &msg_buf)) != 0) + goto err; + msg.data = msg_buf; + msg.ulen = MEGABYTE + __REP_BLOB_CHUNK_SIZE; + if ((ret = __os_malloc(env, MEGABYTE, &chunk_buf)) != 0) + goto err; + rbc.data.data = chunk_buf; + rbc.data.ulen = MEGABYTE; + rbc.data.flags = DB_DBT_USERMEM; + + /* + * The REP_BLOB_ALL_REQ message sends the REP_BLOB_UPDATE message + * payload back to the master to request the actual blobs after the + * client has prepared itself to receive them. + */ + len = rec->size; + if ((ret = __rep_blob_update_unmarshal( + env, &rbu, rec->data, rec->size, &ptr)) != 0) + goto err; + len -= __REP_BLOB_UPDATE_SIZE; + + RPRINT(env, (env, DB_VERB_REP_SYNC, + "blob_all_req: file_id %llu, num_blobs %lu, flags %lu", + (long long)rbu.blob_fid, (long)rbu.num_blobs, + (unsigned long)rbu.flags)); + + if ((ret = __db_create_internal(&dbp, env, 0)) != 0) + goto err; + dbp->blob_file_id = (db_seq_t)rbu.blob_fid; + rbc.blob_fid = rbu.blob_fid; + num_blobs = 0; + /* + * The list of files to send is included in the message, go + * through the list and send each file in pieces. + */ + while (num_blobs < rbu.num_blobs) { + num_blobs++; + if ((ret = __rep_blob_file_unmarshal( + env, &rbf, ptr, len, &ptr)) != 0) + goto err; + len -= __REP_BLOB_FILE_SIZE; + old_sdb_id = dbp->blob_sdb_id; + dbp->blob_sdb_id = (db_seq_t)rbf.blob_sid; + rbc.flags = 0; + rbc.blob_sid = rbf.blob_sid; + rbc.blob_id = rbf.blob_id; + /* Free the sub-directory information if it has changed. */ + if (old_sdb_id != dbp->blob_sdb_id && + dbp->blob_sub_dir != NULL) { + __os_free(env, dbp->blob_sub_dir); + dbp->blob_sub_dir = NULL; + } + if (dbp->blob_sub_dir == NULL) { + if ((ret = __blob_make_sub_dir(env, &dbp->blob_sub_dir, + dbp->blob_file_id, dbp->blob_sdb_id)) != 0) + goto err; + } + if ((ret = __blob_file_open(dbp, + &fhp, (db_seq_t)rbf.blob_id, DB_FOP_READONLY, 0)) != 0) { + /* + * The file may have been deleted between creating the + * list and sending the data. Send a message saying + * the file has been deleted. + */ + if (ret == ENOENT) { + F_SET(&rbc, BLOB_DELETE); + rbc.data.size = 0; + __rep_blob_chunk_marshal(env, &rbc, msg.data); + msg.size = __REP_BLOB_CHUNK_SIZE; + (void)__rep_send_message(env, + eid, REP_BLOB_CHUNK, NULL, &msg, 0, 0); + ret = 0; + fhp = NULL; + continue; + } + goto err; + } + offset = 0; + do { + done = 0; + rbc.flags = 0; + if ((ret = __blob_file_read( + env, fhp, &rbc.data, offset, MEGABYTE)) != 0) + goto err; + DB_ASSERT(env, rbc.data.size <= MEGABYTE); + + /* + * In rare cases the blob file may have gotten shorter + * since the list was created. + */ + if (rbc.data.size < (u_int32_t)MEGABYTE && (u_int64_t) + (offset + rbc.data.size) < rbf.blob_size) { + F_SET(&rbc, BLOB_CHUNK_FAIL); + done = 1; + } + /* File may have grown since the list was made. */ + if ((u_int64_t) + (offset + rbc.data.size) > rbf.blob_size) { + rbc.data.size = + (u_int32_t)((off_t)rbf.blob_size - offset); + } + rbc.offset = (u_int64_t)offset; + __rep_blob_chunk_marshal(env, &rbc, msg.data); + msg.size = __REP_BLOB_CHUNK_SIZE + rbc.data.size; + (void)__rep_send_message( + env, eid, REP_BLOB_CHUNK, NULL, &msg, 0, 0); + offset += MEGABYTE; + } while ((u_int64_t)offset < rbf.blob_size && !done); + + if (fhp != NULL && (ret = __os_closehandle(env, fhp)) != 0) + goto err; + fhp = NULL; + } +err: if (chunk_buf != NULL) + __os_free(env, chunk_buf); + if (msg_buf != NULL) + __os_free(env, msg_buf); + if (fhp != NULL) + (void)__os_closehandle(env, fhp); + if (dbp != 0) + (void)__db_close(dbp, NULL, 0); + return (ret); +} + static int __rep_find_inmem(env, rfp, unused) ENV *env; @@ -1157,6 +2185,11 @@ __rep_find_inmem(env, rfp, unused) COMPQUIET(env, NULL); COMPQUIET(unused, NULL); + /* + * Cannot assume all databases are in-memory because abbreviated + * internal inits from 5.3 and earlier are not limited to in-memory + * databases. + */ return (FLD_ISSET(rfp->db_flags, DB_AM_INMEM) ? DB_KEYEXIST : 0); } @@ -1172,12 +2205,9 @@ __rep_remove_nimdbs(env) FILE_LIST_CTX context; int ret; - if ((ret = __os_calloc(env, 1, MEGABYTE, &context.buf)) != 0) + if ((ret = __rep_init_file_list_context(env, + DB_REPVERSION, 0, 0, &context)) != 0) return (ret); - context.size = MEGABYTE; - context.count = 0; - context.fillptr = context.buf; - context.version = DB_REPVERSION; /* NB: "NULL" asks walk_dir to consider only in-memory DBs */ if ((ret = __rep_walk_dir(env, NULL, NULL, &context)) != 0) @@ -1240,14 +2270,11 @@ __rep_remove_all(env, msg_version, rec) * 1. Get list of databases currently present at this client, which we * intend to remove. */ - if ((ret = __os_calloc(env, 1, MEGABYTE, &context.buf)) != 0) - return (ret); - context.size = MEGABYTE; - context.count = 0; - context.version = DB_REPVERSION; /* Reserve space for the marshaled update_args. */ - context.fillptr = FIRST_FILE_PTR(context.buf); + if ((ret = __rep_init_file_list_context(env, + DB_REPVERSION, 0, 1, &context)) != 0) + return (ret); if ((ret = __rep_find_dbs(env, &context)) != 0) goto out; @@ -1333,6 +2360,9 @@ __rep_remove_all(env, msg_version, rec) FIRST_FILE_PTR(context.buf), context.size, context.count, __rep_remove_file, NULL)) != 0) goto out; + /* Remove the blob directory. */ + if ((ret = __blob_del_hierarchy(env)) != 0) + goto out; /* * 4. Safe-store the (new) list of database files we intend to copy from @@ -1445,6 +2475,8 @@ __rep_remove_file(env, rfp, unused) #ifdef HAVE_QUEUE DB_THREAD_INFO *ip; #endif + APPNAME appname; + db_seq_t blob_fid, blob_sid; char *name; int ret, t_ret; @@ -1496,29 +2528,53 @@ __rep_remove_file(env, rfp, unused) * That will only have removed extent files. Now * we need to deal with the actual file itself. */ + appname = __rep_is_internal_rep_file(rfp->info.data) ? + DB_APP_META : (IS_BLOB_META(rfp->info.data) ? + DB_APP_BLOB : DB_APP_DATA); if (FLD_ISSET(rfp->db_flags, DB_AM_INMEM)) { if ((ret = __db_create_internal(&dbp, env, 0)) != 0) return (ret); MAKE_INMEM(dbp); F_SET(dbp, DB_AM_RECOVER); /* Skirt locking. */ ret = __db_inmem_remove(dbp, NULL, name); - } else if ((ret = __fop_remove(env, - NULL, rfp->uid.data, name, (const char **)&rfp->dir.data, - __rep_is_internal_rep_file(rfp->info.data) ? - DB_APP_META : DB_APP_DATA, 0)) != 0) + } else if ((ret = __fop_remove(env, NULL, rfp->uid.data, name, + (const char **)&rfp->dir.data, appname, 0)) != 0) { /* * If fop_remove fails, it could be because * the client has a different data_dir * structure than the master. Retry with the - * local, default settings. + * local, default settings. */ ret = __fop_remove(env, - NULL, rfp->uid.data, name, NULL, - __rep_is_internal_rep_file(rfp->info.data) ? - DB_APP_META : DB_APP_DATA, 0); -#ifdef HAVE_QUEUE -out: + NULL, rfp->uid.data, name, NULL, appname, 0); +#ifdef DB_WIN32 + /* + * Deleting a blob meta database can result in a + * ERROR_PATH_NOT_FOUND error on windows, so treat + * that as an ENOENT. + */ + if (__os_posix_err(ret) == ENOENT) + ret = ENOENT; #endif + } + /* Clean any blob directories. */ + if (ret == 0 && appname == DB_APP_BLOB) { + /* dbp has not been set, since queues do not support blobs. */ + DB_ASSERT(env, dbp == NULL); + if ((ret = __db_create_internal(&dbp, env, 0)) != 0) + goto out; + if ((ret = __blob_path_to_dir_ids( + env, name, &blob_fid, &blob_sid)) != 0) + goto out; + /* blob_fid == 0 if it is the top level blob meta db. */ + if (blob_fid != 0) { + dbp->blob_file_id = blob_fid; + dbp->blob_sdb_id = blob_sid; + if ((ret = __blob_del_all(dbp, NULL, 0)) != 0) + goto out; + } + } +out: if (dbp != NULL && (t_ret = __db_close(dbp, NULL, DB_NOSYNC)) != 0 && ret == 0) ret = t_ret; @@ -1610,10 +2666,11 @@ __rep_page(env, ip, eid, rp, rec) { DB_REP *db_rep; - DBT key, data; + DBT data, key; REP *rep; __rep_fileinfo_args *msgfp, msgf; __rep_fileinfo_v6_args *msgfpv6; + __rep_fileinfo_v7_args *msgfpv7; db_recno_t recno; int ret; char *msg; @@ -1647,21 +2704,30 @@ __rep_page(env, ip, eid, rp, rec) (u_long)rep->first_lsn.offset)); return (DB_REP_PAGEDONE); } + /* + * Build a current struct by copying in the older + * version struct and then setting up the new fields. + * This is safe because all old fields are in the + * same location in the current struct. + */ if (rp->rep_version < DB_REPVERSION_53) { - /* - * Build a current struct by copying in the older - * version struct and then setting up the data_dir. - * This is safe because all old fields are in the - * same location in the current struct. - */ if ((ret = __rep_fileinfo_v6_unmarshal(env, rp->rep_version, &msgfpv6, rec->data, rec->size, NULL)) != 0) return (ret); memcpy(&msgf, msgfpv6, sizeof(__rep_fileinfo_v6_args)); msgf.dir.data = NULL; msgf.dir.size = 0; + msgf.blob_fid_lo = msgf.blob_fid_hi = 0; msgfp = &msgf; msgfree = msgfpv6; + } else if (rp->rep_version < DB_REPVERSION_61) { + if ((ret = __rep_fileinfo_v7_unmarshal(env, rp->rep_version, + &msgfpv7, rec->data, rec->size, NULL)) != 0) + return (ret); + memcpy(&msgf, msgfpv7, sizeof(__rep_fileinfo_v7_args)); + msgf.blob_fid_lo = msgf.blob_fid_hi = 0; + msgfp = &msgf; + msgfree = msgfpv7; } else { if ((ret = __rep_fileinfo_unmarshal(env, rp->rep_version, &msgfp, rec->data, rec->size, NULL)) != 0) @@ -1671,9 +2737,9 @@ __rep_page(env, ip, eid, rp, rec) MUTEX_LOCK(env, rep->mtx_clientdb); REP_SYSTEM_LOCK(env); /* - * Check if the world changed. + * Check if the world changed or if we are in the blob sync phase. */ - if (rep->sync_state != SYNC_PAGE) { + if (rep->sync_state != SYNC_PAGE || rep->blob_sync != 0) { ret = DB_REP_PAGEDONE; goto err; } @@ -1785,6 +2851,218 @@ err: REP_SYSTEM_UNLOCK(env); } /* + * __rep_blob_chunk + * Process a blob chunk message. When a blob chunk arrives, delete its + * entry in the blob chunk gap database to show that it has arrived, and + * write the data to the blob file. + * + * PUBLIC: int __rep_blob_chunk __P((ENV *, int, DB_THREAD_INFO *, DBT *)); + */ +int +__rep_blob_chunk(env, eid, ip, rec) + ENV *env; + int eid; + DB_THREAD_INFO *ip; + DBT *rec; +{ + DB_REP *db_rep; + DBC *dbc; + DB_FH *fhp; + DBT data, key; + REP *rep; + REGINFO *infop; + __rep_blob_chunk_args rbc; + __rep_fileinfo_args *rfp; + db_seq_t blob_fid; + char *blob_sub_dir, *last, *mkpath, *name, *path; + int ret; + off_t offset; + u_int8_t keybuf[BLOB_KEY_SIZE], *ptr; + + ret = 0; + db_rep = env->rep_handle; + rep = db_rep->region; + infop = env->reginfo; + dbc = NULL; + blob_sub_dir = name = NULL; + path = NULL; + fhp = NULL; + + if (rep->sync_state != SYNC_PAGE) + return (DB_REP_PAGEDONE); + + if ((ret = __rep_blob_chunk_unmarshal( + env, &rbc, rec->data, rec->size, &ptr)) != 0) + return (ret); + + MUTEX_LOCK(env, rep->mtx_clientdb); + REP_SYSTEM_LOCK(env); + /* + * Check if the world changed. + */ + if (rep->sync_state != SYNC_PAGE) { + ret = DB_REP_PAGEDONE; + goto err; + } + /* + * We should not ever be in internal init with a lease granted. + */ + DB_ASSERT(env, + !IS_USING_LEASES(env) || __rep_islease_granted(env) == 0); + + /* Make sure this is for the current file. */ + GET_CURINFO(rep, infop, rfp); + GET_LO_HI(env, rfp->blob_fid_lo, rfp->blob_fid_hi, blob_fid, ret); + if (ret != 0) + goto err; + + if (blob_fid != (db_seq_t)rbc.blob_fid) { + ret = DB_REP_PAGEDONE; + goto err; + } + + RPRINT(env, (env, DB_VERB_REP_SYNC, +"REP_BLOB_CHUNK: blob_fid %llu, blob_sid %llu, blob_id %llu, offset %llu", + (unsigned long long)rbc.blob_fid, + (unsigned long long)rbc.blob_sid, + (unsigned long long)rbc.blob_id, (long long)rbc.offset)); + + if (db_rep->blob_dbp == NULL && + (ret = __rep_client_dbinit(env, 0, REP_BLOB)) != 0) { + RPRINT(env, (env, DB_VERB_REP_SYNC, + "REP_BLOB_CHUNK: Client_dbinit %s", + db_strerror(ret))); + goto err; + } + + /* Set the highest blob chunk received. */ + if (rbc.blob_sid > (u_int64_t)rep->gap_bl_hi_sid || + (rbc.blob_sid == (u_int64_t)rep->gap_bl_hi_sid && + rbc.blob_id > (u_int64_t)rep->gap_bl_hi_id) || + (rbc.blob_sid == (u_int64_t)rep->gap_bl_hi_sid && + rbc.blob_id == (u_int64_t)rep->gap_bl_hi_id && + rbc.offset > (u_int64_t)rep->gap_bl_hi_off)) { + rep->gap_bl_hi_id = (db_seq_t)rbc.blob_id; + rep->gap_bl_hi_sid = (db_seq_t)rbc.blob_sid; + rep->gap_bl_hi_off = (off_t)rbc.offset; + } + + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + data.flags = key.flags = DB_DBT_USERMEM; + key.data = keybuf; + key.ulen = key.size = BLOB_KEY_SIZE; + data.data = (void *)&offset; + data.ulen = data.size = sizeof(offset); + /* BLOB_DELETE is set if the blob file was deleted. */ + if (F_ISSET(&rbc, BLOB_DELETE)) { + memcpy(keybuf, &rbc.blob_sid, BLOB_ID_SIZE); + memcpy(&(keybuf[BLOB_ID_SIZE]), &rbc.blob_id, BLOB_ID_SIZE); + if ((ret = __db_del( + db_rep->blob_dbp, ip, NULL, &key, 0)) != 0) { + if (ret == DB_NOTFOUND) + ret = 0; + goto err; + } + goto done; + } + + if ((ret = __db_cursor(db_rep->blob_dbp, ip, NULL, &dbc, 0)) != 0) + goto err; + offset = (off_t)rbc.offset; + memcpy(keybuf, &rbc.blob_sid, BLOB_ID_SIZE); + memcpy(&(keybuf[BLOB_ID_SIZE]), &rbc.blob_id, BLOB_ID_SIZE); + /* If not found we have already dealt with this chunk. */ + if ((ret = __dbc_get(dbc, &key, &data, DB_GET_BOTH)) != 0) { + if (ret == DB_NOTFOUND) { + ret = 0; + goto done; + } + goto err; + } + /* + * BLOB_CHUNK_FAIL is set if the blob file was truncated to shorter + * than the BLOB_CHUNK offset. + */ + if (F_ISSET(&rbc, BLOB_CHUNK_FAIL)) { + while (ret == 0) { + if ((ret = __dbc_del(dbc, 0)) != 0) + goto err; + ret = __dbc_get(dbc, &key, &data, DB_NEXT_DUP); + } + if (ret == DB_NOTFOUND) + ret = 0; + if ((ret = __dbc_close(dbc)) != 0) + goto err; + dbc = NULL; + goto done; + } + if ((ret = __dbc_del(dbc, 0)) != 0) + goto err; + if ((ret = __dbc_close(dbc)) != 0) + goto err; + dbc = NULL; + + if ((ret = __blob_make_sub_dir(env, &blob_sub_dir, + (db_seq_t)rbc.blob_fid, (db_seq_t)rbc.blob_sid)) != 0) + goto err; + + if ((ret = __blob_id_to_path( + env, blob_sub_dir, (db_seq_t)rbc.blob_id, &name)) != 0) + goto err; + + if ((ret = __db_appname(env, DB_APP_BLOB, name, NULL, &path)) != 0 ) + goto err; + + last = __db_rpath(path); + DB_ASSERT(env, last != NULL); + *last = '\0'; + if (__os_exists(env, path, NULL) != 0) { + *last = PATH_SEPARATOR[0]; + mkpath = path; +#ifdef DB_WIN32 + /* + * Absolute paths on windows can result in it creating a "C" + * or "D" directory in the working directory. + */ + if (__os_abspath(mkpath)) + mkpath += 2; +#endif + if ((ret = __db_mkpath(env, mkpath)) != 0) + goto err; + } + *last = PATH_SEPARATOR[0]; + if ((ret = __os_open( + env, path, 0, DB_OSO_CREATE, env->db_mode, &fhp)) != 0) + goto err; + + /* Write the data into the blob file. */ + if ((ret = __fop_write_file(env, NULL, name, NULL, DB_APP_BLOB, + fhp, (off_t)rbc.offset, rbc.data.data, rbc.data.size, 0)) != 0) + goto err; + if ((ret = __os_closehandle(env, fhp)) != 0) + goto err; + fhp = NULL; + +done: ret = __rep_blobdone(env, eid, ip, rep, blob_fid, 0); + +err: REP_SYSTEM_UNLOCK(env); + MUTEX_UNLOCK(env, rep->mtx_clientdb); + if (path != NULL) + __os_free(env, path); + if (blob_sub_dir != NULL) + __os_free(env, blob_sub_dir); + if (name != NULL) + __os_free(env, name); + if (fhp != NULL) + (void)__os_closehandle(env, fhp); + if (dbc != NULL) + (void)__dbc_close(dbc); + + return (ret); +} + +/* * __rep_write_page - * Write this page into a database. */ @@ -1801,13 +3079,16 @@ __rep_write_page(env, ip, rep, msgfp) DB_PGINFO *pginfo; DB_REP *db_rep; REGINFO *infop; + APPNAME appname; __rep_fileinfo_args *rfp; + char *blob_path; int ret; void *dst; db_rep = env->rep_handle; infop = env->reginfo; rfp = NULL; + blob_path = NULL; /* * If this is the first page we're putting in this database, we need @@ -1830,15 +3111,39 @@ __rep_write_page(env, ip, rep, msgfp) RPRINT(env, (env, DB_VERB_REP_SYNC, "rep_write_page: Calling fop_create for %s", (char *)rfp->info.data)); + appname = (__rep_is_internal_rep_file(rfp->info.data) ? + DB_APP_META : (IS_BLOB_META((char *)rfp->info.data) + ? DB_APP_BLOB : DB_APP_DATA)); + /* + * May have to create the directory structure for blob + * metadata databases. + */ + if (appname == DB_APP_BLOB) { + if ((ret = __db_appname(env, + appname, rfp->info.data, + (const char **)&rfp->dir.data, + &blob_path)) != 0) + goto err; +#ifdef DB_WIN32 + /* + * Absolute paths on windows can result in + * it creating a "C" or "D" + * directory in the working directory. + */ + if (__os_abspath(blob_path)) + blob_path += 2; +#endif + if ((ret = __db_mkpath(env, blob_path)) != 0) + goto err; + } if ((ret = __fop_create(env, NULL, NULL, rfp->info.data, (const char **)&rfp->dir.data, - __rep_is_internal_rep_file(rfp->info.data) ? - DB_APP_META : DB_APP_DATA, env->db_mode, 0)) != 0) { + appname, env->db_mode, 0)) != 0) { /* * If fop_create fails, it could be because * the client has a different data_dir * structure than the master. Retry with the - * local, default settings. + * local, default settings. */ RPRINT(env, (env, DB_VERB_REP_SYNC, "rep_write_page: fop_create ret %d. Retry for %s, master datadir %s", @@ -1929,7 +3234,10 @@ __rep_write_page(env, ip, rep, msgfp) ret = __memp_fput(db_rep->file_mpf, ip, dst, db_rep->file_dbp->priority); -err: return (ret); +err: if (blob_path != NULL) + __os_free(env, blob_path); + + return (ret); } /* @@ -1976,7 +3284,7 @@ __rep_page_gap(env, rep, msgfp, type) * Make sure we're still talking about the same file. * If not, we're done here. */ - if (rfp->filenum != msgfp->filenum) { + if (rfp->filenum != msgfp->filenum || rep->blob_sync != 0) { ret = DB_REP_PAGEDONE; goto err; } @@ -2135,6 +3443,53 @@ err: } /* + * __rep_blob_cleanup - + * Clean up blob internal init information. + * + * Caller must hold client database mutex (mtx_clientdb) and + * REP_SYSTEM_LOCK. + */ +static int +__rep_blob_cleanup(env, rep) + ENV *env; + REP *rep; +{ + DB_REP *db_rep; + DB_THREAD_INFO *ip; + int ret, t_ret; + u_int32_t count; + + ret = 0; + db_rep = env->rep_handle; + + /* + * Delete any remaining records in the blob chunk database. The blob + * chunk database contains descriptions of the blob chunks that have + * yet to arrive. If not deleted, the remaining records could + * interfere with how the next REP_BLOB_UPDATE message is handled. + */ + if (db_rep->blob_dbp != NULL) { + ENV_GET_THREAD_INFO(env, ip); + ret = __db_truncate(db_rep->blob_dbp, ip, NULL, &count); + t_ret = __db_close(db_rep->blob_dbp, NULL, DB_NOSYNC); + if (ret == 0) + ret = t_ret; + db_rep->blob_dbp = NULL; + } + /* Reset blob internal init control values. */ + rep->gap_bl_hi_id = rep->gap_bl_hi_sid = 0; + rep->last_blob_id = rep->last_blob_sid = 0; + rep->prev_blob_id = rep->prev_blob_sid = 0; + rep->gap_bl_hi_off = 0; + rep->blob_more_files = 0; + rep->blob_sync = 0; + rep->highest_id = 0; + rep->blob_rereq = 0; + + return (ret); +} + +/* * __rep_init_cleanup - * Clean up internal initialization pieces. * @@ -2162,9 +3517,10 @@ __rep_init_cleanup(env, rep, force) /* * 1. Close up the file data pointer we used. * 2. Close/reset the page database. - * 3. Close/reset the queue database if we're forcing a cleanup. - * 4. Free current file info. - * 5. If we have all files or need to force, free original file info. + * 3. Close/truncate the blob chunk gap database. + * 4. Close/reset the queue database if we're forcing a cleanup. + * 5. Free current file info. + * 6. If we have all files or need to force, free original file info. */ if (db_rep->file_mpf != NULL) { ret = __memp_fclose(db_rep->file_mpf, 0); @@ -2176,6 +3532,15 @@ __rep_init_cleanup(env, rep, force) if (ret == 0) ret = t_ret; } + /* + * Truncate the blob chunk gap database, since entries in the database + * are for blob chunks we are expecting to arrive. Also reset blob + * internal init control values. + */ + t_ret = __rep_blob_cleanup(env, rep); + if (ret == 0) + ret = t_ret; + if (force && db_rep->queue_dbc != NULL) { queue_dbp = db_rep->queue_dbc->dbp; if ((t_ret = __dbc_close(db_rep->queue_dbc)) != 0 && ret == 0) @@ -2324,8 +3689,8 @@ __rep_clean_interrupted(env) * __rep_filedone - * We need to check if we're done with the current file after * processing the current page. Stat the database to see if - * we have all the pages. If so, we need to clean up/close - * this one, set up for the next one, and ask for its pages, + * we have all the pages and blobs. If so, we need to clean up/close + * this one, set up for the next one, and ask for its pages and blobs, * or if this is the last file, request the log records and * move to the REP_RECOVER_LOG state. */ @@ -2338,9 +3703,14 @@ __rep_filedone(env, ip, eid, rep, msgfp, type) __rep_fileinfo_args *msgfp; u_int32_t type; { + DBT msg; REGINFO *infop; __rep_fileinfo_args *rfp; + __rep_blob_update_req_args rbur; int ret; + u_int8_t buf[__REP_BLOB_UPDATE_REQ_SIZE]; + + memset(&msg, 0, sizeof(DBT)); /* * We've put our page, now we need to do any gap processing @@ -2375,8 +3745,96 @@ __rep_filedone(env, ip, eid, rep, msgfp, type) ((ret = __rep_queue_filedone(env, ip, rep, rfp)) != DB_REP_PAGEDONE)) return (ret); + + /* Request blob files. */ + if (rfp->blob_fid_lo != 0 || rfp->blob_fid_hi != 0) { + ret = 0; + rep->blob_sync = 1; + memset(&rbur, 0, sizeof(__rep_blob_update_req_args)); + GET_LO_HI(env, + rfp->blob_fid_lo, rfp->blob_fid_hi, rbur.blob_fid, ret); + msg.size = __REP_BLOB_UPDATE_REQ_SIZE; + msg.data = buf; + __rep_blob_update_req_marshal(env, &rbur, msg.data); + (void)__rep_send_message(env, + rep->master_id, REP_BLOB_UPDATE_REQ, NULL, &msg, 0, 0); + return (ret); + } + + /* + * We have all the data for this file. Clean up. + */ + if ((ret = __rep_init_cleanup(env, rep, 0)) != 0) + return (ret); + + rep->curfile++; + ret = __rep_nextfile(env, eid, rep); + + return (ret); +} + +/* + * __rep_blobdone - + * We need to check if we're done with the current file after + * processing the current blob chunk. + * + * Caller must hold client database mutex (mtx_clientdb) and + * REP_SYSTEM_LOCK. + */ +static int +__rep_blobdone(env, eid, ip, rep, blob_fid, force) + ENV *env; + int eid; + DB_THREAD_INFO *ip; + REP *rep; + db_seq_t blob_fid; + int force; +{ + DBT msg; + __rep_blob_update_req_args rbur; + int done, ret; + u_int8_t buf[__REP_BLOB_UPDATE_REQ_SIZE]; + /* - * We have all the pages for this file. Clean up. + * We've written our blob chunk, now we need to do any gap processing + * that might be needed to re-request chunks. + */ + done = 0; + ret = __rep_blob_chunk_gap(env, eid, ip, rep, &done, blob_fid, force); + /* + * The world changed while we were doing gap processing. + * We're done here. + */ + if (ret == DB_REP_PAGEDONE) + return (0); + else if (ret != 0) + goto err; + + /* + * If the blob database is empty then all files in the current list + * have been processed. However, there may be more files on the + * master, so request the next list if that is the case. + */ + if (done && rep->blob_more_files) { + memset(&rbur, 0, sizeof(__rep_blob_update_req_args)); + rbur.blob_fid = (u_int64_t)blob_fid; + rbur.blob_sid = (u_int64_t)rep->last_blob_sid; + rbur.blob_id = (u_int64_t)rep->last_blob_id; + rbur.highest_id = (u_int64_t)rep->highest_id; + rep->gap_bl_hi_id = rep->gap_bl_hi_sid = 0; + rep->gap_bl_hi_off = 0; + rep->blob_rereq = 0; + msg.size = __REP_BLOB_UPDATE_REQ_SIZE; + msg.data = buf; + __rep_blob_update_req_marshal(env, &rbur, msg.data); + (void)__rep_send_message(env, + rep->master_id, REP_BLOB_UPDATE_REQ, NULL, &msg, 0, 0); + return (0); + } else if (!done) + return (0); + + /* + * We have all the data for this file. Clean up. */ if ((ret = __rep_init_cleanup(env, rep, 0)) != 0) goto err; @@ -2388,6 +3846,255 @@ err: } /* + * __rep_blob_chunk_gap - + * We have written a blob chunk. Now check if there are any that need + * to be re-requested. The blob chunk gap database contains + * descriptions of all the blob chunks that have yet to arrive. + * + * Caller must hold client database mutex (mtx_clientdb) and + * REP_SYSTEM_LOCK. + */ +static int +__rep_blob_chunk_gap(env, eid, ip, rep, done, blob_fid, force) + ENV *env; + int eid; + DB_THREAD_INFO *ip; + REP *rep; + int *done; + db_seq_t blob_fid; + int force; +{ + DBC *dbc; + DBT data, high, key, msg; + DB_LOG *dblp; + DB_REP *db_rep; + LOG *lp; + REGINFO *infop; + __rep_blob_chunk_req_args rbcr; + __rep_fileinfo_args *rfp; + db_seq_t cur_blob_fid; + off_t offset; + int ret; + u_int8_t buf[BLOB_KEY_SIZE], msgbuf[__REP_BLOB_CHUNK_REQ_SIZE]; + + db_rep = env->rep_handle; + dblp = env->lg_handle; + lp = dblp->reginfo.primary; + infop = env->reginfo; + ret = 0; + dbc = NULL; + *done = 0; + + /* eid will be used when peer-to-peer is re-enabled for blobs. */ + COMPQUIET(eid, 0); + + /* + * Make sure we're still talking about the same file. + * If not, we're done here. + */ + GET_CURINFO(rep, infop, rfp); + GET_LO_HI(env, rfp->blob_fid_lo, rfp->blob_fid_hi, cur_blob_fid, ret); + if (cur_blob_fid != blob_fid) { + ret = DB_REP_PAGEDONE; + goto err; + } + + /* Get the first missing blob chunk. */ + if ((ret = __db_cursor(db_rep->blob_dbp, ip, NULL, &dbc, 0)) != 0) + goto err; + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + ret = __dbc_get(dbc, &key, &data, DB_FIRST); + if (ret == DB_NOTFOUND) { + /* All blobs received. */ + ret = 0; + *done = 1; + goto err; + } else if (ret != 0) + goto err; + + DB_ASSERT(env, key.size == BLOB_KEY_SIZE); + DB_ASSERT(env, data.size == sizeof(off_t)); + offset = *(off_t *)data.data; + /* + * Format the sdbid and id of the high chunk as a blob gap + * database key, so it can be compared with the entries in that + * database. + */ + memset(&high, 0, sizeof(DBT)); + memcpy(buf, &rep->gap_bl_hi_sid, BLOB_ID_SIZE); + memcpy(buf + BLOB_ID_SIZE, &rep->gap_bl_hi_id, BLOB_ID_SIZE); + high.data = buf; + high.size = BLOB_KEY_SIZE; + + /* + * If the first chunk in the database is larger than the highest chunk + * received, then there is no gap. + * + * If a gap does exist, check if it is time to do a re-request. If so, + * re-request every chunk that exists before the highest received. + */ + if (!force && (__rep_blob_cmp(NULL, &key, &high, NULL) > 0 || + (__rep_blob_cmp(NULL, &key, &high, NULL) == 0 && + offset > rep->gap_bl_hi_off))) { + lp->wait_ts = db_rep->request_gap; + __os_gettime(env, &lp->rcvd_ts, 1); + } else if (force || __rep_check_doreq(env, rep)) { + /* + * Re-request every chunk less than the highest one, plus the + * next blob chunk that we are expecting. The next expected + * blob chunk is requested in case the last blob chunk is lost + * in transit. + */ + do { + memset(&rbcr, 0, sizeof(__rep_blob_chunk_req_args)); + memcpy(&(rbcr.blob_sid), key.data, BLOB_ID_SIZE); + memcpy(&(rbcr.blob_id), + (u_int8_t *)key.data + BLOB_ID_SIZE, BLOB_ID_SIZE); + rbcr.offset = *(u_int64_t *)data.data; + rbcr.blob_fid = (u_int64_t)blob_fid; + msg.size = __REP_BLOB_CHUNK_REQ_SIZE; + msg.data = msgbuf; + RPRINT(env, (env, DB_VERB_REP_SYNC, +"blob_chunk_gap: Req file_id %llu, sdb_id %llu, blob_id %llu, offset %llu", + (long long)rbcr.blob_fid, (long long)rbcr.blob_sid, + (long long)rbcr.blob_id, (long long)rbcr.offset)); + __rep_blob_chunk_req_marshal(env, &rbcr, msg.data); + /* + * Note that peer-to-peer initialization is not + * supported for blobs. + */ + (void)__rep_send_message( + env, rep->master_id, + REP_BLOB_CHUNK_REQ, NULL, &msg, 0, 0); + /* + * Break after requesting the chunk after the highest + * one. + */ + if (__rep_blob_cmp(NULL, &key, &high, NULL) > 0 || + (__rep_blob_cmp(NULL, &key, &high, NULL) == 0 && + offset > rep->gap_bl_hi_off)) + break; + if ((ret = __dbc_get( + dbc, &key, &data, DB_NEXT)) != 0) { + if (ret == DB_NOTFOUND) { + ret = 0; + break; + } + goto err; + } + } while (1); + } + +err: if (dbc != NULL) + (void)__dbc_close(dbc); + + return (ret); +} + +/* + * __rep_blob_chunk_req + * Answer a request for a specific blob chunk. + * + * PUBLIC: int __rep_blob_chunk_req __P((ENV *, int, DBT *)); + */ +int +__rep_blob_chunk_req(env, eid, rec) + ENV *env; + int eid; + DBT *rec; +{ + DB *dbp; + DBT msg; + DB_FH *fhp; + __rep_blob_chunk_args rbc; + __rep_blob_chunk_req_args rbcr; + int ret; + u_int8_t *chunk_buf, *msg_buf, *ptr; + + dbp = NULL; + fhp = NULL; + chunk_buf = msg_buf = NULL; + + if ((ret = + __os_malloc(env, MEGABYTE + __REP_BLOB_CHUNK_SIZE, &msg_buf)) != 0) + goto err; + memset(&msg, 0, sizeof(DBT)); + msg.data = msg_buf; + msg.ulen = MEGABYTE + __REP_BLOB_CHUNK_SIZE; + if ((ret = __os_malloc(env, MEGABYTE, &chunk_buf)) != 0) + goto err; + memset(&rbc, 0, sizeof(__rep_blob_chunk_args)); + rbc.data.data = chunk_buf; + rbc.data.ulen = MEGABYTE; + rbc.data.flags = DB_DBT_USERMEM; + + if ((ret = __rep_blob_chunk_req_unmarshal( + env, &rbcr, rec->data, rec->size, &ptr)) != 0) + goto err; + + RPRINT(env, (env, DB_VERB_REP_SYNC, + "blob_chunk_req: file_id %llu, sdbid %llu, id %llu, offset %llu", + (long long)rbcr.blob_fid, (long long)rbcr.blob_sid, + (long long)rbcr.blob_id, (long long)rbcr.offset)); + + rbc.blob_fid = rbcr.blob_fid; + rbc.blob_id = rbcr.blob_id; + rbc.blob_sid = rbcr.blob_sid; + rbc.offset = rbcr.offset; + if ((ret = __db_create_internal(&dbp, env, 0)) != 0) + goto err; + dbp->blob_file_id = (db_seq_t)rbcr.blob_fid; + dbp->blob_sdb_id = (db_seq_t)rbcr.blob_sid; + if ((ret = __blob_make_sub_dir(env, &dbp->blob_sub_dir, + (db_seq_t)rbcr.blob_fid, (db_seq_t)rbcr.blob_sid)) != 0) + goto err; + if ((ret = __blob_file_open( + dbp, &fhp, (db_seq_t)rbcr.blob_id, DB_FOP_READONLY, 0)) != 0) { + /* + * The file may have been deleted between creating the + * list and sending the request. Send a message saying + * the file has been deleted. + */ + if (ret == ENOENT) { + ret = 0; + F_SET(&rbc, BLOB_DELETE); + rbc.data.size = 0; + __rep_blob_chunk_marshal(env, &rbc, msg.data); + msg.size = __REP_BLOB_CHUNK_SIZE; + (void)__rep_send_message( + env, eid, REP_BLOB_CHUNK, NULL, &msg, 0, 0); + goto err; + } + goto err; + } + if ((ret = __blob_file_read( + env, fhp, &rbc.data, (off_t)rbcr.offset, MEGABYTE)) != 0) + goto err; + DB_ASSERT(env, rbc.data.size <= MEGABYTE); + + /* + * In rare cases the blob file may have gotten shorter + * since the list was created. + */ + if (rbc.data.size == 0) + F_SET(&rbc, BLOB_CHUNK_FAIL); + __rep_blob_chunk_marshal(env, &rbc, msg.data); + msg.size = __REP_BLOB_CHUNK_SIZE + rbc.data.size; + (void)__rep_send_message(env, eid, REP_BLOB_CHUNK, NULL, &msg, 0, 0); + +err: if (chunk_buf != NULL) + __os_free(env, chunk_buf); + if (msg_buf != NULL) + __os_free(env, msg_buf); + if (fhp != NULL) + (void)__os_closehandle(env, fhp); + if (dbp != 0) + (void)__db_close(dbp, NULL, 0); + return (ret); +} + +/* * Starts requesting pages for the next file in the list (if any), or if not, * proceeds to the next stage: requesting logs. * @@ -2404,19 +4111,25 @@ __rep_nextfile(env, eid, rep) DBT dbt; __rep_logreq_args lr_args; DB_LOG *dblp; + DB_REP *db_rep; + DELAYED_BLOB_LIST *dbl; LOG *lp; REGENV *renv; REGINFO *infop; __rep_fileinfo_args *curinfo, *rfp, rf; __rep_fileinfo_v6_args *rfpv6; - int *curbuf, ret; + __rep_fileinfo_v7_args *rfpv7; + int *curbuf, ret, view_partial; u_int8_t *buf, *info_ptr, lrbuf[__REP_LOGREQ_SIZE], *nextinfo; size_t len, msgsz; + char *name; void *rffree; infop = env->reginfo; renv = infop->primary; + db_rep = env->rep_handle; rfp = NULL; + dbl = NULL; /* * Always direct the next request to the master (at least nominally), @@ -2430,13 +4143,13 @@ __rep_nextfile(env, eid, rep) /* Set curinfo to next file and examine it. */ info_ptr = R_ADDR(infop, rep->originfo_off + (rep->originfolen - rep->infolen)); + /* + * Build a current struct by copying in the older + * version struct and then setting up the new fields. + * This is safe because all old fields are in the + * same location in the current struct. + */ if (rep->infoversion < DB_REPVERSION_53) { - /* - * Build a current struct by copying in the older - * version struct and then setting up the data_dir. - * This is safe because all old fields are in the - * same location in the current struct. - */ if ((ret = __rep_fileinfo_v6_unmarshal(env, rep->infoversion, &rfpv6, info_ptr, rep->infolen, &nextinfo)) != 0) @@ -2444,8 +4157,18 @@ __rep_nextfile(env, eid, rep) memcpy(&rf, rfpv6, sizeof(__rep_fileinfo_v6_args)); rf.dir.data = NULL; rf.dir.size = 0; + rf.blob_fid_lo = rf.blob_fid_hi = 0; rfp = &rf; rffree = rfpv6; + } else if (rep->infoversion < DB_REPVERSION_61) { + if ((ret = __rep_fileinfo_v7_unmarshal(env, + rep->infoversion, &rfpv7, + info_ptr, rep->infolen, &nextinfo)) != 0) + return (ret); + memcpy(&rf, rfpv7, sizeof(__rep_fileinfo_v7_args)); + rf.blob_fid_lo = rf.blob_fid_hi = 0; + rfp = &rf; + rffree = rfpv7; } else { if ((ret = __rep_fileinfo_unmarshal(env, rep->infoversion, &rfp, info_ptr, @@ -2457,6 +4180,14 @@ __rep_nextfile(env, eid, rep) } rffree = rfp; } +#ifndef HAVE_64BIT_TYPES + if (rfp->blob_fid_lo != 0 || rfp->blob_fid_hi != 0) { + __db_errx(env, DB_STR("3705", + "Blobs require 64 integer compiler support.")); + __os_free(env, rffree); + return (DB_OPNOTSUP); + } +#endif rep->infolen -= (u_int32_t)(nextinfo - info_ptr); MUTEX_LOCK(env, renv->mtx_regenv); ret = __env_alloc(infop, sizeof(__rep_fileinfo_args) + @@ -2484,19 +4215,55 @@ __rep_nextfile(env, eid, rep) rfp->dir.data, rfp->dir.size); __os_free(env, rffree); - /* Skip over regular DB's in "abbreviated" internal inits. */ - if (F_ISSET(rep, REP_F_ABBREVIATED) && + /* + * If a partial callback is set, invoke the callback to see if + * this file should be replicated. + */ + if (IS_VIEW_SITE(env) && curinfo->info.size > 0 && !FLD_ISSET(curinfo->db_flags, DB_AM_INMEM)) { + name = (char *)curinfo->info.data; + DB_ASSERT(env, db_rep->partial != NULL); + /* + * Always replicate system owned databases. + */ + if (IS_DB_FILE(name) && !IS_BLOB_META(name)) + view_partial = 1; + else if ((ret = __rep_call_partial(env, + name, &view_partial, 0, &dbl)) != 0) { + VPRINT(env, (env, DB_VERB_REP_SYNC, + "rep_nextfile: partial cb err %d for %s", + ret, name)); + return (ret); + } + /* + * dbl != NULL when we could not find the name of the + * database that owns a blob meta database. If that + * happens then it was never opened, which means it + * was not replicated, and as such neither should its + * bmd be replicated. + */ + if (dbl != NULL) { + view_partial = 0; + __os_free(env, dbl); + dbl = NULL; + } VPRINT(env, (env, DB_VERB_REP_SYNC, - "Skipping file %d in abbreviated internal init", - curinfo->filenum)); - MUTEX_LOCK(env, renv->mtx_regenv); - __env_alloc_free(infop, - R_ADDR(infop, rep->curinfo_off)); - MUTEX_UNLOCK(env, renv->mtx_regenv); - rep->curinfo_off = INVALID_ROFF; - rep->curfile++; - continue; + "rep_nextfile: %s file %s %d on view site.", + view_partial == 0 ? + "Skipping" : "Replicating", + name, curinfo->filenum)); + /* + * If we're skipping the file, move to the next one. + */ + if (view_partial == 0) { + MUTEX_LOCK(env, renv->mtx_regenv); + __env_alloc_free(infop, + R_ADDR(infop, rep->curinfo_off)); + MUTEX_UNLOCK(env, renv->mtx_regenv); + rep->curinfo_off = INVALID_ROFF; + rep->curfile++; + continue; + } } /* Request this file's pages. */ @@ -2519,15 +4286,19 @@ __rep_nextfile(env, eid, rep) curinfo->uid.size + curinfo->info.size; if ((ret = __os_calloc(env, 1, msgsz, &buf)) != 0) return (ret); + /* + * It is safe to cast to the old structs + * because the first part of the current + * struct matches the old structs. + */ if (rep->infoversion < DB_REPVERSION_53) - /* - * It is safe to cast to the old struct - * because the first part of the current - * struct matches the old struct. - */ ret = __rep_fileinfo_v6_marshal(env, rep->infoversion, (__rep_fileinfo_v6_args *)curinfo, buf, msgsz, &len); + else if (rep->infoversion < DB_REPVERSION_61) + ret = __rep_fileinfo_v7_marshal(env, rep->infoversion, + (__rep_fileinfo_v7_args *)curinfo, buf, + msgsz, &len); else ret = __rep_fileinfo_marshal(env, rep->infoversion, curinfo, buf, msgsz, &len); @@ -2834,16 +4605,19 @@ __rep_pggap_req(env, rep, reqfp, gapflags) * new info into rep->finfo. Assert that the sizes never * change. The only thing this should do is change * the pgno field. Everything else remains the same. + * + * It is safe to cast to the old structs + * because the first part of the current + * struct matches the old structs. */ if (rep->infoversion < DB_REPVERSION_53) - /* - * It is safe to cast to the old struct - * because the first part of the current - * struct matches the old struct. - */ ret = __rep_fileinfo_v6_marshal(env, rep->infoversion, (__rep_fileinfo_v6_args *)tmpfp, buf, msgsz, &len); + else if (rep->infoversion < DB_REPVERSION_61) + ret = __rep_fileinfo_v7_marshal(env, rep->infoversion, + (__rep_fileinfo_v7_args *)tmpfp, buf, + msgsz, &len); else ret = __rep_fileinfo_marshal(env, rep->infoversion, tmpfp, buf, msgsz, &len); @@ -2865,6 +4639,94 @@ err: } /* + * __rep_blob_rereq - + * + * Re-request lost blob messages, such as REP_BLOB_CHUNK_REQ, REP_BLOB_ALL_REQ, + * or REP_BLOB_UPDATE_REQ. Note that the blob chunk gap database contains + * descriptions of the blob chunks that we are expecting to arrive. + * + * Assumes the caller holds mtx_clientdb and rep_mutex. + * + * PUBLIC: int __rep_blob_rereq __P((ENV *, REP *)); + */ +int +__rep_blob_rereq(env, rep) + ENV *env; + REP *rep; +{ + DB_REP *db_rep; + DB_THREAD_INFO *ip; + REGINFO *infop; + __rep_fileinfo_args *rfp; + db_seq_t blob_fid; + int master, ret; + u_int32_t count; + + db_rep = env->rep_handle; + infop = env->reginfo; + rfp = NULL; + ret = 0; + + /* First check if the master is around to answer the re-request. */ + master = rep->master_id; + if (master == DB_EID_INVALID) { + (void)__rep_send_message(env, + DB_EID_BROADCAST, REP_MASTER_REQ, NULL, NULL, 0, 0); + goto err; + } + + if (db_rep->blob_dbp == NULL && + (ret = __rep_client_dbinit(env, 0, REP_BLOB)) != 0) { + RPRINT(env, (env, DB_VERB_REP_SYNC, + "REP_BLOB_CHUNK: Client_dbinit %s", + db_strerror(ret))); + goto err; + } + + /* + * If the gap blob id is 0 then we either lost a REP_BLOB_ALL_REQ or + * a REP_BLOB_UPDATE_REQ message. Since we do not have the information + * to reconstruct a REP_BLOB_ALL_REQ message, reset the blob gap + * database and start over at the REP_BLOB_UPDATE_REQ stage. + * + * If the blob gap id is not 0, we lost a REP_BLOB_CHUNK_REQ message, + * so perform blob gap processing. + */ + ENV_GET_THREAD_INFO(env, ip); + if (rep->gap_bl_hi_id == 0) { + /* + * It takes a while to create the blob update message, so skip + * the first time it asks. + */ + if (rep->blob_rereq == 0) { + rep->blob_rereq = 1; + goto err; + } + rep->blob_rereq = 0; + if ((ret = __db_truncate( + db_rep->blob_dbp, ip, NULL, &count)) != 0) + goto err; + rep->blob_more_files = 1; + rep->last_blob_id = rep->prev_blob_id; + rep->last_blob_sid = rep->prev_blob_sid; + } + + GET_CURINFO(rep, infop, rfp); + GET_LO_HI(env, rfp->blob_fid_lo, rfp->blob_fid_hi, blob_fid, ret); + if (ret != 0) + goto err; + /* + * If there are entries in the blob gap database, __rep_blobdone + * will perform gap processing, otherwise it will send + * a REP_BLOB_UPDATE_REQ. + */ + ret = __rep_blobdone(env, master, ip, rep, blob_fid, 1); + +err: + return (ret); +} + +/* * __rep_finfo_alloc - * Allocate and initialize a fileinfo structure. * @@ -3521,6 +5383,7 @@ __rep_walk_filelist(env, version, files, size, count, fn, arg) { __rep_fileinfo_args *rfp, rf; __rep_fileinfo_v6_args *rfpv6; + __rep_fileinfo_v7_args *rfpv7; u_int8_t *next; int ret; void *rffree; @@ -3530,21 +5393,30 @@ __rep_walk_filelist(env, version, files, size, count, fn, arg) rfpv6 = NULL; rffree = NULL; while (count-- > 0) { + /* + * Build a current struct by copying in the older + * version struct and then setting up the new fields. + * This is safe because all old fields are in the + * same location in the current struct. + */ if (version < DB_REPVERSION_53) { - /* - * Build a current struct by copying in the older - * version struct and then setting up the data_dir. - * This is safe because all old fields are in the - * same location in the current struct. - */ if ((ret = __rep_fileinfo_v6_unmarshal(env, version, &rfpv6, files, size, &next)) != 0) break; memcpy(&rf, rfpv6, sizeof(__rep_fileinfo_v6_args)); rf.dir.data = NULL; rf.dir.size = 0; + rf.blob_fid_lo = rf.blob_fid_hi = 0; rfp = &rf; rffree = rfpv6; + } else if (version < DB_REPVERSION_61) { + if ((ret = __rep_fileinfo_v7_unmarshal(env, version, + &rfpv7, files, size, &next)) != 0) + break; + memcpy(&rf, rfpv7, sizeof(__rep_fileinfo_v7_args)); + rf.blob_fid_lo = rf.blob_fid_hi = 0; + rfp = &rf; + rffree = rfpv7; } else { if ((ret = __rep_fileinfo_unmarshal(env, version, &rfp, files, size, &next)) != 0) @@ -3566,3 +5438,33 @@ __rep_walk_filelist(env, version, files, size, count, fn, arg) __os_free(env, rffree); return (ret); } + +/* + * Initializes a FILE_LIST_CTX structure. + * + * Pass in a non-zero value for update_space to reserve space for + * update_args in the context's buffer. + */ +static int +__rep_init_file_list_context(env, version, flags, update_space, context) + ENV *env; + u_int32_t version; + u_int32_t flags; + int update_space; + FILE_LIST_CTX *context; +{ + int ret; + + if ((ret = __os_calloc(env, 1, MEGABYTE, &context->buf)) != 0) + return (ret); + context->size = MEGABYTE; + context->count = 0; + context->version = version; + context->flags = flags; + /* Reserve space for update_args. */ + if (update_space) + context->fillptr = FIRST_FILE_PTR(context->buf); + else + context->fillptr = context->buf; + return (ret); +} diff --git a/src/rep/rep_elect.c b/src/rep/rep_elect.c index 9e8c5249..234daf31 100644 --- a/src/rep/rep_elect.c +++ b/src/rep/rep_elect.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2004, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2004, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -53,8 +53,9 @@ __rep_elect_pp(dbenv, given_nsites, nvotes, flags) u_int32_t given_nsites, nvotes; u_int32_t flags; { - DB_REP *db_rep; ENV *env; + DB_REP *db_rep; + DB_THREAD_INFO *ip; int ret; env = dbenv->env; @@ -89,7 +90,9 @@ __rep_elect_pp(dbenv, given_nsites, nvotes, flags) return (EINVAL); } + ENV_ENTER(env, ip); ret = __rep_elect_int(env, given_nsites, nvotes, flags); + ENV_LEAVE(env, ip); /* * The DB_REP_IGNORE return code can be of use to repmgr (which of @@ -120,7 +123,6 @@ __rep_elect_int(env, given_nsites, nvotes, flags) DB_LOGC *logc; DB_LSN lsn; DB_REP *db_rep; - DB_THREAD_INFO *ip; LOG *lp; REP *rep; int done, elected, in_progress; @@ -140,6 +142,15 @@ __rep_elect_int(env, given_nsites, nvotes, flags) ret = 0; /* + * View sites never participate in elections. + */ + if (IS_VIEW_SITE(env)) { + __db_errx(env, DB_STR("3687", + "View sites may not participate in elections")); + return (EINVAL); + } + + /* * Specifying 0 for nsites signals us to use the value configured * previously via rep_set_nsites. Similarly, if the given nvotes is 0, * it asks us to compute the value representing a simple majority. @@ -185,7 +196,6 @@ __rep_elect_int(env, given_nsites, nvotes, flags) * real, configured priority, as retrieved from REP region. */ ctlflags = realpri != 0 ? REPCTL_ELECTABLE : 0; - ENV_ENTER(env, ip); orig_tally = 0; /* If we are already master, simply broadcast that fact and return. */ @@ -597,8 +607,7 @@ out: DB_ASSERT(env, rep->elect_th > 0); rep->elect_th--; if (rep->elect_th == 0) { - need_req = F_ISSET(rep, REP_F_SKIPPED_APPLY) && - !I_HAVE_WON(rep, rep->winner); + need_req = F_ISSET(rep, REP_F_SKIPPED_APPLY) && !elected; FLD_CLR(rep->lockout_flags, REP_LOCKOUT_APPLY); F_CLR(rep, REP_F_SKIPPED_APPLY); } @@ -641,7 +650,6 @@ out: unlck_lv: REP_SYSTEM_UNLOCK(env); } envleave: - ENV_LEAVE(env, ip); return (ret); } @@ -1106,7 +1114,7 @@ __rep_cmp_vote(env, rep, eid, lsnp, priority, gen, data_gen, tiebreaker, flags) u_int32_t priority; u_int32_t data_gen, flags, gen, tiebreaker; { - int cmp, like_pri; + int cmp, genlog_cmp, like_pri; cmp = LOG_COMPARE(lsnp, &rep->w_lsn); /* @@ -1140,9 +1148,18 @@ __rep_cmp_vote(env, rep, eid, lsnp, priority, gen, data_gen, tiebreaker, flags) like_pri = (priority == 0 && rep->w_priority == 0) || (priority != 0 && rep->w_priority != 0); - if ((priority != 0 && rep->w_priority == 0) || - (like_pri && data_gen > rep->w_datagen) || - (like_pri && data_gen == rep->w_datagen && cmp > 0) || + /* + * The undocumented ELECT_LOGLENGTH option requires that the + * election should be won based on log length without regard + * for datagen. Do not include datagen in the comparison if + * this option is enabled. + */ + if (FLD_ISSET(rep->config, REP_C_ELECT_LOGLENGTH)) + genlog_cmp = like_pri && cmp > 0; + else + genlog_cmp = (like_pri && data_gen > rep->w_datagen) || + (like_pri && data_gen == rep->w_datagen && cmp > 0); + if ((priority != 0 && rep->w_priority == 0) || genlog_cmp || (cmp == 0 && (priority > rep->w_priority || (priority == rep->w_priority && (tiebreaker > rep->w_tiebreaker))))) { @@ -1306,8 +1323,9 @@ __rep_wait(env, timeoutp, full_elect, egen, flags) { DB_REP *db_rep; REP *rep; - int done; - u_int32_t sleeptime, sleeptotal, timeout; + db_timespec exptime, mytime; + int diff_timeout, done; + u_int32_t sleeptime, timeout; db_rep = env->rep_handle; rep = db_rep->region; @@ -1315,10 +1333,20 @@ __rep_wait(env, timeoutp, full_elect, egen, flags) timeout = *timeoutp; sleeptime = SLEEPTIME(timeout); - sleeptotal = 0; - while (sleeptotal < timeout) { + __os_gettime(env, &exptime, 0); + TIMESPEC_ADD_DB_TIMEOUT(&exptime, timeout); + while (!done) { + __os_gettime(env, &mytime, 0); + /* + * Check if the timeout has expired. __os_yield might sleep + * a slightly shorter time than requested, so check the exact + * amount of time that has passed. If we do not sleep the + * full PHASE0 time, old unexpired lease grants could + * incorrectly prevent the election from happening. + */ + if (timespeccmp(&mytime, &exptime, >)) + break; __os_yield(env, 0, sleeptime); - sleeptotal += sleeptime; REP_SYSTEM_LOCK(env); /* * Check if group membership changed while we were @@ -1331,19 +1359,19 @@ __rep_wait(env, timeoutp, full_elect, egen, flags) if (!LF_ISSET(REP_E_PHASE0) && full_elect && F_ISSET(rep, REP_F_GROUP_ESTD)) { *timeoutp = rep->elect_timeout; + if ((diff_timeout = (int)(*timeoutp - timeout)) > 0) + TIMESPEC_ADD_DB_TIMEOUT(&exptime, diff_timeout); + else { + diff_timeout = -diff_timeout; + TIMESPEC_SUB_DB_TIMEOUT(&exptime, diff_timeout); + } timeout = *timeoutp; - if (sleeptotal >= timeout) - done = 1; - else - sleeptime = SLEEPTIME(timeout); + sleeptime = SLEEPTIME(timeout); } if (egen != rep->egen || !FLD_ISSET(rep->elect_flags, flags)) done = 1; REP_SYSTEM_UNLOCK(env); - - if (done) - return (0); } return (0); } diff --git a/src/rep/rep_lease.c b/src/rep/rep_lease.c index 047c39a7..b6010046 100644 --- a/src/rep/rep_lease.c +++ b/src/rep/rep_lease.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2007, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2007, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -45,10 +45,20 @@ __rep_update_grant(env, ts) timespecclear(&mytime); /* + * If we are a view, we never grant a lease. + */ + if (IS_VIEW_SITE(env)) + return (0); + + /* * Get current time, and add in the (skewed) lease duration - * time to send the grant to the master. + * time to send the grant to the master. We need to use '0' + * for a non-monotonic (i.e. realtime) timestamp. Some systems + * use "time since boot" for monotonic time, which would not + * work between machines here. We already document that for leases, + * the time cannot go backward. */ - __os_gettime(env, &mytime, 1); + __os_gettime(env, &mytime, 0); timespecadd(&mytime, &rep->lease_duration); REP_SYSTEM_LOCK(env); /* @@ -108,7 +118,7 @@ __rep_islease_granted(env) * Get current time and compare against our granted lease. */ timespecclear(&mytime); - __os_gettime(env, &mytime, 1); + __os_gettime(env, &mytime, 0); return (timespeccmp(&mytime, &rep->grant_expire, <=) ? 1 : 0); } @@ -319,9 +329,15 @@ __rep_lease_check(env, refresh) max_tries = LEASE_REFRESH_MIN; retry: REP_SYSTEM_LOCK(env); - min_leases = rep->config_nsites / 2; + /* + * We need enough leases so that we're guaranteed any successful + * election will include at least one site with the lease-guaranteed + * data. Note this is based on total number of sites so leases + * cannot be used with half or more unelectable sites. + */ + min_leases = (rep->config_nsites - 1) / 2; ret = 0; - __os_gettime(env, &curtime, 1); + __os_gettime(env, &curtime, 0); VPRINT(env, (env, DB_VERB_REP_LEASE, "%s %d of %d refresh %d min_leases %lu curtime %lu %lu, maxLSN [%lu][%lu]", "lease_check: try ", tries, max_tries, refresh, @@ -526,7 +542,7 @@ __rep_lease_waittime(env) if (!F_ISSET(rep, REP_F_LEASE_EXPIRED)) to = rep->lease_timeout; } else { - __os_gettime(env, &mytime, 1); + __os_gettime(env, &mytime, 0); RPRINT(env, (env, DB_VERB_REP_LEASE, "wait_time: mytime %lu %lu, grant_expire %lu %lu", (u_long)mytime.tv_sec, (u_long)mytime.tv_nsec, diff --git a/src/rep/rep_log.c b/src/rep/rep_log.c index 42300685..bf72db9e 100644 --- a/src/rep/rep_log.c +++ b/src/rep/rep_log.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2004, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2004, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -110,7 +110,7 @@ __rep_allreq(env, rp, eid) */ if (ret == 0 && repth.lsn.file != 1 && flags == DB_FIRST) { if (F_ISSET(rep, REP_F_CLIENT)) - ret = DB_NOTFOUND; + ret = USR_ERR(env, DB_NOTFOUND); else (void)__rep_send_message(env, eid, REP_VERIFY_FAIL, &repth.lsn, NULL, 0, 0); @@ -466,8 +466,8 @@ __rep_log_split(env, ip, rp, rec, ret_lsnp, last_lsnp) if (p >= ep && save_flags) F_SET(&tmprp, save_flags); /* - * A previous call to __rep_apply indicated an earlier - * record is a dup and the next_new_lsn we are waiting for. + * A previous call to __rep_apply indicated an earlier record + * is a past dup and the next_new_lsn for which we are waiting. * Skip log records until we catch up with next_new_lsn. */ if (is_dup && LOG_COMPARE(&tmprp.lsn, &next_new_lsn) < 0) { @@ -482,7 +482,20 @@ __rep_log_split(env, ip, rp, rec, ret_lsnp, last_lsnp) VPRINT(env, (env, DB_VERB_REP_MISC, "log_split: rep_apply ret %d, dup %d, tmp_lsn [%lu][%lu]", ret, is_dup, (u_long)tmp_lsn.file, (u_long)tmp_lsn.offset)); - if (is_dup) + /* + * We can skip log records between a past dup and tmp_lsn + * returned by rep_apply() because we know we have all + * those log records. For a past dup, this log record is + * less than or equal to tmp_lsn (which is either ready_lsn + * or max_perm_lsn) and we only have records to skip when + * it is less than tmp_lsn. + * + * We cannot skip log records for a future dup because we + * may not have all of them. In this case, this log record + * is greater than or equal to tmp_lsn (which is either + * ready_lsn or this log record). + */ + if (is_dup && LOG_COMPARE(&tmprp.lsn, &tmp_lsn) < 0) next_new_lsn = tmp_lsn; switch (ret) { /* @@ -637,7 +650,7 @@ __rep_logreq(env, rp, rec, eid) if (LOG_COMPARE(&firstlsn, &rp->lsn) > 0) { /* Case 3 */ if (F_ISSET(rep, REP_F_CLIENT)) { - ret = DB_NOTFOUND; + ret = USR_ERR(env, DB_NOTFOUND); goto err; } (void)__rep_send_message(env, eid, @@ -662,7 +675,7 @@ __rep_logreq(env, rp, rec, eid) ret = 0; goto err; } else - ret = DB_NOTFOUND; + ret = USR_ERR(env, DB_NOTFOUND); } } @@ -812,6 +825,14 @@ __rep_loggap_req(env, rep, lsnp, gapflags) ret = 0; /* + * If we are in SYNC_LOG and have all the log we need (i.e. + * rep->last_lsn is ZERO_LSN), just return, as there is nothing + * to do while recovery is running. + */ + if (rep->sync_state == SYNC_LOG && IS_ZERO_LSN(rep->last_lsn)) + return (0); + + /* * Check if we need to ask for the gap. * We ask for the gap if: * We are forced to with gapflags. @@ -1030,7 +1051,7 @@ __rep_chk_newfile(env, logc, rep, rp, eid) REP_VERIFY_FAIL, &rp->lsn, NULL, 0, 0); } else - ret = DB_NOTFOUND; + ret = USR_ERR(env, DB_NOTFOUND); } else { endlsn.offset += logc->len; if ((ret = __logc_version(logc, @@ -1054,7 +1075,7 @@ __rep_chk_newfile(env, logc, rep, rp, eid) } } } else - ret = DB_NOTFOUND; + ret = USR_ERR(env, DB_NOTFOUND); return (ret); } diff --git a/src/rep/rep_method.c b/src/rep/rep_method.c index f9f1924c..e0e7dd19 100644 --- a/src/rep/rep_method.c +++ b/src/rep/rep_method.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -10,6 +10,7 @@ #include "db_int.h" #include "dbinc/db_page.h" +#include "dbinc/blob.h" #include "dbinc/btree.h" #include "dbinc/mp.h" #include "dbinc/txn.h" @@ -17,14 +18,12 @@ static int __rep_abort_prepared __P((ENV *)); static int __rep_await_condition __P((ENV *, struct rep_waitgoal *, db_timeout_t)); -static int __rep_bt_cmp __P((DB *, const DBT *, const DBT *)); +static int __rep_bt_cmp __P((DB *, const DBT *, const DBT *, size_t *)); static int __rep_check_applied __P((ENV *, DB_THREAD_INFO *, DB_COMMIT_INFO *, struct rep_waitgoal *)); static void __rep_config_map __P((ENV *, u_int32_t *, u_int32_t *)); static u_int32_t __rep_conv_vers __P((ENV *, u_int32_t)); -static int __rep_read_lsn_history __P((ENV *, - DB_THREAD_INFO *, DB_TXN **, DBC **, u_int32_t, - __rep_lsn_hist_data_args *, struct rep_waitgoal *, u_int32_t)); +static int __rep_defview __P((DB_ENV *, const char *, int *, u_int32_t)); static int __rep_restore_prepared __P((ENV *)); static int __rep_save_lsn_hist __P((ENV *, DB_THREAD_INFO *, DB_LSN *)); /* @@ -123,9 +122,11 @@ __rep_get_config(dbenv, which, onp) #undef OK_FLAGS #define OK_FLAGS \ (DB_REP_CONF_AUTOINIT | DB_REP_CONF_AUTOROLLBACK | \ - DB_REP_CONF_BULK | DB_REP_CONF_DELAYCLIENT | DB_REP_CONF_INMEM | \ + DB_REP_CONF_BULK | DB_REP_CONF_DELAYCLIENT | \ + DB_REP_CONF_ELECT_LOGLENGTH | DB_REP_CONF_INMEM | \ DB_REP_CONF_LEASE | DB_REP_CONF_NOWAIT | \ - DB_REPMGR_CONF_2SITE_STRICT | DB_REPMGR_CONF_ELECTIONS) + DB_REPMGR_CONF_2SITE_STRICT | DB_REPMGR_CONF_ELECTIONS | \ + DB_REPMGR_CONF_PREFMAS_CLIENT | DB_REPMGR_CONF_PREFMAS_MASTER) if (FLD_ISSET(which, ~OK_FLAGS)) return (__db_ferr(env, "DB_ENV->rep_get_config", 0)); @@ -171,19 +172,30 @@ __rep_set_config(dbenv, which, on) REP *rep; REP_BULK bulk; u_int32_t mapped, orig; - int ret, t_ret; + int inmemlog, pm_ret, ret, t_ret; env = dbenv->env; db_rep = env->rep_handle; ret = 0; + pm_ret = 0; + inmemlog = 0; #undef OK_FLAGS #define OK_FLAGS \ (DB_REP_CONF_AUTOINIT | DB_REP_CONF_AUTOROLLBACK | \ - DB_REP_CONF_BULK | DB_REP_CONF_DELAYCLIENT | DB_REP_CONF_INMEM | \ + DB_REP_CONF_BULK | DB_REP_CONF_DELAYCLIENT | \ + DB_REP_CONF_ELECT_LOGLENGTH | DB_REP_CONF_INMEM | \ DB_REP_CONF_LEASE | DB_REP_CONF_NOWAIT | \ - DB_REPMGR_CONF_2SITE_STRICT | DB_REPMGR_CONF_ELECTIONS) -#define REPMGR_FLAGS (REP_C_2SITE_STRICT | REP_C_ELECTIONS) + DB_REPMGR_CONF_2SITE_STRICT | DB_REPMGR_CONF_ELECTIONS | \ + DB_REPMGR_CONF_PREFMAS_CLIENT | DB_REPMGR_CONF_PREFMAS_MASTER) +#define REPMGR_FLAGS (REP_C_2SITE_STRICT | REP_C_ELECTIONS | \ + REP_C_PREFMAS_CLIENT | REP_C_PREFMAS_MASTER) + +#define TURNING_ON_PREFMAS(orig, curr) \ + ((FLD_ISSET(curr, REP_C_PREFMAS_MASTER) && \ + !FLD_ISSET(orig, REP_C_PREFMAS_MASTER)) || \ + (FLD_ISSET(curr, REP_C_PREFMAS_CLIENT) && \ + !FLD_ISSET(orig, REP_C_PREFMAS_CLIENT))) ENV_NOT_CONFIGURED( env, db_rep->region, "DB_ENV->rep_set_config", DB_INIT_REP); @@ -224,6 +236,62 @@ __rep_set_config(dbenv, which, on) return (EINVAL); } /* + * The undocumented ELECT_LOGLENGTH option and the preferred + * master options cannot be changed after calling repmgr_start. + */ + if (FLD_ISSET(mapped, (REP_C_ELECT_LOGLENGTH | + REP_C_PREFMAS_MASTER | REP_C_PREFMAS_CLIENT)) && + F_ISSET(rep, REP_F_START_CALLED)) { + __db_errx(env, DB_STR("3706", + "DB_ENV->rep_set_config: %s " + "must be configured before DB_ENV->repmgr_start"), + FLD_ISSET(mapped, REP_C_ELECT_LOGLENGTH) ? + "ELECT_LOGLENGTH" : "preferred master"); + ENV_LEAVE(env, ip); + return (EINVAL); + } + /* + * Do not allow users to turn on preferred master if + * leases or in-memory replication files are in effect, + * or with a private environment or in-memory log files. + */ + if (FLD_ISSET(mapped, + (REP_C_PREFMAS_MASTER | REP_C_PREFMAS_CLIENT)) && + (REP_CONFIG_IS_SET(env, (REP_C_LEASE | REP_C_INMEM)) || + (__log_get_config(dbenv, + DB_LOG_IN_MEMORY, &inmemlog) == 0 && + (inmemlog > 0 || F_ISSET(env, ENV_PRIVATE))))) { + __db_errx(env, DB_STR("3707", + "DB_ENV->rep_set_config: preferred master mode " + "cannot be used with %s"), + REP_CONFIG_IS_SET(env, REP_C_LEASE) ? + "master leases" : + REP_CONFIG_IS_SET(env, REP_C_INMEM) ? + "in-memory replication files" : + inmemlog > 0 ? "in-memory log files" : + "a private environment"); + ENV_LEAVE(env, ip); + return (EINVAL); + } + /* + * If we are already in preferred master mode, we can't + * turn off elections or 2site_strict and we can't turn on + * leases. + */ + if (PREFMAS_IS_SET(env) && ((FLD_ISSET(mapped, + (REP_C_ELECTIONS | REP_C_2SITE_STRICT)) && on == 0) || + (FLD_ISSET(mapped, REP_C_LEASE) && on > 0))) { + __db_errx(env, DB_STR("3708", + "DB_ENV->rep_set_config: cannot %s %s " + "in preferred master mode"), + on == 0 ? "disable" : "enable", + FLD_ISSET(mapped, REP_C_ELECTIONS) ? "elections" : + FLD_ISSET(mapped, REP_C_LEASE) ? "leases" : + "2SITE_STRICT"); + ENV_LEAVE(env, ip); + return (EINVAL); + } + /* * Leases must be turned on before calling rep_start. * Leases can never be turned off once they're turned on. */ @@ -252,6 +320,17 @@ __rep_set_config(dbenv, which, on) else FLD_CLR(rep->config, mapped); +#ifdef HAVE_REPLICATION_THREADS + /* Do automatic preferred master configuration. */ + if (TURNING_ON_PREFMAS(orig, rep->config) && + (pm_ret = __repmgr_prefmas_auto_config(dbenv, + &rep->config)) != 0) { + REP_SYSTEM_UNLOCK(env); + MUTEX_UNLOCK(env, rep->mtx_clientdb); + ENV_LEAVE(env, ip); + goto prefmas_err; + } +#endif /* * Bulk transfer requires special processing if it is getting * toggled. @@ -297,10 +376,25 @@ __rep_set_config(dbenv, which, on) ret = t_ret; #endif } else { + orig = db_rep->config; if (on) FLD_SET(db_rep->config, mapped); else FLD_CLR(db_rep->config, mapped); +#ifdef HAVE_REPLICATION_THREADS + /* Do automatic preferred master configuration. */ + if (TURNING_ON_PREFMAS(orig, db_rep->config)) + pm_ret = + __repmgr_prefmas_auto_config(dbenv, + &db_rep->config); +#endif + } +prefmas_err: + if (pm_ret != 0) { + __db_errx(env, DB_STR("3709", + "DB_ENV->rep_set_config: could not complete automatic " + "preferred master configuration")); + ret = EINVAL; } /* Configuring 2SITE_STRICT, etc. makes this a repmgr application */ if (ret == 0 && FLD_ISSET(mapped, REPMGR_FLAGS)) @@ -331,6 +425,10 @@ __rep_config_map(env, inflagsp, outflagsp) FLD_SET(*outflagsp, REP_C_DELAYCLIENT); FLD_CLR(*inflagsp, DB_REP_CONF_DELAYCLIENT); } + if (FLD_ISSET(*inflagsp, DB_REP_CONF_ELECT_LOGLENGTH)) { + FLD_SET(*outflagsp, REP_C_ELECT_LOGLENGTH); + FLD_CLR(*inflagsp, DB_REP_CONF_ELECT_LOGLENGTH); + } if (FLD_ISSET(*inflagsp, DB_REP_CONF_INMEM)) { FLD_SET(*outflagsp, REP_C_INMEM); FLD_CLR(*inflagsp, DB_REP_CONF_INMEM); @@ -351,6 +449,14 @@ __rep_config_map(env, inflagsp, outflagsp) FLD_SET(*outflagsp, REP_C_ELECTIONS); FLD_CLR(*inflagsp, DB_REPMGR_CONF_ELECTIONS); } + if (FLD_ISSET(*inflagsp, DB_REPMGR_CONF_PREFMAS_CLIENT)) { + FLD_SET(*outflagsp, REP_C_PREFMAS_CLIENT); + FLD_CLR(*inflagsp, DB_REPMGR_CONF_PREFMAS_CLIENT); + } + if (FLD_ISSET(*inflagsp, DB_REPMGR_CONF_PREFMAS_MASTER)) { + FLD_SET(*outflagsp, REP_C_PREFMAS_MASTER); + FLD_CLR(*inflagsp, DB_REPMGR_CONF_PREFMAS_MASTER); + } DB_ASSERT(env, *inflagsp == 0); } @@ -368,8 +474,10 @@ __rep_start_pp(dbenv, dbt, flags) DBT *dbt; u_int32_t flags; { - DB_REP *db_rep; ENV *env; + DB_REP *db_rep; + DB_THREAD_INFO *ip; + int ret; env = dbenv->env; db_rep = env->rep_handle; @@ -400,7 +508,11 @@ __rep_start_pp(dbenv, dbt, flags) return (EINVAL); } - return (__rep_start_int(env, dbt, flags)); + ENV_ENTER(env, ip); + ret = __rep_start_int(env, dbt, flags, 0); + ENV_LEAVE(env, ip); + + return (ret); } /* @@ -432,13 +544,14 @@ __rep_start_pp(dbenv, dbt, flags) * clients that reference non-existent files whose creation was backed out * during a synchronizing recovery. * - * PUBLIC: int __rep_start_int __P((ENV *, DBT *, u_int32_t)); + * PUBLIC: int __rep_start_int __P((ENV *, DBT *, u_int32_t, u_int32_t)); */ int -__rep_start_int(env, dbt, flags) +__rep_start_int(env, dbt, flags, startopts) ENV *env; DBT *dbt; u_int32_t flags; + u_int32_t startopts; { DB *dbp; DB_LOG *dblp; @@ -474,9 +587,31 @@ __rep_start_int(env, dbt, flags) return (EINVAL); } - ENV_ENTER(env, ip); + /* + * If we are a view, we can never become master. + */ + if (IS_VIEW_SITE(env) && role == DB_REP_MASTER) { + __db_errx(env, DB_STR("3685", + "View site cannot become master")); + return (EINVAL); + } + + /* + * Check for consistent view usage. We need to check here rather + * than in __rep_open because non-rep-aware processes such as + * db_stat may open/join the environment. Rep-aware handles must + * consistently set the view. + */ + if ((ret = __rep_check_view(env)) != 0) { + RPRINT(env, (env, DB_VERB_REP_MISC, + "Application env/view mismatch.")); + __db_errx(env, DB_STR("3686", + "Application environment and view callback mismatch")); + return (ret); + } /* Serialize rep_start() calls. */ + ENV_GET_THREAD_INFO(env, ip); MUTEX_LOCK(env, rep->mtx_repstart); start_th = 1; @@ -492,8 +627,14 @@ __rep_start_int(env, dbt, flags) goto out; REP_SYSTEM_LOCK(env); + /* + * The FORCE_ROLECHG option is used when a side-effect of the role + * change such as incrementing the master gen is needed regardless + * of the previous role. + */ role_chg = (!F_ISSET(rep, REP_F_MASTER) && role == DB_REP_MASTER) || - (!F_ISSET(rep, REP_F_CLIENT) && role == DB_REP_CLIENT); + (!F_ISSET(rep, REP_F_CLIENT) && role == DB_REP_CLIENT) || + FLD_ISSET(startopts, REP_START_FORCE_ROLECHG); /* * There is no need for lockout if all we're doing is sending a message. @@ -511,9 +652,11 @@ __rep_start_int(env, dbt, flags) goto out; } - if (FLD_ISSET(rep->lockout_flags, REP_LOCKOUT_MSG)) { + if (!FLD_ISSET(startopts, REP_START_WAIT_LOCKMSG) && + FLD_ISSET(rep->lockout_flags, REP_LOCKOUT_MSG)) { /* - * There is already someone in msg lockout. Return. + * There is already someone in msg lockout and we are not + * waiting. Return. */ RPRINT(env, (env, DB_VERB_REP_MISC, "Thread already in msg lockout")); @@ -702,10 +845,15 @@ __rep_start_int(env, dbt, flags) * now defunct on master. * NEWFILE: Used to delay client apply during newfile * operation, not applicable to master. + * READONLY_MASTER: Used to coordinate preferred master + * takeover, should not remain in effect after restart. + * HOLD_GEN: Freeze gen for preferred master, should not + * remain in effect after restart. */ F_CLR(rep, REP_F_CLIENT | REP_F_ABBREVIATED | REP_F_MASTERELECT | REP_F_SKIPPED_APPLY | REP_F_DELAY | - REP_F_LEASE_EXPIRED | REP_F_NEWFILE); + REP_F_LEASE_EXPIRED | REP_F_NEWFILE | + REP_F_READONLY_MASTER | REP_F_HOLD_GEN); /* * When becoming a master, set the following flags: * MASTER: Indicate that this site is master. @@ -842,11 +990,16 @@ __rep_start_int(env, dbt, flags) } /* * When becoming a client, clear the following flags: + * HOLD_GEN: Freeze gen for preferred master, should not + * remain in effect after restart. * MASTER: Site is no longer a master. * MASTERELECT: Indicates that a master is elected * rather than appointed, not applicable on client. + * READONLY_MASTER: Used to coordinate preferred master + * takeover, should not remain in effect after restart. */ - F_CLR(rep, REP_F_MASTER | REP_F_MASTERELECT); + F_CLR(rep, REP_F_HOLD_GEN | REP_F_MASTER | REP_F_MASTERELECT | + REP_F_READONLY_MASTER); F_SET(rep, REP_F_CLIENT); /* @@ -928,6 +1081,15 @@ __rep_start_int(env, dbt, flags) * sync with the master. */ SET_GEN(0); + /* + * If we are changing role to client, reset our min log file + * until we hear from a master or another client. In + * particular, in a dupmaster situation, if this site loses + * an election a stale min_log_file would prevent archiving. + */ +#ifdef HAVE_REPLICATION_THREADS + rep->min_log_file = 0; +#endif REP_SYSTEM_UNLOCK(env); /* @@ -935,6 +1097,15 @@ __rep_start_int(env, dbt, flags) */ if ((ret = __dbt_usercopy(env, dbt)) != 0) goto out; + /* + * The HOLD_CLIGEN option does not allow this client's + * gen to change until the REP_F_HOLD_GEN flag is cleared. + * It prevents this site from responding to NEWMASTER messages + * and disables updating the gen from other incoming messages. + */ + if (FLD_ISSET(startopts, REP_START_HOLD_CLIGEN)) + F_SET(rep, REP_F_HOLD_GEN); + (void)__rep_send_message(env, DB_EID_BROADCAST, REP_NEWCLIENT, NULL, dbt, 0, 0); } @@ -967,7 +1138,6 @@ out: if (start_th) MUTEX_UNLOCK(env, rep->mtx_repstart); __dbt_userfree(env, dbt, NULL, NULL); - ENV_LEAVE(env, ip); return (ret); } @@ -1170,6 +1340,9 @@ __rep_client_dbinit(env, startup, which) if (which == REP_DB) { name = REPDBNAME; rdbpp = &db_rep->rep_db; + } else if (which == REP_BLOB) { + name = REPBLOBNAME; + rdbpp = &db_rep->blob_dbp; } else { name = REPPAGENAME; rdbpp = &db_rep->file_dbp; @@ -1209,16 +1382,28 @@ __rep_client_dbinit(env, startup, which) if (which == REP_DB && (ret = __bam_set_bt_compare(dbp, __rep_bt_cmp)) != 0) goto err; + if (which == REP_BLOB && + (ret = __bam_set_bt_compare(dbp, __rep_blob_cmp)) != 0 && + (ret = __db_set_dup_compare(dbp, __rep_offset_cmp)) != 0) + goto err; /* Don't write log records on the client. */ if ((ret = __db_set_flags(dbp, DB_TXN_NOT_DURABLE)) != 0) goto err; + /* Blob gap processing requires sorted duplicates. */ + if (which == REP_BLOB) { + if ((ret = __db_set_blob_threshold(dbp, 0, 0)) != 0) + goto err; + if ((ret = __db_set_flags(dbp, DB_DUPSORT)) != 0) + goto err; + } + flags = DB_NO_AUTO_COMMIT | DB_CREATE | DB_INTERNAL_TEMPORARY_DB | (F_ISSET(env, ENV_THREAD) ? DB_THREAD : 0); if ((ret = __db_open(dbp, ip, NULL, fname, subdb, - (which == REP_DB ? DB_BTREE : DB_RECNO), + (which == REP_PG ? DB_RECNO : DB_BTREE), flags, 0, PGNO_BASE_MD)) != 0) goto err; @@ -1243,14 +1428,16 @@ err: if (dbp != NULL && * care about the LSNs. */ static int -__rep_bt_cmp(dbp, dbt1, dbt2) +__rep_bt_cmp(dbp, dbt1, dbt2, locp) DB *dbp; const DBT *dbt1, *dbt2; + size_t *locp; { DB_LSN lsn1, lsn2; __rep_control_args *rp1, *rp2; COMPQUIET(dbp, NULL); + COMPQUIET(locp, NULL); rp1 = dbt1->data; rp2 = dbt2->data; @@ -1274,6 +1461,82 @@ __rep_bt_cmp(dbp, dbt1, dbt2) } /* + * __rep_blob_cmp -- + * + * Comparison function for the blob gap database. The key is the blob_sid + * appended with the blob_id. + * + * PUBLIC: int __rep_blob_cmp __P((DB *, const DBT *, const DBT *, size_t *)); + */ +int +__rep_blob_cmp(dbp, dbt1, dbt2, locp) + DB *dbp; + const DBT *dbt1, *dbt2; + size_t *locp; +{ + db_seq_t blob_id1, blob_id2, blob_sid1, blob_sid2; + u_int8_t *p; + + COMPQUIET(dbp, NULL); + COMPQUIET(locp, NULL); + + /* Use memcpy here to prevent alignment issues. */ + p = dbt1->data; + memcpy(&blob_sid1, p, sizeof(db_seq_t)); + p += sizeof(db_seq_t); + memcpy(&blob_id1, p, sizeof(db_seq_t)); + p = dbt2->data; + memcpy(&blob_sid2, p, sizeof(db_seq_t)); + p += sizeof(db_seq_t); + memcpy(&blob_id2, p, sizeof(db_seq_t)); + + if (blob_sid1 > blob_sid2) + return (1); + + if (blob_sid1 < blob_sid2) + return (-1); + + if (blob_id1 > blob_id2) + return (1); + + if (blob_id1 < blob_id2) + return (-1); + + return (0); +} + +/* + * __rep_offset_cmp -- + * + * Comparison function for duplicates in the the blob gap database. + * + * PUBLIC: int __rep_offset_cmp + * PUBLIC: __P((DB *, const DBT *, const DBT *, size_t *)); + */ +int +__rep_offset_cmp(dbp, dbt1, dbt2, locp) + DB *dbp; + const DBT *dbt1, *dbt2; + size_t *locp; +{ + off_t offset1, offset2; + + COMPQUIET(dbp, NULL); + COMPQUIET(locp, NULL); + + /* Use memcpy here to prevent alignment issues. */ + memcpy(&offset1, dbt1->data, sizeof(off_t)); + memcpy(&offset2, dbt2->data, sizeof(off_t)); + + if (offset1 == offset2) + return (0); + else if (offset1 > offset2) + return (1); + + return (-1); +} + +/* * __rep_abort_prepared -- * Abort any prepared transactions that recovery restored. * @@ -1684,7 +1947,10 @@ __rep_set_nsites_pp(dbenv, n) "DB_ENV->rep_set_nsites: cannot call from Replication Manager application")); return (EINVAL); } - if ((ret = __rep_set_nsites_int(env, n)) == 0) + ENV_ENTER(env, ip); + ret = __rep_set_nsites_int(env, n); + ENV_LEAVE(env, ip); + if (ret == 0) APP_SET_BASEAPI(env); return (ret); } @@ -1748,18 +2014,15 @@ __rep_get_nsites(dbenv, n) } /* - * PUBLIC: int __rep_set_priority __P((DB_ENV *, u_int32_t)); + * PUBLIC: int __rep_set_priority_pp __P((DB_ENV *, u_int32_t)); */ int -__rep_set_priority(dbenv, priority) +__rep_set_priority_pp(dbenv, priority) DB_ENV *dbenv; u_int32_t priority; { DB_REP *db_rep; ENV *env; - REP *rep; - u_int32_t prev; - int ret; env = dbenv->env; db_rep = env->rep_handle; @@ -1767,6 +2030,30 @@ __rep_set_priority(dbenv, priority) ENV_NOT_CONFIGURED( env, db_rep->region, "DB_ENV->rep_set_priority", DB_INIT_REP); + if (PREFMAS_IS_SET(env)) { + __db_errx(env, DB_STR_A("3710", +"%s: cannot change priority in preferred master mode.", + "%s"), "DB_ENV->rep_set_priority"); + return (EINVAL); + } + + return (__rep_set_priority_int(env, priority)); +} + +/* + * PUBLIC: int __rep_set_priority_int __P((ENV *, u_int32_t)); + */ +int +__rep_set_priority_int(env, priority) + ENV *env; + u_int32_t priority; +{ + DB_REP *db_rep; + REP *rep; + u_int32_t prev; + int ret; + + db_rep = env->rep_handle; ret = 0; if (REP_ON(env)) { rep = db_rep->region; @@ -1807,10 +2094,10 @@ __rep_get_priority(dbenv, priority) } /* - * PUBLIC: int __rep_set_timeout __P((DB_ENV *, int, db_timeout_t)); + * PUBLIC: int __rep_set_timeout_pp __P((DB_ENV *, int, db_timeout_t)); */ int -__rep_set_timeout(dbenv, which, timeout) +__rep_set_timeout_pp(dbenv, which, timeout) DB_ENV *dbenv; int which; db_timeout_t timeout; @@ -1818,13 +2105,10 @@ __rep_set_timeout(dbenv, which, timeout) DB_REP *db_rep; DB_THREAD_INFO *ip; ENV *env; - REP *rep; int repmgr_timeout, ret; env = dbenv->env; db_rep = env->rep_handle; - rep = db_rep->region; - ret = 0; repmgr_timeout = 0; if (timeout == 0 && (which == DB_REP_CONNECTION_RETRY || @@ -1850,12 +2134,46 @@ __rep_set_timeout(dbenv, which, timeout) return (EINVAL); } if (which == DB_REP_LEASE_TIMEOUT && IS_REP_STARTED(env)) { - ret = EINVAL; __db_errx(env, DB_STR_A("3568", "%s: lease timeout must be set before DB_ENV->rep_start.", "%s"), "DB_ENV->rep_set_timeout"); return (EINVAL); } + if (PREFMAS_IS_SET(env) && + (which == DB_REP_HEARTBEAT_MONITOR || + which == DB_REP_HEARTBEAT_SEND) && + timeout == 0) { + __db_errx(env, DB_STR_A("3711", +"%s: cannot turn off heartbeat timeout in preferred master mode.", + "%s"), "DB_ENV->rep_set_timeout"); + return (EINVAL); + } + + ret = __rep_set_timeout_int(env, which, timeout); + + /* Setting a repmgr timeout makes this a repmgr application */ + if (ret == 0 && repmgr_timeout) + APP_SET_REPMGR(env); + return (ret); + +} + +/* + * PUBLIC: int __rep_set_timeout_int __P((ENV *, int, db_timeout_t)); + */ +int +__rep_set_timeout_int(env, which, timeout) + ENV *env; + int which; + db_timeout_t timeout; +{ + DB_REP *db_rep; + REP *rep; + int ret; + + db_rep = env->rep_handle; + rep = db_rep->region; + ret = 0; switch (which) { case DB_REP_CHECKPOINT_DELAY: @@ -1888,6 +2206,7 @@ __rep_set_timeout(dbenv, which, timeout) rep->ack_timeout = timeout; else db_rep->ack_timeout = timeout; + ADJUST_AUTOTAKEOVER_WAITS(db_rep, timeout); break; case DB_REP_CONNECTION_RETRY: if (REP_ON(env)) @@ -1919,10 +2238,6 @@ __rep_set_timeout(dbenv, which, timeout) "Unknown timeout type argument to DB_ENV->rep_set_timeout")); ret = EINVAL; } - - /* Setting a repmgr timeout makes this a repmgr application */ - if (ret == 0 && repmgr_timeout) - APP_SET_REPMGR(env); return (ret); } @@ -2099,6 +2414,144 @@ __rep_set_request(dbenv, min, max) } /* + * __rep_set_view -- + * Set the view/partial replication function. + * + * PUBLIC: int __rep_set_view __P((DB_ENV *, + * PUBLIC: int (*)(DB_ENV *, const char *, int *, u_int32_t))); + */ +int +__rep_set_view(dbenv, f_partial) + DB_ENV *dbenv; + int (*f_partial) __P((DB_ENV *, + const char *, int *, u_int32_t)); +{ + DB_REP *db_rep; + ENV *env; + + env = dbenv->env; + db_rep = env->rep_handle; + + ENV_NOT_CONFIGURED( + env, db_rep->region, "DB_ENV->rep_set_view", DB_INIT_REP); + + ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->rep_set_view"); + + if (f_partial == NULL) + db_rep->partial = __rep_defview; + else + db_rep->partial = f_partial; + return (0); +} + +/* + * __rep_defview -- + * Default view function. Always replicate. + */ +static int +__rep_defview(dbenv, name, result, flags) + DB_ENV *dbenv; + const char *name; + int *result; + u_int32_t flags; +{ + COMPQUIET(dbenv, NULL); + COMPQUIET(name, NULL); + COMPQUIET(flags, 0); + *result = 1; + return (0); +} + +/* + * __rep_call_partial -- + * Calls the partial function, after doing some checks required for + * handling blobs. + * + * PUBLIC: int __rep_call_partial + * PUBLIC: __P((ENV *, const char *, int *, u_int32_t, DELAYED_BLOB_LIST **)); + */ +int +__rep_call_partial(env, name, result, flags, lsp) + ENV *env; + const char *name; + int *result; + u_int32_t flags; + DELAYED_BLOB_LIST **lsp; +{ + DB_LOG *dblp; + DB_REP *db_rep; + DELAYED_BLOB_LIST *dbl; + FNAME *fname; + db_seq_t blob_file_id; + char *file_name; + int ret; + + ret = 0; + blob_file_id = 0; + db_rep = env->rep_handle; + dblp = env->lg_handle; + fname = NULL; + + /* + * If the database being sent is a blob meta database or file, then the + * name of its associated database needs to be passed to the partial + * function. To do this, use the blob file id in the path to the + * file to look up the blob_file_id of the associated database. That + * can be used to look up the name of the associated database through + * dbreg. + */ + if (db_rep->partial == __rep_defview || + (!IS_BLOB_META(name) && !IS_BLOB_FILE(name))) { + ret = db_rep->partial(env->dbenv, name, result, flags); + } else { + /* + * The top level blob meta database must always be replicated. + */ + if (strcmp(name, BLOB_META_FILE_NAME) == 0) { + *result = 1; + return (ret); + } + if ((ret = __blob_path_to_dir_ids( + env, name, &blob_file_id, NULL)) != 0) + return (ret); + DB_ASSERT(env, blob_file_id > 0); + + /* + * It is possible that the database that owns this blob meta + * database has not yet been processed on the client when + * processing the transaction, so assume it is not replicated. + * Return its information and process it later when its + * owning database is processed (which must happen in the + * same transaction). + */ + if (__dbreg_blob_file_to_fname( + dblp, blob_file_id, 0, &fname) != 0) { + if ((ret = __os_malloc( + env, sizeof(DELAYED_BLOB_LIST), &dbl)) != 0) + return (ret); + memset(dbl, 0, sizeof(DELAYED_BLOB_LIST)); + dbl->blob_file_id = blob_file_id; + if (*lsp == NULL) + *lsp = dbl; + else { + dbl->next = *lsp; + (*lsp)->prev = dbl; + *lsp = dbl; + } + *result = 0; + return (0); + } + + file_name = fname->fname_off == INVALID_ROFF ? + NULL : R_ADDR(&dblp->reginfo, fname->fname_off); + DB_ASSERT(env, file_name != NULL); + ret = db_rep->partial(env->dbenv, file_name, result, flags); + } + + return (ret); +} + +/* * __rep_set_transport_pp -- * Set the transport function for replication. * @@ -2288,25 +2741,46 @@ __rep_set_clockskew(dbenv, fast_clock, slow_clock) } /* - * __rep_flush -- + * __rep_flush_pp -- * Re-push the last log record to all clients, in case they've lost * messages and don't know it. * - * PUBLIC: int __rep_flush __P((DB_ENV *)); + * PUBLIC: int __rep_flush_pp __P((DB_ENV *)); */ int -__rep_flush(dbenv) +__rep_flush_pp (dbenv) DB_ENV *dbenv; { + ENV *env; + DB_THREAD_INFO *ip; + int ret; + + env = dbenv->env; + + ENV_ENTER(env, ip); + ret = __rep_flush_int(env); + ENV_LEAVE(env, ip); + + return (ret); +} + +/* + * __rep_flush_int -- + * Re-push the last log record to all clients, in case they've lost + * messages and don't know it. + * + * PUBLIC: int __rep_flush_int __P((ENV *)); + */ +int +__rep_flush_int(env) + ENV *env; +{ DBT rec; DB_LOGC *logc; DB_LSN lsn; DB_REP *db_rep; - DB_THREAD_INFO *ip; - ENV *env; int ret, t_ret; - env = dbenv->env; db_rep = env->rep_handle; ENV_REQUIRES_CONFIG_XX( @@ -2322,8 +2796,6 @@ __rep_flush(dbenv) return (EINVAL); } - ENV_ENTER(env, ip); - if ((ret = __log_cursor(env, &logc)) != 0) return (ret); @@ -2338,7 +2810,6 @@ __rep_flush(dbenv) err: if ((t_ret = __logc_close(logc)) != 0 && ret == 0) ret = t_ret; - ENV_LEAVE(env, ip); return (ret); } @@ -2693,7 +3164,7 @@ __rep_check_applied(env, ip, commit_info, reasonp) */ if (commit_info->gen == gen) { ret = __rep_read_lsn_history(env, - ip, &txn, &dbc, gen, &hist, reasonp, DB_SET); + ip, &txn, &dbc, gen, &hist, reasonp, DB_SET, 1); if (ret == DB_NOTFOUND) { /* * We haven't yet received the LSN history of the @@ -2720,7 +3191,7 @@ __rep_check_applied(env, ip, commit_info, reasonp) * masters at the same gen, and the txn of interest was * rolled back. */ - ret = DB_NOTFOUND; + ret = USR_ERR(env, DB_NOTFOUND); goto out; } @@ -2750,7 +3221,7 @@ __rep_check_applied(env, ip, commit_info, reasonp) * description of the txn of interest doesn't match what we see * in the history available to us now. */ - ret = DB_NOTFOUND; + ret = USR_ERR(env, DB_NOTFOUND); } else if (commit_info->gen < gen || gen == 0) { /* @@ -2759,10 +3230,10 @@ __rep_check_applied(env, ip, commit_info, reasonp) * the token LSN is within the close/open range defined by * [base,next). */ - ret = __rep_read_lsn_history(env, - ip, &txn, &dbc, commit_info->gen, &hist, reasonp, DB_SET); - t_ret = __rep_read_lsn_history(env, - ip, &txn, &dbc, commit_info->gen, &hist2, reasonp, DB_NEXT); + ret = __rep_read_lsn_history(env, ip, + &txn, &dbc, commit_info->gen, &hist, reasonp, DB_SET, 1); + t_ret = __rep_read_lsn_history(env, ip, + &txn, &dbc, commit_info->gen, &hist2, reasonp, DB_NEXT, 1); if (ret == DB_NOTFOUND) { /* * If the desired gen is not in our database, it could @@ -2812,7 +3283,7 @@ __rep_check_applied(env, ip, commit_info, reasonp) * don't match, meaning the txn was written at a dup * master and that gen instance was rolled back. */ - ret = DB_NOTFOUND; + ret = USR_ERR(env, DB_NOTFOUND); goto out; } @@ -2837,7 +3308,7 @@ __rep_check_applied(env, ip, commit_info, reasonp) LOG_COMPARE(&commit_info->lsn, &hist2.lsn) < 0) ret = 0; else - ret = DB_NOTFOUND; + ret = USR_ERR(env, DB_NOTFOUND); } else { /* * Token names a future gen. If we're a client and the LSN also @@ -2851,7 +3322,7 @@ __rep_check_applied(env, ip, commit_info, reasonp) reasonp->u.gen = commit_info->gen; return (DB_TIMEOUT); } - return (DB_NOTFOUND); + return (USR_ERR(env, DB_NOTFOUND)); } out: @@ -2867,9 +3338,19 @@ out: /* * The txn and dbc handles are owned by caller, though we create them if * necessary. Caller is responsible for closing them. + * + * The use_cache option is enabled for the read-your-writes feature, which + * makes frequent requests for the cached information (envid and lsn) when it + * is in use. Callers that require information that is not cached (e.g. + * timestamp) should not set use_cache. + * + * PUBLIC: int __rep_read_lsn_history __P((ENV *, DB_THREAD_INFO *, DB_TXN **, + * PUBLIC: DBC **, u_int32_t, __rep_lsn_hist_data_args *, + * PUBLIC: struct rep_waitgoal *, u_int32_t, int)); */ -static int -__rep_read_lsn_history(env, ip, txn, dbc, gen, gen_infop, reasonp, flags) +int +__rep_read_lsn_history(env, ip, txn, dbc, gen, gen_infop, reasonp, flags, + use_cache) ENV *env; DB_THREAD_INFO *ip; DB_TXN **txn; @@ -2878,6 +3359,7 @@ __rep_read_lsn_history(env, ip, txn, dbc, gen, gen_infop, reasonp, flags) __rep_lsn_hist_data_args *gen_infop; struct rep_waitgoal *reasonp; u_int32_t flags; + int use_cache; { DB_REP *db_rep; REP *rep; @@ -2898,7 +3380,8 @@ __rep_read_lsn_history(env, ip, txn, dbc, gen, gen_infop, reasonp, flags) /* Simply return cached info, if we already have it. */ desired_gen = flags == DB_SET ? gen : gen + 1; REP_SYSTEM_LOCK(env); - if (rep->gen == desired_gen && !IS_ZERO_LSN(rep->gen_base_lsn)) { + if (use_cache && rep->gen == desired_gen && + !IS_ZERO_LSN(rep->gen_base_lsn)) { gen_infop->lsn = rep->gen_base_lsn; gen_infop->envid = rep->master_envid; goto unlock; @@ -3005,8 +3488,14 @@ __rep_conv_vers(env, log_ver) /* * We can't use a switch statement, some of the DB_LOGVERSION_XX - * constants are the same + * constants are the same. */ + if (log_ver == DB_LOGVERSION_61) + return (DB_REPVERSION_61); + if (log_ver == DB_LOGVERSION_60p1) + return (DB_REPVERSION_60); + if (log_ver == DB_LOGVERSION_60) + return (DB_REPVERSION_60); if (log_ver == DB_LOGVERSION_53) return (DB_REPVERSION_53); if (log_ver == DB_LOGVERSION_52) diff --git a/src/rep/rep_record.c b/src/rep/rep_record.c index f4691974..b206e60e 100644 --- a/src/rep/rep_record.c +++ b/src/rep/rep_record.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -9,13 +9,17 @@ #include "db_config.h" #include "db_int.h" +#include "dbinc/blob.h" #include "dbinc/db_page.h" #include "dbinc/db_am.h" #include "dbinc/lock.h" #include "dbinc/mp.h" #include "dbinc/txn.h" -static int __rep_collect_txn __P((ENV *, DB_LSN *, LSN_COLLECTION *)); +static int __rep_collect_txn + __P((ENV *, DB_LSN *, LSN_COLLECTION *, DELAYED_BLOB_LIST **)); +static int __rep_remove_delayed_blobs + __P((ENV *, db_seq_t, u_int32_t ,DELAYED_BLOB_LIST **)); static int __rep_do_ckp __P((ENV *, DBT *, __rep_control_args *)); static int __rep_fire_newmaster __P((ENV *, u_int32_t, int)); static int __rep_fire_startupdone __P((ENV *, u_int32_t, int)); @@ -153,6 +157,7 @@ __rep_process_message_pp(dbenv, control, rec, eid, ret_lsnp) DB_LSN *ret_lsnp; { ENV *env; + DB_THREAD_INFO *ip; int ret; env = dbenv->env; @@ -193,7 +198,9 @@ __rep_process_message_pp(dbenv, control, rec, eid, ret_lsnp) return (ret); } + ENV_ENTER(env, ip); ret = __rep_process_message_int(env, control, rec, eid, ret_lsnp); + ENV_LEAVE(env, ip); __dbt_userfree(env, control, rec, NULL); return (ret); @@ -289,8 +296,7 @@ __rep_process_message_int(env, control, rec, eid, ret_lsnp) if (ret_lsnp != NULL) ZERO_LSN(*ret_lsnp); - ENV_ENTER(env, ip); - + ENV_GET_THREAD_INFO(env, ip); REP_PRINT_MESSAGE(env, eid, rp, "rep_process_message", 0); /* * Check the version number for both rep and log. If it is @@ -303,8 +309,7 @@ __rep_process_message_int(env, control, rec, eid, ret_lsnp) "%lu %d"), (u_long)rp->rep_version, DB_REPVERSION_MIN); - ret = EINVAL; - goto errlock; + return (EINVAL); } VPRINT(env, (env, DB_VERB_REP_MSGS, "Received record %lu with old rep version %lu", @@ -322,8 +327,7 @@ __rep_process_message_int(env, control, rec, eid, ret_lsnp) __db_errx(env, DB_STR_A("3517", "unexpected replication message version %lu, expected %d", "%lu %d"), (u_long)rp->rep_version, DB_REPVERSION); - ret = EINVAL; - goto errlock; + return (EINVAL); } if (rp->log_version < DB_LOGVERSION) { @@ -332,8 +336,7 @@ __rep_process_message_int(env, control, rec, eid, ret_lsnp) "unsupported old replication log version %lu, minimum version %d", "%lu %d"), (u_long)rp->log_version, DB_LOGVERSION_MIN); - ret = EINVAL; - goto errlock; + return (EINVAL); } VPRINT(env, (env, DB_VERB_REP_MSGS, "Received record %lu with old log version %lu", @@ -342,8 +345,7 @@ __rep_process_message_int(env, control, rec, eid, ret_lsnp) __db_errx(env, DB_STR_A("3519", "unexpected log record version %lu, expected %d", "%lu %d"), (u_long)rp->log_version, DB_LOGVERSION); - ret = EINVAL; - goto errlock; + return (EINVAL); } /* @@ -465,9 +467,14 @@ __rep_process_message_int(env, control, rec, eid, ret_lsnp) * accept the generation number and participate in future * elections and communication. Otherwise, I need to hear about * a new master and sync up. + * + * But do not do any of this if REP_F_HOLD_GEN is set. In + * this case we keep the site at its current gen until we + * clear this flag. */ - if (rp->rectype == REP_ALIVE || - rp->rectype == REP_VOTE1 || rp->rectype == REP_VOTE2) { + if ((rp->rectype == REP_ALIVE || + rp->rectype == REP_VOTE1 || rp->rectype == REP_VOTE2) && + !F_ISSET(rep, REP_F_HOLD_GEN)) { REP_SYSTEM_LOCK(env); RPRINT(env, (env, DB_VERB_REP_MSGS, "Updating gen from %lu to %lu", @@ -593,6 +600,38 @@ __rep_process_message_int(env, control, rec, eid, ret_lsnp) ret = __rep_allreq(env, rp, eid); CLIENT_REREQ; break; + case REP_BLOB_ALL_REQ: + /* Blobs do not support peer-to-peer. */ + RECOVERING_SKIP; + MASTER_ONLY(rep, rp); + ret = __rep_blob_allreq(env, eid, rec); + CLIENT_REREQ; + break; + case REP_BLOB_CHUNK: + /* Handle even if in recovery. */ + CLIENT_ONLY(rep, rp); + ret = __rep_blob_chunk(env, eid, ip, rec); + if (ret == DB_REP_PAGEDONE) + ret = 0; + break; + case REP_BLOB_CHUNK_REQ: + /* Blobs do not support peer-to-peer. */ + RECOVERING_SKIP; + MASTER_ONLY(rep, rp); + ret = __rep_blob_chunk_req(env, eid, rec); + CLIENT_REREQ; + break; + case REP_BLOB_UPDATE: + CLIENT_ONLY(rep, rp); + ret = __rep_blob_update(env, eid, ip, rec); + break; + case REP_BLOB_UPDATE_REQ: + MASTER_ONLY(rep, rp); + infop = env->reginfo; + renv = infop->primary; + MASTER_UPDATE(env, renv); + ret = __rep_blob_update_req(env, ip, rec); + break; case REP_BULK_LOG: RECOVERING_LOG_SKIP; CLIENT_ONLY(rep, rp); @@ -1059,8 +1098,6 @@ out: *ret_lsnp = rp->lsn; ret = DB_REP_NOTPERM; } - __dbt_userfree(env, control, rec, NULL); - ENV_LEAVE(env, ip); return (ret); } @@ -1290,8 +1327,24 @@ gap_check: #endif } - if (ret == DB_KEYEXIST) + if (ret == DB_KEYEXIST) { + STAT(rep->stat.st_log_duplicated++); +#ifdef CONFIG_TEST + STAT(rep->stat.st_log_futuredup++); +#endif + if (is_dupp != NULL) { + *is_dupp = 1; + /* + * Could get overwritten by max_lsn later, + * but only when returning NOTPERM for a + * REPCTL_PERM record, in which case max_lsn + * is this log record. + */ + if (ret_lsnp != NULL) + *ret_lsnp = lp->ready_lsn; + } ret = 0; + } if (ret != 0 && ret != ENOMEM) goto done; @@ -1337,10 +1390,11 @@ gap_check: * But max_lsn is guaranteed <= ready_lsn, so * it would be a more conservative LSN to return. */ - *ret_lsnp = lp->ready_lsn; + if (ret_lsnp != NULL) + *ret_lsnp = lp->ready_lsn; } LOGCOPY_32(env, &rectype, rec->data); - if (rectype == DB___txn_regop || rectype == DB___txn_ckp) + if (IS_PERM_RECTYPE(rectype)) max_lsn = lp->max_perm_lsn; /* * We check REPCTL_LEASE here, because this client may @@ -1536,6 +1590,7 @@ __rep_process_txn(env, rec) DB_REP *db_rep; DB_THREAD_INFO *ip; DB_TXNHEAD *txninfo; + DELAYED_BLOB_LIST *dblp, *dummy; LSN_COLLECTION lc; REP *rep; __txn_regop_args *txn_args; @@ -1548,12 +1603,12 @@ __rep_process_txn(env, rec) db_rep = env->rep_handle; rep = db_rep->region; logc = NULL; + dblp = dummy = NULL; txn_args = NULL; txn42_args = NULL; prep_args = NULL; txninfo = NULL; - ENV_ENTER(env, ip); memset(&data_dbt, 0, sizeof(data_dbt)); if (F_ISSET(env, ENV_THREAD)) F_SET(&data_dbt, DB_DBT_REALLOC); @@ -1618,8 +1673,19 @@ __rep_process_txn(env, rec) goto err; /* Phase 1. Get a list of the LSNs in this transaction, and sort it. */ - if ((ret = __rep_collect_txn(env, &prev_lsn, &lc)) != 0) + if ((ret = __rep_collect_txn(env, &prev_lsn, &lc, &dblp)) != 0) goto err; + /* Deal with any child transactions that had to be delayed. */ + while (dblp != NULL) { + if ((ret = __rep_collect_txn( + env, &dblp->lsn, &lc, &dummy)) != 0) + goto err; + DB_ASSERT(env, dummy == NULL); + dummy = dblp; + dblp = dummy->next; + __os_free(env, dummy); + dummy = NULL; + } qsort(lc.array, lc.nlsns, sizeof(DB_LSN), __rep_lsn_cmp); /* @@ -1627,6 +1693,7 @@ __rep_process_txn(env, rec) * records. Create a txnlist so that they can keep track of file * state between records. */ + ENV_GET_THREAD_INFO(env, ip); if ((ret = __db_txnlist_init(env, ip, 0, 0, NULL, &txninfo)) != 0) goto err; @@ -1647,6 +1714,7 @@ __rep_process_txn(env, rec) (u_long)lsnp->file, (u_long)lsnp->offset); goto err; } + LOGCOPY_32(env, &rectype, data_dbt.data); } err: memset(&req, 0, sizeof(req)); @@ -1658,6 +1726,12 @@ err: memset(&req, 0, sizeof(req)); if ((t_ret = __lock_id_free(env, locker)) != 0 && ret == 0) ret = t_ret; + while (dblp != NULL) { + dummy = dblp; + dblp = dummy->next; + __os_free(env, dummy); + } + err1: if (txn_args != NULL) __os_free(env, txn_args); if (txn42_args != NULL) @@ -1694,25 +1768,52 @@ err1: if (txn_args != NULL) * the entire transaction family at once. */ static int -__rep_collect_txn(env, lsnp, lc) +__rep_collect_txn(env, lsnp, lc, dbl) ENV *env; DB_LSN *lsnp; LSN_COLLECTION *lc; + DELAYED_BLOB_LIST **dbl; { + __dbreg_register_args *dbregargp; __txn_child_args *argp; DB_LOGC *logc; DB_LSN c_lsn; + DB_REP *db_rep; DBT data; - u_int32_t rectype; + db_seq_t blob_file_id; + u_int32_t child, rectype, skip_txnid; u_int nalloc; - int ret, t_ret; + int ret, t_ret, view_partial; + char *name; memset(&data, 0, sizeof(data)); F_SET(&data, DB_DBT_REALLOC); + skip_txnid = TXN_INVALID; if ((ret = __log_cursor(env, &logc)) != 0) return (ret); + /* + * For partial replication we assume a certain sequence of + * log records to detect a database create and skip it if + * desired. We are walking backward through the records of + * a single transaction right now. + * + * A create operation is done inside a BDB-owned child txn. + * Nothing else is done within this BDB-owned child txn. + * The last piece of a create operations is the dbreg_register + * log record that records the opening of the file. That + * log record contains the child txnid in the 'id' field, and + * the file name. At this point we invoke the partial callback + * to determine if this database should be replicated. If it + * should not be replicated, we need to avoid collecting the + * entire child txn referenced in the 'id' field. + * + * So if processing the dbreg_register record finds a database + * to skip, we store the child txnid in 'skip_txnid'. We use + * 'skip_txnid' to avoid processing log records or making + * recursive calls for that txnid. + */ while (!IS_ZERO_LSN(*lsnp) && (ret = __logc_get(logc, lsnp, &data, DB_SET)) == 0) { LOGCOPY_32(env, &rectype, data.data); @@ -1722,9 +1823,66 @@ __rep_collect_txn(env, lsnp, lc) goto err; c_lsn = argp->c_lsn; *lsnp = argp->prev_lsn; + child = argp->child; __os_free(env, argp); - ret = __rep_collect_txn(env, &c_lsn, lc); - } else { + + if (child == skip_txnid && *dbl != NULL && + (*dbl)->child == child) + (*dbl)->lsn = c_lsn; + /* + * If skip_txnid is set, it is the id of the child txnid + * that creates a database we should skip. So, if + * this is that child txn, do not collect it. + */ + if (skip_txnid == TXN_INVALID || child != skip_txnid) + ret = __rep_collect_txn(env, &c_lsn, lc, dbl); + } else if (IS_VIEW_SITE(env) && + rectype == DB___dbreg_register) { + db_rep = env->rep_handle; + /* + * If we are a view see if this is a file creation + * stream. On-disk files have the creating child txn + * in the 'id' field and the name. See if this view + * wants this file. + */ + if ((ret = __dbreg_register_read( + env, data.data, &dbregargp)) != 0) + goto err; + child = dbregargp->id; + name = (char *)dbregargp->name.data; + skip_txnid = TXN_INVALID; + if (child != TXN_INVALID && + (!IS_DB_FILE(name) || IS_BLOB_META(name))) { + /* + * The 'id' has a child txn so it is a create. + */ + DB_ASSERT(env, db_rep->partial != NULL); + GET_LO_HI(env, dbregargp->blob_fid_lo, + dbregargp->blob_fid_hi, blob_file_id, ret); + if (ret != 0) + goto err; + if ((ret = __rep_call_partial(env, + name, &view_partial, 0, dbl)) != 0) { + VPRINT(env, (env, DB_VERB_REP_MISC, + "rep_collect_txn: partial cb err %d for %s", ret, name)); + __os_free(env, dbregargp); + goto err; + } + /* + * Save the child txnid for when we walk back + * into the txn_child record. + */ + if (view_partial == 0) { + skip_txnid = child; + if ((ret = + __rep_remove_delayed_blobs(env, + blob_file_id, child, dbl)) != 0) + goto err; + } + } + __os_free(env, dbregargp); + } + if (rectype != DB___txn_child) { if (lc->nalloc < lc->nlsns + 1) { nalloc = lc->nalloc == 0 ? 20 : lc->nalloc * 2; if ((ret = __os_realloc(env, @@ -1761,6 +1919,62 @@ err: if ((t_ret = __logc_close(logc)) != 0 && ret == 0) } /* + * __rep_remove_delayed_blobs -- + * + * If a blob meta database is opened in the same transaction as the database + * that owns it, then deciding whether it should be replicated or not needs + * to be delayed until after the rest of the transaction is processed. To do + * this, the transaction's information is added to a DELAYED_BLOB_LIST. When + * the owning database is processed, if it is not replicated then remove the + * entry of its blob meta database from the delayed list. + */ +static int +__rep_remove_delayed_blobs(env, blob_file_id, child, dbl) + ENV *env; + db_seq_t blob_file_id; + u_int32_t child; + DELAYED_BLOB_LIST **dbl; +{ + DELAYED_BLOB_LIST *ent, *next, *prev; + + if (*dbl == NULL) + return (0); + + /* + * If the child transaction has not been set, then a new entry was just + * added to the list. + */ + if ((*dbl)->child == 0) { + (*dbl)->child = child; + return (0); + } + + if (blob_file_id == 0) + return (0); + + /* + * This blob meta database should not be replicated if its associated + * database is not replicated. Remove it from the delayed + * list so it will not be processed at a later time. + */ + for (ent = *dbl; ent != NULL; ent = (DELAYED_BLOB_LIST *)ent->next) { + if (ent->blob_file_id == blob_file_id && ent->child != child) { + next = (DELAYED_BLOB_LIST *)ent->next; + prev = (DELAYED_BLOB_LIST *)ent->prev; + if (ent == *dbl) + *dbl = next; + if (prev != NULL) + prev->next = ent->next; + if (next != NULL) + next->prev = ent->prev; + __os_free(env, ent); + break; + } + } + return (0); +} + +/* * __rep_lsn_cmp -- * qsort-type-compatible wrapper for LOG_COMPARE. */ @@ -2138,9 +2352,13 @@ __rep_process_rec(env, ip, rp, rec, ret_tsp, ret_lsnp) ret = __rep_process_txn(env, rec); } while (ret == DB_LOCK_DEADLOCK || ret == DB_LOCK_NOTGRANTED); - /* Now flush the log unless we're running TXN_NOSYNC. */ - if (ret == 0 && !F_ISSET(env->dbenv, DB_ENV_TXN_NOSYNC)) - ret = __log_flush(env, NULL); + /* Now write/flush the log as appropriate. */ + if (ret == 0) { + if (F_ISSET(env->dbenv, DB_ENV_TXN_WRITE_NOSYNC)) + ret = __log_rep_write(env); + else if (!F_ISSET(env->dbenv, DB_ENV_TXN_NOSYNC)) + ret = __log_flush(env, NULL); + } if (ret != 0) { __db_errx(env, DB_STR_A("3526", "Error processing txn [%lu][%lu]", "%lu %lu"), @@ -2256,7 +2474,7 @@ __rep_resend_req(env, rereq) DB_REP *db_rep; LOG *lp; REP *rep; - int master, ret; + int blob_sync, master, ret; repsync_t sync_state; u_int32_t gapflags, msgtype, repflags, sendflags; @@ -2271,6 +2489,7 @@ __rep_resend_req(env, rereq) repflags = rep->flags; sync_state = rep->sync_state; + blob_sync = rep->blob_sync; /* * If we are delayed we do not rerequest anything. */ @@ -2293,9 +2512,17 @@ __rep_resend_req(env, rereq) */ msgtype = REP_UPDATE_REQ; } else if (sync_state == SYNC_PAGE) { - REP_SYSTEM_LOCK(env); - ret = __rep_pggap_req(env, rep, NULL, gapflags); - REP_SYSTEM_UNLOCK(env); + if (blob_sync == 0) { + REP_SYSTEM_LOCK(env); + ret = __rep_pggap_req(env, rep, NULL, gapflags); + REP_SYSTEM_UNLOCK(env); + } else { + MUTEX_LOCK(env, rep->mtx_clientdb); + REP_SYSTEM_LOCK(env); + ret = __rep_blob_rereq(env, rep); + REP_SYSTEM_UNLOCK(env); + MUTEX_UNLOCK(env, rep->mtx_clientdb); + } } else { MUTEX_LOCK(env, rep->mtx_clientdb); ret = __rep_loggap_req(env, rep, NULL, gapflags); @@ -2397,9 +2624,20 @@ __rep_skip_msg(env, rep, eid, rectype) if (rep->master_id == DB_EID_INVALID) /* Case 1. */ (void)__rep_send_message(env, DB_EID_BROADCAST, REP_MASTER_REQ, NULL, NULL, 0, 0); - else if (eid == rep->master_id) /* Case 2. */ - ret = __rep_resend_req(env, 0); - else if (F_ISSET(rep, REP_F_CLIENT)) /* Case 3. */ + else if (eid == rep->master_id) { /* Case 2. */ + /* + * When we receive log messages in the SYNC_PAGE stage + * and we decide to rerequest, it often means the pages + * we expect have been dropped. Send a rerequest with + * gapflags for better performance. + */ + if ((rectype == REP_LOG || rectype == REP_BULK_LOG || + rectype == REP_LOG_MORE) && + rep->sync_state == SYNC_PAGE) + ret = __rep_resend_req(env, 1); + else + ret = __rep_resend_req(env, 0); + } else if (F_ISSET(rep, REP_F_CLIENT)) /* Case 3. */ (void)__rep_send_message(env, eid, REP_REREQUEST, NULL, NULL, 0, 0); } @@ -2421,7 +2659,6 @@ __rep_check_missing(env, gen, master_perm_lsn) DB_LOG *dblp; DB_LSN *end_lsn; DB_REP *db_rep; - DB_THREAD_INFO *ip; LOG *lp; REGINFO *infop; REP *rep; @@ -2434,7 +2671,6 @@ __rep_check_missing(env, gen, master_perm_lsn) infop = env->reginfo; has_log_gap = has_page_gap = ret = 0; - ENV_ENTER(env, ip); MUTEX_LOCK(env, rep->mtx_clientdb); REP_SYSTEM_LOCK(env); /* @@ -2518,8 +2754,7 @@ __rep_check_missing(env, gen, master_perm_lsn) rep->msg_th--; REP_SYSTEM_UNLOCK(env); -out: ENV_LEAVE(env, ip); - return (ret); +out: return (ret); } static int diff --git a/src/rep/rep_region.c b/src/rep/rep_region.c index f1d69dff..72372bff 100644 --- a/src/rep/rep_region.c +++ b/src/rep/rep_region.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -14,6 +14,8 @@ static int __rep_egen_init __P((ENV *, REP *)); static int __rep_gen_init __P((ENV *, REP *)); +static int __rep_view_init __P((ENV *, REP *)); +static int __rep_viewfile_exists __P((ENV *, int *)); /* * __rep_open -- @@ -29,7 +31,7 @@ __rep_open(env) REGENV *renv; REGINFO *infop; REP *rep; - int i, ret; + int i, ret, view; char *p; char fname[sizeof(REP_DIAGNAME) + 3]; @@ -37,10 +39,15 @@ __rep_open(env) infop = env->reginfo; renv = infop->primary; ret = 0; + view = 0; DB_ASSERT(env, DBREP_DIAG_FILES < 100); if (renv->rep_off == INVALID_ROFF) { - /* Must create the region. */ + /* + * Must create the region. This environment either is being + * created for the first time or has just had its regions + * cleared by a recovery. + */ if ((ret = __env_alloc(infop, sizeof(REP), &rep)) != 0) return (ret); memset(rep, 0, sizeof(*rep)); @@ -108,6 +115,23 @@ __rep_open(env) return (ret); if ((ret = __rep_egen_init(env, rep)) != 0) return (ret); + /* + * Determine if this is a view site or not. It is a view + * if the callback is set. If the site was a view in the + * past, we mark it as a view, but will check consistency + * later when starting replication. + */ + if (db_rep->partial != NULL) { + rep->stat.st_view = 1; + if ((ret = __rep_view_init(env, rep)) != 0) + return (ret); + } else { + if ((ret = __rep_viewfile_exists(env, &view)) != 0) + return (ret); + if (view) + rep->stat.st_view = 1; + } + rep->gbytes = db_rep->gbytes; rep->bytes = db_rep->bytes; rep->request_gap = db_rep->request_gap; @@ -157,6 +181,32 @@ __rep_open(env) "process joining the environment")); return (EINVAL); } + /* + * If we are joining an existing environment and we + * have a view callback set, then the environment must + * already be a view. If not, error. + * + * The other mismatch is not an error here (no callback + * set, but environment is a view) because we may be a + * rep unaware process such as db_stat and that is allowed + * to proceed. There is additional checking in other rep + * functions like rep_start to confirm consistency before + * using replication. + */ + if (db_rep->partial != NULL) { + if ((ret = __rep_viewfile_exists(env, &view)) != 0) + return (ret); + /* + * If there is a callback, and we are not in-memory, + * there better be a view system file too. + */ + if (view == 0 && !FLD_ISSET(rep->config, REP_C_INMEM)) { + __db_errx(env, DB_STR("3688", + "Application environment and view mismatch " + "joining the environment")); + return (EINVAL); + } + } #ifdef HAVE_REPLICATION_THREADS if ((ret = __repmgr_join(env, rep)) != 0) return (ret); @@ -506,9 +556,8 @@ __rep_write_egen(env, rep, egen) * If running in-memory replication, return without any file * operations. */ - if (FLD_ISSET(rep->config, REP_C_INMEM)) { + if (FLD_ISSET(rep->config, REP_C_INMEM)) return (0); - } if ((ret = __db_appname(env, DB_APP_META, REP_EGENNAME, NULL, &p)) != 0) @@ -591,9 +640,8 @@ __rep_write_gen(env, rep, gen) * If running in-memory replication, return without any file * operations. */ - if (FLD_ISSET(rep->config, REP_C_INMEM)) { + if (FLD_ISSET(rep->config, REP_C_INMEM)) return (0); - } if ((ret = __db_appname(env, DB_APP_META, REP_GENNAME, NULL, &p)) != 0) @@ -608,3 +656,105 @@ __rep_write_gen(env, rep, gen) __os_free(env, p); return (ret); } + +/* + * __rep_view_init -- + * Initialize the permanent view file to know this site is a view + * forever. The existence of the file is the record. + */ +static int +__rep_view_init(env, rep) + ENV *env; + REP *rep; +{ + DB_FH *fhp; + int ret; + char *p; + + /* + * If running in-memory replication, return without any file + * operations. + */ + if (FLD_ISSET(rep->config, REP_C_INMEM)) + return (0); + + if ((ret = __db_appname(env, + DB_APP_META, REPVIEW, NULL, &p)) != 0) + return (ret); + + /* + * If the file doesn't exist, create it. We just want to open + * and close the file. It doesn't have any content. + * If the file already exists, there is nothing else to do. + */ + if (__os_exists(env, p, NULL) != 0) { + RPRINT(env, (env, DB_VERB_REP_MISC, "View init: Create %s", p)); + if ((ret = __os_open(env, p, 0, + DB_OSO_CREATE | DB_OSO_TRUNC, DB_MODE_600, &fhp)) != 0) + goto out; + (void)__os_closehandle(env, fhp); + } +out: __os_free(env, p); + return (ret); +} + +/* + * __rep_check_view -- + * Check consistency between the view file and the db_rep handle. + * + * PUBLIC: int __rep_check_view __P((ENV *)); + */ +int +__rep_check_view(env) + ENV *env; +{ + DB_REP *db_rep; + REP *rep; + int exist, ret; + + db_rep = env->rep_handle; + rep = db_rep->region; + ret = 0; + + /* + * If running in-memory replication, check without any file + * operations. We can only check what exists in the region, + * which is the st_view field from a previous open. + */ + if (FLD_ISSET(rep->config, REP_C_INMEM)) + exist = (int)rep->stat.st_view; + else if ((ret = __rep_viewfile_exists(env, &exist)) != 0) + return (ret); + + RPRINT(env, (env, DB_VERB_REP_MISC, "Check view. Exist %d, cb %d", + exist, (db_rep->partial != NULL))); + /* + * If view file exists, a partial function must be set. + * If view file does not exist, a partial function must not be set. + */ + if ((exist == 0 && db_rep->partial != NULL) || + (exist == 1 && db_rep->partial == NULL)) + ret = EINVAL; + return (ret); +} + +static int +__rep_viewfile_exists(env, existp) + ENV *env; + int *existp; +{ + char *p; + int ret; + + *existp = 0; + if ((ret = __db_appname(env, + DB_APP_META, REPVIEW, NULL, &p)) != 0) + return (ret); + + if (__os_exists(env, p, NULL) == 0) + *existp = 1; + + __os_free(env, p); + return (ret); + +} diff --git a/src/rep/rep_stat.c b/src/rep/rep_stat.c index addfee25..ffb9f262 100644 --- a/src/rep/rep_stat.c +++ b/src/rep/rep_stat.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -73,6 +73,13 @@ static const char *__rep_syncstate_to_string __P((repsync_t)); } \ } while (0) +#define PRINT_VIEW(sp) do { \ + if ((sp)->st_view != 0) \ + __db_msg(env, "Environment configured as view site"); \ + else \ + __db_msg(env, "Environment not configured as view site");\ +} while (0) + /* * __rep_stat_pp -- * ENV->rep_stat pre/post processing. @@ -120,7 +127,7 @@ __rep_stat(env, statp, flags) DB_REP_STAT *stats; LOG *lp; REP *rep; - u_int32_t startupdone; + u_int32_t startupdone, view; uintmax_t queued; int dolock, ret; @@ -177,10 +184,12 @@ __rep_stat(env, statp, flags) if (LF_ISSET(DB_STAT_CLEAR)) { queued = rep->stat.st_log_queued; startupdone = rep->stat.st_startup_complete; + view = rep->stat.st_view; memset(&rep->stat, 0, sizeof(rep->stat)); rep->stat.st_log_queued = rep->stat.st_log_queued_total = rep->stat.st_log_queued_max = queued; rep->stat.st_startup_complete = startupdone; + rep->stat.st_view = view; } /* @@ -377,6 +386,7 @@ __rep_print_stats(env, flags) __db_dl(env, "Number of page records missed and requested", (u_long)sp->st_pg_requested); PRINT_STARTUPCOMPLETE(sp); + PRINT_VIEW(sp); __db_dl(env, "Number of transactions applied", (u_long)sp->st_txns_applied); @@ -462,16 +472,20 @@ __rep_print_all(env, flags) u_int32_t flags; { static const FN rep_cfn[] = { - { REP_C_2SITE_STRICT, "REP_C_2SITE_STRICT" }, - { REP_C_AUTOINIT, "REP_C_AUTOINIT" }, - { REP_C_AUTOROLLBACK, "REP_C_AUTOROLLBACK" }, - { REP_C_BULK, "REP_C_BULK" }, - { REP_C_DELAYCLIENT, "REP_C_DELAYCLIENT" }, - { REP_C_ELECTIONS, "REP_C_ELECTIONS" }, - { REP_C_INMEM, "REP_C_INMEM" }, - { REP_C_LEASE, "REP_C_LEASE" }, - { REP_C_NOWAIT, "REP_C_NOWAIT" }, - { 0, NULL } + { REP_C_2SITE_STRICT, "REP_C_2SITE_STRICT" }, + { REP_C_AUTOINIT, "REP_C_AUTOINIT" }, + { REP_C_AUTOROLLBACK, "REP_C_AUTOROLLBACK" }, + { REP_C_AUTOTAKEOVER, "REP_C_AUTOTAKEOVER" }, + { REP_C_BULK, "REP_C_BULK" }, + { REP_C_DELAYCLIENT, "REP_C_DELAYCLIENT" }, + { REP_C_ELECT_LOGLENGTH, "REP_C_ELECT_LOGLENGTH" }, + { REP_C_ELECTIONS, "REP_C_ELECTIONS" }, + { REP_C_INMEM, "REP_C_INMEM" }, + { REP_C_LEASE, "REP_C_LEASE" }, + { REP_C_NOWAIT, "REP_C_NOWAIT" }, + { REP_C_PREFMAS_CLIENT, "REP_C_PREFMAS_CLIENT" }, + { REP_C_PREFMAS_MASTER, "REP_C_PREFMAS_MASTER" }, + { 0, NULL } }; static const FN rep_efn[] = { { REP_E_PHASE0, "REP_E_PHASE0" }, @@ -481,19 +495,21 @@ __rep_print_all(env, flags) { 0, NULL } }; static const FN rep_fn[] = { - { REP_F_ABBREVIATED, "REP_F_ABBREVIATED" }, - { REP_F_APP_BASEAPI, "REP_F_APP_BASEAPI" }, - { REP_F_APP_REPMGR, "REP_F_APP_REPMGR" }, - { REP_F_CLIENT, "REP_F_CLIENT" }, - { REP_F_DELAY, "REP_F_DELAY" }, - { REP_F_GROUP_ESTD, "REP_F_GROUP_ESTD" }, - { REP_F_LEASE_EXPIRED, "REP_F_LEASE_EXPIRED" }, - { REP_F_MASTER, "REP_F_MASTER" }, - { REP_F_MASTERELECT, "REP_F_MASTERELECT" }, - { REP_F_NEWFILE, "REP_F_NEWFILE" }, - { REP_F_NIMDBS_LOADED, "REP_F_NIMDBS_LOADED" }, - { REP_F_SKIPPED_APPLY, "REP_F_SKIPPED_APPLY" }, - { REP_F_START_CALLED, "REP_F_START_CALLED" }, + { REP_F_ABBREVIATED, "REP_F_ABBREVIATED" }, + { REP_F_APP_BASEAPI, "REP_F_APP_BASEAPI" }, + { REP_F_APP_REPMGR, "REP_F_APP_REPMGR" }, + { REP_F_CLIENT, "REP_F_CLIENT" }, + { REP_F_DELAY, "REP_F_DELAY" }, + { REP_F_GROUP_ESTD, "REP_F_GROUP_ESTD" }, + { REP_F_HOLD_GEN, "REP_F_HOLD_GEN" }, + { REP_F_LEASE_EXPIRED, "REP_F_LEASE_EXPIRED" }, + { REP_F_MASTER, "REP_F_MASTER" }, + { REP_F_MASTERELECT, "REP_F_MASTERELECT" }, + { REP_F_NEWFILE, "REP_F_NEWFILE" }, + { REP_F_NIMDBS_LOADED, "REP_F_NIMDBS_LOADED" }, + { REP_F_READONLY_MASTER, "REP_F_READONLY_MASTER" }, + { REP_F_SKIPPED_APPLY, "REP_F_SKIPPED_APPLY" }, + { REP_F_START_CALLED, "REP_F_START_CALLED" }, { 0, NULL } }; static const FN rep_lfn[] = { @@ -523,15 +539,16 @@ __rep_print_all(env, flags) rep = db_rep->region; infop = env->reginfo; renv = infop->primary; - ENV_ENTER(env, ip); __db_msg(env, "%s", DB_GLOBAL(db_line)); __db_msg(env, "DB_REP handle information:"); if (db_rep->rep_db == NULL) STAT_ISSET("Bookkeeping database", db_rep->rep_db); - else + else { + ENV_GET_THREAD_INFO(env, ip); (void)__db_stat_print(db_rep->rep_db, ip, flags); + } __db_prflags(env, NULL, db_rep->flags, dbrep_fn, NULL, "\tFlags"); @@ -604,7 +621,6 @@ __rep_print_all(env, flags) STAT_LONG("Maximum lease timestamp microseconds", lp->max_lease_ts.tv_nsec / NS_PER_US); MUTEX_UNLOCK(env, rep->mtx_clientdb); - ENV_LEAVE(env, ip); return (0); } @@ -648,8 +664,10 @@ __rep_stat_summary_print(env) ret = 0; if ((ret = __rep_stat(env, &sp, 0)) == 0) { PRINT_STATUS(sp, is_client); - if (is_client) + if (is_client) { PRINT_STARTUPCOMPLETE(sp); + PRINT_VIEW(sp); + } PRINT_MAXPERMLSN(sp); /* * Use the number of sites that is kept up-to-date most diff --git a/src/rep/rep_stub.c b/src/rep/rep_stub.c index 2d96ea59..51c79eb0 100644 --- a/src/rep/rep_stub.c +++ b/src/rep/rep_stub.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -130,7 +130,7 @@ __rep_elect_pp(dbenv, nsites, nvotes, flags) } int -__rep_flush(dbenv) +__rep_flush_pp(dbenv) DB_ENV *dbenv; { return (__db_norep(dbenv->env)); @@ -201,7 +201,7 @@ __rep_get_nsites(dbenv, n) } int -__rep_set_priority(dbenv, priority) +__rep_set_priority_pp(dbenv, priority) DB_ENV *dbenv; u_int32_t priority; { @@ -219,7 +219,7 @@ __rep_get_priority(dbenv, priority) } int -__rep_set_timeout(dbenv, which, timeout) +__rep_set_timeout_pp(dbenv, which, timeout) DB_ENV *dbenv; int which; db_timeout_t timeout; @@ -342,6 +342,16 @@ __rep_set_transport_pp(dbenv, eid, f_send) } int +__rep_set_view(dbenv, f_partial) + DB_ENV *dbenv; + int (*f_partial) __P((DB_ENV *, + const char *, int *, u_int32_t)); +{ + COMPQUIET(f_partial, NULL); + return (__db_norep(dbenv->env)); +} + +int __rep_set_request(dbenv, min, max) DB_ENV *dbenv; u_int32_t min, max; diff --git a/src/rep/rep_util.c b/src/rep/rep_util.c index 0dfe6122..5ee2592f 100644 --- a/src/rep/rep_util.c +++ b/src/rep/rep_util.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -11,6 +11,7 @@ #include "db_int.h" #include "dbinc/db_page.h" #include "dbinc/db_am.h" +#include "dbinc/fop.h" #include "dbinc/mp.h" #include "dbinc/txn.h" @@ -437,7 +438,7 @@ __rep_send_message(env, eid, rtype, lsnp, dbt, ctlflags, repflags) FLD_ISSET(ctlflags, REPCTL_LEASE | REPCTL_PERM)) { F_SET(&cntrl, REPCTL_LEASE); DB_ASSERT(env, rep->version == DB_REPVERSION); - __os_gettime(env, &msg_time, 1); + __os_gettime(env, &msg_time, 0); cntrl.msg_sec = (u_int32_t)msg_time.tv_sec; cntrl.msg_nsec = (u_int32_t)msg_time.tv_nsec; } @@ -591,6 +592,15 @@ __rep_new_master(env, cntrl, eid) ret = 0; logc = NULL; lockout_msg = 0; + + /* + * If REP_F_HOLD_GEN is set, we want to keep this site at its + * current gen. Do not process an incoming NEWMASTER, which + * would change the gen. + */ + if (F_ISSET(rep, REP_F_HOLD_GEN)) + return (ret); + REP_SYSTEM_LOCK(env); change = rep->gen != cntrl->gen || rep->master_id != eid; /* @@ -1128,6 +1138,8 @@ __env_db_rep_exit(env) rep = db_rep->region; REP_SYSTEM_LOCK(env); + /* If we have a reference, it better not already be 0. */ + DB_ASSERT(env, rep->handle_cnt != 0); rep->handle_cnt--; REP_SYSTEM_UNLOCK(env); @@ -1190,7 +1202,7 @@ __db_rep_enter(dbp, checkgen, checklock, return_now) * get an exclusive lock on this database. */ if (checkgen && dbp->mpf->mfp && IS_REP_CLIENT(env)) { - if (dbp->mpf->mfp->excl_lockout) + if (dbp->mpf->mfp->excl_lockout) return (DB_REP_HANDLE_DEAD); } @@ -1328,7 +1340,8 @@ __op_rep_exit(env) rep = db_rep->region; REP_SYSTEM_LOCK(env); - DB_ASSERT(env, rep->op_cnt > 0); + /* If we have a reference, it better not already be 0. */ + DB_ASSERT(env, rep->op_cnt != 0); rep->op_cnt--; REP_SYSTEM_UNLOCK(env); @@ -1697,7 +1710,9 @@ __rep_msg_to_old(version, rectype) REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, - REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID }, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID }, /* * 4.2/DB_REPVERSION 1 no longer supported. */ @@ -1708,7 +1723,9 @@ __rep_msg_to_old(version, rectype) REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, - REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID }, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID }, /* * 4.3/DB_REPVERSION 2 no longer supported. */ @@ -1719,7 +1736,9 @@ __rep_msg_to_old(version, rectype) REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, - REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID }, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID }, /* * From 4.7 message number To 4.4/4.5 message number */ @@ -1727,6 +1746,11 @@ __rep_msg_to_old(version, rectype) 1, /* REP_ALIVE */ 2, /* REP_ALIVE_REQ */ 3, /* REP_ALL_REQ */ + REP_INVALID, /* REP_BLOB_ALL_REQ */ + REP_INVALID, /* REP_BLOB_CHUNK */ + REP_INVALID, /* REP_BLOB_CHUNK_REQ */ + REP_INVALID, /* REP_BLOB_UPDATE */ + REP_INVALID, /* REP_BLOB_UPDATE_REQ */ 4, /* REP_BULK_LOG */ 5, /* REP_BULK_PAGE */ 6, /* REP_DUPMASTER */ @@ -1765,6 +1789,11 @@ __rep_msg_to_old(version, rectype) 1, /* REP_ALIVE */ 2, /* REP_ALIVE_REQ */ 3, /* REP_ALL_REQ */ + REP_INVALID, /* REP_BLOB_ALL_REQ */ + REP_INVALID, /* REP_BLOB_CHUNK */ + REP_INVALID, /* REP_BLOB_CHUNK_REQ */ + REP_INVALID, /* REP_BLOB_UPDATE */ + REP_INVALID, /* REP_BLOB_UPDATE_REQ */ 4, /* REP_BULK_LOG */ 5, /* REP_BULK_PAGE */ 6, /* REP_DUPMASTER */ @@ -1803,6 +1832,11 @@ __rep_msg_to_old(version, rectype) 1, /* REP_ALIVE */ 2, /* REP_ALIVE_REQ */ 3, /* REP_ALL_REQ */ + REP_INVALID, /* REP_BLOB_ALL_REQ */ + REP_INVALID, /* REP_BLOB_CHUNK */ + REP_INVALID, /* REP_BLOB_CHUNK_REQ */ + REP_INVALID, /* REP_BLOB_UPDATE */ + REP_INVALID, /* REP_BLOB_UPDATE_REQ */ 4, /* REP_BULK_LOG */ 5, /* REP_BULK_PAGE */ 6, /* REP_DUPMASTER */ @@ -1841,6 +1875,53 @@ __rep_msg_to_old(version, rectype) 1, /* REP_ALIVE */ 2, /* REP_ALIVE_REQ */ 3, /* REP_ALL_REQ */ + REP_INVALID, /* REP_BLOB_ALL_REQ */ + REP_INVALID, /* REP_BLOB_CHUNK */ + REP_INVALID, /* REP_BLOB_CHUNK_REQ */ + REP_INVALID, /* REP_BLOB_UPDATE */ + REP_INVALID, /* REP_BLOB_UPDATE_REQ */ + 4, /* REP_BULK_LOG */ + 5, /* REP_BULK_PAGE */ + 6, /* REP_DUPMASTER */ + 7, /* REP_FILE */ + 8, /* REP_FILE_FAIL */ + 9, /* REP_FILE_REQ */ + 10, /* REP_LEASE_GRANT */ + 11, /* REP_LOG */ + 12, /* REP_LOG_MORE */ + 13, /* REP_LOG_REQ */ + 14, /* REP_MASTER_REQ */ + 15, /* REP_NEWCLIENT */ + 16, /* REP_NEWFILE */ + 17, /* REP_NEWMASTER */ + 18, /* REP_NEWSITE */ + 19, /* REP_PAGE */ + 20, /* REP_PAGE_FAIL */ + 21, /* REP_PAGE_MORE */ + 22, /* REP_PAGE_REQ */ + 23, /* REP_REREQUEST */ + 24, /* REP_START_SYNC */ + 25, /* REP_UPDATE */ + 26, /* REP_UPDATE_REQ */ + 27, /* REP_VERIFY */ + 28, /* REP_VERIFY_FAIL */ + 29, /* REP_VERIFY_REQ */ + 30, /* REP_VOTE1 */ + 31 /* REP_VOTE2 */ + }, + /* + * From 6.1 message number To 5.3 message number. Messages + * handling BLOBs were added. + */ + { REP_INVALID, /* NO message 0 */ + 1, /* REP_ALIVE */ + 2, /* REP_ALIVE_REQ */ + 3, /* REP_ALL_REQ */ + REP_INVALID, /* REP_BLOB_ALL_REQ */ + REP_INVALID, /* REP_BLOB_CHUNK */ + REP_INVALID, /* REP_BLOB_CHUNK_REQ */ + REP_INVALID, /* REP_BLOB_UPDATE */ + REP_INVALID, /* REP_BLOB_UPDATE_REQ */ 4, /* REP_BULK_LOG */ 5, /* REP_BULK_PAGE */ 6, /* REP_DUPMASTER */ @@ -1901,7 +1982,9 @@ __rep_msg_from_old(version, rectype) REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, - REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID }, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID }, /* * 4.2/DB_REPVERSION 1 no longer supported. */ @@ -1912,7 +1995,9 @@ __rep_msg_from_old(version, rectype) REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, - REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID }, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID }, /* * 4.3/DB_REPVERSION 2 no longer supported. */ @@ -1923,7 +2008,9 @@ __rep_msg_from_old(version, rectype) REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, - REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID }, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID }, /* * From 4.4/4.5 message number To 4.7 message number */ @@ -1931,36 +2018,41 @@ __rep_msg_from_old(version, rectype) 1, /* 1, REP_ALIVE */ 2, /* 2, REP_ALIVE_REQ */ 3, /* 3, REP_ALL_REQ */ - 4, /* 4, REP_BULK_LOG */ - 5, /* 5, REP_BULK_PAGE */ - 6, /* 6, REP_DUPMASTER */ - 7, /* 7, REP_FILE */ - 8, /* 8, REP_FILE_FAIL */ - 9, /* 9, REP_FILE_REQ */ - /* 10, REP_LEASE_GRANT doesn't exist */ - 11, /* 10, REP_LOG */ - 12, /* 11, REP_LOG_MORE */ - 13, /* 12, REP_LOG_REQ */ - 14, /* 13, REP_MASTER_REQ */ - 15, /* 14, REP_NEWCLIENT */ - 16, /* 15, REP_NEWFILE */ - 17, /* 16, REP_NEWMASTER */ - 18, /* 17, REP_NEWSITE */ - 19, /* 18, REP_PAGE */ - 20, /* 19, REP_PAGE_FAIL */ - 21, /* 20, REP_PAGE_MORE */ - 22, /* 21, REP_PAGE_REQ */ - 23, /* 22, REP_REREQUEST */ - /* 24, REP_START_SYNC doesn't exist */ - 25, /* 23, REP_UPDATE */ - 26, /* 24, REP_UPDATE_REQ */ - 27, /* 25, REP_VERIFY */ - 28, /* 26, REP_VERIFY_FAIL */ - 29, /* 27, REP_VERIFY_REQ */ - 30, /* 28, REP_VOTE1 */ - 31, /* 29, REP_VOTE2 */ + 9, /* 4, REP_BULK_LOG */ + 10, /* 5, REP_BULK_PAGE */ + 11, /* 6, REP_DUPMASTER */ + 12, /* 7, REP_FILE */ + 13, /* 8, REP_FILE_FAIL */ + 14, /* 9, REP_FILE_REQ */ + /* 15, REP_LEASE_GRANT doesn't exist */ + 16, /* 10, REP_LOG */ + 17, /* 11, REP_LOG_MORE */ + 18, /* 12, REP_LOG_REQ */ + 19, /* 13, REP_MASTER_REQ */ + 20, /* 14, REP_NEWCLIENT */ + 21, /* 15, REP_NEWFILE */ + 22, /* 16, REP_NEWMASTER */ + 23, /* 17, REP_NEWSITE */ + 24, /* 18, REP_PAGE */ + 25, /* 19, REP_PAGE_FAIL */ + 26, /* 20, REP_PAGE_MORE */ + 27, /* 21, REP_PAGE_REQ */ + 28, /* 22, REP_REREQUEST */ + /* 29, REP_START_SYNC doesn't exist */ + 30, /* 23, REP_UPDATE */ + 31, /* 24, REP_UPDATE_REQ */ + 32, /* 25, REP_VERIFY */ + 33, /* 26, REP_VERIFY_FAIL */ + 34, /* 27, REP_VERIFY_REQ */ + 35, /* 28, REP_VOTE1 */ + 36, /* 29, REP_VOTE2 */ REP_INVALID, /* 30, 4.4/4.5 no message */ - REP_INVALID /* 31, 4.4/4.5 no message */ + REP_INVALID, /* 31, 4.4/4.5 no message */ + REP_INVALID, /* 32, 4.4/4.5 no message */ + REP_INVALID, /* 33, 4.4/4.5 no message */ + REP_INVALID, /* 34, 4.4/4.5 no message */ + REP_INVALID, /* 35, 4.4/4.5 no message */ + REP_INVALID /* 36, 4.4/4.5 no message */ }, /* * From 4.6 message number To 4.7 message number. There are @@ -1971,34 +2063,39 @@ __rep_msg_from_old(version, rectype) 1, /* 1, REP_ALIVE */ 2, /* 2, REP_ALIVE_REQ */ 3, /* 3, REP_ALL_REQ */ - 4, /* 4, REP_BULK_LOG */ - 5, /* 5, REP_BULK_PAGE */ - 6, /* 6, REP_DUPMASTER */ - 7, /* 7, REP_FILE */ - 8, /* 8, REP_FILE_FAIL */ - 9, /* 9, REP_FILE_REQ */ - 10, /* 10, REP_LEASE_GRANT */ - 11, /* 11, REP_LOG */ - 12, /* 12, REP_LOG_MORE */ - 13, /* 13, REP_LOG_REQ */ - 14, /* 14, REP_MASTER_REQ */ - 15, /* 15, REP_NEWCLIENT */ - 16, /* 16, REP_NEWFILE */ - 17, /* 17, REP_NEWMASTER */ - 18, /* 18, REP_NEWSITE */ - 19, /* 19, REP_PAGE */ - 20, /* 20, REP_PAGE_FAIL */ - 21, /* 21, REP_PAGE_MORE */ - 22, /* 22, REP_PAGE_REQ */ - 23, /* 22, REP_REREQUEST */ - 24, /* 24, REP_START_SYNC */ - 25, /* 25, REP_UPDATE */ - 26, /* 26, REP_UPDATE_REQ */ - 27, /* 27, REP_VERIFY */ - 28, /* 28, REP_VERIFY_FAIL */ - 29, /* 29, REP_VERIFY_REQ */ - 30, /* 30, REP_VOTE1 */ - 31 /* 31, REP_VOTE2 */ + 9, /* 4, REP_BULK_LOG */ + 10, /* 5, REP_BULK_PAGE */ + 11, /* 6, REP_DUPMASTER */ + 12, /* 7, REP_FILE */ + 13, /* 8, REP_FILE_FAIL */ + 14, /* 9, REP_FILE_REQ */ + 15, /* 10, REP_LEASE_GRANT */ + 16, /* 11, REP_LOG */ + 17, /* 12, REP_LOG_MORE */ + 18, /* 13, REP_LOG_REQ */ + 19, /* 14, REP_MASTER_REQ */ + 20, /* 15, REP_NEWCLIENT */ + 21, /* 16, REP_NEWFILE */ + 22, /* 17, REP_NEWMASTER */ + 23, /* 18, REP_NEWSITE */ + 24, /* 19, REP_PAGE */ + 25, /* 20, REP_PAGE_FAIL */ + 26, /* 21, REP_PAGE_MORE */ + 27, /* 22, REP_PAGE_REQ */ + 28, /* 22, REP_REREQUEST */ + 29, /* 24, REP_START_SYNC */ + 30, /* 25, REP_UPDATE */ + 31, /* 26, REP_UPDATE_REQ */ + 32, /* 27, REP_VERIFY */ + 33, /* 28, REP_VERIFY_FAIL */ + 34, /* 29, REP_VERIFY_REQ */ + 35, /* 30, REP_VOTE1 */ + 36, /* 31, REP_VOTE2 */ + REP_INVALID, /* 32, 4.6/4.7 no message */ + REP_INVALID, /* 33, 4.6/4.7 no message */ + REP_INVALID, /* 34, 4.6/4.7 no message */ + REP_INVALID, /* 35, 4.6/4.7 no message */ + REP_INVALID /* 36, 4.6/4.7 no message */ }, /* * From 4.7 message number To 5.2 message number. There are @@ -2009,34 +2106,39 @@ __rep_msg_from_old(version, rectype) 1, /* 1, REP_ALIVE */ 2, /* 2, REP_ALIVE_REQ */ 3, /* 3, REP_ALL_REQ */ - 4, /* 4, REP_BULK_LOG */ - 5, /* 5, REP_BULK_PAGE */ - 6, /* 6, REP_DUPMASTER */ - 7, /* 7, REP_FILE */ - 8, /* 8, REP_FILE_FAIL */ - 9, /* 9, REP_FILE_REQ */ - 10, /* 10, REP_LEASE_GRANT */ - 11, /* 11, REP_LOG */ - 12, /* 12, REP_LOG_MORE */ - 13, /* 13, REP_LOG_REQ */ - 14, /* 14, REP_MASTER_REQ */ - 15, /* 15, REP_NEWCLIENT */ - 16, /* 16, REP_NEWFILE */ - 17, /* 17, REP_NEWMASTER */ - 18, /* 18, REP_NEWSITE */ - 19, /* 19, REP_PAGE */ - 20, /* 20, REP_PAGE_FAIL */ - 21, /* 21, REP_PAGE_MORE */ - 22, /* 22, REP_PAGE_REQ */ - 23, /* 22, REP_REREQUEST */ - 24, /* 24, REP_START_SYNC */ - 25, /* 25, REP_UPDATE */ - 26, /* 26, REP_UPDATE_REQ */ - 27, /* 27, REP_VERIFY */ - 28, /* 28, REP_VERIFY_FAIL */ - 29, /* 29, REP_VERIFY_REQ */ - 30, /* 30, REP_VOTE1 */ - 31 /* 31, REP_VOTE2 */ + 9, /* 4, REP_BULK_LOG */ + 10, /* 5, REP_BULK_PAGE */ + 11, /* 6, REP_DUPMASTER */ + 12, /* 7, REP_FILE */ + 13, /* 8, REP_FILE_FAIL */ + 14, /* 9, REP_FILE_REQ */ + 15, /* 10, REP_LEASE_GRANT */ + 16, /* 11, REP_LOG */ + 17, /* 12, REP_LOG_MORE */ + 18, /* 13, REP_LOG_REQ */ + 19, /* 14, REP_MASTER_REQ */ + 20, /* 15, REP_NEWCLIENT */ + 21, /* 16, REP_NEWFILE */ + 22, /* 17, REP_NEWMASTER */ + 23, /* 18, REP_NEWSITE */ + 24, /* 19, REP_PAGE */ + 25, /* 20, REP_PAGE_FAIL */ + 26, /* 21, REP_PAGE_MORE */ + 27, /* 22, REP_PAGE_REQ */ + 28, /* 22, REP_REREQUEST */ + 29, /* 24, REP_START_SYNC */ + 30, /* 25, REP_UPDATE */ + 31, /* 26, REP_UPDATE_REQ */ + 32, /* 27, REP_VERIFY */ + 33, /* 28, REP_VERIFY_FAIL */ + 34, /* 29, REP_VERIFY_REQ */ + 35, /* 30, REP_VOTE1 */ + 36, /* 31, REP_VOTE2 */ + REP_INVALID, /* 32, 4.7/5.2 no message */ + REP_INVALID, /* 33, 4.7/5.2 no message */ + REP_INVALID, /* 34, 4.7/5.2 no message */ + REP_INVALID, /* 35, 4.7/5.2 no message */ + REP_INVALID /* 36, 4.7/5.2 no message */ }, /* * From 4.7 message number To 5.3 message number. There are @@ -2047,34 +2149,86 @@ __rep_msg_from_old(version, rectype) 1, /* 1, REP_ALIVE */ 2, /* 2, REP_ALIVE_REQ */ 3, /* 3, REP_ALL_REQ */ - 4, /* 4, REP_BULK_LOG */ - 5, /* 5, REP_BULK_PAGE */ - 6, /* 6, REP_DUPMASTER */ - 7, /* 7, REP_FILE */ - 8, /* 8, REP_FILE_FAIL */ - 9, /* 9, REP_FILE_REQ */ - 10, /* 10, REP_LEASE_GRANT */ - 11, /* 11, REP_LOG */ - 12, /* 12, REP_LOG_MORE */ - 13, /* 13, REP_LOG_REQ */ - 14, /* 14, REP_MASTER_REQ */ - 15, /* 15, REP_NEWCLIENT */ - 16, /* 16, REP_NEWFILE */ - 17, /* 17, REP_NEWMASTER */ - 18, /* 18, REP_NEWSITE */ - 19, /* 19, REP_PAGE */ - 20, /* 20, REP_PAGE_FAIL */ - 21, /* 21, REP_PAGE_MORE */ - 22, /* 22, REP_PAGE_REQ */ - 23, /* 22, REP_REREQUEST */ - 24, /* 24, REP_START_SYNC */ - 25, /* 25, REP_UPDATE */ - 26, /* 26, REP_UPDATE_REQ */ - 27, /* 27, REP_VERIFY */ - 28, /* 28, REP_VERIFY_FAIL */ - 29, /* 29, REP_VERIFY_REQ */ - 30, /* 30, REP_VOTE1 */ - 31 /* 31, REP_VOTE2 */ + 9, /* 4, REP_BULK_LOG */ + 10, /* 5, REP_BULK_PAGE */ + 11, /* 6, REP_DUPMASTER */ + 12, /* 7, REP_FILE */ + 13, /* 8, REP_FILE_FAIL */ + 14, /* 9, REP_FILE_REQ */ + 15, /* 10, REP_LEASE_GRANT */ + 16, /* 11, REP_LOG */ + 17, /* 12, REP_LOG_MORE */ + 18, /* 13, REP_LOG_REQ */ + 19, /* 14, REP_MASTER_REQ */ + 20, /* 15, REP_NEWCLIENT */ + 21, /* 16, REP_NEWFILE */ + 22, /* 17, REP_NEWMASTER */ + 23, /* 18, REP_NEWSITE */ + 24, /* 19, REP_PAGE */ + 25, /* 20, REP_PAGE_FAIL */ + 26, /* 21, REP_PAGE_MORE */ + 27, /* 22, REP_PAGE_REQ */ + 28, /* 22, REP_REREQUEST */ + 29, /* 24, REP_START_SYNC */ + 30, /* 25, REP_UPDATE */ + 31, /* 26, REP_UPDATE_REQ */ + 32, /* 27, REP_VERIFY */ + 33, /* 28, REP_VERIFY_FAIL */ + 34, /* 29, REP_VERIFY_REQ */ + 35, /* 30, REP_VOTE1 */ + 36, /* 31, REP_VOTE2 */ + REP_INVALID, /* 32, 4.7/5.3 no message */ + REP_INVALID, /* 33, 4.7/5.3 no message */ + REP_INVALID, /* 34, 4.7/5.3 no message */ + REP_INVALID, /* 35, 4.7/5.3 no message */ + REP_INVALID /* 36, 4.7/5.3 no message */ + }, + /* + * From 5.3 message number To 6.1 message number. Messages to + * handle BLOBs were added. + */ + { REP_INVALID, /* NO message 0 */ + 1, /* 1, REP_ALIVE */ + 2, /* 2, REP_ALIVE_REQ */ + 3, /* 3, REP_ALL_REQ */ + /* 4, REP_BLOB_ALL_REQ doesn't exist */ + /* 5, REP_BLOB_CHUNK doesn't exist */ + /* 6, REP_BLOB_CHUNK_REQ doesn't exist */ + /* 7, REP_BLOB_UPDATE doesn't exist */ + /* 8, REP_BLOB_UPDATE_REQ doesn't exist */ + 9, /* 4, REP_BULK_LOG */ + 10, /* 5, REP_BULK_PAGE */ + 11, /* 6, REP_DUPMASTER */ + 12, /* 7, REP_FILE */ + 13, /* 8, REP_FILE_FAIL */ + 14, /* 9, REP_FILE_REQ */ + 15, /* 10, REP_LEASE_GRANT */ + 16, /* 11, REP_LOG */ + 17, /* 12, REP_LOG_MORE */ + 18, /* 13, REP_LOG_REQ */ + 19, /* 14, REP_MASTER_REQ */ + 20, /* 15, REP_NEWCLIENT */ + 21, /* 16, REP_NEWFILE */ + 22, /* 17, REP_NEWMASTER */ + 23, /* 18, REP_NEWSITE */ + 24, /* 19, REP_PAGE */ + 25, /* 20, REP_PAGE_FAIL */ + 26, /* 21, REP_PAGE_MORE */ + 27, /* 22, REP_PAGE_REQ */ + 28, /* 23, REP_REREQUEST */ + 29, /* 24, REP_START_SYNC */ + 30, /* 25, REP_UPDATE */ + 31, /* 26, REP_UPDATE_REQ */ + 32, /* 27, REP_VERIFY */ + 33, /* 28, REP_VERIFY_FAIL */ + 34, /* 29, REP_VERIFY_REQ */ + 35, /* 30, REP_VOTE1 */ + 36, /* 31, REP_VOTE2 */ + REP_INVALID, /* 32, 5.3/6.1 no message */ + REP_INVALID, /* 33, 5.3/6.1 no message */ + REP_INVALID, /* 34, 5.3/6.1 no message */ + REP_INVALID, /* 35, 5.3/6.1 no message */ + REP_INVALID /* 36, 5.3/6.1 no message */ } }; return (table[version][rectype]); @@ -2215,9 +2369,9 @@ __rep_print_int(env, verbose, fmt, ap) __os_id(env->dbenv, &pid, &tid); if (diag_msg) MUTEX_LOCK(env, rep->mtx_diag); - __os_gettime(env, &ts, 1); + __os_gettime(env, &ts, 0); __db_msgadd(env, &mb, "[%lu:%lu][%s] %s: ", - (u_long)ts.tv_sec, (u_long)ts.tv_nsec/NS_PER_US, + (u_long)ts.tv_sec, (u_long)ts.tv_nsec / NS_PER_US, env->dbenv->thread_id_string(env->dbenv, pid, tid, buf), s); __db_msgadd_ap(env, &mb, fmt, ap); @@ -2260,6 +2414,26 @@ __rep_print_message(env, eid, rp, str, flags) FLD_SET(verbflag, DB_VERB_REP_MISC); type = "all_req"; break; + case REP_BLOB_ALL_REQ: + FLD_SET(verbflag, DB_VERB_REP_MISC); + type = "all_blob_req"; + break; + case REP_BLOB_CHUNK: + FLD_SET(verbflag, DB_VERB_REP_MISC); + type = "blob_chunk"; + break; + case REP_BLOB_CHUNK_REQ: + FLD_SET(verbflag, DB_VERB_REP_MISC); + type = "blob_chunk_req"; + break; + case REP_BLOB_UPDATE: + FLD_SET(verbflag, DB_VERB_REP_MISC); + type = "blob_update"; + break; + case REP_BLOB_UPDATE_REQ: + FLD_SET(verbflag, DB_VERB_REP_MISC); + type = "blob_update_req"; + break; case REP_BULK_LOG: FLD_SET(verbflag, DB_VERB_REP_MISC); type = "bulk_log"; @@ -2650,9 +2824,19 @@ __rep_log_backup(env, logc, lsn, match) */ if ((match == REP_REC_COMMIT && rectype == DB___txn_regop) || - (match == REP_REC_PERM && - (rectype == DB___txn_ckp || rectype == DB___txn_regop))) + ((match == REP_REC_PERM || match == REP_REC_PERM_DEL) && + IS_PERM_RECTYPE(rectype))) break; + /* + * Break early if a file remove is discovered in the logs. + * BDB cannot restore a deleted database or blob file from + * logs, so trigger internal init to recover the file. + * Used by Instant Internal Init in replication. + */ + if (match == REP_REC_PERM_DEL && rectype == DB___fop_remove) { + ret = DB_NOTFOUND; + break; + } } return (ret); } @@ -2671,7 +2855,6 @@ __rep_get_maxpermlsn(env, max_perm_lsnp) { DB_LOG *dblp; DB_REP *db_rep; - DB_THREAD_INFO *ip; LOG *lp; REP *rep; @@ -2680,11 +2863,9 @@ __rep_get_maxpermlsn(env, max_perm_lsnp) dblp = env->lg_handle; lp = dblp->reginfo.primary; - ENV_ENTER(env, ip); MUTEX_LOCK(env, rep->mtx_clientdb); *max_perm_lsnp = lp->max_perm_lsn; MUTEX_UNLOCK(env, rep->mtx_clientdb); - ENV_LEAVE(env, ip); return (0); } @@ -2724,12 +2905,13 @@ __rep_get_datagen(env, data_genp) u_int8_t data_buf[__REP_LSN_HIST_DATA_SIZE]; DBT key_dbt, data_dbt; u_int32_t flags; - int ret, t_ret, tries; + int ret, t_ret, tries, was_open; db_rep = env->rep_handle; ret = 0; *data_genp = 0; tries = 0; + was_open = 0; flags = DB_LAST; retry: if ((ret = __txn_begin(env, NULL, NULL, &txn, DB_IGNORE_LEASE)) != 0) @@ -2746,10 +2928,10 @@ retry: * That is not an error. */ ret = 0; - goto out; + goto noclose; } - db_rep->lsn_db = dbp; - } + } else + was_open = 1; if ((ret = __db_cursor(dbp, NULL, txn, &dbc, 0)) != 0) goto out; @@ -2784,8 +2966,126 @@ retry: &key, key_buf, __REP_LSN_HIST_KEY_SIZE, NULL)) == 0) *data_genp = key.gen; out: + if (!was_open && dbp != NULL && + (t_ret = __db_close(dbp, txn, DB_NOSYNC)) != 0 && ret == 0) + ret = t_ret; +noclose: if ((t_ret = __txn_commit(txn, DB_TXN_NOSYNC)) != 0 && ret == 0) ret = t_ret; err: return (ret); } + +/* + * __rep_become_readonly_master -- + * + * Put this master into a state where it no longer accepts writes but it + * is still a master that can respond to requests for missing messages. + * It fills in sync_lsn to provide a mechanism to know the LSN of the + * next log record expected on this site. Generally, this site should + * be restarted as a client shortly after becoming a readonly master. + * + * PUBLIC: int __rep_become_readonly_master + * PUBLIC: __P((ENV *, u_int32_t *, DB_LSN *)); + */ +int +__rep_become_readonly_master(env, gen, sync_lsnp) + ENV *env; + u_int32_t *gen; + DB_LSN *sync_lsnp; +{ + DB_LOG *dblp; + DB_REP *db_rep; + LOG *lp; + REP *rep; + int locked, ret; + + db_rep = env->rep_handle; + rep = db_rep->region; + dblp = env->lg_handle; + lp = dblp->reginfo.primary; + *gen = 0; + ZERO_LSN(*sync_lsnp); + ret = 0; + locked = 0; + + REP_SYSTEM_LOCK(env); + /* + * Lock out replication message thread processing so that replication + * world won't change (e.g. restart, client sync). + */ + if (FLD_ISSET(rep->lockout_flags, REP_LOCKOUT_MSG)) { + /* There is already someone in msg lockout, return. */ + RPRINT(env, (env, DB_VERB_REP_MISC, + "Readonly master: thread already in msg lockout")); + goto errunlock; + } else if ((ret = __rep_lockout_msg(env, rep, 0)) != 0) + goto errclearlockouts; + + /* + * Lock out API to wait for active txn/mpool operations to complete + * and prevent new ones from starting. + */ + if ((ret = __rep_lockout_api(env, rep)) != 0) + goto errclearlockouts; + locked = 1; + + /* Make this site a readonly master and get master generation. */ + F_SET(rep, REP_F_READONLY_MASTER); + *gen = rep->gen; + REP_SYSTEM_UNLOCK(env); + + /* Get the next log record the logging subsystem expects to write. */ + LOG_SYSTEM_LOCK(env); + *sync_lsnp = lp->lsn; + LOG_SYSTEM_UNLOCK(env); + + REP_SYSTEM_LOCK(env); +errclearlockouts: + FLD_CLR(rep->lockout_flags, REP_LOCKOUT_MSG); + if (locked) + CLR_LOCKOUT_BDB(rep); +errunlock: + REP_SYSTEM_UNLOCK(env); + return (ret); +} + +/* + * __rep_get_lsnhist_data -- + * + * A utility function to get the full LSN history database record for a + * particular gen. + * + * PUBLIC: int __rep_get_lsnhist_data __P((ENV *, DB_THREAD_INFO *, + * PUBLIC: u_int32_t, __rep_lsn_hist_data_args *)); + */ +int +__rep_get_lsnhist_data(env, ip, gen, lsnhist_data) + ENV *env; + DB_THREAD_INFO *ip; + u_int32_t gen; + __rep_lsn_hist_data_args *lsnhist_data; +{ + DB_TXN *txn; + DBC *dbc; + struct rep_waitgoal reason; + int ret, t_ret; + + txn = NULL; + dbc = NULL; + + /* + * Cannot use cached LSN history values because we need the + * timestamp value here, which is not cached. + */ + ret = __rep_read_lsn_history(env, + ip, &txn, &dbc, gen, lsnhist_data, &reason, DB_SET, 0); + + if (dbc != NULL && + (t_ret = __dbc_close(dbc)) != 0 && ret == 0) + ret = t_ret; + if (txn != NULL && + (t_ret = __db_txn_auto_resolve(env, txn, 1, ret)) != 0 && ret == 0) + ret = t_ret; + return (ret); +} diff --git a/src/rep/rep_verify.c b/src/rep/rep_verify.c index 5238f900..40a0dfce 100644 --- a/src/rep/rep_verify.c +++ b/src/rep/rep_verify.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2004, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2004, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -119,8 +119,15 @@ __rep_verify(env, rp, rec, eid, savetime) goto out; } } + /* + * Search for a matching perm record. If none is found, + * or a database or file delete is encountered before the + * perm record, begin internal init. Database and blob file + * deletes cannot be undone once committed, so internal init + * must be used to re-create the files. + */ if ((ret = __rep_log_backup(env, logc, &lsn, - REP_REC_PERM)) == 0) { + REP_REC_PERM_DEL)) == 0) { MUTEX_LOCK(env, rep->mtx_clientdb); lp->verify_lsn = lsn; __os_gettime(env, &lp->rcvd_ts, 1); @@ -205,8 +212,10 @@ __rep_internal_init(env, abbrev) u_int32_t abbrev; { REP *rep; + u_int32_t ctlflags; int master, ret; + ctlflags = 0; rep = env->rep_handle->region; REP_SYSTEM_LOCK(env); #ifdef HAVE_STATISTICS @@ -227,6 +236,7 @@ __rep_internal_init(env, abbrev) RPRINT(env, (env, DB_VERB_REP_SYNC, "send UPDATE_REQ, merely to check for NIMDB refresh")); F_SET(rep, REP_F_ABBREVIATED); + FLD_SET(ctlflags, REPCTL_INMEM_ONLY); } else F_CLR(rep, REP_F_ABBREVIATED); ZERO_LSN(rep->first_lsn); @@ -237,7 +247,7 @@ __rep_internal_init(env, abbrev) REP_SYSTEM_UNLOCK(env); if (ret == 0 && master != DB_EID_INVALID) (void)__rep_send_message(env, - master, REP_UPDATE_REQ, NULL, NULL, 0, 0); + master, REP_UPDATE_REQ, NULL, NULL, ctlflags, 0); return (ret); } @@ -504,8 +514,7 @@ __rep_dorecovery(env, lsnp, trunclsnp) */ DB_ASSERT(env, rep->op_cnt == 0); DB_ASSERT(env, rep->msg_th == 1); - if (rectype == DB___txn_regop || rectype == DB___txn_ckp || - rectype == DB___dbreg_register) + if (IS_PERM_RECTYPE(rectype) || rectype == DB___dbreg_register) skip_rec = 0; if (rectype == DB___txn_regop) { if (rep->version >= DB_REPVERSION_44) { @@ -653,8 +662,10 @@ __rep_verify_match(env, reclsnp, savetime) /* * Lockout the API and wait for operations to complete. */ - if ((ret = __rep_lockout_api(env, rep)) != 0) + if ((ret = __rep_lockout_api(env, rep)) != 0) { + FLD_CLR(rep->lockout_flags, REP_LOCKOUT_MSG); goto errunlock; + } /* OK, everyone is out, we can now run recovery. */ REP_SYSTEM_UNLOCK(env); @@ -690,6 +701,10 @@ __rep_verify_match(env, reclsnp, savetime) */ if (db_rep->rep_db == NULL && (ret = __rep_client_dbinit(env, 0, REP_DB)) != 0) { + REP_SYSTEM_LOCK(env); + FLD_CLR(rep->lockout_flags, + REP_LOCKOUT_API | REP_LOCKOUT_MSG | REP_LOCKOUT_OP); + REP_SYSTEM_UNLOCK(env); MUTEX_UNLOCK(env, rep->mtx_clientdb); goto out; } diff --git a/src/repmgr/repmgr.msg b/src/repmgr/repmgr.msg index 020f2e9c..ba544936 100644 --- a/src/repmgr/repmgr.msg +++ b/src/repmgr/repmgr.msg @@ -65,6 +65,11 @@ ARG port u_int16_t END BEGIN_MSG membership_data +ARG status u_int32_t +ARG flags u_int32_t +END + +BEGIN_MSG v4membership_data ARG flags u_int32_t END @@ -98,22 +103,51 @@ BEGIN_MSG membr_vers ARG version u_int32_t ARG gen u_int32_t END + BEGIN_MSG site_info check_length ARG host DBT ARG port u_int16_t +ARG status u_int32_t +ARG flags u_int32_t +END + +BEGIN_MSG v4site_info check_length +ARG host DBT +ARG port u_int16_t ARG flags u_int32_t END /* * If site A breaks or rejects a connection from site B, it first * tries to send B this message containing site A's currently known - * membership DB version. Site B can use this to decide what to do. - * If site B knows of a later version, it should retry the connection - * to site A later, polling at it until site A catches up. However, if - * site B's known version is less, it means that site B is no longer in - * the group, and so instead it should shut down and notify the application. + * membership DB version and site B's status in site A's membership DB. + * Site B can use them to decide what to do. If site B knows of a later + * version, it should retry the connection to site A later, polling + * until site A catches up. However, if site B's known version is + * less and site B's status is adding in site A's membership DB, it + * means that a badly-timed change of master may have caused the current + * master to lose B's membership DB update to present, so it should + * retry the connection to site A later, otherwise, site B is no longer + * in the group and it should shut down and notify the application. */ BEGIN_MSG connect_reject ARG version u_int32_t ARG gen u_int32_t +ARG status u_int32_t +END + +BEGIN_MSG v4connect_reject +ARG version u_int32_t +ARG gen u_int32_t +END + +/* + * For preferred master LSN history comparison between the sites. + * The next_gen_lsn is [0,0] if the next generation doesn't yet exist. + */ +BEGIN_MSG lsnhist_match +ARG lsn DB_LSN +ARG hist_sec u_int32_t +ARG hist_nsec u_int32_t +ARG next_gen_lsn DB_LSN END diff --git a/src/repmgr/repmgr.src b/src/repmgr/repmgr.src index 68d8c239..f42e159f 100644 --- a/src/repmgr/repmgr.src +++ b/src/repmgr/repmgr.src @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2010, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2010, 2015 Oracle and/or its affiliates. All rights reserved. */ DBPRIVATE diff --git a/src/repmgr/repmgr_automsg.c b/src/repmgr/repmgr_automsg.c index 90af08ff..31bc4c35 100644 --- a/src/repmgr/repmgr_automsg.c +++ b/src/repmgr/repmgr_automsg.c @@ -463,6 +463,7 @@ __repmgr_membership_data_marshal(env, argp, bp) __repmgr_membership_data_args *argp; u_int8_t *bp; { + DB_HTONL_COPYOUT(env, bp, argp->status); DB_HTONL_COPYOUT(env, bp, argp->flags); } @@ -481,6 +482,7 @@ __repmgr_membership_data_unmarshal(env, argp, bp, max, nextp) { if (max < __REPMGR_MEMBERSHIP_DATA_SIZE) goto too_few; + DB_NTOHL_COPYIN(env, argp->status, bp); DB_NTOHL_COPYIN(env, argp->flags, bp); if (nextp != NULL) @@ -494,6 +496,46 @@ too_few: } /* + * PUBLIC: void __repmgr_v4membership_data_marshal __P((ENV *, + * PUBLIC: __repmgr_v4membership_data_args *, u_int8_t *)); + */ +void +__repmgr_v4membership_data_marshal(env, argp, bp) + ENV *env; + __repmgr_v4membership_data_args *argp; + u_int8_t *bp; +{ + DB_HTONL_COPYOUT(env, bp, argp->flags); +} + +/* + * PUBLIC: int __repmgr_v4membership_data_unmarshal __P((ENV *, + * PUBLIC: __repmgr_v4membership_data_args *, u_int8_t *, size_t, + * PUBLIC: u_int8_t **)); + */ +int +__repmgr_v4membership_data_unmarshal(env, argp, bp, max, nextp) + ENV *env; + __repmgr_v4membership_data_args *argp; + u_int8_t *bp; + size_t max; + u_int8_t **nextp; +{ + if (max < __REPMGR_V4MEMBERSHIP_DATA_SIZE) + goto too_few; + DB_NTOHL_COPYIN(env, argp->flags, bp); + + if (nextp != NULL) + *nextp = bp; + return (0); + +too_few: + __db_errx(env, DB_STR("3675", + "Not enough input bytes to fill a __repmgr_v4membership_data message")); + return (EINVAL); +} + +/* * PUBLIC: void __repmgr_member_metadata_marshal __P((ENV *, * PUBLIC: __repmgr_member_metadata_args *, u_int8_t *)); */ @@ -669,6 +711,7 @@ __repmgr_site_info_marshal(env, argp, bp, max, lenp) bp += argp->host.size; } DB_HTONS_COPYOUT(env, bp, argp->port); + DB_HTONL_COPYOUT(env, bp, argp->status); DB_HTONL_COPYOUT(env, bp, argp->flags); *lenp = (size_t)(bp - start); @@ -702,6 +745,7 @@ __repmgr_site_info_unmarshal(env, argp, bp, max, nextp) goto too_few; bp += argp->host.size; DB_NTOHS_COPYIN(env, argp->port, bp); + DB_NTOHL_COPYIN(env, argp->status, bp); DB_NTOHL_COPYIN(env, argp->flags, bp); if (nextp != NULL) @@ -715,6 +759,75 @@ too_few: } /* + * PUBLIC: int __repmgr_v4site_info_marshal __P((ENV *, + * PUBLIC: __repmgr_v4site_info_args *, u_int8_t *, size_t, size_t *)); + */ +int +__repmgr_v4site_info_marshal(env, argp, bp, max, lenp) + ENV *env; + __repmgr_v4site_info_args *argp; + u_int8_t *bp; + size_t *lenp, max; +{ + u_int8_t *start; + + if (max < __REPMGR_V4SITE_INFO_SIZE + + (size_t)argp->host.size) + return (ENOMEM); + start = bp; + + DB_HTONL_COPYOUT(env, bp, argp->host.size); + if (argp->host.size > 0) { + memcpy(bp, argp->host.data, argp->host.size); + bp += argp->host.size; + } + DB_HTONS_COPYOUT(env, bp, argp->port); + DB_HTONL_COPYOUT(env, bp, argp->flags); + + *lenp = (size_t)(bp - start); + return (0); +} + +/* + * PUBLIC: int __repmgr_v4site_info_unmarshal __P((ENV *, + * PUBLIC: __repmgr_v4site_info_args *, u_int8_t *, size_t, u_int8_t **)); + */ +int +__repmgr_v4site_info_unmarshal(env, argp, bp, max, nextp) + ENV *env; + __repmgr_v4site_info_args *argp; + u_int8_t *bp; + size_t max; + u_int8_t **nextp; +{ + size_t needed; + + needed = __REPMGR_V4SITE_INFO_SIZE; + if (max < needed) + goto too_few; + DB_NTOHL_COPYIN(env, argp->host.size, bp); + if (argp->host.size == 0) + argp->host.data = NULL; + else + argp->host.data = bp; + needed += (size_t)argp->host.size; + if (max < needed) + goto too_few; + bp += argp->host.size; + DB_NTOHS_COPYIN(env, argp->port, bp); + DB_NTOHL_COPYIN(env, argp->flags, bp); + + if (nextp != NULL) + *nextp = bp; + return (0); + +too_few: + __db_errx(env, DB_STR("3675", + "Not enough input bytes to fill a __repmgr_v4site_info message")); + return (EINVAL); +} + +/* * PUBLIC: void __repmgr_connect_reject_marshal __P((ENV *, * PUBLIC: __repmgr_connect_reject_args *, u_int8_t *)); */ @@ -726,6 +839,7 @@ __repmgr_connect_reject_marshal(env, argp, bp) { DB_HTONL_COPYOUT(env, bp, argp->version); DB_HTONL_COPYOUT(env, bp, argp->gen); + DB_HTONL_COPYOUT(env, bp, argp->status); } /* @@ -744,6 +858,7 @@ __repmgr_connect_reject_unmarshal(env, argp, bp, max, nextp) goto too_few; DB_NTOHL_COPYIN(env, argp->version, bp); DB_NTOHL_COPYIN(env, argp->gen, bp); + DB_NTOHL_COPYIN(env, argp->status, bp); if (nextp != NULL) *nextp = bp; @@ -755,3 +870,94 @@ too_few: return (EINVAL); } +/* + * PUBLIC: void __repmgr_v4connect_reject_marshal __P((ENV *, + * PUBLIC: __repmgr_v4connect_reject_args *, u_int8_t *)); + */ +void +__repmgr_v4connect_reject_marshal(env, argp, bp) + ENV *env; + __repmgr_v4connect_reject_args *argp; + u_int8_t *bp; +{ + DB_HTONL_COPYOUT(env, bp, argp->version); + DB_HTONL_COPYOUT(env, bp, argp->gen); +} + +/* + * PUBLIC: int __repmgr_v4connect_reject_unmarshal __P((ENV *, + * PUBLIC: __repmgr_v4connect_reject_args *, u_int8_t *, size_t, + * PUBLIC: u_int8_t **)); + */ +int +__repmgr_v4connect_reject_unmarshal(env, argp, bp, max, nextp) + ENV *env; + __repmgr_v4connect_reject_args *argp; + u_int8_t *bp; + size_t max; + u_int8_t **nextp; +{ + if (max < __REPMGR_V4CONNECT_REJECT_SIZE) + goto too_few; + DB_NTOHL_COPYIN(env, argp->version, bp); + DB_NTOHL_COPYIN(env, argp->gen, bp); + + if (nextp != NULL) + *nextp = bp; + return (0); + +too_few: + __db_errx(env, DB_STR("3675", + "Not enough input bytes to fill a __repmgr_v4connect_reject message")); + return (EINVAL); +} + +/* + * PUBLIC: void __repmgr_lsnhist_match_marshal __P((ENV *, + * PUBLIC: __repmgr_lsnhist_match_args *, u_int8_t *)); + */ +void +__repmgr_lsnhist_match_marshal(env, argp, bp) + ENV *env; + __repmgr_lsnhist_match_args *argp; + u_int8_t *bp; +{ + DB_HTONL_COPYOUT(env, bp, argp->lsn.file); + DB_HTONL_COPYOUT(env, bp, argp->lsn.offset); + DB_HTONL_COPYOUT(env, bp, argp->hist_sec); + DB_HTONL_COPYOUT(env, bp, argp->hist_nsec); + DB_HTONL_COPYOUT(env, bp, argp->next_gen_lsn.file); + DB_HTONL_COPYOUT(env, bp, argp->next_gen_lsn.offset); +} + +/* + * PUBLIC: int __repmgr_lsnhist_match_unmarshal __P((ENV *, + * PUBLIC: __repmgr_lsnhist_match_args *, u_int8_t *, size_t, u_int8_t **)); + */ +int +__repmgr_lsnhist_match_unmarshal(env, argp, bp, max, nextp) + ENV *env; + __repmgr_lsnhist_match_args *argp; + u_int8_t *bp; + size_t max; + u_int8_t **nextp; +{ + if (max < __REPMGR_LSNHIST_MATCH_SIZE) + goto too_few; + DB_NTOHL_COPYIN(env, argp->lsn.file, bp); + DB_NTOHL_COPYIN(env, argp->lsn.offset, bp); + DB_NTOHL_COPYIN(env, argp->hist_sec, bp); + DB_NTOHL_COPYIN(env, argp->hist_nsec, bp); + DB_NTOHL_COPYIN(env, argp->next_gen_lsn.file, bp); + DB_NTOHL_COPYIN(env, argp->next_gen_lsn.offset, bp); + + if (nextp != NULL) + *nextp = bp; + return (0); + +too_few: + __db_errx(env, DB_STR("3675", + "Not enough input bytes to fill a __repmgr_lsnhist_match message")); + return (EINVAL); +} + diff --git a/src/repmgr/repmgr_elect.c b/src/repmgr/repmgr_elect.c index 3a84694a..15a2de7b 100644 --- a/src/repmgr/repmgr_elect.c +++ b/src/repmgr/repmgr_elect.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -12,9 +12,9 @@ static db_timeout_t __repmgr_compute_response_time __P((ENV *)); static int __repmgr_elect __P((ENV *, u_int32_t, db_timespec *)); -static int __repmgr_elect_main __P((ENV *, REPMGR_RUNNABLE *)); +static int __repmgr_elect_main __P((ENV *, + DB_THREAD_INFO *, REPMGR_RUNNABLE *)); static void *__repmgr_elect_thread __P((void *)); -static int send_membership __P((ENV *)); /* * Starts an election thread. @@ -90,26 +90,39 @@ __repmgr_elect_thread(argsp) { REPMGR_RUNNABLE *th; ENV *env; + DB_THREAD_INFO *ip; int ret; th = argsp; env = th->env; + ip = NULL; + ret = 0; - RPRINT(env, (env, DB_VERB_REPMGR_MISC, "starting election thread")); + ENV_ENTER_RET(env, ip, ret); + if (ret == 0) + RPRINT(env, (env, + DB_VERB_REPMGR_MISC, "starting election thread")); - if ((ret = __repmgr_elect_main(env, th)) != 0) { + if (ret != 0 || (ret = __repmgr_elect_main(env, ip, th)) != 0) { __db_err(env, ret, "election thread failed"); + RPRINT(env, (env, + DB_VERB_REPMGR_MISC, "election thread is exiting")); + ENV_LEAVE(env, ip); (void)__repmgr_thread_failure(env, ret); } - - RPRINT(env, (env, DB_VERB_REPMGR_MISC, "election thread is exiting")); + if (ret == 0) { + RPRINT(env, (env, + DB_VERB_REPMGR_MISC, "election thread is exiting")); + ENV_LEAVE(env, ip); + } th->finished = TRUE; return (NULL); } static int -__repmgr_elect_main(env, th) +__repmgr_elect_main(env, ip, th) ENV *env; + DB_THREAD_INFO *ip; REPMGR_RUNNABLE *th; { DB_REP *db_rep; @@ -123,10 +136,13 @@ __repmgr_elect_main(env, th) db_timespec failtime, now, repstart_time, target, wait_til; db_timeout_t delay_time, response_time, tmp_time; u_long sec, usec; - u_int32_t flags; - int done_repstart, ret, suppress_election; + u_int32_t flags, max_tries, tries; + int client_detected, done_repstart, lsnhist_match, master_detected; + int ret, suppress_election; enum { ELECTION, REPSTART } action; + COMPQUIET(usec, 0); + COMPQUIET(max_tries, 0); COMPQUIET(action, ELECTION); db_rep = env->rep_handle; @@ -181,6 +197,120 @@ __repmgr_elect_main(env, th) UNLOCK_MUTEX(db_rep->mutex); /* + * In preferred master mode, the select thread signals when a + * client has lost its connection to the master via prefmas_pending, + * but the actual restart as temporary master is done here in an + * election thread. + */ + if (IS_PREFMAS_MODE(env) && F_ISSET(rep, REP_F_CLIENT) && + db_rep->prefmas_pending == start_temp_master) { + db_rep->prefmas_pending = no_action; + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "elect_main preferred master restart temp master")); + ret = __repmgr_become_master(env, 0); + goto out; + } + + /* Get preferred master wait limits for detecting the other site. */ + if (IS_PREFMAS_MODE(env) && + (ret = __repmgr_prefmas_get_wait(env, &max_tries, &usec)) != 0) + goto out; + + /* Preferred master mode master site start-up. */ + if (IS_PREFMAS_MODE(env) && + FLD_ISSET(rep->config, REP_C_PREFMAS_MASTER) && + LF_ISSET(ELECT_F_STARTUP)) { + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "elect_main preferred master site startup")); + client_detected = FALSE; + lsnhist_match = FALSE; + tries = 0; + while (!client_detected && tries < max_tries) { + __os_yield(env, 0, usec); + tries++; + client_detected = __repmgr_prefmas_connected(env); + } + if (client_detected) { + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "elect_main preferred master client detected")); + /* + * Restart remote site as a client. Depending on the + * outcome of lsnhist_match below, this site will + * either restart as master or it will start an + * election. In either case, the remote site should + * be running as a client. + * + * Then perform the lsnhist_match comparison. + */ + if ((ret = __repmgr_restart_site_as_client( + env, 1)) != 0 || + (ret = __repmgr_lsnhist_match(env, + ip, 1, &lsnhist_match)) != 0) + goto out; + /* + * An lsnhist_match means that we have a continuous + * set of transactions and it is safe to call a + * comparison election to preserve any temporary master + * transactions that were committed while this site + * was down. + */ + if (lsnhist_match) { + F_CLR(rep, REP_F_HOLD_GEN); + LF_SET(ELECT_F_IMMED); + LF_CLR(ELECT_F_STARTUP); + /* Continue on to election code below. */ + } + } + /* + * If we didn't detect a client within a reasonable time or + * we failed the lsnhist_match (meaning we have conflicting + * sets of transactions), we start this site as a master and + * possibly force rollback of temporary master transactions. + */ + if (!client_detected || !lsnhist_match) { + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "elect_main preferred master site start master")); + ret = __repmgr_become_master(env, 0); + F_CLR(rep, REP_F_HOLD_GEN); + goto out; + } + } + + /* Preferred master mode client site start-up. */ + if (IS_PREFMAS_MODE(env) && + FLD_ISSET(rep->config, REP_C_PREFMAS_CLIENT) && + LF_ISSET(ELECT_F_STARTUP)) { + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "elect_main preferred master client site startup")); + master_detected = FALSE; + tries = 0; + while (!master_detected && tries < max_tries) { + __os_yield(env, 0, usec); + tries++; + master_detected = __repmgr_prefmas_connected(env); + } + /* + * If we find the master, restart as client here so that we + * send a newclient message after we are connected to the + * master. The master will send a newmaster message so that + * we can start the client sync process. + * + * If we haven't found the master after the timeout, start as + * temporary master. + */ + if (master_detected) { + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "elect_main preferred master detected")); + ret = __repmgr_become_client(env); + } else { + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "elect_main preferred master client start master")); + ret = __repmgr_become_master(env, 0); + } + goto out; + } + + /* * The 'done_repstart' flag keeps track of which was our most recent * operation (repstart or election), so that we can alternate * appropriately. There are a few different ways this thread can be @@ -188,7 +318,7 @@ __repmgr_elect_main(env, th) * called. The one exception is at initial start-up, where we * first probe for a master by sending out rep_start(CLIENT) calls. */ - if (LF_ISSET(ELECT_F_IMMED)) { + if (LF_ISSET(ELECT_F_IMMED) && !IS_VIEW_SITE(env)) { /* * When the election succeeds, we've successfully completed * everything we need to do. If it fails in an unexpected way, @@ -256,11 +386,13 @@ __repmgr_elect_main(env, th) /* * See if it's time to retry the operation. Normally it's an * election we're interested in retrying. But we refrain from - * calling for elections if so configured. + * calling for elections if so configured or we are a view. */ - suppress_election = LF_ISSET(ELECT_F_STARTUP) ? + suppress_election = IS_VIEW_SITE(env) || + (LF_ISSET(ELECT_F_STARTUP) ? db_rep->init_policy == DB_REP_CLIENT : - !FLD_ISSET(rep->config, REP_C_ELECTIONS); + !FLD_ISSET(rep->config, REP_C_ELECTIONS)) || + LF_ISSET(ELECT_F_CLIENT_RESTART); repstart_time = db_rep->repstart_time; target = suppress_election ? repstart_time : failtime; TIMESPEC_ADD_DB_TIMEOUT(&target, rep->election_retry_wait); @@ -343,7 +475,8 @@ __repmgr_elect_main(env, th) DB_ASSERT(env, action == REPSTART); db_rep->new_connection = FALSE; - if ((ret = __repmgr_repstart(env, DB_REP_CLIENT)) != 0) + if ((ret = __repmgr_repstart(env, + DB_REP_CLIENT, 0)) != 0) goto out; done_repstart = TRUE; @@ -476,7 +609,20 @@ __repmgr_elect(env, flags, failtimep) case DB_REP_UNAVAIL: __os_gettime(env, failtimep, 1); DB_EVENT(env, DB_EVENT_REP_ELECTION_FAILED, NULL); - if ((t_ret = send_membership(env)) != 0) + /* + * If an election fails with DB_REP_UNAVAIL, it could be + * because a participating site has an obsolete, too-high + * notion of the group size. (This could happen if the site + * was down/disconnected during removal of some (other) sites.) + * To remedy this, broadcast a current copy of the membership + * list. Since all sites are doing this, and we always ratchet + * to the most up-to-date version, this should bring all sites + * up to date. We only do this after a failure, during what + * will normally be an idle period anyway, so that we don't + * slow down a first election following the loss of an active + * master. + */ + if ((t_ret = __repmgr_bcast_member_list(env)) != 0) ret = t_ret; break; @@ -498,40 +644,6 @@ __repmgr_elect(env, flags, failtimep) } /* - * If an election fails with DB_REP_UNAVAIL, it could be because a participating - * site has an obsolete, too-high notion of the group size. (This could happen - * if the site was down/disconnected during removal of some (other) sites.) To - * remedy this, broadcast a current copy of the membership list. Since all - * sites are doing this, and we always ratchet to the most up-to-date version, - * this should bring all sites up to date. We only do this after a failure, - * during what will normally be an idle period anyway, so that we don't slow - * down a first election following the loss of an active master. - */ -static int -send_membership(env) - ENV *env; -{ - DB_REP *db_rep; - u_int8_t *buf; - size_t len; - int ret; - - db_rep = env->rep_handle; - buf = NULL; - LOCK_MUTEX(db_rep->mutex); - if ((ret = __repmgr_marshal_member_list(env, &buf, &len)) != 0) - goto out; - RPRINT(env, (env, DB_VERB_REPMGR_MISC, - "Broadcast latest membership list")); - ret = __repmgr_bcast_own_msg(env, REPMGR_SHARING, buf, len); -out: - UNLOCK_MUTEX(db_rep->mutex); - if (buf != NULL) - __os_free(env, buf); - return (ret); -} - -/* * Becomes master after we've won an election, if we can. * * PUBLIC: int __repmgr_claim_victory __P((ENV *)); @@ -543,7 +655,7 @@ __repmgr_claim_victory(env) int ret; env->rep_handle->takeover_pending = FALSE; - if ((ret = __repmgr_become_master(env)) == DB_REP_UNAVAIL) { + if ((ret = __repmgr_become_master(env, 0)) == DB_REP_UNAVAIL) { ret = 0; RPRINT(env, (env, DB_VERB_REPMGR_MISC, "Won election but lost race with DUPMASTER client intent")); diff --git a/src/repmgr/repmgr_method.c b/src/repmgr/repmgr_method.c index 229cf650..729ba5ff 100644 --- a/src/repmgr/repmgr_method.c +++ b/src/repmgr/repmgr_method.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -29,19 +29,17 @@ static int get_channel_connection __P((CHANNEL *, REPMGR_CONNECTION **)); static int init_dbsite __P((ENV *, int, const char *, u_int, DB_SITE **)); static int join_group_at_site __P((ENV *, repmgr_netaddr_t *)); static int kick_blockers __P((ENV *, REPMGR_CONNECTION *, void *)); -static int make_request_conn __P((ENV *, - repmgr_netaddr_t *, REPMGR_CONNECTION **)); static int set_local_site __P((DB_SITE *, u_int32_t)); -static int read_own_msg __P((ENV *, - REPMGR_CONNECTION *, u_int32_t *, u_int8_t **, size_t *)); static int refresh_site __P((DB_SITE *)); static int __repmgr_await_threads __P((ENV *)); static int __repmgr_build_data_out __P((ENV *, DBT *, u_int32_t, __repmgr_msg_metadata_args *, REPMGR_IOVECS **iovecsp)); static int __repmgr_build_msg_out __P((ENV *, DBT *, u_int32_t, __repmgr_msg_metadata_args *, REPMGR_IOVECS **iovecsp)); +static int __repmgr_demote_site(ENV *, int); static int repmgr_only __P((ENV *, const char *)); static int __repmgr_restart __P((ENV *, int, u_int32_t)); +static int __repmgr_remove_and_close_site __P((DB_SITE *)); static int __repmgr_remove_site __P((DB_SITE *)); static int __repmgr_remove_site_pp __P((DB_SITE *)); static int __repmgr_start_msg_threads __P((ENV *, u_int)); @@ -52,25 +50,21 @@ static int send_msg_self __P((ENV *, REPMGR_IOVECS *, u_int32_t)); static int site_by_addr __P((ENV *, const char *, u_int, DB_SITE **)); /* - * PUBLIC: int __repmgr_start __P((DB_ENV *, int, u_int32_t)); + * PUBLIC: int __repmgr_start_pp __P((DB_ENV *, int, u_int32_t)); */ int -__repmgr_start(dbenv, nthreads, flags) +__repmgr_start_pp(dbenv, nthreads, flags) DB_ENV *dbenv; int nthreads; u_int32_t flags; { DB_REP *db_rep; - REP *rep; - REPMGR_SITE *me, *site; - DB_THREAD_INFO *ip; ENV *env; - int first, is_listener, locked, min, need_masterseek, ret, start_master; - u_int i, n; + DB_THREAD_INFO *ip; + int ret; env = dbenv->env; db_rep = env->rep_handle; - rep = db_rep->region; switch (flags) { case 0: @@ -102,7 +96,27 @@ __repmgr_start(dbenv, nthreads, flags) return (EINVAL); } - /* Check if it is a shut-down site, if so, clean the resources. */ + /* A view site cannot be started as MASTER or ELECTION. */ + if (IS_VIEW_SITE(env) && + (flags == DB_REP_MASTER || flags == DB_REP_ELECTION)) { + __db_errx(env, DB_STR("3694", + "A view site must be started with DB_REP_CLIENT")); + return (EINVAL); + } + + /* Must start site as client in preferred master mode. */ + if (PREFMAS_IS_SET(env) && + (flags == DB_REP_MASTER || flags == DB_REP_ELECTION)) { + __db_errx(env, DB_STR("3702", + "A preferred master site must be started with " + "DB_REP_CLIENT")); + return (EINVAL); + } + + /* + * Check if it is a shut-down site, if so, clean the resources and + * reset the status in order to get ready to start replication. + */ if (db_rep->repmgr_status == stopped) { if ((ret = __repmgr_stop(env)) != 0) { __db_errx(env, DB_STR("3638", @@ -112,7 +126,55 @@ __repmgr_start(dbenv, nthreads, flags) db_rep->repmgr_status = ready; } + /* Record the original configurations given by application. */ + ENV_ENTER(env, ip); db_rep->init_policy = flags; + db_rep->config_nthreads = nthreads; + ret = __repmgr_start_int(env, nthreads, flags); + ENV_LEAVE(env, ip); + return (ret); +} + +/* + * Internal processing to start replication manager. + * + * PUBLIC: int __repmgr_start_int __P((ENV *, int, u_int32_t)); + */ +int +__repmgr_start_int(env, nthreads, flags) + ENV *env; + int nthreads; + u_int32_t flags; +{ + DB_LOG *dblp; + DB_REP *db_rep; + LOG *lp; + REP *rep; + REPMGR_SITE *me, *site; + u_int32_t startopts; + int first, flags_error, is_listener, locked, min; + int need_masterseek, ret, start_master; + u_int i, n; + + db_rep = env->rep_handle; + rep = db_rep->region; + dblp = env->lg_handle; + lp = dblp->reginfo.primary; + flags_error = 0; + startopts = 0; + + /* + * For preferred master master site startup, we need to save the + * log location at the end of our previous transactions for + * the lsnhist_match comparisons. Starting repmgr adds a few + * more log records that we don't want to count in lsnhist_match. + */ + if (FLD_ISSET(rep->config, REP_C_PREFMAS_MASTER)) { + LOG_SYSTEM_LOCK(env); + db_rep->prefmas_init_lsn = lp->lsn; + LOG_SYSTEM_UNLOCK(env); + } + if ((ret = __rep_set_transport_int(env, db_rep->self_eid, __repmgr_send)) != 0) return (ret); @@ -128,7 +190,8 @@ __repmgr_start(dbenv, nthreads, flags) if (db_rep->restored_list != NULL) { ret = __repmgr_refresh_membership(env, - db_rep->restored_list, db_rep->restored_list_length); + db_rep->restored_list, db_rep->restored_list_length, + DB_REPMGR_VERSION); __os_free(env, db_rep->restored_list); db_rep->restored_list = NULL; } else { @@ -145,9 +208,15 @@ __repmgr_start(dbenv, nthreads, flags) * join. */ ret = __repmgr_join_group(env); + else if (VIEW_TO_PARTICIPANT(db_rep, me)) { + __db_errx(env, DB_STR("3695", + "A view site must be started with a view callback")); + return (EINVAL); + } } else if (ret == ENOENT) { - ENV_ENTER(env, ip); - if (FLD_ISSET(me->config, DB_GROUP_CREATOR)) + if (FLD_ISSET(me->config, DB_GROUP_CREATOR) || + (IS_PREFMAS_MODE(env) && + FLD_ISSET(rep->config, REP_C_PREFMAS_MASTER))) start_master = TRUE; /* * LEGACY is inconsistent with CREATOR, but start_master @@ -166,10 +235,12 @@ __repmgr_start(dbenv, nthreads, flags) continue; if ((ret = __repmgr_set_membership(env, site->net_addr.host, - site->net_addr.port, - SITE_PRESENT)) != 0) + site->net_addr.port, SITE_PRESENT, + site->gmdb_flags)) != 0) break; - n++; + if (!FLD_ISSET(site->gmdb_flags, + SITE_VIEW)) + n++; } ret = __rep_set_nsites_int(env, n); DB_ASSERT(env, ret == 0); @@ -180,30 +251,27 @@ __repmgr_start(dbenv, nthreads, flags) db_rep->member_version_gen = 1; if ((ret = __repmgr_set_membership(env, me->net_addr.host, me->net_addr.port, - SITE_PRESENT)) == 0) { + SITE_PRESENT, 0)) == 0) { ret = __rep_set_nsites_int(env, 1); DB_ASSERT(env, ret == 0); } UNLOCK_MUTEX(db_rep->mutex); } else ret = __repmgr_join_group(env); - ENV_LEAVE(env, ip); } else if (ret == DB_DELETED) ret = DB_REP_UNAVAIL; } if (ret != 0) return (ret); - DB_ASSERT(env, start_master || - SITE_FROM_EID(db_rep->self_eid)->membership == SITE_PRESENT); - /* - * If we're the first repmgr_start() call, we will have to start threads. - * Therefore, we require a flags value (to tell us how). + * Catch case where user defines a different local site address than + * the one in the restored_list from an ongoing internal init. */ - if (db_rep->repmgr_status != running && flags == 0) { - __db_errx(env, DB_STR("3639", - "a non-zero flags value is required for initial repmgr_start() call")); + if (!start_master && + SITE_FROM_EID(db_rep->self_eid)->membership != SITE_PRESENT) { + __db_errx(env, DB_STR("3696", + "Current local site conflicts with earlier definition")); return (EINVAL); } @@ -214,37 +282,54 @@ __repmgr_start(dbenv, nthreads, flags) * * Then, in case there could be multiple processes, we're either the * main listener process or a subordinate process. On a "subsequent" - * repmgr_start() call we already have enough information to know which - * it is. Otherwise, negotiate with information in the shared region to - * claim the listener role if possible. + * repmgr_start() call, with a running main listener process, we already + * have enough information to know which it is. Otherwise, if there is + * no listener, negotiate with information in the shared region to claim + * the listener role if possible. Once we decide we're the listener, + * mark the listener id in the shared region, so that no other process + * thinks the same thing. * * To avoid a race, once we decide we're in the first call, mark the * handle as started, so that no other thread thinks the same thing. */ + first = FALSE; + is_listener = FALSE; LOCK_MUTEX(db_rep->mutex); locked = TRUE; - if (db_rep->repmgr_status == running) { - first = FALSE; + if (db_rep->repmgr_status == running && !(rep->listener == 0 && + FLD_ISSET(rep->config, REP_C_AUTOTAKEOVER))) is_listener = !IS_SUBORDINATE(db_rep); - } else { + else if (db_rep->repmgr_status != running && + rep->listener == 0 && flags == 0) + flags_error = 1; + else { first = TRUE; db_rep->repmgr_status = running; - ENV_ENTER(env, ip); MUTEX_LOCK(env, rep->mtx_repmgr); if (rep->listener == 0) { is_listener = TRUE; - __os_id(dbenv, &rep->listener, NULL); - } else { - is_listener = FALSE; + __os_id(env->dbenv, &rep->listener, NULL); + } else nthreads = 0; - } MUTEX_UNLOCK(env, rep->mtx_repmgr); - ENV_LEAVE(env, ip); } UNLOCK_MUTEX(db_rep->mutex); locked = FALSE; + /* + * The first repmgr_start() call for the main listener process + * requires a flags value to tell us how to start up the site. + * But we don't require a flags value for the repmgr_start() + * call for a subordinate process because the site is already + * started and we would only ignore the value anyway. + */ + if (flags_error) { + __db_errx(env, DB_STR("3639", + "A non-zero flags value is required for initial repmgr_start() call")); + return (EINVAL); + } + if (!first) { /* * Subsequent call is allowed when ELECTIONS are turned off, so @@ -266,7 +351,7 @@ __repmgr_start(dbenv, nthreads, flags) /* * The minimum legal number of threads is either 1 or 0, depending upon - * whether we're the main process or a subordinate. + * whether we're the listener process or a subordinate. */ min = is_listener ? 1 : 0; if (nthreads < min) { @@ -303,14 +388,24 @@ __repmgr_start(dbenv, nthreads, flags) * of rep_start calls even within an env region lifetime. */ if (start_master) { - ret = __repmgr_become_master(env); + ret = __repmgr_become_master(env, 0); /* No other repmgr threads running yet. */ DB_ASSERT(env, ret != DB_REP_UNAVAIL); if (ret != 0) goto err; need_masterseek = FALSE; } else { - if ((ret = __repmgr_repstart(env, DB_REP_CLIENT)) != 0) + /* + * The preferred master site cannot allow its gen + * to change until it has done its lsnhist_match to + * guarantee that no preferred master transactions + * will be rolled back. + */ + if (IS_PREFMAS_MODE(env) && + FLD_ISSET(rep->config, REP_C_PREFMAS_MASTER)) + startopts = REP_START_HOLD_CLIGEN; + if ((ret = __repmgr_repstart(env, + DB_REP_CLIENT, startopts)) != 0) goto err; /* * The repmgr election code starts elections only if @@ -352,6 +447,7 @@ __repmgr_start(dbenv, nthreads, flags) if ((ret = __repmgr_start_msg_threads(env, (u_int)nthreads)) != 0) goto err; + rep->listener_nthreads = (u_int)nthreads; if (need_masterseek) { /* @@ -374,10 +470,47 @@ __repmgr_start(dbenv, nthreads, flags) } UNLOCK_MUTEX(db_rep->mutex); locked = FALSE; + /* + * Turn on the DB_EVENT_REP_INQUEUE_FULL event firing. We only + * do this for the main listener process. For a subordinate + * process, it is always turned on. + */ + rep->inqueue_full_event_on = 1; + } + if (db_rep->selector == NULL) { + /* All processes (even non-listeners) need a select() thread. */ + if ((ret = __repmgr_start_selector(env)) == 0) { + /* + * A view callback is set but this site isn't yet a + * view in the internal site list. Do the view + * demotion here, which will update the internal + * site list. We need the select() thread for the + * demotion because the demotion performs gmdb + * operations. + */ + if (PARTICIPANT_TO_VIEW(db_rep, + SITE_FROM_EID(db_rep->self_eid)) && + (ret = __repmgr_demote_site(env, + db_rep->self_eid)) != 0) + goto err; + return (is_listener ? 0 : DB_REP_IGNORE); + } + } else { + /* + * If the selector thread already exists, the current process + * should be the new listener which has just finished a + * takeover. Now, all active connections need to be refreshed + * to notify remote sites about the new listener. If a new + * connection is established immediately, disable the existing + * main connection to the same site. Otherwise, schedule a + * second immediate attempt. If it still fails, disable the + * main connection and retry a connection as usual. + */ + DB_ASSERT(env, is_listener && + FLD_ISSET(rep->config, REP_C_AUTOTAKEOVER)); + if ((ret = __repmgr_refresh_selector(env)) == 0) + return (0); } - /* All processes (even non-listeners) need a select() thread. */ - if ((ret = __repmgr_start_selector(env)) == 0) - return (is_listener ? 0 : DB_REP_IGNORE); err: /* If we couldn't succeed at everything, undo the parts we did do. */ @@ -392,6 +525,16 @@ err: if (!locked) LOCK_MUTEX(db_rep->mutex); (void)__repmgr_net_close(env); + /* Reset the listener when we fail before having a valid listen_fd. */ + if (first && is_listener) + rep->listener = 0; + /* + * Reset repmgr_status when we fail before starting a selector if the + * earlier call to __repmgr_stop_threads() hasn't already reset it to + * stopped. + */ + if (db_rep->repmgr_status == running) + db_rep->repmgr_status = ready; UNLOCK_MUTEX(db_rep->mutex); return (ret); } @@ -425,6 +568,53 @@ __repmgr_valid_config(env, flags) } /* + * Set priority, heartbeat and election_retry timeouts for preferred master + * mode. Turn on 2SITE_STRICT and ELECTIONS. Can be called whether or not + * REP_ON() is true + * + * PUBLIC: int __repmgr_prefmas_auto_config __P((DB_ENV *, u_int32_t *)); + */ +int __repmgr_prefmas_auto_config (dbenv, config_flags) + DB_ENV *dbenv; + u_int32_t *config_flags; +{ + ENV * env; + db_timeout_t timeout; + int ret; + + env = dbenv->env; + timeout = 0; + + /* Change heartbeat timeouts if they are not already set. */ + if ((ret = __rep_get_timeout(dbenv, + DB_REP_HEARTBEAT_MONITOR, &timeout)) == 0 && + timeout == 0 && (ret = __rep_set_timeout_int(env, + DB_REP_HEARTBEAT_MONITOR, + DB_REPMGR_PREFMAS_HEARTBEAT_MONITOR)) != 0) + return (ret); + if ((ret = __rep_get_timeout(dbenv, + DB_REP_HEARTBEAT_SEND, &timeout)) == 0 && + timeout == 0 && (ret = __rep_set_timeout_int(env, + DB_REP_HEARTBEAT_SEND, DB_REPMGR_PREFMAS_HEARTBEAT_SEND)) != 0) + return (ret); + + /* Change election_retry timeout if it is still the default value. */ + if ((ret = __rep_get_timeout(dbenv, + DB_REP_ELECTION_RETRY, &timeout)) == 0 && + timeout == DB_REPMGR_DEFAULT_ELECTION_RETRY && + (ret = __rep_set_timeout_int(env, + DB_REP_ELECTION_RETRY, DB_REPMGR_PREFMAS_ELECTION_RETRY)) != 0) + return (ret); + + if ((ret = __rep_set_priority_int(env, FLD_ISSET(*config_flags, + REP_C_PREFMAS_MASTER) ? DB_REPMGR_PREFMAS_PRIORITY_MASTER : + DB_REPMGR_PREFMAS_PRIORITY_CLIENT)) != 0) + return (ret); + FLD_SET(*config_flags, REP_C_ELECTIONS | REP_C_2SITE_STRICT); + return (0); +} + +/* * Starts message processing threads. On entry, the actual number of threads * already active is db_rep->nthreads; the desired number of threads is passed * as "n". @@ -473,7 +663,7 @@ __repmgr_restart(env, nthreads, flags) REP *rep; REPMGR_RUNNABLE **th; u_int32_t cur_repflags; - int locked, ret, t_ret; + int locked, ret, role_change, t_ret; u_int delta, i, min, nth; th = NULL; @@ -491,6 +681,7 @@ __repmgr_restart(env, nthreads, flags) } ret = 0; + role_change = 0; db_rep = env->rep_handle; DB_ASSERT(env, REP_ON(env)); rep = db_rep->region; @@ -498,11 +689,14 @@ __repmgr_restart(env, nthreads, flags) cur_repflags = F_ISSET(rep, REP_F_MASTER | REP_F_CLIENT); DB_ASSERT(env, cur_repflags); if (FLD_ISSET(cur_repflags, REP_F_MASTER) && - flags == DB_REP_CLIENT) + flags == DB_REP_CLIENT) { ret = __repmgr_become_client(env); - else if (FLD_ISSET(cur_repflags, REP_F_CLIENT) && - flags == DB_REP_MASTER) - ret = __repmgr_become_master(env); + role_change = 1; + } else if (FLD_ISSET(cur_repflags, REP_F_CLIENT) && + flags == DB_REP_MASTER) { + ret = __repmgr_become_master(env, 0); + role_change = 1; + } if (ret != 0) return (ret); @@ -574,6 +768,9 @@ __repmgr_restart(env, nthreads, flags) } __os_free(env, th); } + /* We will always turn on the inqueue full event after role change. */ + if (role_change) + rep->inqueue_full_event_on = 1; out: if (locked) UNLOCK_MUTEX(db_rep->mutex); @@ -668,7 +865,8 @@ __repmgr_start_selector(env) * PUBLIC: int __repmgr_close __P((ENV *)); * * Close repmgr during env close. It stops repmgr, frees sites array and - * its addresses. + * its addresses. Note that it is possible for the sites array to exist + * and require deallocation independently of whether repmgr was started. */ int __repmgr_close(env) @@ -679,10 +877,15 @@ __repmgr_close(env) int ret; u_int i; - db_rep = env->rep_handle; + if ((db_rep = env->rep_handle) == NULL) + return (0); ret = 0; - ret = __repmgr_stop(env); + /* Stop repmgr and all of its threads if it was previously started. */ + if (IS_ENV_REPLICATED(env)) + ret = __repmgr_stop(env); + + /* Clean up sites array regardless of whether we could stop repmgr. */ if (db_rep->sites != NULL) { for (i = 0; i < db_rep->site_cnt; i++) { site = &db_rep->sites[i]; @@ -756,9 +959,9 @@ __repmgr_set_ack_policy(dbenv, policy) DB_ENV *dbenv; int policy; { + ENV *env; DB_REP *db_rep; DB_THREAD_INFO *ip; - ENV *env; REP *rep; int ret; @@ -823,6 +1026,208 @@ __repmgr_get_ack_policy(dbenv, policy) } /* + * PUBLIC: int __repmgr_set_incoming_queue_max __P((DB_ENV *, u_int32_t, + * PUBLIC: u_int32_t)); + * + * Sets the maximum amount of dynamic memory used by the Replication Manager + * incoming queue. + */ +int +__repmgr_set_incoming_queue_max(dbenv, gbytes, bytes) + DB_ENV *dbenv; + u_int32_t gbytes, bytes; +{ + DB_REP *db_rep; + DB_THREAD_INFO *ip; + ENV *env; + REP *rep; + + env = dbenv->env; + db_rep = env->rep_handle; + rep = db_rep->region; + + ENV_NOT_CONFIGURED( + env, db_rep->region, "DB_ENV->repmgr_set_incoming_queue_max", + DB_INIT_REP); + + if (APP_IS_BASEAPI(env)) { + __db_errx(env, "%s %s", + "DB_ENV->repmgr_set_incoming_queue_max:", + "cannot call from base replication application"); + return (EINVAL); + } + + /* + * If the caller provided 0 for the size, the size will be unlimited. + */ + if (gbytes == 0 && bytes == 0) { + gbytes = UINT32_MAX; + bytes = GIGABYTE - 1; + } + + while (bytes >= GIGABYTE) { + bytes -= GIGABYTE; + if (gbytes < UINT32_MAX) + gbytes++; + } + + if (REP_ON(env)) { + ENV_ENTER(env, ip); + MUTEX_LOCK(env, rep->mtx_repmgr); + rep->inqueue_max_gbytes = gbytes; + rep->inqueue_max_bytes = bytes; + __repmgr_set_incoming_queue_redzone(rep, gbytes, bytes); + MUTEX_UNLOCK(env, rep->mtx_repmgr); + ENV_LEAVE(env, ip); + } else { + db_rep->inqueue_max_gbytes = gbytes; + db_rep->inqueue_max_bytes = bytes; + } + + /* + * Setting incoming queue maximum sizes makes this a replication + * manager application. + */ + APP_SET_REPMGR(env); + return (0); +} + +/* + * PUBLIC: int __repmgr_get_incoming_queue_max __P((DB_ENV *, u_int32_t *, + * PUBLIC: u_int32_t *)); + * + * Gets the maximum amount of dynamic memory that can be used by the + * Replicaton Manager incoming queue. + */ +int +__repmgr_get_incoming_queue_max(dbenv, gbytesp, bytesp) + DB_ENV *dbenv; + u_int32_t *gbytesp, *bytesp; +{ + ENV *env; + DB_THREAD_INFO *ip; + DB_REP *db_rep; + REP *rep; + + env = dbenv->env; + db_rep = env->rep_handle; + rep = db_rep->region; + + if (REP_ON(env)) { + ENV_ENTER(env, ip); + MUTEX_LOCK(env, rep->mtx_repmgr); + *gbytesp = rep->inqueue_max_gbytes; + *bytesp = rep->inqueue_max_bytes; + MUTEX_UNLOCK(env, rep->mtx_repmgr); + ENV_LEAVE(env, ip); + } else { + *gbytesp = db_rep->inqueue_max_gbytes; + *bytesp = db_rep->inqueue_max_bytes; + } + + return (0); +} + +/* + * PUBLIC: void __repmgr_set_incoming_queue_redzone __P((void *, u_int32_t, + * PUBLIC: u_int32_t)); + * + * Sets the lower bound of the repmgr incoming queue red zone. + * !!! Assumes caller holds mtx_repmgr lock. + * + * Note that we can't simply get the REP* address from the env as we usually do, + * because at the time of this call it may not have been linked into there yet. + * Also note that, REP is not a public structure, so we use "void *" here. + */ +void __repmgr_set_incoming_queue_redzone(rep_, gbytes, bytes) + void *rep_; + u_int32_t gbytes, bytes; +{ + REP *rep; + double rdgbytes, rdbytes; + + rep = rep_; + + /* + * We use 'double' values to do the computation for precision, and + * to avoid overflow. + */ + rdgbytes = gbytes * 1.00 * DB_REPMGR_INQUEUE_REDZONE_PERCENT / 100.00; + rdbytes = (rdgbytes - (u_int32_t)rdgbytes) * GIGABYTE; + rdbytes += bytes * 1.00 * DB_REPMGR_INQUEUE_REDZONE_PERCENT / 100.00; + if (rdbytes >= GIGABYTE) { + rdgbytes += 1; + rdbytes -= GIGABYTE; + } + rep->inqueue_rz_gbytes = (u_int32_t)rdgbytes; + rep->inqueue_rz_bytes = (u_int32_t)rdbytes; +} + +/* + * PUBLIC: int __repmgr_get_incoming_queue_redzone __P((DB_ENV *, + * PUBLIC: u_int32_t *, u_int32_t *)); + * + * Gets the lower bound of the repmgr incoming queue red zone. + * This method must be called after environment open. + */ +int __repmgr_get_incoming_queue_redzone(dbenv, gbytesp, bytesp) + DB_ENV *dbenv; + u_int32_t *gbytesp, *bytesp; +{ + DB_REP *db_rep; + DB_THREAD_INFO *ip; + ENV *env; + REP *rep; + + env = dbenv->env; + db_rep = env->rep_handle; + rep = db_rep->region; + + ENV_REQUIRES_CONFIG( + env, db_rep->region, "__repmgr_get_incoming_queue_redzone", + DB_INIT_REP); + + ENV_ENTER(env, ip); + MUTEX_LOCK(env, rep->mtx_repmgr); + *gbytesp = rep->inqueue_rz_gbytes; + *bytesp = rep->inqueue_rz_bytes; + MUTEX_UNLOCK(env, rep->mtx_repmgr); + ENV_LEAVE(env, ip); + + return (0); +} + +/* + * PUBLIC: int __repmgr_get_incoming_queue_fullevent __P((DB_ENV *, + * PUBLIC: int *)); + * + * Return whether the DB_EVENT_REP_INQUEUE_FULL event firing is + * turned on or off. + * This method must be called after environment open. + */ +int __repmgr_get_incoming_queue_fullevent(dbenv, onoffp) + DB_ENV *dbenv; + int *onoffp; +{ + DB_REP *db_rep; + ENV *env; + REP *rep; + + env = dbenv->env; + db_rep = env->rep_handle; + rep = db_rep->region; + + ENV_REQUIRES_CONFIG( + env, db_rep->region, + "DB_ENV->__repmgr_get_incoming_queue_fullevent", + DB_INIT_REP); + + *onoffp = rep->inqueue_full_event_on ? 1 : 0; + + return (0); +} + +/* * PUBLIC: int __repmgr_env_create __P((ENV *, DB_REP *)); */ int @@ -837,7 +1242,13 @@ __repmgr_env_create(env, db_rep) db_rep->connection_retry_wait = DB_REPMGR_DEFAULT_CONNECTION_RETRY; db_rep->election_retry_wait = DB_REPMGR_DEFAULT_ELECTION_RETRY; db_rep->config_nsites = 0; + ADJUST_AUTOTAKEOVER_WAITS(db_rep, DB_REPMGR_DEFAULT_ACK_TIMEOUT); db_rep->perm_policy = DB_REPMGR_ACKS_QUORUM; + db_rep->inqueue_max_gbytes = 0; + db_rep->inqueue_max_bytes = 0; +#ifdef HAVE_REPLICATION_LISTENER_TAKEOVER + FLD_SET(db_rep->config, REP_C_AUTOTAKEOVER); +#endif FLD_SET(db_rep->config, REP_C_ELECTIONS); FLD_SET(db_rep->config, REP_C_2SITE_STRICT); @@ -846,7 +1257,8 @@ __repmgr_env_create(env, db_rep) TAILQ_INIT(&db_rep->connections); TAILQ_INIT(&db_rep->retries); - db_rep->input_queue.size = 0; + db_rep->input_queue.gbytes = 0; + db_rep->input_queue.bytes = 0; STAILQ_INIT(&db_rep->input_queue.header); __repmgr_env_create_pf(db_rep); @@ -944,6 +1356,15 @@ __repmgr_await_threads(env) * of a connector thread. */ + /* Takeover thread. */ + if (db_rep->takeover_thread != NULL) { + if ((t_ret = __repmgr_thread_join(db_rep->takeover_thread)) != + 0 && ret == 0) + ret = t_ret; + __os_free(env, db_rep->takeover_thread); + db_rep->takeover_thread = NULL; + } + /* Message processing threads. */ for (i = 0; i < db_rep->nthreads && db_rep->messengers[i] != NULL; i++) { @@ -1178,7 +1599,7 @@ get_shared_netaddr(env, eid, netaddr) MUTEX_LOCK(env, rep->mtx_repmgr); if ((u_int)eid >= rep->site_cnt) { - ret = DB_NOTFOUND; + ret = USR_ERR(env, DB_NOTFOUND); goto err; } DB_ASSERT(env, rep->siteinfo_off != INVALID_ROFF); @@ -1423,7 +1844,7 @@ send_msg_self(env, iovecs, nmsg) u_int32_t nmsg; { REPMGR_MESSAGE *msg; - size_t align, bodysize, structsize; + size_t align, bodysize, msgsize, structsize; u_int8_t *membase; int ret; @@ -1431,10 +1852,12 @@ send_msg_self(env, iovecs, nmsg) bodysize = iovecs->total_bytes - __REPMGR_MSG_HDR_SIZE; structsize = (size_t)DB_ALIGN((size_t)(sizeof(REPMGR_MESSAGE) + nmsg * sizeof(DBT)), align); - if ((ret = __os_malloc(env, structsize + bodysize, &membase)) != 0) + msgsize = structsize + bodysize; + if ((ret = __os_malloc(env, msgsize, &membase)) != 0) return (ret); msg = (void*)membase; + msg->size = msgsize; membase += structsize; /* @@ -1616,13 +2039,14 @@ __repmgr_send_request(db_channel, request, nrequest, response, timeout, flags) } ENV_ENTER(env, ip); - ret = get_channel_connection(channel, &conn); - ENV_LEAVE(env, ip); - if (ret != 0) - return (ret); + if ((ret = get_channel_connection(channel, &conn)) != 0) + goto out; - if (conn == NULL) - return (request_self(env, request, nrequest, response, flags)); + /* If conn is NULL, call request_self and then we are done here. */ + if (conn == NULL) { + ret = request_self(env, request, nrequest, response, flags); + goto out; + } /* Find an available array slot, or grow the array if necessary. */ LOCK_MUTEX(db_rep->mutex); @@ -1670,7 +2094,7 @@ __repmgr_send_request(db_channel, request, nrequest, response, timeout, flags) LOCK_MUTEX(db_rep->mutex); F_CLR(&conn->responses[i], RESP_IN_USE | RESP_THREAD_WAITING); UNLOCK_MUTEX(db_rep->mutex); - return (ret); + goto out; } timeout = timeout > 0 ? timeout : db_channel->timeout; @@ -1688,7 +2112,7 @@ __repmgr_send_request(db_channel, request, nrequest, response, timeout, flags) * to wake up those threads, with a COMPLETE indication and an * error code. That's more than we want to tackle here. */ - return (ret); + goto out; } /* @@ -1732,7 +2156,7 @@ __repmgr_send_request(db_channel, request, nrequest, response, timeout, flags) sz = conn->iovecs.vectors[0].iov_len; if ((ret = __os_malloc(env, sz, &dummy)) != 0) - goto out; + goto out_unlck; __repmgr_iovec_init(&conn->iovecs); DB_INIT_DBT(resp->dbt, dummy, sz); __repmgr_add_dbt(&conn->iovecs, &resp->dbt); @@ -1740,8 +2164,9 @@ __repmgr_send_request(db_channel, request, nrequest, response, timeout, flags) } } -out: +out_unlck: UNLOCK_MUTEX(db_rep->mutex); +out: ENV_LEAVE(env, ip); return (ret); } @@ -2168,6 +2593,7 @@ __repmgr_channel_close(dbchan, flags) { ENV *env; DB_REP *db_rep; + DB_THREAD_INFO *ip; REPMGR_CONNECTION *conn; CHANNEL *channel; u_int32_t i; @@ -2182,6 +2608,7 @@ __repmgr_channel_close(dbchan, flags) * Disable connection(s) (if not already done due to an error having * occurred previously); release our reference to conn struct(s). */ + ENV_ENTER(env, ip); LOCK_MUTEX(db_rep->mutex); if (dbchan->eid >= 0) { conn = channel->c.conn; @@ -2218,6 +2645,7 @@ __repmgr_channel_close(dbchan, flags) __os_free(env, channel); __os_free(env, dbchan); + ENV_LEAVE(env, ip); return (ret); } @@ -2369,29 +2797,26 @@ join_group_at_site(env, addrp) repmgr_netaddr_t *addrp; { DB_REP *db_rep; + REP *rep; REPMGR_CONNECTION *conn; SITE_STRING_BUFFER addr_buf; repmgr_netaddr_t addr, myaddr; __repmgr_gm_fwd_args fwd; __repmgr_site_info_args site_info; + __repmgr_v4site_info_args v4site_info; u_int8_t *p, *response_buf, siteinfo_buf[MAX_MSG_BUF]; char host_buf[MAXHOSTNAMELEN + 1], *host; u_int32_t gen, type; - size_t len; + size_t host_len, msg_len, req_len; int ret, t_ret; db_rep = env->rep_handle; + rep = db_rep->region; LOCK_MUTEX(db_rep->mutex); myaddr = SITE_FROM_EID(db_rep->self_eid)->net_addr; UNLOCK_MUTEX(db_rep->mutex); - len = strlen(myaddr.host) + 1; - DB_INIT_DBT(site_info.host, myaddr.host, len); - site_info.port = myaddr.port; - site_info.flags = 0; - ret = __repmgr_site_info_marshal(env, - &site_info, siteinfo_buf, sizeof(siteinfo_buf), &len); - DB_ASSERT(env, ret == 0); + host_len = strlen(myaddr.host) + 1; conn = NULL; response_buf = NULL; @@ -2399,14 +2824,35 @@ join_group_at_site(env, addrp) RPRINT(env, (env, DB_VERB_REPMGR_MISC, "try join request to site %s", __repmgr_format_addr_loc(addrp, addr_buf))); retry: - if ((ret = make_request_conn(env, addrp, &conn)) != 0) + if ((ret = __repmgr_make_request_conn(env, addrp, &conn)) != 0) return (ret); + DB_ASSERT(env, conn->version > 0 && conn->version <= DB_REPMGR_VERSION); + if (conn->version < 5) { + DB_INIT_DBT(v4site_info.host, myaddr.host, host_len); + v4site_info.port = myaddr.port; + v4site_info.flags = 0; + ret = __repmgr_v4site_info_marshal(env, + &v4site_info, siteinfo_buf, sizeof(siteinfo_buf), &req_len); + } else { + DB_INIT_DBT(site_info.host, myaddr.host, host_len); + site_info.port = myaddr.port; + site_info.status = 0; + site_info.flags = 0; + if (IS_VIEW_SITE(env)) + FLD_SET(site_info.flags, SITE_VIEW); + if (rep->priority > 0) + FLD_SET(site_info.flags, SITE_JOIN_ELECTABLE); + ret = __repmgr_site_info_marshal(env, + &site_info, siteinfo_buf, sizeof(siteinfo_buf), &req_len); + } + DB_ASSERT(env, ret == 0); + /* Preserve separate request length in case there is a retry. */ if ((ret = __repmgr_send_sync_msg(env, conn, - REPMGR_JOIN_REQUEST, siteinfo_buf, (u_int32_t)len)) != 0) + REPMGR_JOIN_REQUEST, siteinfo_buf, (u_int32_t)req_len)) != 0) goto err; - if ((ret = read_own_msg(env, - conn, &type, &response_buf, &len)) != 0) + if ((ret = __repmgr_read_own_msg(env, + conn, &type, &response_buf, &msg_len)) != 0) goto err; if (type == REPMGR_GM_FAILURE) { @@ -2429,7 +2875,7 @@ retry: goto err; ret = __repmgr_gm_fwd_unmarshal(env, &fwd, - response_buf, len, &p); + response_buf, msg_len, &p); DB_ASSERT(env, ret == 0); if (fwd.gen > gen) { if (fwd.host.size > MAXHOSTNAMELEN + 1) { @@ -2456,7 +2902,8 @@ retry: } } if (type == REPMGR_JOIN_SUCCESS) - ret = __repmgr_refresh_membership(env, response_buf, len); + ret = __repmgr_refresh_membership(env, response_buf, msg_len, + conn->version); else ret = DB_REP_UNAVAIL; /* Invalid response: protocol violation */ @@ -2476,129 +2923,6 @@ err: } /* - * Reads a whole message, when we expect to get a REPMGR_OWN_MSG. - */ -static int -read_own_msg(env, conn, typep, bufp, lenp) - ENV *env; - REPMGR_CONNECTION *conn; - u_int32_t *typep; - u_int8_t **bufp; - size_t *lenp; -{ - __repmgr_msg_hdr_args msg_hdr; - u_int8_t *buf; - u_int32_t type; - size_t size; - int ret; - - __repmgr_reset_for_reading(conn); - if ((ret = __repmgr_read_conn(conn)) != 0) - goto err; - ret = __repmgr_msg_hdr_unmarshal(env, &msg_hdr, - conn->msg_hdr_buf, __REPMGR_MSG_HDR_SIZE, NULL); - DB_ASSERT(env, ret == 0); - - if ((conn->msg_type = msg_hdr.type) != REPMGR_OWN_MSG) { - ret = DB_REP_UNAVAIL; /* Protocol violation. */ - goto err; - } - type = REPMGR_OWN_MSG_TYPE(msg_hdr); - if ((size = (size_t)REPMGR_OWN_BUF_SIZE(msg_hdr)) > 0) { - conn->reading_phase = DATA_PHASE; - __repmgr_iovec_init(&conn->iovecs); - - if ((ret = __os_malloc(env, size, &buf)) != 0) - goto err; - conn->input.rep_message = NULL; - - __repmgr_add_buffer(&conn->iovecs, buf, size); - if ((ret = __repmgr_read_conn(conn)) != 0) { - __os_free(env, buf); - goto err; - } - *bufp = buf; - } - - *typep = type; - *lenp = size; - -err: - return (ret); -} - -static int -make_request_conn(env, addr, connp) - ENV *env; - repmgr_netaddr_t *addr; - REPMGR_CONNECTION **connp; -{ - DBT vi; - __repmgr_msg_hdr_args msg_hdr; - __repmgr_version_confirmation_args conf; - REPMGR_CONNECTION *conn; - int alloc, ret, unused; - - alloc = FALSE; - if ((ret = __repmgr_connect(env, addr, &conn, &unused)) != 0) - return (ret); - conn->type = APP_CONNECTION; - - /* Read a handshake msg, to get version confirmation and parameters. */ - if ((ret = __repmgr_read_conn(conn)) != 0) - goto err; - /* - * We can only get here after having read the full 9 bytes that we - * expect, so this can't fail. - */ - DB_ASSERT(env, conn->reading_phase == SIZES_PHASE); - ret = __repmgr_msg_hdr_unmarshal(env, &msg_hdr, - conn->msg_hdr_buf, __REPMGR_MSG_HDR_SIZE, NULL); - DB_ASSERT(env, ret == 0); - __repmgr_iovec_init(&conn->iovecs); - conn->reading_phase = DATA_PHASE; - - if ((ret = __repmgr_prepare_simple_input(env, conn, &msg_hdr)) != 0) - goto err; - alloc = TRUE; - - if ((ret = __repmgr_read_conn(conn)) != 0) - goto err; - - /* - * Analyze the handshake msg, and stash relevant info. - */ - if ((ret = __repmgr_find_version_info(env, conn, &vi)) != 0) - goto err; - DB_ASSERT(env, vi.size > 0); - if ((ret = __repmgr_version_confirmation_unmarshal(env, - &conf, vi.data, vi.size, NULL)) != 0) - goto err; - - if (conf.version < GM_MIN_VERSION) { - ret = DB_REP_UNAVAIL; - goto err; - } - conn->version = conf.version; - -err: - if (alloc) { - DB_ASSERT(env, conn->input.repmgr_msg.cntrl.size > 0); - __os_free(env, conn->input.repmgr_msg.cntrl.data); - DB_ASSERT(env, conn->input.repmgr_msg.rec.size > 0); - __os_free(env, conn->input.repmgr_msg.rec.data); - } - __repmgr_reset_for_reading(conn); - if (ret == 0) - *connp = conn; - else { - (void)__repmgr_close_connection(env, conn); - (void)__repmgr_destroy_conn(env, conn); - } - return (ret); -} - -/* * PUBLIC: int __repmgr_site __P((DB_ENV *, * PUBLIC: const char *, u_int, DB_SITE **, u_int32_t)); */ @@ -2640,9 +2964,9 @@ site_by_addr(env, host, port, sitep) if ((ret = addr_chk(env, host, port)) != 0) return (ret); + ENV_ENTER(env, ip); if (REP_ON(env)) { LOCK_MUTEX(db_rep->mutex); - ENV_ENTER(env, ip); locked = TRUE; } else locked = FALSE; @@ -2654,10 +2978,9 @@ site_by_addr(env, host, port, sitep) * we want the DB_SITE handle to point to; just like site_by_eid() does. */ host = site->net_addr.host; - if (locked) { - ENV_LEAVE(env, ip); + if (locked) UNLOCK_MUTEX(db_rep->mutex); - } + ENV_LEAVE(env, ip); if (ret != 0) return (ret); @@ -2723,7 +3046,7 @@ init_dbsite(env, eid, host, port, sitep) dbsite->get_address = __repmgr_get_site_address; dbsite->get_config = __repmgr_get_config; dbsite->get_eid = __repmgr_get_eid; - dbsite->set_config = __repmgr_site_config; + dbsite->set_config = __repmgr_site_config_pp; dbsite->remove = __repmgr_remove_site_pp; dbsite->close = __repmgr_site_close; @@ -2756,9 +3079,16 @@ __repmgr_get_eid(dbsite, eidp) DB_SITE *dbsite; int *eidp; { + DB_THREAD_INFO *ip; + ENV *env; int ret; - if ((ret = refresh_site(dbsite)) != 0) + env = dbsite->env; + + ENV_ENTER(env, ip); + ret = refresh_site(dbsite); + ENV_LEAVE(env, ip); + if (ret != 0) return (ret); if (F_ISSET(dbsite, DB_SITE_PREOPEN)) { @@ -2791,8 +3121,11 @@ __repmgr_get_config(dbsite, which, valuep) env = dbsite->env; db_rep = env->rep_handle; - if ((ret = refresh_site(dbsite)) != 0) + ENV_ENTER(env, ip); + if ((ret = refresh_site(dbsite)) != 0) { + ENV_LEAVE(env, ip); return (ret); + } LOCK_MUTEX(db_rep->mutex); DB_ASSERT(env, IS_VALID_EID(dbsite->eid)); site = SITE_FROM_EID(dbsite->eid); @@ -2800,32 +3133,52 @@ __repmgr_get_config(dbsite, which, valuep) rep = db_rep->region; infop = env->reginfo; - ENV_ENTER(env, ip); MUTEX_LOCK(env, rep->mtx_repmgr); sites = R_ADDR(infop, rep->siteinfo_off); site->config = sites[dbsite->eid].config; MUTEX_UNLOCK(env, rep->mtx_repmgr); - ENV_LEAVE(env, ip); } *valuep = FLD_ISSET(site->config, which) ? 1 : 0; UNLOCK_MUTEX(db_rep->mutex); + ENV_LEAVE(env, ip); return (0); } /* - * PUBLIC: int __repmgr_site_config __P((DB_SITE *, u_int32_t, u_int32_t)); + * PUBLIC: int __repmgr_site_config_pp __P((DB_SITE *, u_int32_t, u_int32_t)); */ int -__repmgr_site_config(dbsite, which, value) +__repmgr_site_config_pp(dbsite, which, value) DB_SITE *dbsite; u_int32_t which; u_int32_t value; { - DB_REP *db_rep; DB_THREAD_INFO *ip; ENV *env; + int ret; + + env = dbsite->env; + + ENV_ENTER(env, ip); + ret = __repmgr_site_config_int(dbsite, which, value); + ENV_LEAVE(env, ip); + + return (ret); +} + +/* + * PUBLIC: int __repmgr_site_config_int __P((DB_SITE *, u_int32_t, u_int32_t)); + */ +int +__repmgr_site_config_int(dbsite, which, value) + DB_SITE *dbsite; + u_int32_t which; + u_int32_t value; +{ + DB_REP *db_rep; + ENV *env; REGINFO *infop; REP *rep; REPMGR_SITE *site; @@ -2875,7 +3228,6 @@ __repmgr_site_config(dbsite, which, value) infop = env->reginfo; LOCK_MUTEX(db_rep->mutex); - ENV_ENTER(env, ip); MUTEX_LOCK(env, rep->mtx_repmgr); sites = R_ADDR(infop, rep->siteinfo_off); site = SITE_FROM_EID(dbsite->eid); @@ -2896,7 +3248,6 @@ __repmgr_site_config(dbsite, which, value) rep->siteinfo_seq++; } MUTEX_UNLOCK(env, rep->mtx_repmgr); - ENV_LEAVE(env, ip); UNLOCK_MUTEX(db_rep->mutex); } else { site = SITE_FROM_EID(dbsite->eid); @@ -2930,7 +3281,6 @@ set_local_site(dbsite, value) if (REP_ON(env)) { rep = db_rep->region; LOCK_MUTEX(db_rep->mutex); - ENV_ENTER(env, ip); MUTEX_LOCK(env, rep->mtx_repmgr); locked = TRUE; /* Make sure we're in sync first. */ @@ -2941,31 +3291,32 @@ set_local_site(dbsite, value) __db_errx(env, DB_STR("3666", "A previously given local site may not be unset")); ret = EINVAL; - } else if (IS_VALID_EID(db_rep->self_eid) && - db_rep->self_eid != dbsite->eid) { - __db_errx(env, DB_STR("3667", - "A (different) local site has already been set")); - ret = EINVAL; - } else { - DB_ASSERT(env, IS_VALID_EID(dbsite->eid)); - site = SITE_FROM_EID(dbsite->eid); - if (FLD_ISSET(site->config, - DB_BOOTSTRAP_HELPER | DB_REPMGR_PEER)) { - __db_errx(env, DB_STR("3668", - "Local site cannot have HELPER or PEER attributes")); + } else if (value) { + if (IS_VALID_EID(db_rep->self_eid) && + db_rep->self_eid != dbsite->eid) { + __db_errx(env, DB_STR("3697", + "A (different) local site has already been set")); ret = EINVAL; + } else { + DB_ASSERT(env, IS_VALID_EID(dbsite->eid)); + site = SITE_FROM_EID(dbsite->eid); + if (FLD_ISSET(site->config, + DB_BOOTSTRAP_HELPER | DB_REPMGR_PEER)) { + __db_errx(env, DB_STR("3698", + "Local site cannot have HELPER or PEER attributes")); + ret = EINVAL; + } } } - if (ret == 0) { + if (ret == 0 && value) { db_rep->self_eid = dbsite->eid; if (locked) { - rep->self_eid = dbsite->eid; + rep->self_eid = db_rep->self_eid; rep->siteinfo_seq++; } } if (locked) { MUTEX_UNLOCK(env, rep->mtx_repmgr); - ENV_LEAVE(env, ip); UNLOCK_MUTEX(db_rep->mutex); } return (ret); @@ -2998,7 +3349,7 @@ refresh_site(dbsite) } static int -__repmgr_remove_site_pp(dbsite) +__repmgr_remove_and_close_site(dbsite) DB_SITE *dbsite; { int ret, t_ret; @@ -3011,6 +3362,23 @@ __repmgr_remove_site_pp(dbsite) */ if ((t_ret = __repmgr_site_close(dbsite)) != 0 && ret == 0) ret = t_ret; + + return (ret); +} + +static int +__repmgr_remove_site_pp(dbsite) + DB_SITE *dbsite; +{ + ENV *env; + DB_THREAD_INFO *ip; + int ret; + + env = dbsite->env; + + ENV_ENTER(env, ip); + ret = __repmgr_remove_and_close_site(dbsite); + ENV_LEAVE(env, ip); return (ret); } @@ -3024,6 +3392,7 @@ __repmgr_remove_site(dbsite) REPMGR_CONNECTION *conn; repmgr_netaddr_t addr; __repmgr_site_info_args site_info; + __repmgr_v4site_info_args v4site_info; u_int8_t *response_buf, siteinfo_buf[MAX_MSG_BUF]; size_t len; u_int32_t type; @@ -3046,23 +3415,33 @@ __repmgr_remove_site(dbsite) DB_ASSERT(env, IS_VALID_EID(master)); addr = SITE_FROM_EID(master)->net_addr; UNLOCK_MUTEX(db_rep->mutex); - len = strlen(dbsite->host) + 1; - DB_INIT_DBT(site_info.host, dbsite->host, len); - site_info.port = dbsite->port; - site_info.flags = 0; - ret = __repmgr_site_info_marshal(env, - &site_info, siteinfo_buf, sizeof(siteinfo_buf), &len); - DB_ASSERT(env, ret == 0); conn = NULL; response_buf = NULL; - if ((ret = make_request_conn(env, &addr, &conn)) != 0) + if ((ret = __repmgr_make_request_conn(env, &addr, &conn)) != 0) return (ret); + DB_ASSERT(env, conn->version > 0 && conn->version <= DB_REPMGR_VERSION); + if (conn->version < 5) { + DB_INIT_DBT(v4site_info.host, dbsite->host, len); + v4site_info.port = dbsite->port; + v4site_info.flags = 0; + ret = __repmgr_v4site_info_marshal(env, + &v4site_info, siteinfo_buf, sizeof(siteinfo_buf), &len); + } else { + DB_INIT_DBT(site_info.host, dbsite->host, len); + site_info.port = dbsite->port; + site_info.status = 0; + site_info.flags = 0; + ret = __repmgr_site_info_marshal(env, + &site_info, siteinfo_buf, sizeof(siteinfo_buf), &len); + } + DB_ASSERT(env, ret == 0); + if ((ret = __repmgr_send_sync_msg(env, conn, REPMGR_REMOVE_REQUEST, siteinfo_buf, (u_int32_t)len)) != 0) goto err; - if ((ret = read_own_msg(env, + if ((ret = __repmgr_read_own_msg(env, conn, &type, &response_buf, &len)) != 0) goto err; ret = type == REPMGR_REMOVE_SUCCESS ? 0 : DB_REP_UNAVAIL; @@ -3090,3 +3469,82 @@ __repmgr_site_close(dbsite) __os_free(dbsite->env, dbsite); return (0); } + +/* + * Demotes a participant site to a view. This is a one-way and one-time + * operation. + * + * The demotion occurs at the very end of repmgr_start() because it + * requires a select thread to perform the gmdb operations that remove + * the site from the replication group and immediately add the site back + * into the group as a view. The demotion also preserves any other threads + * created by repmgr_start() so that they are there to be used by the + * demoted site after it is re-added as a view site. + * + * We remove and re-add the site to propagate the site's change from + * participant to view to all sites in the replication group. This includes + * updates to each site's gmdb and in-memory site list. + */ +#define REPMGR_DEMOTION_MASTER_RETRIES 10 +#define REPMGR_DEMOTION_RETRY_USECS 500000 +static int +__repmgr_demote_site(env, eid) + ENV *env; + int eid; +{ + DB_REP *db_rep; + DB_SITE *dbsite; + REP *rep; + REPMGR_SITE *site; + int ret, t_ret, tries; + + db_rep = env->rep_handle; + rep = db_rep->region; + site = SITE_FROM_EID(eid); + dbsite = NULL; + + /* Inform other repmgr threads that a demotion is in progress. */ + db_rep->demotion_pending = TRUE; + + if ((ret = init_dbsite(env, eid, site->net_addr.host, + site->net_addr.port, &dbsite)) != 0) + goto err; + + /* + * We need a master to perform the gmdb updates. Poll periodically + * for a limited time to find one. + */ + tries = 0; + while (rep->master_id == DB_EID_INVALID) { + __os_yield(env, 0, REPMGR_DEMOTION_RETRY_USECS); + if (++tries >= REPMGR_DEMOTION_MASTER_RETRIES) { + ret = DB_REP_UNAVAIL; + goto err; + } + } + + /* Remove site from replication group. */ + if ((ret = __repmgr_remove_site(dbsite)) != 0) + goto err; + + /* + * Add site back into replication group as a view. This demotion is + * occurring because this site now has a view callback but its + * SITE_VIEW flag is not set. Now, __repmgr_join_group() will detect + * the view callback and set the SITE_VIEW flag before sending this + * site's information to the rest of the replication group. + */ + if ((ret = __repmgr_join_group(env)) != 0) + goto err; + +err: + /* Deallocates dbsite. */ + if (dbsite != NULL) { + t_ret = __repmgr_site_close(dbsite); + if (ret == 0 && t_ret != 0) + ret = t_ret; + } + /* Must reset demotion_pending before leaving this routine. */ + db_rep->demotion_pending = FALSE; + return (ret); +} diff --git a/src/repmgr/repmgr_msg.c b/src/repmgr/repmgr_msg.c index 13537823..71cb2ada 100644 --- a/src/repmgr/repmgr_msg.c +++ b/src/repmgr/repmgr_msg.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -15,15 +15,19 @@ #include "dbinc_auto/repmgr_auto.h" static int dispatch_app_message __P((ENV *, REPMGR_MESSAGE *)); -static int finish_gmdb_update __P((ENV *, - DB_THREAD_INFO *, DBT *, u_int32_t, u_int32_t, __repmgr_member_args *)); +static int finish_gmdb_update __P((ENV *, DB_THREAD_INFO *, + DBT *, u_int32_t, u_int32_t, u_int32_t, __repmgr_member_args *)); static int incr_gm_version __P((ENV *, DB_THREAD_INFO *, DB_TXN *)); -static void marshal_site_data __P((ENV *, u_int32_t, u_int8_t *, DBT *)); +static void marshal_site_data __P((ENV *, + u_int32_t, u_int32_t, u_int8_t *, DBT *)); static void marshal_site_key __P((ENV *, repmgr_netaddr_t *, u_int8_t *, DBT *, __repmgr_member_args *)); static int message_loop __P((ENV *, REPMGR_RUNNABLE *)); +static int preferred_master_takeover __P((ENV*)); static int process_message __P((ENV*, DBT*, DBT*, int)); static int reject_fwd __P((ENV *, REPMGR_CONNECTION *)); +static int rejoin_connections(ENV *); +static int rejoin_deferred_election(ENV *); static int rescind_pending __P((ENV *, DB_THREAD_INFO *, int, u_int32_t, u_int32_t)); static int resolve_limbo_int __P((ENV *, DB_THREAD_INFO *)); @@ -33,9 +37,13 @@ static int send_permlsn_conn __P((ENV *, REPMGR_CONNECTION *, u_int32_t, DB_LSN *)); static int serve_join_request __P((ENV *, DB_THREAD_INFO *, REPMGR_MESSAGE *)); +static int serve_lsnhist_request __P((ENV *, DB_THREAD_INFO *, + REPMGR_MESSAGE *)); +static int serve_readonly_master_request __P((ENV *, REPMGR_MESSAGE *)); static int serve_remove_request __P((ENV *, DB_THREAD_INFO *, REPMGR_MESSAGE *)); static int serve_repmgr_request __P((ENV *, REPMGR_MESSAGE *)); +static int serve_restart_client_request __P((ENV *, REPMGR_MESSAGE *)); /* * Map one of the phase-1/provisional membership status values to its @@ -72,6 +80,7 @@ message_loop(env, th) REPMGR_RUNNABLE *th; { DB_REP *db_rep; + DB_THREAD_INFO *ip; REP *rep; REPMGR_MESSAGE *msg; REPMGR_CONNECTION *conn; @@ -83,6 +92,7 @@ message_loop(env, th) COMPQUIET(membership, 0); db_rep = env->rep_handle; rep = db_rep->region; + ENV_ENTER(env, ip); LOCK_MUTEX(db_rep->mutex); while ((ret = __repmgr_queue_get(env, &msg, th)) == 0) { incremented = FALSE; @@ -141,7 +151,21 @@ message_loop(env, th) * detect it without the need for application * activity. */ - ret = __rep_flush(env->dbenv); + ret = __rep_flush_int(env); + } else if (db_rep->prefmas_pending == master_switch && + IS_PREFMAS_MODE(env) && + FLD_ISSET(rep->config, REP_C_PREFMAS_MASTER) && + F_ISSET(rep, REP_F_CLIENT)) { + RPRINT(env, (env, DB_VERB_REPMGR_MISC, +"message_loop heartbeat preferred master switch")); + /* + * We are a preferred master site currently + * running as a client and we have finished + * syncing with the temporary master. It is + * now time to take over as master. + */ + db_rep->prefmas_pending = no_action; + ret = preferred_master_takeover(env); } else { /* * Use heartbeat message to initiate rerequest @@ -162,6 +186,12 @@ message_loop(env, th) db_rep->non_rep_th--; if (ret != 0) goto out; + if (db_rep->view_mismatch) { + __db_errx(env, DB_STR("3699", + "Site is not recorded as a view in the group membership database")); + ret = EINVAL; + goto out; + } } /* * A return of DB_REP_UNAVAIL from __repmgr_queue_get() merely means we @@ -171,6 +201,7 @@ message_loop(env, th) ret = 0; out: UNLOCK_MUTEX(db_rep->mutex); + ENV_LEAVE(env, ip); return (ret); } @@ -341,16 +372,45 @@ process_message(env, control, rec, eid) break; case DB_REP_DUPMASTER: - /* - * Initiate an election if we're configured to be using - * elections, but only if we're *NOT* using leases. When using - * leases, there is never any uncertainty over which site is the - * rightful master, and only the loser gets the DUPMASTER return - * code. - */ - if ((ret = __repmgr_become_client(env)) == 0 && + if (IS_PREFMAS_MODE(env) && + FLD_ISSET(rep->config, REP_C_PREFMAS_MASTER)) { + /* + * The preferred master site must restart as a master + * so that it sends out a NEWMASTER to help the client + * sync. It must force a role change so that it + * advances its gen even though it is already master. + * This is needed if there was a temporary master at + * a higher gen that is now restarting as a client. + * A client won't process messages from a master at + * a lower gen than its own. + */ + ret = __repmgr_repstart(env, DB_REP_MASTER, + REP_START_FORCE_ROLECHG); + } else if (IS_PREFMAS_MODE(env) && + FLD_ISSET(rep->config, REP_C_PREFMAS_CLIENT) && + (ret = __repmgr_become_client(env)) == 0) { + /* + * The preferred master client site must restart as + * client without any elections to enable the preferred + * master site to preserve its own transactions. It + * uses an election thread to repeatedly perform client + * startups so that it will perform its client sync + * when the preferred master's gen has caught up. + */ + LOCK_MUTEX(db_rep->mutex); + ret = __repmgr_init_election(env, + ELECT_F_CLIENT_RESTART); + UNLOCK_MUTEX(db_rep->mutex); + } else if ((ret = __repmgr_become_client(env)) == 0 && FLD_ISSET(rep->config, REP_C_LEASE | REP_C_ELECTIONS) == REP_C_ELECTIONS) { + /* + * Initiate an election if we're configured to be using + * elections, but only if we're *NOT* using leases. + * When using leases, there is never any uncertainty + * over which site is the rightful master, and only the + * loser gets the DUPMASTER return code. + */ LOCK_MUTEX(db_rep->mutex); ret = __repmgr_init_election(env, ELECT_F_IMMED); UNLOCK_MUTEX(db_rep->mutex); @@ -406,6 +466,14 @@ DB_TEST_RECOVERY_LABEL t_ret = __op_rep_exit(env); if (ret == ENOENT) ret = 0; + else if (ret == DB_DELETED && db_rep->demotion_pending) + /* + * If a demotion is in progress, we want to keep + * the repmgr threads instead of bowing out because + * they are needed when we rejoin the replication group + * immediately as a view. + */ + ret = 0; else if (ret == DB_DELETED) ret = __repmgr_bow_out(env); if (t_ret != 0 && ret == 0) @@ -428,8 +496,10 @@ __repmgr_handle_event(env, event, info) void *info; { DB_REP *db_rep; + REP *rep; db_rep = env->rep_handle; + rep = db_rep->region; if (db_rep->selector == NULL) { /* Repmgr is not in use, so all events go to application. */ @@ -457,9 +527,46 @@ __repmgr_handle_event(env, event, info) /* Application still needs to see this. */ break; + case DB_EVENT_REP_MASTER: + case DB_EVENT_REP_STARTUPDONE: + /* + * Detect a rare case where a dupmaster or incomplete gmdb + * operation has left the site's gmdb inconsistent with + * a view callback definition. The user would have correctly + * defined a view callback and called repmgr_start(), but the + * gmdb operation to update this site to a view would have been + * incomplete or rolled back. The site cannot operate in this + * inconsistent state, so set an indicator to cause a message + * thread to panic and terminate. + * + * The one exception is during a demotion to view, when + * this inconsistency is expected for a short time. + */ + if (IS_VALID_EID(db_rep->self_eid) && + PARTICIPANT_TO_VIEW(db_rep, + SITE_FROM_EID(db_rep->self_eid)) && + !db_rep->demotion_pending) + db_rep->view_mismatch = TRUE; + + /* + * In preferred master mode, when the preferred master site + * finishes synchronizing with the temporary master it must + * prepare to take over as master. This is detected by the + * next heartbeat in a message thread, where the takeover is + * actually performed. + */ + if (event == DB_EVENT_REP_STARTUPDONE && + IS_PREFMAS_MODE(env) && + FLD_ISSET(rep->config, REP_C_PREFMAS_MASTER)) { + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "startupdone set preferred master switch")); + db_rep->prefmas_pending = master_switch; + } + break; default: break; } + COMPQUIET(info, NULL); return (DB_EVENT_NOT_HANDLED); } @@ -504,7 +611,7 @@ send_permlsn(env, generation, lsn) */ policy = site->ack_policy > 0 ? site->ack_policy : rep->perm_policy; - if (policy == DB_REPMGR_ACKS_NONE || + if (IS_VIEW_SITE(env) || policy == DB_REPMGR_ACKS_NONE || (IS_PEER_POLICY(policy) && rep->priority == 0)) ack = FALSE; else @@ -614,26 +721,149 @@ send_permlsn_conn(env, conn, generation, lsn) return (ret); } +/* + * Perform the steps on the preferred master site to take over again as + * preferred master from a temporary master. This routine should only be + * called after the preferred master has restarted as a client and finished + * a client sync with the temporary master. + * + * This routine makes a best effort to wait until all temporary master + * transactions have been applied on this site before taking over. + */ +static int +preferred_master_takeover(env) + ENV *env; +{ + DB_LOG *dblp; + DB_REP *db_rep; + LOG *lp; + REP *rep; + DB_LSN last_ready_lsn, ready_lsn, sync_lsn; + u_long usec; + u_int32_t gen, max_tries, tries; + int ret, synced; + + dblp = env->lg_handle; + lp = dblp->reginfo.primary; + db_rep = env->rep_handle; + rep = db_rep->region; + gen = 0; + ZERO_LSN(sync_lsn); + ret = 0; + + if (!IS_PREFMAS_MODE(env)) + return (ret); + + /* + * Start by making the temporary master a readonly master so that we + * can know when we have applied all of its transactions on this + * site before taking over. + */ + if ((ret = __repmgr_make_site_readonly_master(env, + 1, &gen, &sync_lsn)) != 0) + return (ret); + DB_ASSERT(env, gen >= rep->gen); + + /* + * Make a best effort to wait until this site has all transactions + * from the temporary master. We want to preserve temporary master + * transactions, but we can't wait forever. If we exceed our wait, + * we restart this site as preferred master anyway. This may + * sacrifice some temporary master transactions in order to preserve + * repgroup write availability. + * + * We restart the number of tries each time we make progress in + * transactions applied, until either we apply through sync_lsn or + * we exceed max_tries without progress. + */ + if ((ret = __repmgr_prefmas_get_wait(env, &max_tries, &usec)) != 0) + return (ret); + tries = 0; + synced = 0; + ZERO_LSN(ready_lsn); + ZERO_LSN(last_ready_lsn); + while (!synced && tries < max_tries) { + __os_yield(env, 0, usec); + tries++; + /* + * lp->ready_lsn is the next LSN we expect to receive, + * which also indicates how much we've applied. sync_lsn + * is the lp->lsn (indicating the next log record expected) + * from the other site. + */ + MUTEX_LOCK(env, rep->mtx_clientdb); + ready_lsn = lp->ready_lsn; + MUTEX_UNLOCK(env, rep->mtx_clientdb); + if (gen == rep->gen && LOG_COMPARE(&ready_lsn, &sync_lsn) >= 0) + synced = 1; + else if (LOG_COMPARE(&ready_lsn, &last_ready_lsn) >= 0) { + /* We are making progress, restart number of tries. */ + last_ready_lsn = ready_lsn; + tries = 0; + } + } + + /* Restart the remote readonly temporary master as a client. */ + if ((ret = __repmgr_restart_site_as_client(env, 1)) != 0) + return (ret); + + /* Restart this site as the preferred master, waiting for + * REP_LOCKOUT_MSG. The NEWCLIENT message sent back from + * restarting the other site as client can briefly lock + * REP_LOCKOUT_MSG to do some cleanup. We don't want this + * to cause the rep_start_int() call to restart this site + * as master to return 0 without doing anything. + */ + ret = __repmgr_become_master(env, REP_START_WAIT_LOCKMSG); + return (ret); +} + static int serve_repmgr_request(env, msg) ENV *env; REPMGR_MESSAGE *msg; { - DB_THREAD_INFO *ip; + DB_REP *db_rep; DBT *dbt; + DB_THREAD_INFO *ip; REPMGR_CONNECTION *conn; + u_int32_t mtype; int ret, t_ret; - ENV_ENTER(env, ip); - switch (REPMGR_OWN_MSG_TYPE(msg->msg_hdr)) { + db_rep = env->rep_handle; + ENV_GET_THREAD_INFO(env, ip); + conn = msg->v.gmdb_msg.conn; + mtype = REPMGR_OWN_MSG_TYPE(msg->msg_hdr); + switch (mtype) { case REPMGR_JOIN_REQUEST: ret = serve_join_request(env, ip, msg); break; + case REPMGR_LSNHIST_REQUEST: + ret = serve_lsnhist_request(env, ip, msg); + break; + case REPMGR_READONLY_MASTER: + ret = serve_readonly_master_request(env, msg); + break; case REPMGR_REJOIN: RPRINT(env, (env, DB_VERB_REPMGR_MISC, "One try at rejoining group automatically")); if ((ret = __repmgr_join_group(env)) == DB_REP_UNAVAIL) ret = __repmgr_bow_out(env); + else if (ret == 0 && IS_PREFMAS_MODE(env)) { + /* + * For preferred master mode, we need to get + * a "regular" connection to the other site without + * calling an election prematurely here. + */ + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "Establishing connections after rejoin")); + ret = rejoin_connections(env); + } else if (ret == 0 && db_rep->rejoin_pending) { + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "Calling deferred election after rejoin")); + ret = rejoin_deferred_election(env); + } + db_rep->rejoin_pending = FALSE; break; case REPMGR_REMOVE_REQUEST: ret = serve_remove_request(env, ip, msg); @@ -641,23 +871,32 @@ serve_repmgr_request(env, msg) case REPMGR_RESOLVE_LIMBO: ret = resolve_limbo_wrapper(env, ip); break; + case REPMGR_RESTART_CLIENT: + ret = serve_restart_client_request(env, msg); + break; case REPMGR_SHARING: dbt = &msg->v.gmdb_msg.request; - ret = __repmgr_refresh_membership(env, dbt->data, dbt->size); + ret = __repmgr_refresh_membership(env, dbt->data, dbt->size, + (conn == NULL ? DB_REPMGR_VERSION : conn->version)); break; default: ret = __db_unknown_path(env, "serve_repmgr_request"); break; } - if ((conn = msg->v.gmdb_msg.conn) != NULL) { + if (conn != NULL) { + /* + * A site that removed itself may have already closed its + * connections. Do not return an error and panic if we + * can't close the one-shot GMDB connection for a remove + * request here. + */ if ((t_ret = __repmgr_close_connection(env, conn)) != 0 && - ret == 0) + ret == 0 && mtype != REPMGR_REMOVE_REQUEST) ret = t_ret; if ((t_ret = __repmgr_decr_conn_ref(env, conn)) != 0 && ret == 0) ret = t_ret; } - ENV_LEAVE(env, ip); return (ret); } @@ -674,8 +913,10 @@ serve_join_request(env, ip, msg) { DB_REP *db_rep; REPMGR_CONNECTION *conn; + REPMGR_SITE *site; DBT *dbt; __repmgr_site_info_args site_info; + __repmgr_v4site_info_args v4site_info; u_int8_t *buf; char *host; size_t len; @@ -686,9 +927,18 @@ serve_join_request(env, ip, msg) COMPQUIET(status, 0); conn = msg->v.gmdb_msg.conn; + DB_ASSERT(env, conn->version > 0 && conn->version <= DB_REPMGR_VERSION); dbt = &msg->v.gmdb_msg.request; - ret = __repmgr_site_info_unmarshal(env, - &site_info, dbt->data, dbt->size, NULL); + if (conn->version < 5) { + ret = __repmgr_v4site_info_unmarshal(env, + &v4site_info, dbt->data, dbt->size, NULL); + site_info.host = v4site_info.host; + site_info.port = v4site_info.port; + site_info.status = v4site_info.flags; + site_info.flags = 0; + } else + ret = __repmgr_site_info_unmarshal(env, + &site_info, dbt->data, dbt->size, NULL); host = site_info.host.data; host[site_info.host.size - 1] = '\0'; @@ -703,7 +953,23 @@ serve_join_request(env, ip, msg) LOCK_MUTEX(db_rep->mutex); if ((ret = __repmgr_find_site(env, host, site_info.port, &eid)) == 0) { DB_ASSERT(env, eid != db_rep->self_eid); - status = SITE_FROM_EID(eid)->membership; + site = SITE_FROM_EID(eid); + status = site->membership; + /* + * Remote site electability is usually exchanged when + * a connection is established, but when a new site + * joins the repgroup there is a brief gap between the + * join and the connection. Record electability for + * the joining site so that we are not overly conservative + * about the number of acks we require for a PERM + * transaction if the joining site is unelectable. + */ + if (FLD_ISSET(site_info.flags, SITE_JOIN_ELECTABLE)) { + F_SET(site, SITE_ELECTABLE); + FLD_CLR(site_info.flags, SITE_JOIN_ELECTABLE); + } else + F_CLR(site, SITE_ELECTABLE); + F_SET(site, SITE_HAS_PRIO); } UNLOCK_MUTEX(db_rep->mutex); if (ret != 0) @@ -712,7 +978,8 @@ serve_join_request(env, ip, msg) switch (status) { case 0: case SITE_ADDING: - ret = __repmgr_update_membership(env, ip, eid, SITE_ADDING); + ret = __repmgr_update_membership(env, ip, eid, SITE_ADDING, + site_info.flags); break; case SITE_PRESENT: /* Already in desired state. */ @@ -729,7 +996,7 @@ serve_join_request(env, ip, msg) goto err; LOCK_MUTEX(db_rep->mutex); - ret = __repmgr_marshal_member_list(env, &buf, &len); + ret = __repmgr_marshal_member_list(env, conn->version, &buf, &len); UNLOCK_MUTEX(db_rep->mutex); if (ret != 0) goto err; @@ -760,6 +1027,7 @@ serve_remove_request(env, ip, msg) REPMGR_SITE *site; DBT *dbt; __repmgr_site_info_args site_info; + __repmgr_v4site_info_args v4site_info; char *host; u_int32_t status, type; int eid, ret, t_ret; @@ -768,9 +1036,18 @@ serve_remove_request(env, ip, msg) db_rep = env->rep_handle; conn = msg->v.gmdb_msg.conn; + DB_ASSERT(env, conn->version > 0 && conn->version <= DB_REPMGR_VERSION); dbt = &msg->v.gmdb_msg.request; - ret = __repmgr_site_info_unmarshal(env, - &site_info, dbt->data, dbt->size, NULL); + if (conn->version < 5) { + ret = __repmgr_v4site_info_unmarshal(env, + &v4site_info, dbt->data, dbt->size, NULL); + site_info.host = v4site_info.host; + site_info.port = v4site_info.port; + site_info.status = v4site_info.flags; + site_info.flags = 0; + } else + ret = __repmgr_site_info_unmarshal(env, + &site_info, dbt->data, dbt->size, NULL); host = site_info.host.data; host[site_info.host.size - 1] = '\0'; @@ -810,7 +1087,8 @@ serve_remove_request(env, ip, msg) break; case SITE_PRESENT: case SITE_DELETING: - ret = __repmgr_update_membership(env, ip, eid, SITE_DELETING); + ret = __repmgr_update_membership(env, ip, eid, SITE_DELETING, + site_info.flags); break; default: ret = __db_unknown_path(env, "serve_remove_request"); @@ -829,7 +1107,175 @@ err: default: return (ret); } - return (__repmgr_send_sync_msg(env, conn, type, NULL, 0)); + /* + * It is possible when a site removes itself that by now it has + * already acted on the first GMDB update and closed its connections. + * Do not return an error and panic if we can't send the final + * status of the remove operation. + */ + if ((ret = __repmgr_send_sync_msg(env, conn, type, NULL, 0)) != 0) + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "Problem sending remove site status message %d", ret)); + return (0); +} + +/* + * Serve the REPMGR_RESTART_CLIENT message by restarting this site as a + * client if it is not already a client. Always sends back a + * REPMGR_PREFMAS_SUCCESS message with an empty payload. + */ +static int +serve_restart_client_request(env, msg) + ENV *env; + REPMGR_MESSAGE *msg; +{ + DB_REP *db_rep; + REP * rep; + REPMGR_CONNECTION *conn; + int ret, t_ret; + + db_rep = env->rep_handle; + rep = db_rep->region; + ret = 0; + + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "Serving restart_client request")); + conn = msg->v.gmdb_msg.conn; + DB_ASSERT(env, conn->version > 0 && conn->version <= DB_REPMGR_VERSION); + /* No need to read payload - it is just a dummy byte. */ + + if (IS_PREFMAS_MODE(env) && !F_ISSET(rep, REP_F_CLIENT)) + ret = __repmgr_become_client(env); + + if ((t_ret = __repmgr_send_sync_msg(env, conn, + REPMGR_PREFMAS_SUCCESS, NULL, 0)) != 0) + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "Problem sending restart client success message %d", ret)); + + if (ret == 0 && t_ret != 0) + ret = t_ret; + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "Request for restart_client returning %d", ret)); + return (ret); +} + +/* + * Serve the REPMGR_READONLY_MASTER message by turning this site into a + * readonly master. Always sends back a REPMGR_READONLY_RESPONSE message with + * a payload containing this site's gen and next LSN expected. If there are + * any errors, the gen is 0 and the next LSN is [0,0]. + */ +static int +serve_readonly_master_request(env, msg) + ENV *env; + REPMGR_MESSAGE *msg; +{ + REPMGR_CONNECTION *conn; + __repmgr_permlsn_args permlsn; + u_int8_t buf[__REPMGR_PERMLSN_SIZE]; + int ret, t_ret; + + ret = 0; + + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "Serving readonly_master request")); + conn = msg->v.gmdb_msg.conn; + DB_ASSERT(env, conn->version > 0 && conn->version <= DB_REPMGR_VERSION); + /* No need to read payload - it is just a dummy byte. */ + + if (IS_PREFMAS_MODE(env)) + ret = __rep_become_readonly_master(env, + &permlsn.generation, &permlsn.lsn); + + __repmgr_permlsn_marshal(env, &permlsn, buf); + if ((t_ret = __repmgr_send_sync_msg(env, conn, + REPMGR_READONLY_RESPONSE, buf, __REPMGR_PERMLSN_SIZE)) != 0) + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "Problem sending readonly response message %d", ret)); + if (ret == 0 && t_ret != 0) + ret = t_ret; + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "Request for readonly_master returning %d", ret)); + return (ret); +} + +/* + * Serve the REPMGR_LSNHIST_REQUEST message by retrieving information from + * this site's LSN history database for the requested gen. If the requested + * gen exists at this site, sends back a REPMGR_LSNHIST_RESPONSE message + * containing the LSN and timestamp at the requested gen and the LSN for the + * next gen if that gen exists (next gen LSN is [0,0] if next gen doesn't + * yet exist at this site.) Sends back a PREFMAS_FAILURE message if the + * requested gen does not yet exist at this site or if there are any errors. + */ +static int +serve_lsnhist_request(env, ip, msg) + ENV *env; + DB_THREAD_INFO *ip; + REPMGR_MESSAGE *msg; +{ + REPMGR_CONNECTION *conn; + DBT *dbt; + __repmgr_lsnhist_match_args lsnhist_match; + __rep_lsn_hist_data_args lsnhist_data, next_lsnhist_data; + __rep_lsn_hist_key_args key; + u_int8_t match_buf[__REPMGR_LSNHIST_MATCH_SIZE]; + DB_LSN next_gen_lsn; + int ret, t_ret; + + RPRINT(env, (env, DB_VERB_REPMGR_MISC, "Serving lsnhist request")); + conn = msg->v.gmdb_msg.conn; + DB_ASSERT(env, conn->version > 0 && conn->version <= DB_REPMGR_VERSION); + /* Read lsn_hist_key incoming payload to get gen being requested. */ + dbt = &msg->v.gmdb_msg.request; + if ((ret = __rep_lsn_hist_key_unmarshal(env, + &key, dbt->data, dbt->size, NULL)) != 0) + return (ret); + if (key.version != REP_LSN_HISTORY_FMT_VERSION) { + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "serve_lsnhist_request version mismatch")); + return (0); + } + + /* + * There's no need to retry if we don't find an lsnhist record for + * requested gen. This site is either a temporary master or a client, + * which means that if it doesn't already have an lsnhist record at + * this gen, it is highly unlikely to get one in the near future. + */ + if ((ret = __rep_get_lsnhist_data(env, + ip, key.gen, &lsnhist_data)) == 0) { + + if ((t_ret = __rep_get_lsnhist_data(env, + ip, key.gen + 1, &next_lsnhist_data)) == 0) + next_gen_lsn = next_lsnhist_data.lsn; + else + ZERO_LSN(next_gen_lsn); + + lsnhist_match.lsn = lsnhist_data.lsn; + lsnhist_match.hist_sec = lsnhist_data.hist_sec; + lsnhist_match.hist_nsec = lsnhist_data.hist_nsec; + lsnhist_match.next_gen_lsn = next_gen_lsn; + __repmgr_lsnhist_match_marshal(env, &lsnhist_match, match_buf); + if ((t_ret = __repmgr_send_sync_msg(env, conn, + REPMGR_LSNHIST_RESPONSE, match_buf, + __REPMGR_LSNHIST_MATCH_SIZE)) != 0) + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "Problem sending lsnhist response message %d", + ret)); + } else if ((t_ret = __repmgr_send_sync_msg(env, conn, + REPMGR_PREFMAS_FAILURE, NULL, 0)) != 0) + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "Problem sending prefmas failure message %d", ret)); + + /* Do not return an error if LSN history record not found. */ + if (ret == DB_NOTFOUND) + ret = 0; + if (ret == 0 && t_ret != 0) + ret = t_ret; + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "Request for lsnhist returning %d", ret)); + return (ret); } /* @@ -917,7 +1363,13 @@ resolve_limbo_int(env, ip) if (orig_status == SITE_PRESENT || orig_status == 0) goto out; - if (IS_ZERO_LSN(db_rep->limbo_failure)) + /* + * It is possible after an autotakeover on a master to have no + * limbo_failure LSN but to have a limbo_victim that was found + * in the gmdb that still needs to be resolved. + */ + if (IS_ZERO_LSN(db_rep->limbo_failure) && + !db_rep->limbo_resolution_needed) goto out; /* @@ -947,7 +1399,8 @@ resolve_limbo_int(env, ip) ip, NULL, &txn, DB_IGNORE_LEASE)) != 0) goto out; - marshal_site_data(env, orig_status, data_buf, &data_dbt); + marshal_site_data(env, + orig_status, site->gmdb_flags, data_buf, &data_dbt); ret = __db_put(db_rep->gmdb, ip, txn, &key_dbt, &data_dbt, 0); if ((t_ret = __db_txn_auto_resolve(env, txn, 0, ret)) != 0 && @@ -980,15 +1433,15 @@ resolve_limbo_int(env, ip) UNLOCK_MUTEX(db_rep->mutex); locked = FALSE; status = NEXT_STATUS(orig_status); - if ((ret = finish_gmdb_update(env, - ip, &key_dbt, orig_status, status, &logrec)) != 0) + if ((ret = finish_gmdb_update(env, ip, + &key_dbt, orig_status, status, site->gmdb_flags, &logrec)) != 0) goto out; /* Track modified membership status in our in-memory sites array. */ LOCK_MUTEX(db_rep->mutex); locked = TRUE; if ((ret = __repmgr_set_membership(env, - addr.host, addr.port, status)) != 0) + addr.host, addr.port, status, site->gmdb_flags)) != 0) goto out; __repmgr_set_sites(env); @@ -1005,14 +1458,15 @@ out: * status is inferred (ADDING -> PRESENT, or DELETING -> 0). * * PUBLIC: int __repmgr_update_membership __P((ENV *, - * PUBLIC: DB_THREAD_INFO *, int, u_int32_t)); + * PUBLIC: DB_THREAD_INFO *, int, u_int32_t, u_int32_t)); */ int -__repmgr_update_membership(env, ip, eid, pstatus) +__repmgr_update_membership(env, ip, eid, pstatus, site_flags) ENV *env; DB_THREAD_INFO *ip; int eid; u_int32_t pstatus; /* Provisional status. */ + u_int32_t site_flags; { DB_REP *db_rep; REPMGR_SITE *site; @@ -1092,7 +1546,7 @@ retry: * those seem even more confusing. */ if ((ret = __repmgr_set_membership(env, - addr.host, addr.port, pstatus)) != 0) + addr.host, addr.port, pstatus, site_flags)) != 0) goto err; __repmgr_set_sites(env); @@ -1108,7 +1562,7 @@ retry: if ((ret = __txn_begin(env, ip, NULL, &txn, DB_IGNORE_LEASE)) != 0) goto err; marshal_site_key(env, &addr, key_buf, &key_dbt, &logrec); - marshal_site_data(env, pstatus, status_buf, &data_dbt); + marshal_site_data(env, pstatus, site_flags, status_buf, &data_dbt); if ((ret = __db_put(db_rep->gmdb, ip, txn, &key_dbt, &data_dbt, 0)) != 0) goto err; @@ -1152,13 +1606,14 @@ retry: locked = FALSE; if ((ret = finish_gmdb_update(env, ip, - &key_dbt, pstatus, ult_status, &logrec)) != 0) + &key_dbt, pstatus, ult_status, site_flags, &logrec)) != 0) goto err; /* Track modified membership status in our in-memory sites array. */ LOCK_MUTEX(db_rep->mutex); locked = TRUE; - ret = __repmgr_set_membership(env, addr.host, addr.port, ult_status); + ret = __repmgr_set_membership(env, addr.host, addr.port, + ult_status, site_flags); __repmgr_set_sites(env); err: @@ -1173,7 +1628,7 @@ err: * that we keep in sync. */ (void)__repmgr_set_membership(env, - addr.host, addr.port, orig_status); + addr.host, addr.port, orig_status, site_flags); } if ((t_ret = __repmgr_cleanup_gmdb_op(env, do_close)) != 0 && ret == 0) @@ -1215,13 +1670,14 @@ retry: UNLOCK_MUTEX(db_rep->mutex); marshal_site_key(env, &addr, key_buf, &key_dbt, &logrec); - if ((ret = finish_gmdb_update(env, - ip, &key_dbt, cur_status, new_status, &logrec)) != 0) + if ((ret = finish_gmdb_update(env, ip, + &key_dbt, cur_status, new_status, site->gmdb_flags, &logrec)) != 0) goto err; /* Track modified membership status in our in-memory sites array. */ LOCK_MUTEX(db_rep->mutex); - ret = __repmgr_set_membership(env, addr.host, addr.port, new_status); + ret = __repmgr_set_membership(env, addr.host, addr.port, + new_status, site->gmdb_flags); __repmgr_set_sites(env); UNLOCK_MUTEX(db_rep->mutex); @@ -1301,11 +1757,11 @@ __repmgr_set_gm_version(env, ip, txn, version) * really deleted. */ static int -finish_gmdb_update(env, ip, key_dbt, prev_status, status, logrec) +finish_gmdb_update(env, ip, key_dbt, prev_status, status, flags, logrec) ENV *env; DB_THREAD_INFO *ip; DBT *key_dbt; - u_int32_t prev_status, status; + u_int32_t prev_status, status, flags; __repmgr_member_args *logrec; { DB_REP *db_rep; @@ -1324,7 +1780,7 @@ finish_gmdb_update(env, ip, key_dbt, prev_status, status, logrec) if (status == 0) ret = __db_del(db_rep->gmdb, ip, txn, key_dbt, 0); else { - marshal_site_data(env, status, data_buf, &data_dbt); + marshal_site_data(env, status, flags, data_buf, &data_dbt); ret = __db_put(db_rep->gmdb, ip, txn, key_dbt, &data_dbt, 0); } if (ret != 0) @@ -1617,16 +2073,18 @@ marshal_site_key(env, addr, buf, dbt, logrec) } static void -marshal_site_data(env, status, buf, dbt) +marshal_site_data(env, status, flags, buf, dbt) ENV *env; u_int32_t status; + u_int32_t flags; u_int8_t *buf; DBT *dbt; { - __repmgr_membership_data_args member_status; + __repmgr_membership_data_args member_data; - member_status.flags = status; - __repmgr_membership_data_marshal(env, &member_status, buf); + member_data.status = status; + member_data.flags = flags; + __repmgr_membership_data_marshal(env, &member_data, buf); DB_INIT_DBT(*dbt, buf, __REPMGR_MEMBERSHIP_DATA_SIZE); } @@ -1640,16 +2098,107 @@ __repmgr_set_sites(env) ENV *env; { DB_REP *db_rep; + REP *rep; int ret; u_int32_t n; u_int i; db_rep = env->rep_handle; + rep = db_rep->region; for (i = 0, n = 0; i < db_rep->site_cnt; i++) { - if (db_rep->sites[i].membership > 0) + /* + * Views do not count towards nsites because they cannot + * vote in elections, become master or contribute to + * durability. + */ + if (db_rep->sites[i].membership > 0 && + !FLD_ISSET(db_rep->sites[i].gmdb_flags, SITE_VIEW)) n++; } ret = __rep_set_nsites_int(env, n); DB_ASSERT(env, ret == 0); + if (FLD_ISSET(rep->config, + REP_C_PREFMAS_MASTER | REP_C_PREFMAS_CLIENT) && + rep->config_nsites > 2) + __db_errx(env, DB_STR("3701", + "More than two sites in preferred master replication group")); +} + +/* + * If a site is rejoining a 2-site repgroup with 2SITE_STRICT off + * and has a rejection because it needs to catch up with the latest + * group membership database, it cannot call an election right away + * because it would win with only its own vote and ignore an existing + * master in the repgroup. Instead, this routine is used to call the + * deferred election after the site has rejoined the repgroup successfully. + */ +static int +rejoin_deferred_election(env) + ENV *env; +{ + DB_REP *db_rep; + u_int32_t flags; + int eid, ret; + + db_rep = env->rep_handle; + LOCK_MUTEX(db_rep->mutex); + + /* + * First, retry all connections so that the election can communicate + * with the other sites. Normally there should only be one other + * site in the repgroup, but it is safest to retry all remote sites + * found in case the group membership changed while we were gone. + */ + FOR_EACH_REMOTE_SITE_INDEX(eid) { + if ((ret = + __repmgr_schedule_connection_attempt(env, eid, TRUE)) != 0) + break; + } + + /* + * Call an immediate, but not a fast, election because a fast + * election reduces the number of votes needed by 1. + */ + flags = ELECT_F_EVENT_NOTIFY; + if (FLD_ISSET(db_rep->region->config, REP_C_ELECTIONS)) + LF_SET(ELECT_F_IMMED); + else + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "Deferred rejoin election, but no elections")); + ret = __repmgr_init_election(env, flags); + + UNLOCK_MUTEX(db_rep->mutex); + return (ret); +} +/* + * If a site is rejoining a preferred master replication group and has a + * rejection because it needs to catch up with the latest group membership + * database, it needs to establish its "regular" connection to the other site + * so that it can proceed through the preferred master startup sequence. + */ +static int +rejoin_connections(env) + ENV *env; +{ + DB_REP *db_rep; + int eid, ret; + + db_rep = env->rep_handle; + ret = 0; + LOCK_MUTEX(db_rep->mutex); + + /* + * Retry all connections. Normally there should only be one other + * site in the repgroup, but it is safest to retry all remote sites + * found in case the group membership changed while we were gone. + */ + FOR_EACH_REMOTE_SITE_INDEX(eid) { + if ((ret = + __repmgr_schedule_connection_attempt(env, eid, TRUE)) != 0) + break; + } + + UNLOCK_MUTEX(db_rep->mutex); + return (ret); } diff --git a/src/repmgr/repmgr_net.c b/src/repmgr/repmgr_net.c index 54e3d066..334fd150 100644 --- a/src/repmgr/repmgr_net.c +++ b/src/repmgr/repmgr_net.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -57,6 +57,7 @@ struct sending_msg { * whether the PERM message should be considered durable. */ struct repmgr_permanence { + u_int32_t gen; /* Master generation for LSN. */ DB_LSN lsn; /* LSN whose ack this thread is waiting for. */ u_int threshold; /* Number of client acks to wait for. */ u_int quorum; /* Durability threshold for QUORUM policy. */ @@ -378,7 +379,7 @@ __repmgr_send(dbenv, control, rec, lsnp, eid, flags) goto out; #undef SEND_ONE_CONNECTION - nsites_sent = 1; + nsites_sent = FLD_ISSET(site->gmdb_flags, SITE_VIEW) ? 0 : 1; npeers_sent = F_ISSET(site, SITE_ELECTABLE) ? 1 : 0; missed_peer = FALSE; } @@ -418,7 +419,13 @@ __repmgr_send(dbenv, control, rec, lsnp, eid, flags) nclients = 0; else if ((policy == DB_REPMGR_ACKS_ONE || policy == DB_REPMGR_ACKS_ONE_PEER) && - nclients == 1) { + nclients < 2) { + /* + * Adjust to QUORUM when first other + * participant joins (nclients=1) or when there + * are no other participants but a view joins + * (nclients=0) to get enough acks. + */ nclients = 0; policy = DB_REPMGR_ACKS_QUORUM; } @@ -498,9 +505,16 @@ __repmgr_send(dbenv, control, rec, lsnp, eid, flags) if (nclients > 1 || FLD_ISSET(db_rep->region->config, REP_C_2SITE_STRICT) || - db_rep->active_gmdb_update == gmdb_primary) + db_rep->active_gmdb_update == gmdb_primary) { quorum = nclients / 2; - else + /* + * An unelectable master can't be part of the + * QUORUM policy quorum. + */ + if (rep->priority == 0 && + policy == DB_REPMGR_ACKS_QUORUM) + quorum++; + } else quorum = nclients; if (policy == DB_REPMGR_ACKS_ALL_AVAILABLE) { @@ -560,6 +574,7 @@ __repmgr_send(dbenv, control, rec, lsnp, eid, flags) /* In ALL_PEERS case, display of "needed" might be confusing. */ VPRINT(env, (env, DB_VERB_REPMGR_MISC, "will await acknowledgement: need %u", needed)); + perm.gen = rep->gen; perm.lsn = *lsnp; perm.threshold = needed; perm.policy = policy; @@ -734,8 +749,13 @@ __repmgr_send_broadcast(env, type, control, rec, nsitesp, npeersp, missingp) * useful to keep letting a removed site see updates so that it * learns of its own removal, and will know to rejoin at its * next reboot. + * + * We never count sends to views because views cannot + * contribute to durability, but we always do the sends. */ - if (site->membership == SITE_PRESENT) + if (FLD_ISSET(site->gmdb_flags, SITE_VIEW)) + full_member = FALSE; + else if (site->membership == SITE_PRESENT) full_member = TRUE; else { full_member = FALSE; @@ -802,7 +822,9 @@ send_connection(env, type, conn, msg, sent) REPMGR_MAX_V1_MSG_TYPE, REPMGR_MAX_V2_MSG_TYPE, REPMGR_MAX_V3_MSG_TYPE, - REPMGR_MAX_V4_MSG_TYPE + REPMGR_MAX_V4_MSG_TYPE, + REPMGR_MAX_V5_MSG_TYPE, + REPMGR_MAX_V6_MSG_TYPE }; db_rep = env->rep_handle; @@ -1132,18 +1154,24 @@ got_acks(env, context) has_unacked_peer = FALSE; FOR_EACH_REMOTE_SITE_INDEX(eid) { site = SITE_FROM_EID(eid); - if (site->membership != SITE_PRESENT) + /* + * Do not count an ack from a view because a view cannot + * contribute to durability. + */ + if (FLD_ISSET(site->gmdb_flags, SITE_VIEW)) continue; if (!F_ISSET(site, SITE_HAS_PRIO)) { /* - * Never connected to this site: since we can't know - * whether it's a peer, assume the worst. + * We have not reconnected to this site since the last + * recovery. Since we don't yet know whether it's a + * peer, assume the worst. */ has_unacked_peer = TRUE; continue; } - if (LOG_COMPARE(&site->max_ack, &perm->lsn) >= 0) { + if (site->max_ack_gen == perm->gen && + LOG_COMPARE(&site->max_ack, &perm->lsn) >= 0) { sites_acked++; if (F_ISSET(site, SITE_ELECTABLE)) peers_acked++; @@ -1206,6 +1234,7 @@ __repmgr_bust_connection(env, conn) DB_REP *db_rep; REP *rep; REPMGR_SITE *site; + db_timespec now; u_int32_t flags; int ret, eid; @@ -1259,7 +1288,9 @@ __repmgr_bust_connection(env, conn) } else /* Subordinate connection. */ goto out; - if ((ret = __repmgr_schedule_connection_attempt(env, eid, FALSE)) != 0) + /* Defer connection attempt if rejoining 2SITE_STRICT=off repgroup. */ + if (!db_rep->rejoin_pending && + (ret = __repmgr_schedule_connection_attempt(env, eid, FALSE)) != 0) goto out; /* @@ -1267,11 +1298,47 @@ __repmgr_bust_connection(env, conn) * master, assume that the master may have failed, and call for * an election. But only do this for the connection to the main * master process, not a subordinate one. And only do it if - * we're our site's main process, not a subordinate one. And + * we're our site's listener process, not a subordinate one. And * skip it if the application has configured us not to do * elections. */ if (!IS_SUBORDINATE(db_rep) && eid == rep->master_id) { + if (FLD_ISSET(rep->config, REP_C_AUTOTAKEOVER)) { + /* + * When the connection is from master's listener, if + * there is any other connection from a master's + * subordinate process that could take over as + * listener, we delay the election to allow some time + * for a new master listener to start. At the end of + * the delay, if there is still no master listener, + * call an election. There is a slight chance that + * we will delay the election to wait for an inactive + * connection which would never become the next main + * connection. + */ + TAILQ_FOREACH(conn, &site->sub_conns, entries) { + if (conn->auto_takeover) { + if (!timespecisset( + &db_rep->m_listener_chk)) { + __os_gettime(env, &now, 1); + TIMESPEC_ADD_DB_TIMEOUT(&now, + db_rep->m_listener_wait); + db_rep->m_listener_chk = now; + } + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "Master failure, but delay elections for takeover on master")); + return (0); + } + } + } + + /* Defer election if rejoining 2SITE_STRICT=off repgroup. */ + if (db_rep->rejoin_pending) { + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "Deferring election after rejoin rejection")); + goto out; + } + /* * Even if we're not doing elections, defer the event * notification to later execution in the election @@ -1285,6 +1352,17 @@ __repmgr_bust_connection(env, conn) RPRINT(env, (env, DB_VERB_REPMGR_MISC, "Master failure, but no elections")); + /* + * In preferred master mode, a client that has lost its + * connection to the master uses an election thread to + * restart as master. + */ + if (IS_PREFMAS_MODE(env)) { + RPRINT(env, (env, DB_VERB_REPMGR_MISC, +"bust_connection setting preferred master temp master")); + db_rep->prefmas_pending = start_temp_master; + } + if ((ret = __repmgr_init_election(env, flags)) != 0) goto out; } @@ -1340,25 +1418,59 @@ __repmgr_disable_connection(env, conn) REPMGR_CONNECTION *conn; { DB_REP *db_rep; - REPMGR_SITE *site; + REP *rep; REPMGR_RESPONSE *resp; + REPMGR_SITE *site; + SITEINFO *sites; u_int32_t i; - int eid, ret, t_ret; + int eid, is_subord, orig_state, ret, t_ret; db_rep = env->rep_handle; + rep = db_rep->region; ret = 0; + is_subord = 0; + orig_state = conn->state; conn->state = CONN_DEFUNCT; if (conn->type == REP_CONNECTION) { eid = conn->eid; if (IS_VALID_EID(eid)) { site = SITE_FROM_EID(eid); if (conn != site->ref.conn.in && - conn != site->ref.conn.out) - /* It's a subordinate connection. */ + conn != site->ref.conn.out) { + /* + * It is a subordinate connection to disable. + * Remove it from the subordinate connection + * list, and decrease the number of listener + * candidates by 1 if it is from a subordinate + * rep-aware process that allows takeover. + */ TAILQ_REMOVE(&site->sub_conns, conn, entries); + SET_LISTENER_CAND(conn->auto_takeover, --); + is_subord = 1; + } TAILQ_INSERT_TAIL(&db_rep->connections, conn, entries); conn->ref_count++; + /* + * Do not decrease sites_avail for a subordinate + * connection. + */ + if (site->state == SITE_CONNECTED && !is_subord && + (orig_state == CONN_READY || + orig_state == CONN_CONGESTED)) { + /* + * Some thread orderings can cause a brief + * dip into a negative sites_avail value. + * Once it goes negative it stays negative, + * so avoid this. Future connections will + * be counted correctly. + */ + if (rep->sites_avail > 0) + rep->sites_avail--; + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "disable_conn: EID %lu disabled. sites_avail %lu", + (u_long)eid, (u_long)rep->sites_avail)); + } } conn->eid = -1; } else if (conn->type == APP_CONNECTION) { @@ -1646,8 +1758,10 @@ flatten(env, msg) } /* - * Scan the list of remote sites, returning the first one that is a peer, - * is not the current master, and is available. + * Scan the list of remote sites, returning the first participant that is a + * peer, is not the current master, and is available. If there are no + * available participant peers but there is an available view peer, return the + * first available view peer. */ static REPMGR_SITE * __repmgr_find_available_peer(env) @@ -1656,23 +1770,28 @@ __repmgr_find_available_peer(env) DB_REP *db_rep; REP *rep; REPMGR_CONNECTION *conn; - REPMGR_SITE *site; - u_int i; + REPMGR_SITE *site, *view; + u_int avail, i; db_rep = env->rep_handle; rep = db_rep->region; + view = NULL; FOR_EACH_REMOTE_SITE_INDEX(i) { site = &db_rep->sites[i]; - if (FLD_ISSET(site->config, DB_REPMGR_PEER) && - EID_FROM_SITE(site) != rep->master_id && - site->state == SITE_CONNECTED && + avail = (site->state == SITE_CONNECTED && (((conn = site->ref.conn.in) != NULL && conn->state == CONN_READY) || ((conn = site->ref.conn.out) != NULL && - conn->state == CONN_READY))) + conn->state == CONN_READY))); + if (FLD_ISSET(site->config, DB_REPMGR_PEER) && + !FLD_ISSET(site->gmdb_flags, SITE_VIEW) && + EID_FROM_SITE(site) != rep->master_id && avail) return (site); + if (!view && FLD_ISSET(site->config, DB_REPMGR_PEER) && + FLD_ISSET(site->gmdb_flags, SITE_VIEW) && avail) + view = site; } - return (NULL); + return (view); } /* @@ -1852,6 +1971,7 @@ __repmgr_net_close(env) site->ref.conn.out = NULL; } } + rep->sites_avail = 0; if (db_rep->listen_fd != INVALID_SOCKET) { if (closesocket(db_rep->listen_fd) == SOCKET_ERROR && ret == 0) @@ -1870,22 +1990,28 @@ final_cleanup(env, conn, unused) void *unused; { DB_REP *db_rep; + REP *rep; REPMGR_SITE *site; - int ret, t_ret; + SITEINFO *sites; + int eid, ret, t_ret; COMPQUIET(unused, NULL); db_rep = env->rep_handle; + rep = db_rep->region; + eid = conn->eid; ret = __repmgr_close_connection(env, conn); /* Remove the connection from whatever list it's on, if any. */ - if (conn->type == REP_CONNECTION && IS_VALID_EID(conn->eid)) { - site = SITE_FROM_EID(conn->eid); + if (conn->type == REP_CONNECTION && IS_VALID_EID(eid)) { + site = SITE_FROM_EID(eid); if (site->state == SITE_CONNECTED && (conn == site->ref.conn.in || conn == site->ref.conn.out)) { /* Not on any list, so no need to do anything. */ - } else + } else { TAILQ_REMOVE(&site->sub_conns, conn, entries); + SET_LISTENER_CAND(conn->auto_takeover, --); + } t_ret = __repmgr_destroy_conn(env, conn); } else { diff --git a/src/repmgr/repmgr_posix.c b/src/repmgr/repmgr_posix.c index 0687681a..c49017ff 100644 --- a/src/repmgr/repmgr_posix.c +++ b/src/repmgr/repmgr_posix.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/repmgr/repmgr_queue.c b/src/repmgr/repmgr_queue.c index 6a381acf..3a51b32b 100644 --- a/src/repmgr/repmgr_queue.c +++ b/src/repmgr/repmgr_queue.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2006, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -22,13 +22,28 @@ __repmgr_queue_destroy(env) ENV *env; { DB_REP *db_rep; + REP *rep; REPMGR_MESSAGE *m; REPMGR_CONNECTION *conn; + u_int32_t mtype; int ret, t_ret; + COMPQUIET(mtype, 0); + db_rep = env->rep_handle; + rep = db_rep->region; ret = 0; + + /* + * Turn on the DB_EVENT_REP_INQUEUE_FULL event firing. We only do + * this for the main listener process. For a subordinate process, + * it is always turned on. + */ + if (!STAILQ_EMPTY(&db_rep->input_queue.header) && + !IS_SUBORDINATE(db_rep)) + rep->inqueue_full_event_on = 1; + while (!STAILQ_EMPTY(&db_rep->input_queue.header)) { m = STAILQ_FIRST(&db_rep->input_queue.header); STAILQ_REMOVE_HEAD(&db_rep->input_queue.header, entries); @@ -38,8 +53,25 @@ __repmgr_queue_destroy(env) ret == 0) ret = t_ret; } + if (m->msg_hdr.type == REPMGR_OWN_MSG) { + mtype = REPMGR_OWN_MSG_TYPE(m->msg_hdr); + if ((conn = m->v.gmdb_msg.conn) != NULL) { + /* + * A site that removed itself may have already + * closed its connections. + */ + if ((t_ret = __repmgr_close_connection(env, + conn)) != 0 && ret == 0 && + mtype != REPMGR_REMOVE_REQUEST) + ret = t_ret; + if ((t_ret = __repmgr_decr_conn_ref(env, + conn)) != 0 && ret == 0) + ret = t_ret; + } + } __os_free(env, m); } + return (ret); } @@ -60,14 +92,17 @@ __repmgr_queue_get(env, msgp, th) REPMGR_RUNNABLE *th; { DB_REP *db_rep; + REP *rep; REPMGR_MESSAGE *m; #ifdef DB_WIN32 HANDLE wait_events[2]; #endif + u_int32_t msgsize; int ret; ret = 0; db_rep = env->rep_handle; + rep = db_rep->region; while ((m = available_work(env)) == NULL && db_rep->repmgr_status == running && !th->quit_requested) { @@ -104,10 +139,42 @@ __repmgr_queue_get(env, msgp, th) else { STAILQ_REMOVE(&db_rep->input_queue.header, m, __repmgr_message, entries); - db_rep->input_queue.size--; + msgsize = (u_int32_t)m->size; + while (msgsize >= GIGABYTE) { + DB_ASSERT(env, db_rep->input_queue.gbytes > 0); + db_rep->input_queue.gbytes--; + msgsize -= GIGABYTE; + } + if (db_rep->input_queue.bytes < msgsize) { + DB_ASSERT(env, db_rep->input_queue.gbytes > 0); + db_rep->input_queue.gbytes--; + db_rep->input_queue.bytes += GIGABYTE; + } + db_rep->input_queue.bytes -= msgsize; + + /* + * Check if current size is out of the red zone. + * If it is, we will turn on the DB_EVENT_REP_INQUEUE_FULL + * event firing. + * + * We only have the redzone machanism for the main listener + * process. + */ + if (!IS_SUBORDINATE(db_rep) && + rep->inqueue_full_event_on == 0) { + MUTEX_LOCK(env, rep->mtx_repmgr); + if (db_rep->input_queue.gbytes < + rep->inqueue_rz_gbytes || + (db_rep->input_queue.gbytes == + rep->inqueue_rz_gbytes && + db_rep->input_queue.bytes < + rep->inqueue_rz_bytes)) + rep->inqueue_full_event_on = 1; + MUTEX_UNLOCK(env, rep->mtx_repmgr); + } + *msgp = m; } - err: return (ret); } @@ -157,24 +224,55 @@ __repmgr_queue_put(env, msg) REPMGR_MESSAGE *msg; { DB_REP *db_rep; + REP *rep; + u_int32_t msgsize; db_rep = env->rep_handle; + rep = db_rep->region; + + /* + * Drop message if incoming queue contains more messages than the + * limit. See dbenv->repmgr_set_incoming_queue_max() for more + * information. + */ + MUTEX_LOCK(env, rep->mtx_repmgr); + if (db_rep->input_queue.gbytes > rep->inqueue_max_gbytes || + (db_rep->input_queue.gbytes == rep->inqueue_max_gbytes && + db_rep->input_queue.bytes >= rep->inqueue_max_bytes)) { + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "incoming queue limit exceeded")); + STAT(rep->mstat.st_incoming_msgs_dropped++); + if (IS_SUBORDINATE(db_rep) || rep->inqueue_full_event_on) { + DB_EVENT(env, DB_EVENT_REP_INQUEUE_FULL, NULL); + /* + * We will always disable the event firing after + * the queue is full. It will be enabled again + * after the incoming queue size is out of the + * redzone. + * + * We only have the redzone machanism for the main + * listener process. + */ + if (!IS_SUBORDINATE(db_rep)) + rep->inqueue_full_event_on = 0; + } + MUTEX_UNLOCK(env, rep->mtx_repmgr); + __os_free(env, msg); + return (0); + } + MUTEX_UNLOCK(env, rep->mtx_repmgr); STAILQ_INSERT_TAIL(&db_rep->input_queue.header, msg, entries); - db_rep->input_queue.size++; + msgsize = (u_int32_t)msg->size; + while (msgsize >= GIGABYTE) { + msgsize -= GIGABYTE; + db_rep->input_queue.gbytes++; + } + db_rep->input_queue.bytes += msgsize; + if (db_rep->input_queue.bytes >= GIGABYTE) { + db_rep->input_queue.gbytes++; + db_rep->input_queue.bytes -= GIGABYTE; + } return (__repmgr_signal(&db_rep->msg_avail)); } - -/* - * PUBLIC: int __repmgr_queue_size __P((ENV *)); - * - * !!! - * Caller must hold repmgr->mutex. - */ -int -__repmgr_queue_size(env) - ENV *env; -{ - return (env->rep_handle->input_queue.size); -} diff --git a/src/repmgr/repmgr_rec.c b/src/repmgr/repmgr_rec.c index 41827aff..568df45d 100644 --- a/src/repmgr/repmgr_rec.c +++ b/src/repmgr/repmgr_rec.c @@ -1,3 +1,11 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2014, 2015 Oracle and/or its affiliates. All rights reserved. + * + * $Id$ + */ + #include "db_config.h" #include "db_int.h" @@ -31,7 +39,7 @@ __repmgr_member_recover(env, dbtp, lsnp, op, info) /* * The annotation log record describes the update in enough detail for - * us to be able to optimize our tracking of it at clients sites. + * us to be able to optimize our tracking of it at client sites. * However, for now we just simply reread the whole (small) database * each time, since changes happen so seldom (and we need to have the * code for reading the whole thing anyway, for other cases). diff --git a/src/repmgr/repmgr_sel.c b/src/repmgr/repmgr_sel.c index ba14368f..c32dad25 100644 --- a/src/repmgr/repmgr_sel.c +++ b/src/repmgr/repmgr_sel.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2006, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -12,7 +12,7 @@ typedef int (*HEARTBEAT_ACTION) __P((ENV *)); -static int accept_handshake __P((ENV *, REPMGR_CONNECTION *, char *)); +static int accept_handshake __P((ENV *, REPMGR_CONNECTION *, char *, int *)); static int accept_v1_handshake __P((ENV *, REPMGR_CONNECTION *, char *)); static void check_min_log_file __P((ENV *)); static int dispatch_msgin __P((ENV *, REPMGR_CONNECTION *)); @@ -23,13 +23,18 @@ static int process_parameters __P((ENV *, static int read_version_response __P((ENV *, REPMGR_CONNECTION *)); static int record_permlsn __P((ENV *, REPMGR_CONNECTION *)); static int __repmgr_call_election __P((ENV *)); +static int __repmgr_check_listener __P((ENV *)); +static int __repmgr_check_master_listener __P((ENV *)); static int __repmgr_connector_main __P((ENV *, REPMGR_RUNNABLE *)); static void *__repmgr_connector_thread __P((void *)); static int __repmgr_next_timeout __P((ENV *, db_timespec *, HEARTBEAT_ACTION *)); +static int __repmgr_reset_last_rcvd __P((ENV *)); static int __repmgr_retry_connections __P((ENV *)); static int __repmgr_send_heartbeat __P((ENV *)); -static int __repmgr_try_one __P((ENV *, int)); +static int __repmgr_start_takeover __P((ENV *)); +static void *__repmgr_takeover_thread __P((void *)); +static int __repmgr_try_one __P((ENV *, int, int)); static int resolve_collision __P((ENV *, REPMGR_SITE *, REPMGR_CONNECTION *)); static int send_version_response __P((ENV *, REPMGR_CONNECTION *)); @@ -49,17 +54,24 @@ void * __repmgr_select_thread(argsp) void *argsp; { - REPMGR_RUNNABLE *args; ENV *env; + DB_THREAD_INFO *ip; int ret; + REPMGR_RUNNABLE *args; args = argsp; env = args->env; + ip = NULL; + ret = 0; - if ((ret = __repmgr_select_loop(env)) != 0) { + ENV_ENTER_RET(env, ip, ret); + if (ret != 0 || (ret = __repmgr_select_loop(env)) != 0) { __db_err(env, ret, DB_STR("3614", "select loop failed")); + ENV_LEAVE(env, ip); (void)__repmgr_thread_failure(env, ret); } + if (ret == 0) + ENV_LEAVE(env, ip); return (NULL); } @@ -71,12 +83,19 @@ __repmgr_bow_out(env) ENV *env; { DB_REP *db_rep; + REP *rep; int ret; db_rep = env->rep_handle; + rep = db_rep->region; LOCK_MUTEX(db_rep->mutex); ret = __repmgr_stop_threads(env); UNLOCK_MUTEX(db_rep->mutex); + /* + * Reset sites_avail so that it will be calculated correctly if this + * site rejoins the group in the future. + */ + rep->sites_avail = 0; DB_EVENT(env, DB_EVENT_REP_LOCAL_SITE_REMOVED, NULL); return (ret); } @@ -187,23 +206,53 @@ __repmgr_compute_timeout(env, timeout) db_rep = env->rep_handle; /* - * There are two factors to consider: are heartbeats in use? and, do we + * There are four factors to consider: are heartbeats in use? do we * have any sites with broken connections that we ought to retry? + * is there a listener process running locally? do we need to call + * an election if no master listener exists? */ have_timeout = __repmgr_next_timeout(env, &t, NULL); /* List items are in order, so we only have to examine the first one. */ if (!TAILQ_EMPTY(&db_rep->retries)) { retry = TAILQ_FIRST(&db_rep->retries); - if (have_timeout) { + if (have_timeout) /* Choose earliest timeout deadline. */ t = timespeccmp(&retry->time, &t, <) ? retry->time : t; - } else { + else { t = retry->time; have_timeout = TRUE; } } + /* Check listener every timeout in subordinate rep-aware process. */ + if (IS_LISTENER_CAND(db_rep)) { + if (!timespecisset(&db_rep->l_listener_chk)) { + __os_gettime(env, &now, 1); + TIMESPEC_ADD_DB_TIMEOUT(&now, db_rep->l_listener_wait); + db_rep->l_listener_chk = now; + } + if (have_timeout) + t = timespeccmp(&db_rep->l_listener_chk, &t, <) ? + db_rep->l_listener_chk : t; + else { + t = db_rep->l_listener_chk; + have_timeout = TRUE; + } + } + + /* Check master listener if needed. */ + if (FLD_ISSET(db_rep->region->config, REP_C_AUTOTAKEOVER) && + timespecisset(&db_rep->m_listener_chk)) { + if (have_timeout) + t = timespeccmp(&db_rep->m_listener_chk, &t, <) ? + db_rep->m_listener_chk : t; + else { + t = db_rep->m_listener_chk; + have_timeout = TRUE; + } + } + if (have_timeout) { __os_gettime(env, &now, 1); if (timespeccmp(&now, &t, >=)) @@ -242,7 +291,17 @@ __repmgr_next_timeout(env, deadline, action) if (rep->master_id == db_rep->self_eid && rep->heartbeat_frequency > 0) { - t = db_rep->last_bcast; + /* + * A temporary master in preferred master mode must send + * regular heartbeats regardless of other activity because + * the preferred master requires a heartbeat to take over as + * master after it has synced with the temporary master. + */ + if (IS_PREFMAS_MODE(env) && + FLD_ISSET(rep->config, REP_C_PREFMAS_CLIENT)) + t = db_rep->last_hbeat; + else + t = db_rep->last_bcast; TIMESPEC_ADD_DB_TIMEOUT(&t, rep->heartbeat_frequency); my_action = __repmgr_send_heartbeat; } else if ((master = __repmgr_connected_master(env)) != NULL && @@ -301,6 +360,24 @@ __repmgr_send_heartbeat(env) db_rep = env->rep_handle; rep = db_rep->region; + ret = 0; + + /* + * Check test hook preventing heartbeats and connection attempts. + * This is used to create and maintain a dupmaster condition in + * a test until the test hook is rescinded. + */ + DB_TEST_SET(env->test_abort, DB_TEST_REPMGR_HEARTBEAT); + + /* + * Track last heartbeat for temporary master in preferred master + * mode so that it will send regular heartbeats regardless of + * other activity. + */ + if (IS_PREFMAS_MODE(env) && + FLD_ISSET(rep->config, REP_C_PREFMAS_CLIENT) && + rep->master_id == db_rep->self_eid) + __os_gettime(env, &db_rep->last_hbeat, 1); permlsn.generation = rep->gen; if ((ret = __rep_get_maxpermlsn(env, &permlsn.lsn)) != 0) @@ -310,8 +387,11 @@ __repmgr_send_heartbeat(env) control.size = __REPMGR_PERMLSN_SIZE; DB_INIT_DBT(rec, NULL, 0); - return (__repmgr_send_broadcast(env, - REPMGR_HEARTBEAT, &control, &rec, &unused1, &unused2, &unused3)); + ret =__repmgr_send_broadcast(env, + REPMGR_HEARTBEAT, &control, &rec, &unused1, &unused2, &unused3); + +DB_TEST_RECOVERY_LABEL + return (ret); } /* @@ -373,6 +453,8 @@ __repmgr_check_timeouts(env) HEARTBEAT_ACTION action; int ret; + ret = 0; + /* * Figure out the next heartbeat-related thing to be done. Then, if * it's time to do it, do so. @@ -384,7 +466,342 @@ __repmgr_check_timeouts(env) return (ret); } - return (__repmgr_retry_connections(env)); + /* Check the existence of local listener. */ + if ((ret = __repmgr_check_listener(env)) != 0) + return (ret); + + /* Check the existence of master listener. */ + if ((ret = __repmgr_check_master_listener(env)) != 0) + return (ret); + + /* + * Check test hook preventing heartbeats and connection attempts. + * This is used to create and maintain a dupmaster condition in + * a test until the test hook is rescinded. + */ + DB_TEST_SET(env->test_abort, DB_TEST_REPMGR_HEARTBEAT); + + ret = __repmgr_retry_connections(env); + +DB_TEST_RECOVERY_LABEL + return (ret); +} + +/* + * Check the existence of the listener process on the local site. If one + * does not exist and the current process is a subordinate rep-aware process, + * then start a takeover thread to covert this process to the listener process. + */ +static int +__repmgr_check_listener(env) + ENV *env; +{ + DB_REP *db_rep; + REP *rep; + SITEINFO *sites; + db_timespec t; + int ret; + + db_rep = env->rep_handle; + rep = db_rep->region; + ret = 0; + + /* + * Only subordinate rep-aware process can take over listener role, so + * no need to check listener in listener process or rep unaware process. + */ + if (!IS_LISTENER_CAND(db_rep)) + return (0); + + /* + * If the listener quits due to site removal, no subordinate process + * should take over as listener as the current site is not expected + * to be active in the group. Check the status from the site array + * in the shared region instead of that in the GMDB. We do this + * because the GMDB doesn't apply the change yet when replication + * is stopped on the removed site. + */ + sites = R_ADDR(env->reginfo, rep->siteinfo_off); + if (sites[rep->self_eid].status == SITE_DELETING) + return (0); + + /* + * Check the listener after timeout. If there is no listener, we + * take over. During takeover, we will refresh all connections. + * A subordinate process does not have an up-to-date site list, so sync + * up addresses from the in-memory site array before takeover. + */ + __os_gettime(env, &t, 1); + if (timespeccmp(&t, &db_rep->l_listener_chk, >=)) { + /* Compute the next timeout. */ + TIMESPEC_ADD_DB_TIMEOUT(&t, db_rep->l_listener_wait); + db_rep->l_listener_chk = t; + + /* Check if site address information needs to be refreshed. */ + if ((rep->siteinfo_seq > db_rep->siteinfo_seq) && + (ret = __repmgr_sync_siteaddr(env)) != 0) + return (ret); + + if (rep->listener == 0) + ret = __repmgr_start_takeover(env); + } + return (ret); +} + +/* + * Start a thread to take over the listener role in the current subordinate + * process. + */ +static int +__repmgr_start_takeover(env) + ENV *env; +{ + DB_REP *db_rep; + REPMGR_RUNNABLE *th; + int ret; + + db_rep = env->rep_handle; + th = db_rep->takeover_thread; + if (th == NULL) { + if ((ret = __os_calloc(env, 1, sizeof(REPMGR_RUNNABLE), + &th)) != 0) + return (ret); + db_rep->takeover_thread = th; + } else if (th->finished) { + if ((ret = __repmgr_thread_join(th)) != 0) + return (ret); + } else { + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "takeover thread still running")); + return (0); + } + th->run = __repmgr_takeover_thread; + if ((ret = __repmgr_thread_start(env, th)) != 0) { + __os_free(env, th); + db_rep->takeover_thread = NULL; + } + return (ret); +} + +/* + * Take over listener role in the current subordinate process. + */ +static void * +__repmgr_takeover_thread(argsp) + void *argsp; +{ + DB_REP *db_rep; + DB_THREAD_INFO *ip; + ENV *env; + REP *rep; + REPMGR_RUNNABLE *th; + int nthreads, ret, save_policy; + + th = argsp; + env = th->env; + db_rep = env->rep_handle; + ip = NULL; + rep = db_rep->region; + ret = 0; + + ENV_ENTER_RET(env, ip, ret); + if (ret != 0) + goto out; + RPRINT(env, (env, DB_VERB_REPMGR_MISC, "starting takeover thread")); + /* + * It is likely that there is an old heartbeat ready to expire + * immediately upon restarting repmgr, leading to an unnecessary + * election. Reset the expiration countdown here to avoid this. + */ + if ((ret = __repmgr_reset_last_rcvd(env)) != 0) + goto out; + /* + * If nthreads is set to be 0 in the current subordinate process, use + * the value in the last listener. The nthreads should be larger than + * 0 in listener. + */ + nthreads = db_rep->config_nthreads == 0 ? (int)rep->listener_nthreads : + db_rep->config_nthreads; + /* + * It is possible that this subordinate process does not have intact + * connections to the other sites. For most ack policies, restarting + * repmgr will wait for acks when it commits its transaction to reload + * the gmdb. Temporarily set the ack policy to NONE for the takeover + * so that it is not delayed waiting for acks that can never come. + */ + save_policy = rep->perm_policy; + rep->perm_policy = DB_REPMGR_ACKS_NONE; + /* + * Restart the repmgr as listener. If DB_REP_IGNORE is returned, + * the current process has become listener. If DB_REP_UNAVAIL is + * returned, the site has been removed from the group and no listener + * should be started. For any other error, if the replication is + * stopped because of the takeover thread, we will notify the + * application. + */ + ret = __repmgr_start_int(env, nthreads, F_ISSET(rep, REP_F_MASTER) ? + DB_REP_MASTER : DB_REP_CLIENT); + if (ret == 0 && !IS_SUBORDINATE(db_rep) && + db_rep->repmgr_status == running) { + STAT(rep->mstat.st_takeovers++); + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "finished takeover and became listener")); + } else if (ret != 0 && db_rep->repmgr_status == stopped) { + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "failed to take over, repmgr was stopped")); + DB_EVENT(env, DB_EVENT_REP_AUTOTAKEOVER_FAILED, NULL); + } else { + /* The current process is not changed to listener. */ + RPRINT(env, (env, DB_VERB_REPMGR_MISC, "failed to take over")); + } + rep->perm_policy = save_policy; + RPRINT(env, (env, DB_VERB_REPMGR_MISC, "takeover thread is exiting")); + ENV_LEAVE(env, ip); +out: th->finished = TRUE; + return (NULL); +} + +/* + * Reset the last_rcvd_timestamp to restart the wait for a heartbeat + * monitor expiration. + */ +static int +__repmgr_reset_last_rcvd(env) + ENV *env; +{ + DB_REP *db_rep; + REPMGR_SITE *master; + + db_rep = env->rep_handle; + + LOCK_MUTEX(db_rep->mutex); + if ((master = __repmgr_connected_master(env)) != NULL) + __os_gettime(env, &master->last_rcvd_timestamp, 1); + UNLOCK_MUTEX(db_rep->mutex); + return (0); +} + +/* + * Monitor the connection to master listener. When the master listener is + * disconnected and some other master process might take over as listener + * soon, we will delay the election. After the delay if there is still no + * connection from master listener, call an election then. + */ +static int +__repmgr_check_master_listener(env) + ENV *env; +{ + DB_REP *db_rep; + REP *rep; + REPMGR_SITE *master; + db_timespec t; + u_int32_t flags; + int ret; + + db_rep = env->rep_handle; + rep = db_rep->region; + ret = 0; + + /* + * We only check for a master listener if m_listener_chk is set. + * The field is only set when __repmgr_bust_connection() previously + * detected the loss of our connection to the master listener. + * If rep->master_id is invalid, wait until it is ready to check. + */ + if (!FLD_ISSET((db_rep)->region->config, REP_C_AUTOTAKEOVER) || + !timespecisset(&db_rep->m_listener_chk) || + !IS_VALID_EID(rep->master_id)) + return (0); + + __os_gettime(env, &t, 1); + if (timespeccmp(&t, &db_rep->m_listener_chk, >=)) { + master = SITE_FROM_EID(db_rep->region->master_id); + if (master->ref.conn.out == NULL && + master->ref.conn.in == NULL) { + flags = ELECT_F_EVENT_NOTIFY; + if (FLD_ISSET(db_rep->region->config, REP_C_ELECTIONS)) + LF_SET(ELECT_F_IMMED | ELECT_F_FAST); + else + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "Master failure, but no elections")); + + /* + * In preferred master mode, a client that has lost its + * connection to the master uses an election thread to + * restart as master. + */ + if (IS_PREFMAS_MODE(env)) { + RPRINT(env, (env, DB_VERB_REPMGR_MISC, +"check_master_listener setting preferred master temp master")); + db_rep->prefmas_pending = start_temp_master; + } + + ret = __repmgr_init_election(env, flags); + } + /* + * If the delay has expired reset m_listener_chk. We reset + * it whether or not the master listener process comes back + * so that we will not continue checking for a master listener + * indefinitely. + */ + timespecclear(&db_rep->m_listener_chk); + } + return (ret); +} + +/* + * Wake up I/O waiting in selector thread, refresh connections to all connected + * and present sites. + * + * PUBLIC: int __repmgr_refresh_selector __P((ENV *)); + */ +int +__repmgr_refresh_selector(env) + ENV *env; +{ + DB_REP *db_rep; + REP *rep; + REPMGR_RETRY *retry; + REPMGR_SITE *site; + SITEINFO *sites; + int eid, ret; + + db_rep = env->rep_handle; + rep = db_rep->region; + + if ((ret = __repmgr_wake_main_thread(env)) != 0) + return (ret); + + FOR_EACH_REMOTE_SITE_INDEX(eid) { + SET_LISTENER_CAND(1, = 0); + site = SITE_FROM_EID(eid); + + /* + * It is possible some sites were left in a paused state + * during the switch, so they have to be removed from the + * retry list. + */ + if (site->state == SITE_PAUSING) { + retry = site->ref.retry; + if (retry != NULL) { + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "Removing site from retry list eid %lu", + (u_long)eid)); + TAILQ_REMOVE(&db_rep->retries, retry, entries); + __os_free(env, retry); + site->ref.retry = NULL; + } + + } + /* + * Try to connect to any site that is now PRESENT after + * rereading the gmdb. + */ + if (site->membership == SITE_PRESENT && + (ret = __repmgr_try_one(env, eid, TRUE)) != 0) + return (ret); + } + return (0); } /* @@ -415,10 +832,11 @@ __repmgr_retry_connections(env) __os_free(env, retry); DB_ASSERT(env, IS_VALID_EID(eid)); site = SITE_FROM_EID(eid); + site->ref.retry = NULL; DB_ASSERT(env, site->state == SITE_PAUSING); if (site->membership == SITE_PRESENT) { - if ((ret = __repmgr_try_one(env, eid)) != 0) + if ((ret = __repmgr_try_one(env, eid, FALSE)) != 0) return (ret); } else site->state = SITE_IDLE; @@ -437,11 +855,23 @@ __repmgr_first_try_connections(env) ENV *env; { DB_REP *db_rep; + REP *rep; REPMGR_SITE *site; + SITEINFO *sites; int eid, ret; db_rep = env->rep_handle; + rep = db_rep->region; + + /* + * Check test hook preventing heartbeats and connection attempts. + * This is used to create and maintain a dupmaster condition in + * a test until the test hook is rescinded. + */ + DB_TEST_SET(env->test_abort, DB_TEST_REPMGR_HEARTBEAT); + FOR_EACH_REMOTE_SITE_INDEX(eid) { + SET_LISTENER_CAND(1, = 0); site = SITE_FROM_EID(eid); /* * Normally all sites would be IDLE here. But if a user thread @@ -453,19 +883,22 @@ __repmgr_first_try_connections(env) */ if (site->state == SITE_IDLE && site->membership == SITE_PRESENT && - (ret = __repmgr_try_one(env, eid)) != 0) + (ret = __repmgr_try_one(env, eid, FALSE)) != 0) return (ret); } +DB_TEST_RECOVERY_LABEL return (0); } /* - * Starts a thread to open a connection to the site at the given EID. + * Starts a thread to open a connection to the site at the given EID. We might + * have no connection to the site, or an existing connection to be replaced. */ static int -__repmgr_try_one(env, eid) +__repmgr_try_one(env, eid, refresh) ENV *env; int eid; + int refresh; { DB_REP *db_rep; REPMGR_SITE *site; @@ -488,13 +921,22 @@ __repmgr_try_one(env, eid) "eid %lu previous connector thread still running; will retry", (u_long)eid)); return (__repmgr_schedule_connection_attempt(env, - eid, FALSE)); + eid, refresh)); } site->state = SITE_CONNECTING; th->run = __repmgr_connector_thread; - th->args.eid = eid; + th->args.conn_th.eid = eid; + /* + * The flag CONNECT_F_REFRESH indicates an immediate connection attempt + * should be scheduled if the current connection attempt fails. It is + * turned on before the first attempt to refresh the connection but + * turned off if the first attempt fails. In this way, when refreshing + * the connection, there will be at most two immediate connection + * attempts, after that, retry as usual. + */ + th->args.conn_th.flags = refresh ? CONNECT_F_REFRESH : 0; if ((ret = __repmgr_thread_start(env, th)) != 0) { __os_free(env, th); site->connector = NULL; @@ -506,21 +948,33 @@ static void * __repmgr_connector_thread(argsp) void *argsp; { - REPMGR_RUNNABLE *th; ENV *env; + DB_THREAD_INFO *ip; + REPMGR_RUNNABLE *th; int ret; th = argsp; env = th->env; + ip = NULL; + ret = 0; - RPRINT(env, (env, DB_VERB_REPMGR_MISC, - "starting connector thread, eid %u", th->args.eid)); - if ((ret = __repmgr_connector_main(env, th)) != 0) { + ENV_ENTER_RET(env, ip, ret); + if (ret == 0) + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "starting connector thread, eid %u", + th->args.conn_th.eid)); + if (ret != 0 || (ret = __repmgr_connector_main(env, th)) != 0) { __db_err(env, ret, DB_STR("3617", "connector thread failed")); + RPRINT(env, (env, + DB_VERB_REPMGR_MISC, "connector thread is exiting")); + ENV_LEAVE(env, ip); (void)__repmgr_thread_failure(env, ret); } - RPRINT(env, (env, DB_VERB_REPMGR_MISC, "connector thread is exiting")); - + if (ret == 0) { + RPRINT(env, (env, + DB_VERB_REPMGR_MISC, "connector thread is exiting")); + ENV_LEAVE(env, ip); + } th->finished = TRUE; return (NULL); } @@ -542,8 +996,8 @@ __repmgr_connector_main(env, th) ret = 0; LOCK_MUTEX(db_rep->mutex); - DB_ASSERT(env, IS_VALID_EID(th->args.eid)); - site = SITE_FROM_EID(th->args.eid); + DB_ASSERT(env, IS_VALID_EID(th->args.conn_th.eid)); + site = SITE_FROM_EID(th->args.conn_th.eid); if (site->state != SITE_CONNECTING && db_rep->repmgr_status == stopped) goto unlock; @@ -563,7 +1017,8 @@ __repmgr_connector_main(env, th) UNLOCK_MUTEX(db_rep->mutex); if ((ret = __repmgr_connect(env, &netaddr, &conn, &err)) == 0) { - DB_EVENT(env, DB_EVENT_REP_CONNECT_ESTD, &th->args.eid); + DB_EVENT(env, + DB_EVENT_REP_CONNECT_ESTD, &th->args.conn_th.eid); LOCK_MUTEX(db_rep->mutex); if ((ret = __repmgr_set_nonblock_conn(conn)) != 0) { __db_err(env, ret, DB_STR("3618", @@ -571,33 +1026,53 @@ __repmgr_connector_main(env, th) goto cleanup; } conn->type = REP_CONNECTION; - site = SITE_FROM_EID(th->args.eid); + site = SITE_FROM_EID(th->args.conn_th.eid); if (site->state != SITE_CONNECTING || db_rep->repmgr_status == stopped) goto cleanup; - conn->eid = th->args.eid; - site = SITE_FROM_EID(th->args.eid); - site->ref.conn.out = conn; + conn->eid = th->args.conn_th.eid; + site = SITE_FROM_EID(th->args.conn_th.eid); + /* + * If there is an existing outgoing connection, disable it and + * replace it with a new connection. The sites for a formerly + * subordinate handle that is now taking over might still be + * SITE_CONNECTING. Set to SITE_CONNECTED before disabling + * connection so that sites_avail is correctly maintained. + */ site->state = SITE_CONNECTED; + if (site->ref.conn.out != NULL) + (void)__repmgr_disable_connection(env, + site->ref.conn.out); + site->ref.conn.out = conn; __os_gettime(env, &site->last_rcvd_timestamp, 1); ret = __repmgr_wake_main_thread(env); } else if (ret == DB_REP_UNAVAIL) { /* Retryable error while trying to connect: retry later. */ - info.eid = th->args.eid; + info.eid = th->args.conn_th.eid; info.error = err; DB_EVENT(env, DB_EVENT_REP_CONNECT_TRY_FAILED, &info); STAT(db_rep->region->mstat.st_connect_fail++); LOCK_MUTEX(db_rep->mutex); - site = SITE_FROM_EID(th->args.eid); + site = SITE_FROM_EID(th->args.conn_th.eid); if (site->state != SITE_CONNECTING || db_rep->repmgr_status == stopped) { ret = 0; goto unlock; } + /* + * If it fails to create a new outgoing connection to replace + * the existing one in the first attempt, schedule another + * immediate attempt. If it is our second attempt, disable + * the existing connections and retry as normal. + */ + if (site->ref.conn.out != NULL && th->args.conn_th.flags == 0) + (void)__repmgr_disable_connection(env, + site->ref.conn.out); ret = __repmgr_schedule_connection_attempt(env, - th->args.eid, FALSE); + th->args.conn_th.eid, + th->args.conn_th.flags == CONNECT_F_REFRESH); } else goto out; @@ -842,6 +1317,7 @@ prepare_input(env, conn) if ((ret = __os_malloc(env, memsize, &membase)) != 0) return (ret); conn->input.rep_message = membase; + conn->input.rep_message->size = memsize; conn->input.rep_message->msg_hdr = msg_hdr; conn->input.rep_message->v.repmsg.originating_eid = conn->eid; @@ -876,6 +1352,7 @@ prepare_input(env, conn) if ((ret = __os_malloc(env, memsize, &membase)) != 0) return (ret); conn->input.rep_message = membase; + conn->input.rep_message->size = memsize; conn->input.rep_message->msg_hdr = msg_hdr; conn->input.rep_message->v.appmsg.conn = conn; @@ -891,6 +1368,7 @@ prepare_input(env, conn) if ((ret = __os_malloc(env, size, &membase)) != 0) return (ret); conn->input.rep_message = membase; + conn->input.rep_message->size = size; conn->input.rep_message->msg_hdr = msg_hdr; /* @@ -1065,16 +1543,18 @@ dispatch_msgin(env, conn) ENV *env; REPMGR_CONNECTION *conn; { + DBT *dbt; DB_REP *db_rep; - REPMGR_SITE *site; - REPMGR_RUNNABLE *th; + REP *rep; REPMGR_RESPONSE *resp; - DBT *dbt; + REPMGR_RUNNABLE *th; + REPMGR_SITE *site; char *hostname; - int eid, ret; + int eid, ret, subord; DB_ASSERT(env, conn->reading_phase == DATA_PHASE); db_rep = env->rep_handle; + rep = db_rep->region; switch (conn->state) { case CONN_CONNECTED: @@ -1129,9 +1609,22 @@ dispatch_msgin(env, conn) dbt = &conn->input.repmgr_msg.rec; hostname = dbt->data; hostname[dbt->size-1] = '\0'; - if ((ret = accept_handshake(env, conn, hostname)) != 0) + if ((ret = accept_handshake(env, + conn, hostname, &subord)) != 0) return (ret); conn->state = CONN_READY; + site = SITE_FROM_EID(conn->eid); + /* + * Do not increase sites_avail redundantly for an + * incoming subordinate connection. + */ + if (conn->type == REP_CONNECTION && + site->state == SITE_CONNECTED && !subord) { + rep->sites_avail++; + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "msgin: EID %lu CONNECTED, READY. sites_avail %lu", + (u_long)conn->eid, (u_long)rep->sites_avail)); + } break; case REPMGR_OWN_MSG: /* @@ -1279,9 +1772,11 @@ process_own_msg(env, conn) REPMGR_SITE *site; REPMGR_MESSAGE *msg; __repmgr_connect_reject_args reject; + __repmgr_v4connect_reject_args v4reject; __repmgr_parm_refresh_args parms; int ret; + db_rep = env->rep_handle; ret = 0; /* * Set "msg" to point to the message struct. If we do all necessary @@ -1293,28 +1788,61 @@ process_own_msg(env, conn) switch (REPMGR_OWN_MSG_TYPE((msg = conn->input.rep_message)->msg_hdr)) { case REPMGR_CONNECT_REJECT: dbt = &msg->v.gmdb_msg.request; - if ((ret = __repmgr_connect_reject_unmarshal(env, - &reject, dbt->data, dbt->size, NULL)) != 0) - return (DB_REP_UNAVAIL); + if (conn->version < 5) { + if ((ret = __repmgr_v4connect_reject_unmarshal(env, + &v4reject, dbt->data, dbt->size, NULL)) != 0) + return (DB_REP_UNAVAIL); + reject.version = v4reject.version; + reject.gen = v4reject.gen; + reject.status = 0; + } else { + if ((ret = __repmgr_connect_reject_unmarshal(env, + &reject, dbt->data, dbt->size, NULL)) != 0) + return (DB_REP_UNAVAIL); + } /* * If we're being rejected by someone who has more up-to-date - * membership information than we do, it means we have been - * removed from the group. If we've just gotten started, we can - * make one attempt at automatically rejoining; otherwise we bow - * out gracefully. + * membership information than we do, it means we are not in + * the group. If we've just gotten started, or our status is + * adding, we can make one attempt at automatically rejoining; + * otherwise we bow out gracefully. */ RPRINT(env, (env, DB_VERB_REPMGR_MISC, - "got rejection msg citing version %lu/%lu", - (u_long)reject.gen, (u_long)reject.version)); + "got rejection msg citing version %lu/%lu mine %lu/%lu membership %lu", + (u_long)reject.gen, (u_long)reject.version, + (u_long)db_rep->member_version_gen, + (u_long)db_rep->membership_version, + (u_long)reject.status)); if (__repmgr_gmdb_version_cmp(env, reject.gen, reject.version) > 0) { - if (env->rep_handle->seen_repmsg) + if (db_rep->seen_repmsg && reject.status != SITE_ADDING) ret = DB_DELETED; - else if ((ret = __repmgr_defer_op(env, - REPMGR_REJOIN)) == 0) - ret = DB_REP_UNAVAIL; + else { + /* + * If 2SITE_STRICT is off, we are likely to + * win an election with our own vote before + * discovering there is already a master. + * Set indicator to defer the election until + * after rejoining group. + * + * In preferred master mode, either site + * should defer the election (which + * executes the preferred master startup + * code and only calls an election if it is + * safe) and also avoid scheduling an extra + * reconnect attempt in bust_connection() + * by setting the indicator. + */ + if (!FLD_ISSET(db_rep->region->config, + REP_C_2SITE_STRICT) || + IS_PREFMAS_MODE(env)) + db_rep->rejoin_pending = TRUE; + if ((ret = __repmgr_defer_op(env, + REPMGR_REJOIN)) == 0) + ret = DB_REP_UNAVAIL; + } } else ret = DB_REP_UNAVAIL; DB_ASSERT(env, ret != 0); @@ -1332,7 +1860,6 @@ process_own_msg(env, conn) if ((ret = __repmgr_parm_refresh_unmarshal(env, &parms, dbt->data, dbt->size, NULL)) != 0) return (DB_REP_UNAVAIL); - db_rep = env->rep_handle; DB_ASSERT(env, conn->type == REP_CONNECTION && IS_KNOWN_REMOTE_SITE(conn->eid)); site = SITE_FROM_EID(conn->eid); @@ -1348,8 +1875,15 @@ process_own_msg(env, conn) case REPMGR_GM_FORWARD: case REPMGR_JOIN_REQUEST: case REPMGR_JOIN_SUCCESS: + case REPMGR_LSNHIST_REQUEST: + case REPMGR_LSNHIST_RESPONSE: + case REPMGR_PREFMAS_FAILURE: + case REPMGR_PREFMAS_SUCCESS: + case REPMGR_READONLY_MASTER: + case REPMGR_READONLY_RESPONSE: case REPMGR_REMOVE_REQUEST: case REPMGR_RESOLVE_LIMBO: + case REPMGR_RESTART_CLIENT: default: __db_errx(env, DB_STR_A("3677", "unexpected msg type %lu in process_own_msg", "%lu"), @@ -1482,6 +2016,8 @@ __repmgr_send_handshake(env, conn, opt, optlen, flags) cntrl_len = __REPMGR_V3HANDSHAKE_SIZE; break; case 4: + case 5: + case 6: cntrl_len = __REPMGR_HANDSHAKE_SIZE; break; default: @@ -1513,6 +2049,8 @@ __repmgr_send_handshake(env, conn, opt, optlen, flags) __repmgr_v3handshake_marshal(env, &v3hs, p); break; case 4: + case 5: + case 6: hs.port = my_addr->port; hs.alignment = MEM_ALIGN; hs.ack_policy = (u_int32_t)rep->perm_policy; @@ -1551,11 +2089,14 @@ read_version_response(env, conn) DB_REP *db_rep; __repmgr_version_confirmation_args conf; DBT vi; + REP *rep; + REPMGR_SITE *site; char *hostname; u_int32_t flags; - int ret; + int ret, subord; db_rep = env->rep_handle; + rep = db_rep->region; if ((ret = __repmgr_find_version_info(env, conn, &vi)) != 0) return (ret); @@ -1581,14 +2122,37 @@ read_version_response(env, conn) return (DB_REP_UNAVAIL); } - if ((ret = accept_handshake(env, conn, hostname)) != 0) + if ((ret = accept_handshake(env, conn, hostname, &subord)) != 0) return (ret); - flags = IS_SUBORDINATE(db_rep) ? REPMGR_SUBORDINATE : 0; + if (!IS_SUBORDINATE(db_rep)) + flags = 0; + else { + flags = REPMGR_SUBORDINATE; + if (FLD_ISSET(rep->config, REP_C_AUTOTAKEOVER) && + db_rep->repmgr_status == running) + /* + * Takeover is enabled in rep-aware subordinate + * process. + */ + flags |= REPMGR_AUTOTAKEOVER; + } if ((ret = __repmgr_send_handshake(env, conn, NULL, 0, flags)) != 0) return (ret); } conn->state = CONN_READY; + site = SITE_FROM_EID(conn->eid); + /* + * Do not increase sites_avail redundantly for a new outgoing + * connection from a subordinate process. + */ + if (conn->type == REP_CONNECTION && + site->state == SITE_CONNECTED && !IS_SUBORDINATE(db_rep)) { + rep->sites_avail++; + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "vers_resp: EID %lu CONNECTED, READY. sites_avail %lu", + (u_long)conn->eid, (u_long)rep->sites_avail)); + } return (ret); } @@ -1641,10 +2205,11 @@ __repmgr_find_version_info(env, conn, vi) } static int -accept_handshake(env, conn, hostname) +accept_handshake(env, conn, hostname, subordinate) ENV *env; REPMGR_CONNECTION *conn; char *hostname; + int *subordinate; { __repmgr_handshake_args hs; __repmgr_v2handshake_args hs2; @@ -1653,6 +2218,7 @@ accept_handshake(env, conn, hostname) u_int32_t ack, flags; int electable; + *subordinate = 0; switch (conn->version) { case 2: if (__repmgr_v2handshake_unmarshal(env, &hs2, @@ -1674,6 +2240,8 @@ accept_handshake(env, conn, hostname) ack = 0; break; case 4: + case 5: + case 6: if (__repmgr_handshake_unmarshal(env, &hs, conn->input.repmgr_msg.cntrl.data, conn->input.repmgr_msg.cntrl.size, NULL) != 0) @@ -1682,6 +2250,8 @@ accept_handshake(env, conn, hostname) electable = F_ISSET(&hs, ELECTABLE_SITE); flags = hs.flags; ack = hs.ack_policy; + if (LF_ISSET(REPMGR_SUBORDINATE)) + *subordinate = 1; break; default: __db_errx(env, DB_STR_A("3679", @@ -1729,13 +2299,17 @@ process_parameters(env, conn, host, port, ack, electable, flags) u_int32_t ack, flags; { DB_REP *db_rep; + REP *rep; REPMGR_RETRY *retry; REPMGR_SITE *site; + SITEINFO *sites; __repmgr_connect_reject_args reject; + __repmgr_v4connect_reject_args v4reject; u_int8_t reject_buf[__REPMGR_CONNECT_REJECT_SIZE]; int eid, ret; db_rep = env->rep_handle; + rep = db_rep->region; /* Connection state can be used to discern incoming versus outgoing. */ if (conn->state == CONN_CONNECTED) { @@ -1785,6 +2359,13 @@ process_parameters(env, conn, host, port, ack, electable, flags) TAILQ_INSERT_TAIL(&site->sub_conns, conn, entries); conn->eid = eid; + conn->auto_takeover = + LF_ISSET(REPMGR_AUTOTAKEOVER) ? 1 : 0; + SET_LISTENER_CAND(conn->auto_takeover, ++); + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "handshake from subordinate %sconnection at site %s:%u EID %u", + LF_ISSET(REPMGR_AUTOTAKEOVER)? + "takeover ": "", host, port, eid)); } else { DB_EVENT(env, DB_EVENT_REP_CONNECT_ESTD, &eid); @@ -1797,6 +2378,7 @@ process_parameters(env, conn, host, port, ack, electable, flags) TAILQ_REMOVE(&db_rep->retries, retry, entries); __os_free(env, retry); + site->ref.retry = NULL; break; case SITE_CONNECTED: /* @@ -1821,6 +2403,16 @@ process_parameters(env, conn, host, port, ack, electable, flags) * don't have to do anything else here. */ break; + case SITE_IDLE: + /* + * This can occur after the heartbeat + * test hook artificially kept this + * site from first trying to connect. + */ + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "handshake from idle site %s:%u EID %u", + host, port, eid)); + break; default: DB_ASSERT(env, FALSE); } @@ -1834,10 +2426,18 @@ process_parameters(env, conn, host, port, ack, electable, flags) RPRINT(env, (env, DB_VERB_REPMGR_MISC, "rejecting connection from unknown or provisional site %s:%u", host, port)); - reject.version = db_rep->membership_version; - reject.gen = db_rep->member_version_gen; - __repmgr_connect_reject_marshal(env, - &reject, reject_buf); + if (conn->version < 5) { + v4reject.version = db_rep->membership_version; + v4reject.gen = db_rep->member_version_gen; + __repmgr_v4connect_reject_marshal(env, + &v4reject, reject_buf); + } else { + reject.version = db_rep->membership_version; + reject.gen = db_rep->member_version_gen; + reject.status = (site) ? site->membership : 0; + __repmgr_connect_reject_marshal(env, + &reject, reject_buf); + } if ((ret = __repmgr_send_own_msg(env, conn, REPMGR_CONNECT_REJECT, reject_buf, @@ -1867,7 +2467,8 @@ process_parameters(env, conn, host, port, ack, electable, flags) */ if (!IS_SUBORDINATE(db_rep) && /* us */ !__repmgr_master_is_known(env) && - !LF_ISSET(REPMGR_SUBORDINATE)) { /* the remote site */ + !LF_ISSET(REPMGR_SUBORDINATE) && /* the remote site */ + !IS_PREFMAS_MODE(env)) { RPRINT(env, (env, DB_VERB_REPMGR_MISC, "handshake with no known master to wake election thread")); db_rep->new_connection = TRUE; @@ -1980,6 +2581,7 @@ record_permlsn(env, conn) */ if (ackp->lsn.file > site->max_ack.file) do_log_check = 1; + site->max_ack_gen = ackp->generation; memcpy(&site->max_ack, &ackp->lsn, sizeof(DB_LSN)); if (do_log_check) check_min_log_file(env); diff --git a/src/repmgr/repmgr_stat.c b/src/repmgr/repmgr_stat.c index fd6dabd3..215f4719 100644 --- a/src/repmgr/repmgr_stat.c +++ b/src/repmgr/repmgr_stat.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -55,7 +55,9 @@ __repmgr_stat(env, statp, flags) { DB_REP *db_rep; DB_REPMGR_STAT *copy, *stats; - uintmax_t tmp; + REPMGR_SITE *site; + u_int32_t tmp; + u_int i; int ret; db_rep = env->rep_handle; @@ -73,6 +75,20 @@ __repmgr_stat(env, statp, flags) memset(stats, 0, sizeof(DB_REPMGR_STAT)); stats->st_max_elect_threads = tmp; } + stats->st_incoming_queue_gbytes = db_rep->input_queue.gbytes; + stats->st_incoming_queue_bytes = db_rep->input_queue.bytes; + LOCK_MUTEX(db_rep->mutex); + for (i = 0; i < db_rep->site_cnt; i++) { + site = SITE_FROM_EID(i); + if (site->membership != 0) { + copy->st_site_total++; + if (FLD_ISSET(site->gmdb_flags, SITE_VIEW)) + copy->st_site_views++; + else + copy->st_site_participants++; + } + } + UNLOCK_MUTEX(db_rep->mutex); *statp = copy; return (0); @@ -148,6 +164,11 @@ __repmgr_print_stats(env, flags) (u_long)sp->st_msgs_queued); __db_dl(env, "Number of messages discarded due to queue length", (u_long)sp->st_msgs_dropped); + __db_dlbytes(env, "Incoming message size in queue", + (u_long)sp->st_incoming_queue_gbytes, (u_long)0, + (u_long)sp->st_incoming_queue_bytes); + __db_dl(env, "Number of messages discarded due to incoming queue full", + (u_long)sp->st_incoming_msgs_dropped); __db_dl(env, "Number of existing connections dropped", (u_long)sp->st_connection_drop); __db_dl(env, "Number of failed new connection attempts", @@ -156,6 +177,14 @@ __repmgr_print_stats(env, flags) (u_long)sp->st_elect_threads); __db_dl(env, "Election threads for which space is reserved", (u_long)sp->st_max_elect_threads); + __db_dl(env, "Number of participant sites in replication group", + (u_long)sp->st_site_participants); + __db_dl(env, "Total number of sites in replication group", + (u_long)sp->st_site_total); + __db_dl(env, "Number of view sites in replication group", + (u_long)sp->st_site_views); + __db_dl(env, "Number of automatic replication process takeovers", + (u_long)sp->st_takeovers); __os_ufree(env, sp); @@ -171,7 +200,7 @@ __repmgr_print_sites(env) u_int count, i; int ret; - if ((ret = __repmgr_site_list(env->dbenv, &count, &list)) != 0) + if ((ret = __repmgr_site_list_int(env, &count, &list)) != 0) return (ret); if (count == 0) @@ -189,6 +218,9 @@ __repmgr_print_sites(env) list[i].status == DB_REPMGR_CONNECTED ? "" : "dis"); __db_msgadd(env, &mb, ", %speer", F_ISSET(&list[i], DB_REPMGR_ISPEER) ? "" : "non-"); + __db_msgadd(env, &mb, ", %s", + F_ISSET(&list[i], DB_REPMGR_ISVIEW) ? + "view" : "participant"); __db_msgadd(env, &mb, ")"); DB_MSGBUF_FLUSH(env, &mb); } @@ -238,26 +270,46 @@ __repmgr_stat_print_pp(dbenv, flags) #endif /* - * PUBLIC: int __repmgr_site_list __P((DB_ENV *, u_int *, DB_REPMGR_SITE **)); + * PUBLIC: int __repmgr_site_list_pp + * PUBLIC: __P((DB_ENV *, u_int *, DB_REPMGR_SITE **)); */ int -__repmgr_site_list(dbenv, countp, listp) +__repmgr_site_list_pp(dbenv, countp, listp) DB_ENV *dbenv; u_int *countp; DB_REPMGR_SITE **listp; { - DB_REP *db_rep; - REP *rep; - DB_REPMGR_SITE *status; ENV *env; DB_THREAD_INFO *ip; + int ret; + + env = dbenv->env; + + ENV_ENTER(env, ip); + ret = __repmgr_site_list_int(env, countp, listp); + ENV_LEAVE(env, ip); + + return (ret); +} + +/* + * PUBLIC: int __repmgr_site_list_int __P((ENV *, u_int *, DB_REPMGR_SITE **)); + */ +int +__repmgr_site_list_int(env, countp, listp) + ENV *env; + u_int *countp; + DB_REPMGR_SITE **listp; +{ + DB_REP *db_rep; + DB_REPMGR_SITE *status; + REP *rep; REPMGR_SITE *site; size_t array_size, total_size; int eid, locked, ret; u_int count, i; char *name; - env = dbenv->env; db_rep = env->rep_handle; ret = 0; @@ -269,10 +321,8 @@ __repmgr_site_list(dbenv, countp, listp) LOCK_MUTEX(db_rep->mutex); locked = TRUE; - ENV_ENTER(env, ip); if (rep->siteinfo_seq > db_rep->siteinfo_seq) ret = __repmgr_sync_siteaddr(env); - ENV_LEAVE(env, ip); if (ret != 0) goto err; } else { @@ -329,6 +379,8 @@ __repmgr_site_list(dbenv, countp, listp) if (FLD_ISSET(site->config, DB_REPMGR_PEER)) F_SET(&status[i], DB_REPMGR_ISPEER); + if (FLD_ISSET(site->gmdb_flags, SITE_VIEW)) + F_SET(&status[i], DB_REPMGR_ISVIEW); /* * If we haven't started a communications thread, connection diff --git a/src/repmgr/repmgr_stub.c b/src/repmgr/repmgr_stub.c index 734c2240..999b759f 100644 --- a/src/repmgr/repmgr_stub.c +++ b/src/repmgr/repmgr_stub.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -73,6 +73,69 @@ __repmgr_set_ack_policy(dbenv, policy) /* * PUBLIC: #ifndef HAVE_REPLICATION_THREADS + * PUBLIC: int __repmgr_get_incoming_queue_max __P((DB_ENV *, u_int32_t *, + * PUBLIC: u_int32_t *)); + * PUBLIC: #endif + */ +int +__repmgr_get_incoming_queue_max(dbenv, messagesp, bulk_messagesp) + DB_ENV *dbenv; + u_int32_t *messagesp; + u_int32_t *bulk_messagesp; +{ + COMPQUIET(messagesp, NULL); + COMPQUIET(bulk_messagesp, NULL); + return (__db_norepmgr(dbenv)); +} + +/* + * PUBLIC: #ifndef HAVE_REPLICATION_THREADS + * PUBLIC: int __repmgr_set_incoming_queue_max __P((DB_ENV *, u_int32_t, + * PUBLIC: u_int32_t)); + * PUBLIC: #endif + */ +int +__repmgr_set_incoming_queue_max(dbenv, messages, bulk_messages) + DB_ENV *dbenv; + u_int32_t messages; + u_int32_t bulk_messages; +{ + COMPQUIET(messages, 0); + COMPQUIET(bulk_messages, 0); + return (__db_norepmgr(dbenv)); +} + +/* + * PUBLIC: #ifndef HAVE_REPLICATION_THREADS + * PUBLIC: int __repmgr_get_incoming_queue_redzone __P((DB_ENV *, + * PUBLIC: u_int32_t *, u_int32_t *)); + * PUBLIC: #endif + */ +int __repmgr_get_incoming_queue_redzone(dbenv, gbytesp, bytesp) + DB_ENV *dbenv; + u_int32_t *gbytesp, *bytesp; +{ + COMPQUIET(gbytesp, NULL); + COMPQUIET(bytesp, NULL); + return (__db_norepmgr(dbenv)); +} + +/* + * PUBLIC: #ifndef HAVE_REPLICATION_THREADS + * PUBLIC: int __repmgr_get_incoming_queue_fullevent __P((DB_ENV *, + * PUBLIC: int *)); + * PUBLIC: #endif + */ +int __repmgr_get_incoming_queue_fullevent(dbenv, onoffp) + DB_ENV *dbenv; + int *onoffp; +{ + COMPQUIET(onoffp, NULL); + return (__db_norepmgr(dbenv)); +} + +/* + * PUBLIC: #ifndef HAVE_REPLICATION_THREADS * PUBLIC: int __repmgr_site * PUBLIC: __P((DB_ENV *, const char *, u_int, DB_SITE **, u_int32_t)); * PUBLIC: #endif @@ -125,11 +188,12 @@ __repmgr_local_site(dbenv, dbsitep) /* * PUBLIC: #ifndef HAVE_REPLICATION_THREADS - * PUBLIC: int __repmgr_site_list __P((DB_ENV *, u_int *, DB_REPMGR_SITE **)); + * PUBLIC: int __repmgr_site_list_pp + * PUBLIC: __P((DB_ENV *, u_int *, DB_REPMGR_SITE **)); * PUBLIC: #endif */ int -__repmgr_site_list(dbenv, countp, listp) +__repmgr_site_list_pp(dbenv, countp, listp) DB_ENV *dbenv; u_int *countp; DB_REPMGR_SITE **listp; @@ -141,11 +205,11 @@ __repmgr_site_list(dbenv, countp, listp) /* * PUBLIC: #ifndef HAVE_REPLICATION_THREADS - * PUBLIC: int __repmgr_start __P((DB_ENV *, int, u_int32_t)); + * PUBLIC: int __repmgr_start_pp __P((DB_ENV *, int, u_int32_t)); * PUBLIC: #endif */ int -__repmgr_start(dbenv, nthreads, flags) +__repmgr_start_pp(dbenv, nthreads, flags) DB_ENV *dbenv; int nthreads; u_int32_t flags; diff --git a/src/repmgr/repmgr_util.c b/src/repmgr/repmgr_util.c index c2439436..1c5ebe59 100644 --- a/src/repmgr/repmgr_util.c +++ b/src/repmgr/repmgr_util.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -15,9 +15,13 @@ #define INITIAL_SITES_ALLOCATION 3 /* Arbitrary guess. */ +static int convert_gmdb(ENV *, DB_THREAD_INFO *, DB *, DB_TXN *); static int get_eid __P((ENV *, const char *, u_int, int *)); -static int __repmgr_addrcmp __P((repmgr_netaddr_t *, repmgr_netaddr_t *)); static int read_gmdb __P((ENV *, DB_THREAD_INFO *, u_int8_t **, size_t *)); +static int __repmgr_addrcmp __P((repmgr_netaddr_t *, repmgr_netaddr_t *)); +static int __repmgr_find_commit __P((ENV *, DB_LSN *, DB_LSN *, int *)); +static int __repmgr_remote_lsnhist(ENV *, int, u_int32_t, + __repmgr_lsnhist_match_args *); /* * Schedules a future attempt to re-establish a connection with the given site. @@ -43,6 +47,8 @@ __repmgr_schedule_connection_attempt(env, eid, immediate) REP *rep; REPMGR_RETRY *retry, *target; REPMGR_SITE *site; + SITEINFO *sites; + db_timeout_t timeout; db_timespec t; int ret; @@ -57,7 +63,24 @@ __repmgr_schedule_connection_attempt(env, eid, immediate) if (immediate) TAILQ_INSERT_HEAD(&db_rep->retries, retry, entries); else { - TIMESPEC_ADD_DB_TIMEOUT(&t, rep->connection_retry_wait); + /* + * Normally we retry a connection after connection retry + * timeout. In a subordinate rep-aware process, we retry sooner + * when there is a listener candidate on the disconnected site. + * The listener process will be connected from the new listener, + * but subordinate rep-aware process can only wait for retry. + * It matters when the subordinate process becomes listener and + * the disconnected site is master. The m_listener_wait is set + * to retry after enough time has passed for a takeover. The + * number of listener candidates is maintained in the listener + * process as it has connections to all subordinate processes + * from other sites. + */ + timeout = rep->connection_retry_wait; + CHECK_LISTENER_CAND(timeout, >0, db_rep->m_listener_wait, + timeout); + TIMESPEC_ADD_DB_TIMEOUT(&t, timeout); + /* * Insert the new "retry" on the (time-ordered) list in its * proper position. To do so, find the list entry ("target") @@ -284,6 +307,7 @@ __repmgr_new_site(env, sitep, host, port) site->net_addr.host = p; site->net_addr.port = (u_int16_t)port; + site->max_ack_gen = 0; ZERO_LSN(site->max_ack); site->ack_policy = 0; site->alignment = 0; @@ -295,6 +319,7 @@ __repmgr_new_site(env, sitep, host, port) site->state = SITE_IDLE; site->membership = 0; + site->gmdb_flags = 0; site->config = 0; *sitep = site; @@ -535,11 +560,14 @@ __repmgr_thread_failure(env, why) int why; { DB_REP *db_rep; + DB_THREAD_INFO *ip; db_rep = env->rep_handle; + ENV_ENTER(env, ip); LOCK_MUTEX(db_rep->mutex); (void)__repmgr_stop_threads(env); UNLOCK_MUTEX(db_rep->mutex); + ENV_LEAVE(env, ip); return (__env_panic(env, why)); } @@ -597,12 +625,13 @@ __repmgr_format_addr_loc(addr, buffer) } /* - * PUBLIC: int __repmgr_repstart __P((ENV *, u_int32_t)); + * PUBLIC: int __repmgr_repstart __P((ENV *, u_int32_t, u_int32_t)); */ int -__repmgr_repstart(env, flags) +__repmgr_repstart(env, flags, startopts) ENV *env; u_int32_t flags; + u_int32_t startopts; { DBT my_addr; int ret; @@ -610,7 +639,11 @@ __repmgr_repstart(env, flags) /* Include "cdata" in case sending to old-version site. */ if ((ret = __repmgr_prepare_my_addr(env, &my_addr)) != 0) return (ret); - ret = __rep_start_int(env, &my_addr, flags); + /* + * force_role_chg and hold_client_gen are used by preferred master + * mode to help control site startup. + */ + ret = __rep_start_int(env, &my_addr, flags, startopts); __os_free(env, my_addr.data); if (ret != 0) __db_err(env, ret, DB_STR("3673", "rep_start")); @@ -618,11 +651,12 @@ __repmgr_repstart(env, flags) } /* - * PUBLIC: int __repmgr_become_master __P((ENV *)); + * PUBLIC: int __repmgr_become_master __P((ENV *, u_int32_t)); */ int -__repmgr_become_master(env) +__repmgr_become_master(env, startopts) ENV *env; + u_int32_t startopts; { DB_REP *db_rep; DB_THREAD_INFO *ip; @@ -631,7 +665,7 @@ __repmgr_become_master(env) REPMGR_SITE *site; DBT key_dbt, data_dbt; __repmgr_membership_key_args key; - __repmgr_membership_data_args member_status; + __repmgr_membership_data_args member_data; repmgr_netaddr_t addr; u_int32_t status; u_int8_t data_buf[__REPMGR_MEMBERSHIP_DATA_SIZE]; @@ -668,16 +702,23 @@ __repmgr_become_master(env) db_rep->client_intent = FALSE; UNLOCK_MUTEX(db_rep->mutex); - if ((ret = __repmgr_repstart(env, DB_REP_MASTER)) != 0) + if ((ret = __repmgr_repstart(env, DB_REP_MASTER, startopts)) != 0) return (ret); + /* + * Make sure member_version_gen is current so that this master + * can reject obsolete member lists from other sites. + */ + db_rep->member_version_gen = db_rep->region->gen; + + /* If there is already a gmdb, we are finished. */ if (db_rep->have_gmdb) return (0); - db_rep->member_version_gen = db_rep->region->gen; - ENV_ENTER(env, ip); + /* There isn't a gmdb. Create one from the in-memory site list. */ if ((ret = __repmgr_hold_master_role(env, NULL)) != 0) goto leave; + ENV_GET_THREAD_INFO(env, ip); retry: if ((ret = __repmgr_setup_gmdb_op(env, ip, &txn, DB_CREATE)) != 0) goto err; @@ -705,8 +746,9 @@ retry: &key, key_buf, sizeof(key_buf), &len); DB_ASSERT(env, ret == 0); DB_INIT_DBT(key_dbt, key_buf, len); - member_status.flags = status; - __repmgr_membership_data_marshal(env, &member_status, data_buf); + member_data.status = status; + member_data.flags = site->gmdb_flags; + __repmgr_membership_data_marshal(env, &member_data, data_buf); DB_INIT_DBT(data_dbt, data_buf, __REPMGR_MEMBERSHIP_DATA_SIZE); if ((ret = __db_put(dbp, ip, txn, &key_dbt, &data_dbt, 0)) != 0) goto err; @@ -726,7 +768,6 @@ err: if ((t_ret = __repmgr_rlse_master_role(env)) != 0 && ret == 0) ret = t_ret; leave: - ENV_LEAVE(env, ip); return (ret); } @@ -840,6 +881,14 @@ __repmgr_open(env, rep_) rep->election_retry_wait = db_rep->election_retry_wait; rep->heartbeat_monitor_timeout = db_rep->heartbeat_monitor_timeout; rep->heartbeat_frequency = db_rep->heartbeat_frequency; + rep->inqueue_max_gbytes = db_rep->inqueue_max_gbytes; + rep->inqueue_max_bytes = db_rep->inqueue_max_bytes; + if (rep->inqueue_max_gbytes == 0 && rep->inqueue_max_bytes == 0) { + rep->inqueue_max_bytes = DB_REPMGR_DEFAULT_INQUEUE_MAX; + } + __repmgr_set_incoming_queue_redzone(rep, rep->inqueue_max_gbytes, + rep->inqueue_max_bytes); + return (ret); } @@ -958,6 +1007,18 @@ __repmgr_join(env, rep_) } db_rep->siteinfo_seq = rep->siteinfo_seq; + /* + * Update the incoming queue limit settings if necessary. + */ + if ((db_rep->inqueue_max_gbytes != 0 || + db_rep->inqueue_max_bytes != 0) && + (db_rep->inqueue_max_gbytes != rep->inqueue_max_gbytes || + db_rep->inqueue_max_bytes != rep->inqueue_max_gbytes)) { + rep->inqueue_max_gbytes = db_rep->inqueue_max_gbytes; + rep->inqueue_max_bytes = db_rep->inqueue_max_bytes; + __repmgr_set_incoming_queue_redzone(rep, + rep->inqueue_max_gbytes, rep->inqueue_max_bytes); + } unlock: MUTEX_UNLOCK(env, rep->mtx_repmgr); return (ret); @@ -1073,6 +1134,7 @@ __repmgr_share_netaddrs(env, rep_, start, limit) shared_array[eid].addr.port = db_rep->sites[i].net_addr.port; shared_array[eid].config = db_rep->sites[i].config; shared_array[eid].status = db_rep->sites[i].membership; + shared_array[eid].flags = db_rep->sites[i].gmdb_flags; RPRINT(env, (env, DB_VERB_REPMGR_MISC, "EID %d is assigned for site %s:%lu", eid, host, (u_long)shared_array[eid].addr.port)); @@ -1134,6 +1196,7 @@ __repmgr_copy_in_added_sites(env) site = SITE_FROM_EID(i); site->config = p->config; site->membership = p->status; + site->gmdb_flags = p->flags; } out: @@ -1266,7 +1329,9 @@ __repmgr_stable_lsn(env, stable_lsn) db_rep = env->rep_handle; rep = db_rep->region; - if (rep->min_log_file != 0 && rep->min_log_file < stable_lsn->file) { + LOCK_MUTEX(db_rep->mutex); + if (rep->sites_avail != 0 && rep->min_log_file != 0 && + rep->min_log_file < stable_lsn->file) { /* * Returning an LSN to be consistent with the rest of the * log archiving processing. Construct LSN of format @@ -1276,12 +1341,91 @@ __repmgr_stable_lsn(env, stable_lsn) stable_lsn->offset = 0; } RPRINT(env, (env, DB_VERB_REPMGR_MISC, - "Repmgr_stable_lsn: Returning stable_lsn[%lu][%lu]", - (u_long)stable_lsn->file, (u_long)stable_lsn->offset)); +"Repmgr_stable_lsn: Returning stable_lsn[%lu][%lu] sites_avail %lu min_log %lu", + (u_long)stable_lsn->file, (u_long)stable_lsn->offset, + (u_long)rep->sites_avail, (u_long)rep->min_log_file)); + UNLOCK_MUTEX(db_rep->mutex); return (0); } /* + * PUBLIC: int __repmgr_make_request_conn __P((ENV *, + * PUBLIC: repmgr_netaddr_t *, REPMGR_CONNECTION **)); + */ +int +__repmgr_make_request_conn(env, addr, connp) + ENV *env; + repmgr_netaddr_t *addr; + REPMGR_CONNECTION **connp; +{ + DBT vi; + __repmgr_msg_hdr_args msg_hdr; + __repmgr_version_confirmation_args conf; + REPMGR_CONNECTION *conn; + int alloc, ret, unused; + + alloc = FALSE; + if ((ret = __repmgr_connect(env, addr, &conn, &unused)) != 0) + return (ret); + conn->type = APP_CONNECTION; + + /* Read a handshake msg, to get version confirmation and parameters. */ + if ((ret = __repmgr_read_conn(conn)) != 0) + goto err; + /* + * We can only get here after having read the full 9 bytes that we + * expect, so this can't fail. + */ + DB_ASSERT(env, conn->reading_phase == SIZES_PHASE); + ret = __repmgr_msg_hdr_unmarshal(env, &msg_hdr, + conn->msg_hdr_buf, __REPMGR_MSG_HDR_SIZE, NULL); + DB_ASSERT(env, ret == 0); + __repmgr_iovec_init(&conn->iovecs); + conn->reading_phase = DATA_PHASE; + + if ((ret = __repmgr_prepare_simple_input(env, conn, &msg_hdr)) != 0) + goto err; + alloc = TRUE; + + if ((ret = __repmgr_read_conn(conn)) != 0) + goto err; + + /* + * Analyze the handshake msg, and stash relevant info. + */ + if ((ret = __repmgr_find_version_info(env, conn, &vi)) != 0) + goto err; + DB_ASSERT(env, vi.size > 0); + if ((ret = __repmgr_version_confirmation_unmarshal(env, + &conf, vi.data, vi.size, NULL)) != 0) + goto err; + + if (conf.version < GM_MIN_VERSION || + (IS_VIEW_SITE(env) && conf.version < VIEW_MIN_VERSION) || + (PREFMAS_IS_SET(env) && conf.version < PREFMAS_MIN_VERSION)) { + ret = DB_REP_UNAVAIL; + goto err; + } + conn->version = conf.version; + +err: + if (alloc) { + DB_ASSERT(env, conn->input.repmgr_msg.cntrl.size > 0); + __os_free(env, conn->input.repmgr_msg.cntrl.data); + DB_ASSERT(env, conn->input.repmgr_msg.rec.size > 0); + __os_free(env, conn->input.repmgr_msg.rec.data); + } + __repmgr_reset_for_reading(conn); + if (ret == 0) + *connp = conn; + else { + (void)__repmgr_close_connection(env, conn); + (void)__repmgr_destroy_conn(env, conn); + } + return (ret); +} + +/* * PUBLIC: int __repmgr_send_sync_msg __P((ENV *, REPMGR_CONNECTION *, * PUBLIC: u_int32_t, u_int8_t *, u_int32_t)); */ @@ -1311,15 +1455,511 @@ __repmgr_send_sync_msg(env, conn, type, buf, len) } /* + * Reads a whole message, when we expect to get a REPMGR_OWN_MSG. + */ +/* + * PUBLIC: int __repmgr_read_own_msg __P((ENV *, REPMGR_CONNECTION *, + * PUBLIC: u_int32_t *, u_int8_t **, size_t *)); + */ +int +__repmgr_read_own_msg(env, conn, typep, bufp, lenp) + ENV *env; + REPMGR_CONNECTION *conn; + u_int32_t *typep; + u_int8_t **bufp; + size_t *lenp; +{ + __repmgr_msg_hdr_args msg_hdr; + u_int8_t *buf; + u_int32_t type; + size_t size; + int ret; + + __repmgr_reset_for_reading(conn); + if ((ret = __repmgr_read_conn(conn)) != 0) + goto err; + ret = __repmgr_msg_hdr_unmarshal(env, &msg_hdr, + conn->msg_hdr_buf, __REPMGR_MSG_HDR_SIZE, NULL); + DB_ASSERT(env, ret == 0); + + if ((conn->msg_type = msg_hdr.type) != REPMGR_OWN_MSG) { + ret = DB_REP_UNAVAIL; /* Protocol violation. */ + goto err; + } + type = REPMGR_OWN_MSG_TYPE(msg_hdr); + if ((size = (size_t)REPMGR_OWN_BUF_SIZE(msg_hdr)) > 0) { + conn->reading_phase = DATA_PHASE; + __repmgr_iovec_init(&conn->iovecs); + + if ((ret = __os_malloc(env, size, &buf)) != 0) + goto err; + conn->input.rep_message = NULL; + + __repmgr_add_buffer(&conn->iovecs, buf, size); + if ((ret = __repmgr_read_conn(conn)) != 0) { + __os_free(env, buf); + goto err; + } + *bufp = buf; + } + + *typep = type; + *lenp = size; + +err: + return (ret); +} + +/* + * Returns TRUE if we are connected to the other site in a preferred + * master replication group, FALSE otherwise. + * + * PUBLIC: int __repmgr_prefmas_connected __P((ENV *)); + */ +int +__repmgr_prefmas_connected(env) + ENV *env; +{ + DB_REP *db_rep; + REPMGR_CONNECTION *conn; + REPMGR_SITE *other_site; + + db_rep = env->rep_handle; + + /* + * Preferred master mode only has 2 sites, so the other site is + * always EID 1. + */ + if (!IS_PREFMAS_MODE(env) || !IS_KNOWN_REMOTE_SITE(1)) + return (FALSE); + + other_site = SITE_FROM_EID(1); + if (other_site->state == SITE_CONNECTED) + return (TRUE); + + if ((conn = other_site->ref.conn.in) != NULL && + IS_READY_STATE(conn->state)) + return (TRUE); + if ((conn = other_site->ref.conn.out) != NULL && + IS_READY_STATE(conn->state)) + return (TRUE); + + return (FALSE); +} + +/* + * Used by a preferred master site to restart the remote temporary master + * site as a client. This is used to help guarantee that the preferred master + * site's transactions are never rolled back. + * + * PUBLIC: int __repmgr_restart_site_as_client __P((ENV *, int)); + */ +int +__repmgr_restart_site_as_client(env, eid) + ENV *env; + int eid; +{ + DB_REP *db_rep; + REPMGR_CONNECTION *conn; + repmgr_netaddr_t addr; + u_int32_t type; + size_t len; + u_int8_t any_value, *response_buf; + int ret, t_ret; + + COMPQUIET(any_value, 0); + db_rep = env->rep_handle; + conn = NULL; + + if (!IS_PREFMAS_MODE(env)) + return (0); + + LOCK_MUTEX(db_rep->mutex); + addr = SITE_FROM_EID(eid)->net_addr; + UNLOCK_MUTEX(db_rep->mutex); + if ((ret = __repmgr_make_request_conn(env, &addr, &conn)) != 0) + return (ret); + + /* + * No payload needed, but must send at least a dummy byte for the + * other side to recognize that a message has arrived. + */ + if ((ret = __repmgr_send_sync_msg(env, conn, + REPMGR_RESTART_CLIENT, VOID_STAR_CAST &any_value, 1)) != 0) + goto err; + + if ((ret = __repmgr_read_own_msg(env, + conn, &type, &response_buf, &len)) != 0) + goto err; + if (type != REPMGR_PREFMAS_SUCCESS) { + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "restart_site_as_client got unexpected message type %d", + type)); + ret = DB_REP_UNAVAIL; /* Invalid response: protocol violation */ + } +err: + if (conn != NULL) { + if ((t_ret = __repmgr_close_connection(env, + conn)) != 0 && ret != 0) + ret = t_ret; + if ((t_ret = __repmgr_destroy_conn(env, + conn)) != 0 && ret != 0) + ret = t_ret; + } + return (ret); +} + +/* + * Used by a preferred master site to make the remote temporary master + * site a readonly master. This is used to help preserve all temporary + * master transactions. + * + * PUBLIC: int __repmgr_make_site_readonly_master __P((ENV *, int, + * PUBLIC: u_int32_t *, DB_LSN *)); + */ +int +__repmgr_make_site_readonly_master(env, eid, gen, sync_lsnp) + ENV *env; + int eid; + u_int32_t *gen; + DB_LSN *sync_lsnp; +{ + DB_REP *db_rep; + REPMGR_CONNECTION *conn; + repmgr_netaddr_t addr; + __repmgr_permlsn_args permlsn; + u_int32_t type; + size_t len; + u_int8_t any_value, *response_buf; + int ret, t_ret; + + COMPQUIET(any_value, 0); + db_rep = env->rep_handle; + conn = NULL; + response_buf = NULL; + *gen = 0; + ZERO_LSN(*sync_lsnp); + + if (!IS_PREFMAS_MODE(env)) + return (0); + + LOCK_MUTEX(db_rep->mutex); + addr = SITE_FROM_EID(eid)->net_addr; + UNLOCK_MUTEX(db_rep->mutex); + if ((ret = __repmgr_make_request_conn(env, &addr, &conn)) != 0) + return (ret); + + /* + * No payload needed, but must send at least a dummy byte for the + * other side to recognize that a message has arrived. + */ + if ((ret = __repmgr_send_sync_msg(env, conn, + REPMGR_READONLY_MASTER, VOID_STAR_CAST &any_value, 1)) != 0) + goto err; + + if ((ret = __repmgr_read_own_msg(env, + conn, &type, &response_buf, &len)) != 0) + goto err; + + if (type == REPMGR_READONLY_RESPONSE) { + if ((ret = __repmgr_permlsn_unmarshal(env, + &permlsn, response_buf, len, NULL)) != 0) + goto err; + *gen = permlsn.generation; + *sync_lsnp = permlsn.lsn; + } else { + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "make_site_readonly_master got unexpected message type %d", + type)); + ret = DB_REP_UNAVAIL; /* Invalid response: protocol violation */ + } + +err: + if (conn != NULL) { + if ((t_ret = __repmgr_close_connection(env, + conn)) != 0 && ret != 0) + ret = t_ret; + if ((t_ret = __repmgr_destroy_conn(env, + conn)) != 0 && ret != 0) + ret = t_ret; + } + if (response_buf != NULL) + __os_free(env, response_buf); + return (ret); +} + +/* + * Used by a preferred master site to perform the LSN history comparisons to + * determine whether there is are continuous or conflicting sets of + * transactions between this site and the remote temporary master. + * + * PUBLIC: int __repmgr_lsnhist_match __P((ENV *, + * PUBLIC: DB_THREAD_INFO *, int, int *)); + */ +int +__repmgr_lsnhist_match(env, ip, eid, match) + ENV *env; + DB_THREAD_INFO *ip; + int eid; + int *match; +{ + DB_REP *db_rep; + REP *rep; + __rep_lsn_hist_data_args my_lsnhist; + __repmgr_lsnhist_match_args remote_lsnhist; + u_int32_t my_gen; + int found_commit, ret; + + db_rep = env->rep_handle; + rep = db_rep->region; + *match = FALSE; + my_gen = rep->gen; + found_commit = FALSE; + + if (!IS_PREFMAS_MODE(env)) + return (0); + + /* Get local LSN history information for comparison. */ + if ((ret = __rep_get_lsnhist_data(env, ip, my_gen, &my_lsnhist)) != 0) + return (ret); + + /* Get remote LSN history information for comparison. */ + ret = __repmgr_remote_lsnhist(env, eid, my_gen, &remote_lsnhist); + + /* + * If the current gen doesn't exist at the remote site, the match + * fails. + * + * If the remote LSN or timestamp at the current gen doesn't match + * ours, we probably had a whack-a-mole situation where each site + * as up and down in isolation one or more times and the match fails. + * + * If the remote LSN for the next generation is lower than this + * site's startup LSN and there are any commit operations between + * these LSNs, there are conflicting sets of transactions and the + * match fails. + */ + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "lsnhist_match my_lsn [%lu][%lu] remote_lsn [%lu][%lu]", + (u_long)my_lsnhist.lsn.file, (u_long)my_lsnhist.lsn.offset, + (u_long)remote_lsnhist.lsn.file, + (u_long)remote_lsnhist.lsn.offset)); + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "lsnhist_match my_time %lu:%lu remote_time %lu:%lu", + (u_long)my_lsnhist.hist_sec, (u_long)my_lsnhist.hist_nsec, + (u_long)remote_lsnhist.hist_sec, (u_long)remote_lsnhist.hist_nsec)); + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "lsnhist_match pminit_lsn [%lu][%lu] next_gen_lsn [%lu][%lu]", + (u_long)db_rep->prefmas_init_lsn.file, + (u_long)db_rep->prefmas_init_lsn.offset, + (u_long)remote_lsnhist.next_gen_lsn.file, + (u_long)remote_lsnhist.next_gen_lsn.offset)); + if (ret != DB_REP_UNAVAIL && + LOG_COMPARE(&my_lsnhist.lsn, &remote_lsnhist.lsn) == 0 && + my_lsnhist.hist_sec == remote_lsnhist.hist_sec && + my_lsnhist.hist_nsec == remote_lsnhist.hist_nsec) { + /* + * If the remote site doesn't yet have the next gen or if + * our startup LSN is <= than the remote next gen LSN, we + * have a match. + * + * Otherwise, our startup LSN is higher than the remote + * next gen LSN. If we have any commit operations between + * these two LSNs, we have preferred master operations we + * must preserve and there is not a match. But if we just + * have uncommitted operations between these LSNs it doesn't + * matter if they are rolled back, so we call it a match and + * try to retain temporary master transactions if possible. + */ + if (IS_ZERO_LSN(remote_lsnhist.next_gen_lsn) || + LOG_COMPARE(&db_rep->prefmas_init_lsn, + &remote_lsnhist.next_gen_lsn) <= 0) + *match = TRUE; + else if ((ret = __repmgr_find_commit(env, + &remote_lsnhist.next_gen_lsn, + &db_rep->prefmas_init_lsn, &found_commit)) == 0 && + !found_commit) { + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "lsnhist_match !found_commit set match TRUE")); + *match = TRUE; + } + } + + /* Don't return an error if current gen didn't exist at remote site. */ + if (ret == DB_REP_UNAVAIL) + ret = 0; + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "lsnhist_match match %d returning %d", *match, ret)); + return (ret); +} + +/* + * Checks a range of log records from low_lsn to high_lsn for any + * commit operations. Sets found_commit to TRUE if a commit is + * found. + */ +static int +__repmgr_find_commit(env, low_lsn, high_lsn, found_commit) + ENV *env; + DB_LSN *low_lsn; + DB_LSN *high_lsn; + int *found_commit; +{ + DB_LOGC *logc; + DB_LSN lsn; + DBT rec; + __txn_regop_args *txn_args; + u_int32_t rectype; + int ret, t_ret; + + *found_commit = FALSE; + ret = 0; + + lsn = *low_lsn; + if ((ret = __log_cursor(env, &logc)) != 0) + return (ret); + memset(&rec, 0, sizeof(rec)); + if (__logc_get(logc, &lsn, &rec, DB_SET) == 0) { + do { + LOGCOPY_32(env, &rectype, rec.data); + if (rectype == DB___txn_regop) { + if ((ret = __txn_regop_read( + env, rec.data, &txn_args)) != 0) + goto close_cursor; + if (txn_args->opcode == TXN_COMMIT) { + *found_commit = TRUE; + __os_free(env, txn_args); + break; + } + __os_free(env, txn_args); + } + } while ((ret = __logc_get(logc, &lsn, &rec, DB_NEXT)) == 0 && + LOG_COMPARE(&lsn, high_lsn) <= 0); + } +close_cursor: + if ((t_ret = __logc_close(logc)) != 0 && ret == 0) + ret = t_ret; + return (ret); +} + +/* + * Used by a preferred master site to get remote LSN history information + * from the other site in the replication group. + */ +static int +__repmgr_remote_lsnhist(env, eid, gen, lsnhist_match) + ENV *env; + int eid; + u_int32_t gen; + __repmgr_lsnhist_match_args *lsnhist_match; +{ + DB_REP *db_rep; + REPMGR_CONNECTION *conn; + repmgr_netaddr_t addr; + __rep_lsn_hist_key_args lsnhist_key; + u_int8_t lsnhist_key_buf[__REP_LSN_HIST_KEY_SIZE]; + u_int32_t type; + size_t len; + u_int8_t *response_buf; + int ret, t_ret; + + db_rep = env->rep_handle; + conn = NULL; + response_buf = NULL; + + if (!IS_KNOWN_REMOTE_SITE(eid)) + return (0); + + LOCK_MUTEX(db_rep->mutex); + addr = SITE_FROM_EID(eid)->net_addr; + UNLOCK_MUTEX(db_rep->mutex); + if ((ret = __repmgr_make_request_conn(env, &addr, &conn)) != 0) + return (ret); + + /* Marshal generation for which to request remote lsnhist data. */ + lsnhist_key.version = REP_LSN_HISTORY_FMT_VERSION; + lsnhist_key.gen = gen; + __rep_lsn_hist_key_marshal(env, &lsnhist_key, lsnhist_key_buf); + if ((ret = __repmgr_send_sync_msg(env, conn, REPMGR_LSNHIST_REQUEST, + lsnhist_key_buf, sizeof(lsnhist_key_buf))) != 0) + goto err; + + if ((ret = __repmgr_read_own_msg(env, + conn, &type, &response_buf, &len)) != 0) + goto err; + + /* Unmarshal remote lsnhist time and LSNs for comparison. */ + if (type == REPMGR_LSNHIST_RESPONSE) { + if ((ret = __repmgr_lsnhist_match_unmarshal(env, lsnhist_match, + response_buf, __REPMGR_LSNHIST_MATCH_SIZE, NULL)) != 0) + goto err; + } else { + /* + * If the other site sent back REPMGR_PREFMAS_FAILURE, it means + * no lsnhist record for the requested gen was found on other + * site. + */ + if (type != REPMGR_PREFMAS_FAILURE) + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "remote_lsnhist got unexpected message type %d", + type)); + ret = DB_REP_UNAVAIL; + } + +err: + if (conn != NULL) { + if ((t_ret = __repmgr_close_connection(env, + conn)) != 0 && ret != 0) + ret = t_ret; + if ((t_ret = __repmgr_destroy_conn(env, + conn)) != 0 && ret != 0) + ret = t_ret; + } + if (response_buf != NULL) + __os_free(env, response_buf); + return (ret); +} + +/* + * Returns the number of tries and the amount of time to yield the + * processor for preferred master waits. The total wait is the larger + * of 2 seconds or 3 * ack_timeout. + * + * PUBLIC: int __repmgr_prefmas_get_wait __P((ENV *, u_int32_t *, u_long *)); + */ +int +__repmgr_prefmas_get_wait(env, tries, yield_usecs) + ENV *env; + u_int32_t *tries; + u_long *yield_usecs; +{ + DB_REP *db_rep; + REP *rep; + db_timeout_t max_wait; + + db_rep = env->rep_handle; + rep = db_rep->region; + + *yield_usecs = 250000; + max_wait = DB_REPMGR_DEFAULT_ACK_TIMEOUT * 2; + if ((rep->ack_timeout * 3) > max_wait) + max_wait = rep->ack_timeout * 3; + *tries = max_wait / (u_int32_t)*yield_usecs; + return (0); +} + +/* * Produce a membership list from the known info currently in memory. * - * PUBLIC: int __repmgr_marshal_member_list __P((ENV *, u_int8_t **, size_t *)); + * PUBLIC: int __repmgr_marshal_member_list __P((ENV *, u_int32_t, + * PUBLIC: u_int8_t **, size_t *)); * * Caller must hold mutex. */ int -__repmgr_marshal_member_list(env, bufp, lenp) +__repmgr_marshal_member_list(env, msg_version, bufp, lenp) ENV *env; + u_int32_t msg_version; u_int8_t **bufp; size_t *lenp; { @@ -1328,6 +1968,7 @@ __repmgr_marshal_member_list(env, bufp, lenp) REPMGR_SITE *site; __repmgr_membr_vers_args membr_vers; __repmgr_site_info_args site_info; + __repmgr_v4site_info_args v4site_info; u_int8_t *buf, *p; size_t bufsize, len; u_int i; @@ -1353,14 +1994,24 @@ __repmgr_marshal_member_list(env, bufp, lenp) if (site->membership == 0) continue; - site_info.host.data = site->net_addr.host; - site_info.host.size = - (u_int32_t)strlen(site->net_addr.host) + 1; - site_info.port = site->net_addr.port; - site_info.flags = site->membership; - - ret = __repmgr_site_info_marshal(env, - &site_info, p, (size_t)(&buf[bufsize]-p), &len); + if (msg_version < 5) { + v4site_info.host.data = site->net_addr.host; + v4site_info.host.size = + (u_int32_t)strlen(site->net_addr.host) + 1; + v4site_info.port = site->net_addr.port; + v4site_info.flags = site->membership; + ret = __repmgr_v4site_info_marshal(env, + &v4site_info, p, (size_t)(&buf[bufsize]-p), &len); + } else { + site_info.host.data = site->net_addr.host; + site_info.host.size = + (u_int32_t)strlen(site->net_addr.host) + 1; + site_info.port = site->net_addr.port; + site_info.status = site->membership; + site_info.flags = site->gmdb_flags; + ret = __repmgr_site_info_marshal(env, + &site_info, p, (size_t)(&buf[bufsize]-p), &len); + } DB_ASSERT(env, ret == 0); p += len; } @@ -1387,7 +2038,7 @@ read_gmdb(env, ip, bufp, lenp) DBC *dbc; DBT key_dbt, data_dbt; __repmgr_membership_key_args key; - __repmgr_membership_data_args member_status; + __repmgr_membership_data_args member_data; __repmgr_member_metadata_args metadata; __repmgr_membr_vers_args membr_vers; __repmgr_site_info_args site_info; @@ -1435,8 +2086,13 @@ read_gmdb(env, ip, bufp, lenp) ret = __repmgr_member_metadata_unmarshal(env, &metadata, metadata_buf, data_dbt.size, NULL); DB_ASSERT(env, ret == 0); - DB_ASSERT(env, metadata.format == REPMGR_GMDB_FMT_VERSION); + DB_ASSERT(env, metadata.format >= REPMGR_GMDB_FMT_MIN_VERSION && + metadata.format <= REPMGR_GMDB_FMT_VERSION); DB_ASSERT(env, metadata.version > 0); + /* Automatic conversion of old format gmdb if needed. */ + if (metadata.format < REPMGR_GMDB_FMT_VERSION && + (ret = convert_gmdb(env, ip, dbp, txn)) != 0) + goto err; bufsize = 1000; /* Initial guess. */ if ((ret = __os_malloc(env, bufsize, &buf)) != 0) @@ -1459,13 +2115,14 @@ read_gmdb(env, ip, bufp, lenp) DB_ASSERT(env, key.port > 0); ret = __repmgr_membership_data_unmarshal(env, - &member_status, data_buf, data_dbt.size, NULL); + &member_data, data_buf, data_dbt.size, NULL); DB_ASSERT(env, ret == 0); - DB_ASSERT(env, member_status.flags != 0); + DB_ASSERT(env, member_data.status != 0); site_info.host = key.host; site_info.port = key.port; - site_info.flags = member_status.flags; + site_info.status = member_data.status; + site_info.flags = member_data.flags; if ((ret = __repmgr_site_info_marshal(env, &site_info, p, (size_t)(&buf[bufsize]-p), &len)) == ENOMEM) { bufsize *= 2; @@ -1501,28 +2158,129 @@ err: } /* + * Convert an older-format group membership database into the current format. + */ +static int +convert_gmdb(env, ip, dbp, txn) + ENV *env; + DB_THREAD_INFO *ip; + DB *dbp; + DB_TXN *txn; +{ + DBC *dbc; + DBT key_dbt, data_dbt, v4data_dbt; + __repmgr_membership_key_args key; + __repmgr_membership_data_args member_data; + __repmgr_v4membership_data_args v4member_data; + __repmgr_member_metadata_args metadata; + u_int8_t data_buf[__REPMGR_MEMBERSHIP_DATA_SIZE]; + u_int8_t key_buf[MAX_MSG_BUF]; + u_int8_t metadata_buf[__REPMGR_MEMBER_METADATA_SIZE]; + u_int8_t v4data_buf[__REPMGR_V4MEMBERSHIP_DATA_SIZE]; + int ret, t_ret; + + dbc = NULL; + + if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0) + goto err; + + memset(&key_dbt, 0, sizeof(key_dbt)); + key_dbt.data = key_buf; + key_dbt.ulen = sizeof(key_buf); + F_SET(&key_dbt, DB_DBT_USERMEM); + memset(&data_dbt, 0, sizeof(data_dbt)); + data_dbt.data = metadata_buf; + data_dbt.ulen = sizeof(metadata_buf); + F_SET(&data_dbt, DB_DBT_USERMEM); + memset(&v4data_dbt, 0, sizeof(v4data_dbt)); + v4data_dbt.data = v4data_buf; + v4data_dbt.ulen = sizeof(v4data_buf); + F_SET(&v4data_dbt, DB_DBT_USERMEM); + + /* + * The first gmdb record is a special metadata record that contains + * an empty key and gmdb metadata (format and version) and has already + * been validated by the caller. We need to update its format value + * for this conversion but leave the version alone. + */ + if ((ret = __dbc_get(dbc, &key_dbt, &data_dbt, DB_NEXT)) != 0) + goto err; + ret = __repmgr_membership_key_unmarshal(env, + &key, key_buf, key_dbt.size, NULL); + DB_ASSERT(env, ret == 0); + DB_ASSERT(env, key.host.size == 0); + DB_ASSERT(env, key.port == 0); + ret = __repmgr_member_metadata_unmarshal(env, + &metadata, metadata_buf, data_dbt.size, NULL); + DB_ASSERT(env, ret == 0); + DB_ASSERT(env, metadata.version > 0); + metadata.format = REPMGR_GMDB_FMT_VERSION; + __repmgr_member_metadata_marshal(env, &metadata, metadata_buf); + DB_INIT_DBT(data_dbt, metadata_buf, __REPMGR_MEMBER_METADATA_SIZE); + if ((ret = __dbc_put(dbc, &key_dbt, &data_dbt, DB_CURRENT)) != 0) + goto err; + + /* + * The rest of the gmdb records contain a key (host and port) and + * membership data (status and now flags). But the old format was + * using flags for the status value, so we need to transfer the + * old flags value to status and provide an empty flags value for + * this conversion. + */ + data_dbt.data = data_buf; + data_dbt.ulen = sizeof(data_buf); + while ((ret = __dbc_get(dbc, &key_dbt, &v4data_dbt, DB_NEXT)) == 0) { + /* Get membership data in old format. */ + ret = __repmgr_v4membership_data_unmarshal(env, + &v4member_data, v4data_buf, v4data_dbt.size, NULL); + DB_ASSERT(env, ret == 0); + DB_ASSERT(env, v4member_data.flags != 0); + + /* Convert membership data into current format and update. */ + member_data.status = v4member_data.flags; + member_data.flags = 0; + __repmgr_membership_data_marshal(env, &member_data, data_buf); + DB_INIT_DBT(data_dbt, data_buf, __REPMGR_MEMBERSHIP_DATA_SIZE); + if ((ret = __dbc_put(dbc, + &key_dbt, &data_dbt, DB_CURRENT)) != 0) + goto err; + } + if (ret == DB_NOTFOUND) + ret = 0; + +err: + if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0) + ret = t_ret; + return (ret); +} + +/* * Refresh our sites array from the given membership list. * * PUBLIC: int __repmgr_refresh_membership __P((ENV *, - * PUBLIC: u_int8_t *, size_t)); + * PUBLIC: u_int8_t *, size_t, u_int32_t)); */ int -__repmgr_refresh_membership(env, buf, len) +__repmgr_refresh_membership(env, buf, len, version) ENV *env; u_int8_t *buf; size_t len; + u_int32_t version; { DB_REP *db_rep; + REP *rep; REPMGR_SITE *site; __repmgr_membr_vers_args membr_vers; __repmgr_site_info_args site_info; + __repmgr_v4site_info_args v4site_info; char *host; u_int8_t *p; u_int16_t port; - u_int32_t i, n; + u_int32_t i, participants; int eid, ret; db_rep = env->rep_handle; + rep = db_rep->region; /* * Membership list consists of membr_vers followed by a number of @@ -1546,9 +2304,17 @@ __repmgr_refresh_membership(env, buf, len) for (i = 0; i < db_rep->site_cnt; i++) F_CLR(SITE_FROM_EID(i), SITE_TOUCHED); - for (n = 0; p < &buf[len]; ++n) { - ret = __repmgr_site_info_unmarshal(env, - &site_info, p, (size_t)(&buf[len] - p), &p); + for (participants = 0; p < &buf[len]; ) { + if (version < 5) { + ret = __repmgr_v4site_info_unmarshal(env, + &v4site_info, p, (size_t)(&buf[len] - p), &p); + site_info.host = v4site_info.host; + site_info.port = v4site_info.port; + site_info.status = v4site_info.flags; + site_info.flags = 0; + } else + ret = __repmgr_site_info_unmarshal(env, + &site_info, p, (size_t)(&buf[len] - p), &p); DB_ASSERT(env, ret == 0); host = site_info.host.data; @@ -1556,9 +2322,11 @@ __repmgr_refresh_membership(env, buf, len) (u_int8_t*)site_info.host.data + site_info.host.size <= p); host[site_info.host.size-1] = '\0'; port = site_info.port; + if (!FLD_ISSET(site_info.flags, SITE_VIEW)) + participants++; if ((ret = __repmgr_set_membership(env, - host, port, site_info.flags)) != 0) + host, port, site_info.status, site_info.flags)) != 0) goto err; if ((ret = __repmgr_find_site(env, host, port, &eid)) != 0) @@ -1566,8 +2334,13 @@ __repmgr_refresh_membership(env, buf, len) DB_ASSERT(env, IS_VALID_EID(eid)); F_SET(SITE_FROM_EID(eid), SITE_TOUCHED); } - ret = __rep_set_nsites_int(env, n); + ret = __rep_set_nsites_int(env, participants); DB_ASSERT(env, ret == 0); + if (FLD_ISSET(rep->config, + REP_C_PREFMAS_MASTER | REP_C_PREFMAS_CLIENT) && + rep->config_nsites > 2) + __db_errx(env, DB_STR("3703", + "More than two sites in preferred master replication group")); /* Scan "touched" flags so as to notice sites that have been removed. */ for (i = 0; i < db_rep->site_cnt; i++) { @@ -1576,7 +2349,8 @@ __repmgr_refresh_membership(env, buf, len) continue; host = site->net_addr.host; port = site->net_addr.port; - if ((ret = __repmgr_set_membership(env, host, port, 0)) != 0) + if ((ret = __repmgr_set_membership(env, host, port, + 0, site->gmdb_flags)) != 0) goto err; } @@ -1597,13 +2371,13 @@ __repmgr_reload_gmdb(env) size_t len; int ret; - ENV_ENTER(env, ip); + ENV_GET_THREAD_INFO(env, ip); if ((ret = read_gmdb(env, ip, &buf, &len)) == 0) { env->rep_handle->have_gmdb = TRUE; - ret = __repmgr_refresh_membership(env, buf, len); + ret = __repmgr_refresh_membership(env, buf, len, + DB_REPMGR_VERSION); __os_free(env, buf); } - ENV_LEAVE(env, ip); return (ret); } @@ -1650,7 +2424,8 @@ __repmgr_init_save(env, dbt) dbt->data = NULL; dbt->size = 0; ret = 0; - } else if ((ret = __repmgr_marshal_member_list(env, &buf, &len)) == 0) { + } else if ((ret = __repmgr_marshal_member_list(env, + DB_REPMGR_VERSION, &buf, &len)) == 0) { dbt->data = buf; dbt->size = (u_int32_t)len; } @@ -1700,6 +2475,7 @@ __repmgr_defer_op(env, op) */ if ((ret = __os_calloc(env, 1, sizeof(*msg), &msg)) != 0) return (ret); + msg->size = sizeof(*msg); msg->msg_hdr.type = REPMGR_OWN_MSG; REPMGR_OWN_MSG_TYPE(msg->msg_hdr) = op; ret = __repmgr_queue_put(env, msg); @@ -1771,7 +2547,7 @@ __repmgr_become_client(env) if ((ret = __repmgr_await_gmdbop(env)) == 0) db_rep->client_intent = TRUE; UNLOCK_MUTEX(db_rep->mutex); - return (ret == 0 ? __repmgr_repstart(env, DB_REP_CLIENT) : ret); + return (ret == 0 ? __repmgr_repstart(env, DB_REP_CLIENT, 0) : ret); } /* @@ -1897,16 +2673,17 @@ get_eid(env, host, port, eidp) * accordingly. * * PUBLIC: int __repmgr_set_membership __P((ENV *, - * PUBLIC: const char *, u_int, u_int32_t)); + * PUBLIC: const char *, u_int, u_int32_t, u_int32_t)); * * Caller must host db_rep mutex, and be in ENV_ENTER context. */ int -__repmgr_set_membership(env, host, port, status) +__repmgr_set_membership(env, host, port, status, flags) ENV *env; const char *host; u_int port; u_int32_t status; + u_int32_t flags; { DB_REP *db_rep; REP *rep; @@ -1953,7 +2730,9 @@ __repmgr_set_membership(env, host, port, status) /* Set both private and shared copies of the info. */ site->membership = status; + site->gmdb_flags = flags; sites[eid].status = status; + sites[eid].flags = flags; } MUTEX_UNLOCK(env, rep->mtx_repmgr); @@ -1965,7 +2744,8 @@ __repmgr_set_membership(env, host, port, status) SELECTOR_RUNNING(db_rep)) { if (eid == db_rep->self_eid && status != SITE_PRESENT) - ret = DB_DELETED; + ret = (status == SITE_ADDING) ? + __repmgr_defer_op(env, REPMGR_REJOIN) : DB_DELETED; else if (orig != SITE_PRESENT && status == SITE_PRESENT && site->state == SITE_IDLE) { /* @@ -1981,10 +2761,11 @@ __repmgr_set_membership(env, host, port, status) * failure shouldn't hurt anything, because we'll just * naturally try again later. */ - ret = __repmgr_schedule_connection_attempt(env, - eid, TRUE); - if (eid != db_rep->self_eid) + if (eid != db_rep->self_eid) { + ret = __repmgr_schedule_connection_attempt(env, + eid, TRUE); DB_EVENT(env, DB_EVENT_REP_SITE_ADDED, &eid); + } } else if (orig != 0 && status == 0) DB_EVENT(env, DB_EVENT_REP_SITE_REMOVED, &eid); @@ -2084,3 +2865,73 @@ __repmgr_bcast_own_msg(env, type, buf, len) } return (0); } + +/* + * PUBLIC: int __repmgr_bcast_member_list __P((ENV *)); + * + * Broadcast membership list to all other sites in the replication group. + * + * Caller must hold mutex. + */ +int +__repmgr_bcast_member_list(env) + ENV *env; +{ + DB_REP *db_rep; + REPMGR_CONNECTION *conn; + REPMGR_SITE *site; + u_int8_t *buf, *v4buf; + size_t len, v4len; + int ret; + u_int i; + + db_rep = env->rep_handle; + if (!SELECTOR_RUNNING(db_rep)) + return (0); + buf = NULL; + v4buf = NULL; + LOCK_MUTEX(db_rep->mutex); + /* + * Some of the other sites in the replication group might be at + * an older version, so we need to be able to send the membership + * list in the current or older format. + */ + if ((ret = __repmgr_marshal_member_list(env, + DB_REPMGR_VERSION, &buf, &len)) != 0 || + (ret = __repmgr_marshal_member_list(env, + 4, &v4buf, &v4len)) != 0) { + UNLOCK_MUTEX(db_rep->mutex); + goto out; + } + UNLOCK_MUTEX(db_rep->mutex); + + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "Broadcast latest membership list")); + FOR_EACH_REMOTE_SITE_INDEX(i) { + site = SITE_FROM_EID(i); + if (site->state != SITE_CONNECTED) + continue; + if ((conn = site->ref.conn.in) != NULL && + conn->state == CONN_READY && + (ret = __repmgr_send_own_msg(env, conn, REPMGR_SHARING, + (conn->version < 5 ? v4buf : buf), + (conn->version < 5 ? (u_int32_t) v4len : (u_int32_t)len))) + != 0 && + (ret = __repmgr_bust_connection(env, conn)) != 0) + goto out; + if ((conn = site->ref.conn.out) != NULL && + conn->state == CONN_READY && + (ret = __repmgr_send_own_msg(env, conn, REPMGR_SHARING, + (conn->version < 5 ? v4buf : buf), + (conn->version < 5 ? (u_int32_t)v4len : (u_int32_t)len))) + != 0 && + (ret = __repmgr_bust_connection(env, conn)) != 0) + goto out; + } +out: + if (buf != NULL) + __os_free(env, buf); + if (v4buf != NULL) + __os_free(env, v4buf); + return (ret); +} diff --git a/src/repmgr/repmgr_windows.c b/src/repmgr/repmgr_windows.c index d9c2a03d..8cf05960 100644 --- a/src/repmgr/repmgr_windows.c +++ b/src/repmgr/repmgr_windows.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -252,7 +252,7 @@ allocate_wait_slot(env, resultp, table) * the previous wait but before reacquiring the mutex, and this * extra signal would incorrectly cause the next wait to return * immediately. - */ + */ (void)WaitForSingleObject(w->event, 0); *resultp = i; return (0); @@ -639,31 +639,40 @@ __repmgr_select_loop(env) WSAEVENT listen_event; WSANETWORKEVENTS net_events; struct io_info io_info; - int i; + int accept_connect, i; db_rep = env->rep_handle; io_info.connections = connections; io_info.events = events; + accept_connect = FALSE; if ((listen_event = WSACreateEvent()) == WSA_INVALID_EVENT) { __db_err(env, net_errno, DB_STR("3590", "can't create event for listen socket")); return (net_errno); } - if (!IS_SUBORDINATE(db_rep) && - WSAEventSelect(db_rep->listen_fd, listen_event, FD_ACCEPT) == - SOCKET_ERROR) { - ret = net_errno; - __db_err(env, ret, DB_STR("3591", - "can't enable event for listener")); - (void)WSACloseEvent(listen_event); - goto out; - } LOCK_MUTEX(db_rep->mutex); if ((ret = __repmgr_first_try_connections(env)) != 0) goto unlock; for (;;) { + /* + * Set the event for this process to receive notification of + * incoming connections if this process is or has just taken + * over as the listener process. + */ + if (!IS_SUBORDINATE(db_rep) && !accept_connect) { + if (WSAEventSelect(db_rep->listen_fd, listen_event, + FD_ACCEPT) == SOCKET_ERROR) { + ret = net_errno; + __db_err(env, ret, DB_STR("3700", + "can't enable event for listener")); + (void)WSACloseEvent(listen_event); + goto out; + } + accept_connect = TRUE; + } + /* Start with the two events that we always wait for. */ #define SIGNALER_INDEX 0 #define LISTENER_INDEX 1 @@ -714,6 +723,8 @@ __repmgr_select_loop(env) ret = net_errno; goto unlock; } + if (net_events.lNetworkEvents == 0) + continue; DB_ASSERT(env, net_events.lNetworkEvents & FD_ACCEPT); if ((ret = net_events.iErrorCode[FD_ACCEPT_BIT]) @@ -815,7 +826,16 @@ handle_completion(env, conn) /* Check both writing and reading. */ if (events.lNetworkEvents & FD_CLOSE) { error = events.iErrorCode[FD_CLOSE_BIT]; - goto report; + + /* + * There could be data for reading when we see FD_CLOSE, + * so we should try reading in this case. + */ + if (error != 0) + goto report; + else if ((ret = + __repmgr_read_from_site(env, conn)) != 0) + goto err; } if (events.lNetworkEvents & FD_WRITE) { @@ -823,7 +843,7 @@ handle_completion(env, conn) error = events.iErrorCode[FD_WRITE_BIT]; goto report; } else if ((ret = - __repmgr_write_some(env, conn)) != 0) + __repmgr_write_some(env, conn)) != 0) goto err; } @@ -832,7 +852,7 @@ handle_completion(env, conn) error = events.iErrorCode[FD_READ_BIT]; goto report; } else if ((ret = - __repmgr_read_from_site(env, conn)) != 0) + __repmgr_read_from_site(env, conn)) != 0) goto err; } diff --git a/src/sequence/seq_stat.c b/src/sequence/seq_stat.c index d5b9a401..28f61174 100644 --- a/src/sequence/seq_stat.c +++ b/src/sequence/seq_stat.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2004, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2004, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -124,10 +124,12 @@ __seq_stat_print(seq, flags) DB *dbp; DB_THREAD_INFO *ip; ENV *env; + u_int32_t orig_flags; int handle_check, ret, t_ret; dbp = seq->seq_dbp; env = dbp->env; + ret = 0; SEQ_ILLEGAL_BEFORE_OPEN(seq, "DB_SEQUENCE->stat_print"); @@ -140,11 +142,16 @@ __seq_stat_print(seq, flags) goto err; } - if ((ret = __seq_print_stats(seq, flags)) != 0) - goto err; + orig_flags = flags; + LF_CLR(DB_STAT_CLEAR | DB_STAT_SUBSYSTEM); + if (flags == 0 || LF_ISSET(DB_STAT_ALL)) { + ret = __seq_print_stats(seq, orig_flags); + if (flags == 0 || ret != 0) + goto err; + } if (LF_ISSET(DB_STAT_ALL) && - (ret = __seq_print_all(seq, flags)) != 0) + (ret = __seq_print_all(seq, orig_flags)) != 0) goto err; /* Release replication block. */ diff --git a/src/sequence/sequence.c b/src/sequence/sequence.c index 1c19f838..9ee31123 100644 --- a/src/sequence/sequence.c +++ b/src/sequence/sequence.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2004, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2004, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -53,24 +53,23 @@ } \ } while (0) -static int __seq_chk_cachesize __P((ENV *, int32_t, db_seq_t, db_seq_t)); -static int __seq_close __P((DB_SEQUENCE *, u_int32_t)); +static int __seq_chk_cachesize __P((ENV *, u_int32_t, db_seq_t, db_seq_t)); static int __seq_close_pp __P((DB_SEQUENCE *, u_int32_t)); -static int __seq_get - __P((DB_SEQUENCE *, DB_TXN *, int32_t, db_seq_t *, u_int32_t)); -static int __seq_get_cachesize __P((DB_SEQUENCE *, int32_t *)); +static int __seq_get_pp + __P((DB_SEQUENCE *, + DB_TXN *, u_int32_t, db_seq_t *, u_int32_t)); +static int __seq_get_cachesize __P((DB_SEQUENCE *, u_int32_t *)); static int __seq_get_db __P((DB_SEQUENCE *, DB **)); static int __seq_get_flags __P((DB_SEQUENCE *, u_int32_t *)); static int __seq_get_key __P((DB_SEQUENCE *, DBT *)); static int __seq_get_range __P((DB_SEQUENCE *, db_seq_t *, db_seq_t *)); -static int __seq_initial_value __P((DB_SEQUENCE *, db_seq_t)); static int __seq_open_pp __P((DB_SEQUENCE *, DB_TXN *, DBT *, u_int32_t)); static int __seq_remove __P((DB_SEQUENCE *, DB_TXN *, u_int32_t)); -static int __seq_set_cachesize __P((DB_SEQUENCE *, int32_t)); +static int __seq_set_cachesize __P((DB_SEQUENCE *, u_int32_t)); static int __seq_set_flags __P((DB_SEQUENCE *, u_int32_t)); static int __seq_set_range __P((DB_SEQUENCE *, db_seq_t, db_seq_t)); static int __seq_update - __P((DB_SEQUENCE *, DB_THREAD_INFO *, DB_TXN *, int32_t, u_int32_t)); + __P((DB_SEQUENCE *, DB_THREAD_INFO *, DB_TXN *, u_int32_t, u_int32_t)); /* * db_sequence_create -- @@ -113,7 +112,7 @@ db_sequence_create(seqp, dbp, flags) seq->seq_dbp = dbp; seq->close = __seq_close_pp; - seq->get = __seq_get; + seq->get = __seq_get_pp; seq->get_cachesize = __seq_get_cachesize; seq->set_cachesize = __seq_set_cachesize; seq->get_db = __seq_get_db; @@ -134,7 +133,7 @@ db_sequence_create(seqp, dbp, flags) } /* - * __seq_open -- + * __seq_open_pp -- * DB_SEQUENCE->open method. * */ @@ -146,21 +145,18 @@ __seq_open_pp(seq, txn, keyp, flags) u_int32_t flags; { DB *dbp; - DB_SEQ_RECORD *rp; DB_THREAD_INFO *ip; ENV *env; - u_int32_t tflags; - int handle_check, txn_local, ret, t_ret; + int handle_check, ret, t_ret; #define SEQ_OPEN_FLAGS (DB_CREATE | DB_EXCL | DB_THREAD) - dbp = seq->seq_dbp; - env = dbp->env; - txn_local = 0; - - STRIP_AUTO_COMMIT(flags); SEQ_ILLEGAL_AFTER_OPEN(seq, "DB_SEQUENCE->open"); + env = seq->seq_dbp->env; + dbp = seq->seq_dbp; + ENV_ENTER(env, ip); + STRIP_AUTO_COMMIT(flags); /* Check for replication block. */ handle_check = IS_ENV_REPLICATED(env); @@ -174,6 +170,41 @@ __seq_open_pp(seq, txn, keyp, flags) "DB_SEQUENCE->open", flags, SEQ_OPEN_FLAGS)) != 0) goto err; + ret = __seq_open(seq, txn, keyp, flags); + + /* Release replication block. */ +err: if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0) + ret = t_ret; + ENV_LEAVE(env, ip); + + return (ret); +} + +/* + * __seq_open -- + * Internal open function. + * + * PUBLIC: int __seq_open __P((DB_SEQUENCE *, DB_TXN *, DBT *, u_int32_t)); + */ + +int +__seq_open(seq, txn, keyp, flags) + DB_SEQUENCE *seq; + DB_TXN *txn; + DBT *keyp; + u_int32_t flags; +{ + DB *dbp; + DB_SEQ_RECORD *rp; + DB_THREAD_INFO *ip; + ENV *env; + u_int32_t tflags; + int txn_local, ret, t_ret; + + dbp = seq->seq_dbp; + env = dbp->env; + txn_local = 0; + if (keyp->size == 0) { __db_errx(env, DB_STR("4001", "Zero length sequence key specified")); @@ -229,6 +260,7 @@ __seq_open_pp(seq, txn, keyp, flags) seq->seq_key.size = seq->seq_key.ulen = keyp->size; seq->seq_key.flags = DB_DBT_USERMEM; + ENV_GET_THREAD_INFO(env, ip); retry: if ((ret = __db_get(dbp, ip, txn, &seq->seq_key, &seq->seq_data, 0)) != 0) { if (ret == DB_BUFFER_SMALL && @@ -369,11 +401,6 @@ err: if (txn_local && __os_free(env, seq->seq_key.data); seq->seq_key.data = NULL; } - /* Release replication block. */ - if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0) - ret = t_ret; - - ENV_LEAVE(env, ip); __dbt_userfree(env, keyp, NULL, NULL); return (ret); } @@ -386,10 +413,8 @@ err: if (txn_local && static int __seq_get_cachesize(seq, cachesize) DB_SEQUENCE *seq; - int32_t *cachesize; + u_int32_t *cachesize; { - SEQ_ILLEGAL_BEFORE_OPEN(seq, "DB_SEQUENCE->get_cachesize"); - *cachesize = seq->seq_cache_size; return (0); } @@ -402,25 +427,9 @@ __seq_get_cachesize(seq, cachesize) static int __seq_set_cachesize(seq, cachesize) DB_SEQUENCE *seq; - int32_t cachesize; + u_int32_t cachesize; { - ENV *env; - int ret; - - env = seq->seq_dbp->env; - - if (cachesize < 0) { - __db_errx(env, DB_STR("4007", - "Cache size must be >= 0")); - return (EINVAL); - } - - /* - * It's an error to specify a cache larger than the range of sequences. - */ - if (SEQ_IS_OPEN(seq) && (ret = __seq_chk_cachesize(env, - cachesize, seq->seq_rp->seq_max, seq->seq_rp->seq_min)) != 0) - return (ret); + SEQ_ILLEGAL_AFTER_OPEN(seq, "DB_SEQUENCE->set_cachesize"); seq->seq_cache_size = cachesize; return (0); @@ -437,8 +446,6 @@ __seq_get_flags(seq, flagsp) DB_SEQUENCE *seq; u_int32_t *flagsp; { - SEQ_ILLEGAL_BEFORE_OPEN(seq, "DB_SEQUENCE->get_flags"); - *flagsp = F_ISSET(seq->seq_rp, SEQ_SET_FLAGS); return (0); } @@ -480,8 +487,10 @@ __seq_set_flags(seq, flags) * __seq_initial_value -- * DB_SEQUENCE->initial_value. * + * PUBLIC: int __seq_initial_value __P((DB_SEQUENCE *, db_seq_t)); + * */ -static int +int __seq_initial_value(seq, value) DB_SEQUENCE *seq; db_seq_t value; @@ -515,8 +524,6 @@ __seq_get_range(seq, minp, maxp) DB_SEQUENCE *seq; db_seq_t *minp, *maxp; { - SEQ_ILLEGAL_BEFORE_OPEN(seq, "DB_SEQUENCE->get_range"); - *minp = seq->seq_rp->seq_min; *maxp = seq->seq_rp->seq_max; return (0); @@ -557,14 +564,13 @@ __seq_update(seq, ip, txn, delta, flags) DB_SEQUENCE *seq; DB_THREAD_INFO *ip; DB_TXN *txn; - int32_t delta; - u_int32_t flags; + u_int32_t delta, flags; { DB *dbp; DBT *data, ldata; DB_SEQ_RECORD *rp; ENV *env; - int32_t adjust; + db_seq_t adjust; int ret, txn_local, need_mutex; dbp = seq->seq_dbp; @@ -721,29 +727,36 @@ err: if (need_mutex) { env, txn, LF_ISSET(DB_TXN_NOSYNC), ret) : ret); } -static int +/* + * __seq_get -- + * Internal get function for sequence. + * + * PUBLIC: int __seq_get + * PUBLIC: __P((DB_SEQUENCE *, DB_TXN *, u_int32_t, db_seq_t *, u_int32_t)); + */ +int __seq_get(seq, txn, delta, retp, flags) DB_SEQUENCE *seq; DB_TXN *txn; - int32_t delta; + u_int32_t delta, flags; db_seq_t *retp; - u_int32_t flags; { DB *dbp; DB_SEQ_RECORD *rp; DB_THREAD_INFO *ip; ENV *env; - int handle_check, ret, t_ret; + int handle_check, ret; dbp = seq->seq_dbp; env = dbp->env; rp = seq->seq_rp; ret = 0; + ENV_GET_THREAD_INFO(env, ip); STRIP_AUTO_COMMIT(flags); SEQ_ILLEGAL_BEFORE_OPEN(seq, "DB_SEQUENCE->get"); - if (delta < 0 || (delta == 0 && !LF_ISSET(DB_CURRENT))) { + if (delta == 0 && !LF_ISSET(DB_CURRENT)) { __db_errx(env, "Sequence delta must be greater than 0"); return (EINVAL); } @@ -754,16 +767,9 @@ __seq_get(seq, txn, delta, retp, flags) return (EINVAL); } - ENV_ENTER(env, ip); - - /* Check for replication block. */ - handle_check = IS_ENV_REPLICATED(env); - if (handle_check && - (ret = __db_rep_enter(dbp, 1, 0, IS_REAL_TXN(txn))) != 0) - return (ret); - MUTEX_LOCK(env, seq->mtx_seq); + handle_check = IS_ENV_REPLICATED(env); if (handle_check && IS_REP_CLIENT(env) && !F_ISSET(dbp, DB_AM_NOT_DURABLE)) { ret = __db_rdonly(env, "DB_SEQUENCE->get"); @@ -799,6 +805,31 @@ __seq_get(seq, txn, delta, retp, flags) } err: MUTEX_UNLOCK(env, seq->mtx_seq); + return (ret); +} + +static int +__seq_get_pp(seq, txn, delta, retp, flags) + DB_SEQUENCE *seq; + DB_TXN *txn; + u_int32_t delta, flags; + db_seq_t *retp; +{ + DB_THREAD_INFO *ip; + ENV *env; + int handle_check, ret, t_ret; + + env = seq->seq_dbp->env; + + ENV_ENTER(env, ip); + + /* Check for replication block. */ + handle_check = IS_ENV_REPLICATED(env); + if (handle_check && + (ret = __db_rep_enter(seq->seq_dbp, 1, 0, IS_REAL_TXN(txn))) != 0) + return (ret); + + ret = __seq_get(seq, txn, delta, retp, flags); /* Release replication block. */ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0) @@ -868,8 +899,9 @@ __seq_close_pp(seq, flags) * __seq_close -- * Close a sequence * + * PUBLIC: int __seq_close __P((DB_SEQUENCE *, u_int32_t)); */ -static int +int __seq_close(seq, flags) DB_SEQUENCE *seq; u_int32_t flags; @@ -916,19 +948,24 @@ __seq_remove(seq, txn, flags) dbp = seq->seq_dbp; env = dbp->env; + handle_check = 0; + ret = 0; txn_local = 0; - SEQ_ILLEGAL_BEFORE_OPEN(seq, "DB_SEQUENCE->remove"); + if (!SEQ_IS_OPEN(seq)) + ret = __db_mi_open(env, "DB_SEQUENCE->remove", 0); /* * Flags can only be 0, unless the database has DB_AUTO_COMMIT enabled. * Then DB_TXN_NOSYNC is allowed. */ - if (flags != 0 && + if (ret == 0 && flags != 0 && (flags != DB_TXN_NOSYNC || !IS_DB_AUTO_COMMIT(dbp, txn))) - return (__db_ferr(env, "DB_SEQUENCE->remove illegal flag", 0)); + ret = __db_ferr(env, "DB_SEQUENCE->remove illegal flag", 0); ENV_ENTER(env, ip); + if (ret != 0) + goto err; /* Check for replication block. */ handle_check = IS_ENV_REPLICATED(env); @@ -945,7 +982,7 @@ __seq_remove(seq, txn, flags) */ if (IS_DB_AUTO_COMMIT(dbp, txn)) { if ((ret = __txn_begin(env, ip, NULL, &txn, flags)) != 0) - return (ret); + goto err; txn_local = 1; } @@ -955,13 +992,14 @@ __seq_remove(seq, txn, flags) ret = __db_del(dbp, ip, txn, &seq->seq_key, 0); +err: if ((t_ret = __seq_close(seq, 0)) != 0 && ret == 0) ret = t_ret; /* Release replication block. */ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0) ret = t_ret; -err: if (txn_local && (t_ret = + if (txn_local && (t_ret = __db_txn_auto_resolve(env, txn, 0, ret)) != 0 && ret == 0) ret = t_ret; @@ -976,7 +1014,7 @@ err: if (txn_local && (t_ret = static int __seq_chk_cachesize(env, cachesize, max, min) ENV *env; - int32_t cachesize; + u_int32_t cachesize; db_seq_t max, min; { /* diff --git a/src/txn/txn.c b/src/txn/txn.c index 81225e5c..91652cb7 100644 --- a/src/txn/txn.c +++ b/src/txn/txn.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1995, 1996 @@ -227,8 +227,15 @@ __txn_begin(env, ip, parent, txnpp, flags) if (LF_ISSET(DB_TXN_FAMILY)) F_SET(txn, TXN_FAMILY | TXN_INFAMILY | TXN_READONLY); if (LF_ISSET(DB_TXN_SNAPSHOT) || F_ISSET(dbenv, DB_ENV_TXN_SNAPSHOT) || - (parent != NULL && F_ISSET(parent, TXN_SNAPSHOT))) - F_SET(txn, TXN_SNAPSHOT); + (parent != NULL && F_ISSET(parent, TXN_SNAPSHOT))) { + if (IS_REP_CLIENT(env)) { + __db_errx(env, DB_STR("4572", + "DB_TXN_SNAPSHOT may not be used on a replication client")); + ret = (EINVAL); + goto err; + } else + F_SET(txn, TXN_SNAPSHOT); + } if (LF_ISSET(DB_IGNORE_LEASE)) F_SET(txn, TXN_IGNORE_LEASE); @@ -581,8 +588,7 @@ __txn_continue(env, txn, td, ip, add_to_list) txn->set_timeout = __txn_set_timeout; txn->set_txn_lsnp = __txn_set_txn_lsnp; - /* XXX Do we need to explicitly set a SYNC flag here? */ - txn->flags = TXN_MALLOC | + txn->flags = TXN_MALLOC | TXN_SYNC | (F_ISSET(td, TXN_DTL_NOWAIT) ? TXN_NOWAIT : 0); txn->xa_thr_status = TXN_XA_THREAD_NOTA; @@ -795,8 +801,9 @@ __txn_commit(txn, flags) if (ret == 0) { DB_LSN s_lsn; - DB_ASSERT(env, __log_current_lsn_int( - env, &s_lsn, NULL, NULL) == 0); + if ((ret = __log_current_lsn_int( + env, &s_lsn, NULL, NULL)) != 0) + goto err; DB_ASSERT(env, LOG_COMPARE( &td->visible_lsn, &s_lsn) <= 0); COMPQUIET(s_lsn.file, 0); @@ -890,17 +897,16 @@ static int __txn_close_cursors(txn) DB_TXN *txn; { - int ret, tret; + int ret, t_ret; DBC *dbc; - ret = tret = 0; + ret = t_ret = 0; dbc = NULL; if (txn == NULL) return (0); while ((dbc = TAILQ_FIRST(&txn->my_cursors)) != NULL) { - DB_ASSERT(dbc->env, txn == dbc->txn); /* @@ -913,21 +919,21 @@ __txn_close_cursors(txn) /* Removed from the active queue here. */ if (F_ISSET(dbc, DBC_ACTIVE)) - ret = __dbc_close(dbc); + t_ret = __dbc_close(dbc); dbc->txn = NULL; /* We have to close all cursors anyway, so continue on error. */ - if (ret != 0) { - __db_err(dbc->env, ret, "__dbc_close"); - if (tret == 0) - tret = ret; + if (t_ret != 0) { + __db_err(dbc->env, t_ret, "__dbc_close"); + if (ret == 0) + ret = t_ret; } } txn->my_cursors.tqh_first = NULL; txn->my_cursors.tqh_last = NULL; - return (tret);/* Return the first error if any. */ + return (ret); /* Return the first error, if any. */ } /* @@ -1050,7 +1056,7 @@ __txn_abort(txn) * it, however make sure that it is aborted when the last process * tries to abort it. */ - if (txn->xa_thr_status != TXN_XA_THREAD_NOTA && td->xa_ref > 1) { + if (txn->xa_thr_status != TXN_XA_THREAD_NOTA && td->xa_ref > 1) { td->status = TXN_NEED_ABORT; return (0); } @@ -2165,5 +2171,5 @@ __txn_applied(env, ip, commit_info, timeout) if (renv->envid == commit_info->envid && LOG_COMPARE(&commit_info->lsn, &lsn) <= 0) return (0); - return (DB_NOTFOUND); + return (USR_ERR(env, DB_NOTFOUND)); } diff --git a/src/txn/txn.src b/src/txn/txn.src index 7e82dc82..d9af5318 100644 --- a/src/txn/txn.src +++ b/src/txn/txn.src @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/txn/txn_chkpt.c b/src/txn/txn_chkpt.c index 73715b10..a909767f 100644 --- a/src/txn/txn_chkpt.c +++ b/src/txn/txn_chkpt.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1995, 1996 @@ -377,7 +377,7 @@ __txn_getckp(env, lsnp) TXN_SYSTEM_UNLOCK(env); if (IS_ZERO_LSN(lsn)) - return (DB_NOTFOUND); + return (USR_ERR(env, DB_NOTFOUND)); *lsnp = lsn; return (0); diff --git a/src/txn/txn_failchk.c b/src/txn/txn_failchk.c index b2007ad6..94f22ec2 100644 --- a/src/txn/txn_failchk.c +++ b/src/txn/txn_failchk.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -57,7 +57,7 @@ retry: TXN_SYSTEM_LOCK(env); if (F_ISSET(td, TXN_DTL_INMEMORY)) { TXN_SYSTEM_UNLOCK(env); - return (__db_failed(env, DB_STR("4501", + return (__db_failed(env, DB_STR("4573", "Transaction has in memory logs"), td->pid, td->tid)); } diff --git a/src/txn/txn_method.c b/src/txn/txn_method.c index 629eac04..357e78c6 100644 --- a/src/txn/txn_method.c +++ b/src/txn/txn_method.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/txn/txn_rec.c b/src/txn/txn_rec.c index b39d56d1..708af98a 100644 --- a/src/txn/txn_rec.c +++ b/src/txn/txn_rec.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1996 @@ -210,11 +210,12 @@ __txn_prepare_recover(env, dbtp, lsnp, op, info) */ else if ((ret = __db_txnlist_remove(env, info, argp->txnp->txnid)) != 0) { -txn_err: __db_errx(env, +txn_err: + ret = USR_ERR(env, DB_NOTFOUND); + __db_errx(env, DB_STR_A("4515", "transaction not in list %lx", "%lx"), (u_long)argp->txnp->txnid); - ret = DB_NOTFOUND; } else if (IS_ZERO_LSN(headp->trunc_lsn) || LOG_COMPARE(&headp->trunc_lsn, lsnp) >= 0) { if ((ret = __db_txnlist_add(env, diff --git a/src/txn/txn_recover.c b/src/txn/txn_recover.c index 67f24439..915a289f 100644 --- a/src/txn/txn_recover.c +++ b/src/txn/txn_recover.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -305,8 +305,8 @@ __txn_openfiles(env, ip, min, force) if ((ret = __db_txnlist_init(env, ip, 0, 0, NULL, &txninfo)) != 0) goto err; - ret = __env_openfiles( - env, logc, txninfo, &data, &open_lsn, NULL, (double)0, 0); + ret = __env_openfiles(env, + logc, txninfo, &data, &open_lsn, NULL, (double)0, 0); if (txninfo != NULL) __db_txnlist_end(env, txninfo); diff --git a/src/txn/txn_region.c b/src/txn/txn_region.c index 6f43d45f..7fef66e6 100644 --- a/src/txn/txn_region.c +++ b/src/txn/txn_region.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -13,6 +13,7 @@ #include "dbinc/txn.h" static int __txn_init __P((ENV *, DB_TXNMGR *)); +static int lsn_hi_to_low __P((const void *, const void *)); /* * __txn_open -- @@ -57,12 +58,30 @@ __txn_open(env) env->tx_handle = mgr; return (0); -err: env->tx_handle = NULL; - if (mgr->reginfo.addr != NULL) - (void)__env_region_detach(env, &mgr->reginfo, 0); +err: (void)__mutex_free(env, &mgr->mutex); + (void)__txn_region_detach(env, mgr); - (void)__mutex_free(env, &mgr->mutex); - __os_free(env, mgr); + return (ret); +} + +/* + * __txn_region_detach -- + * + * PUBLIC: int __txn_region_detach __P((ENV *, DB_TXNMGR *)); + */ +int +__txn_region_detach(env, mgr) + ENV *env; + DB_TXNMGR *mgr; +{ + int ret; + + ret = 0; + if (mgr != NULL) { + ret = __env_region_detach(env, &mgr->reginfo, 0); + __os_free(env, mgr); + env->tx_handle = NULL; + } return (ret); } @@ -409,39 +428,101 @@ __txn_id_set(env, cur_txnid, max_txnid) } /* - * __txn_oldest_reader -- - * Find the oldest "read LSN" of any active transaction' - * MVCC changes older than this can safely be discarded from the cache. + * lsn_hi_to_low -- + * Compare lsns, sorting them from high to low. This is the opposite of + * __rep_lsn_cmp. + */ +static int +lsn_hi_to_low(lsn1, lsn2) + const void *lsn1, *lsn2; +{ + return (LOG_COMPARE((DB_LSN *)lsn2, (DB_LSN *)lsn1)); +} + +/* + * __txn_get_readers -- + * Find the read LSN of all active transactions. + * MVCC versions older than the oldest active transaction can safely be + * discarded from the cache. MVCC versions not quite so old can be + * discarded if they are not visible to any active transaction. * - * PUBLIC: int __txn_oldest_reader __P((ENV *, DB_LSN *)); + * Returns: + * An error code, or 0. + * If 0 was returned, *readers has been filled in with an __os_malloc()'d + * array of active transactions with read_lsns, sorted from newest + * (largest) to oldest (smallest). *ntxnsp indicates how many are there. + * The last lsn is that of the oldest active mvcc-supporting transaction. + * The caller must __os_free() *readers whenever it is non-NULL. + * + * PUBLIC: int __txn_get_readers __P((ENV *, DB_LSN **, int *)); */ +#define TXN_READERS_SIZE 64 /* Initial number of LSNs to allocate. */ int -__txn_oldest_reader(env, lsnp) +__txn_get_readers(env, readers, ntxnsp) ENV *env; - DB_LSN *lsnp; + DB_LSN **readers; + int *ntxnsp; { - DB_LSN old_lsn; + DB_LSN current, *lsns; DB_TXNMGR *mgr; DB_TXNREGION *region; TXN_DETAIL *td; - int ret; + int cmp, is_sorted, ret; + unsigned count, txnmax; + + *ntxnsp = 0; + *readers = NULL; if ((mgr = env->tx_handle) == NULL) return (0); region = mgr->reginfo.primary; + lsns = NULL; + + if ((ret = __log_current_lsn_int(env, ¤t, NULL, NULL)) != 0) + return (ret); - if ((ret = __log_current_lsn_int(env, &old_lsn, NULL, NULL)) != 0) + txnmax = TXN_READERS_SIZE; + if ((ret = __os_malloc(env, txnmax * sizeof(lsns[0]), &lsns)) != 0) return (ret); TXN_SYSTEM_LOCK(env); - SH_TAILQ_FOREACH(td, ®ion->active_txn, links, __txn_detail) - if (LOG_COMPARE(&td->read_lsn, &old_lsn) < 0) - old_lsn = td->read_lsn; + /* The array always has at least the current lsn. */ + lsns[0] = current; + count = 1; + is_sorted = TRUE; - *lsnp = old_lsn; + /* + * Build up our array in most-recent (largest) to first-started (oldest) + * order. Delete adjacent dups. Detect when the txns need to be sorted. + */ + SH_TAILQ_FOREACH(td, ®ion->active_txn, links, __txn_detail) { + if (IS_MAX_LSN(td->read_lsn) || + (cmp = LOG_COMPARE(&td->read_lsn, &lsns[count - 1])) == 0) + continue; + if (cmp > 0) + is_sorted = FALSE; + if (count >= txnmax) { + txnmax += txnmax; + if ((ret = __os_realloc(env, + txnmax * sizeof(lsns[0]), &lsns)) != 0) + goto err; + } + lsns[count] = td->read_lsn; + count++; + } + +err: TXN_SYSTEM_UNLOCK(env); - return (0); + if (ret != 0) + __os_free(env, lsns); + else { + if (!is_sorted) + qsort(lsns, count, sizeof(lsns[0]), lsn_hi_to_low); + *ntxnsp = (int)count; + *readers = lsns; + } + return (ret); } /* diff --git a/src/txn/txn_stat.c b/src/txn/txn_stat.c index 62fe622d..231ac3c5 100644 --- a/src/txn/txn_stat.c +++ b/src/txn/txn_stat.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/txn/txn_util.c b/src/txn/txn_util.c index 0ecd7f6c..9f3b8cf6 100644 --- a/src/txn/txn_util.c +++ b/src/txn/txn_util.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -9,6 +9,7 @@ #include "db_config.h" #include "db_int.h" +#include "dbinc/blob.h" #include "dbinc/db_page.h" #include "dbinc/lock.h" #include "dbinc/mp.h" @@ -209,7 +210,7 @@ __txn_remlock(env, txn, lock, locker) for (e = TAILQ_FIRST(&txn->events); e != NULL; e = next_e) { next_e = TAILQ_NEXT(e, links); - if ((e->op != TXN_TRADE && e->op != TXN_TRADED && + if ((e->op != TXN_TRADE && e->op != TXN_TRADED && e->op != TXN_XTRADE) || (e->u.t.lock.off != lock->off && e->u.t.locker != locker)) continue; @@ -280,13 +281,21 @@ __txn_doevents(env, txn, opcode, preprocess) e != NULL; e = enext) { enext = TAILQ_NEXT(e, links); /* - * Move all exclusive handle locks and + * Move all exclusive handle locks and * read handle locks to the handle locker. */ if (!(opcode == TXN_COMMIT && e->op == TXN_XTRADE) && - (e->op != TXN_TRADE || - IS_WRITELOCK(e->u.t.lock.mode))) + (e->op != TXN_TRADE || + IS_WRITELOCK(e->u.t.lock.mode))) { + if (opcode == TXN_PREPARE && + e->op == TXN_REMOVE) { + __db_errx(env, DB_STR_A("4501", +"TXN->prepare is not allowed because this transaction removes \"%s\"", "%s"), + e->u.r.name); + return (EINVAL); + } continue; + } DO_TRADE; if (txn->parent != NULL) { TAILQ_REMOVE(&txn->events, e, links); @@ -321,17 +330,26 @@ __txn_doevents(env, txn, opcode, preprocess) ret = t_ret; break; case TXN_REMOVE: - if (txn->parent != NULL) + if (txn->parent != NULL) { TAILQ_INSERT_TAIL( &txn->parent->events, e, links); - else if (e->u.r.fileid != NULL) { + continue; + } else if (e->u.r.fileid != NULL) { if ((t_ret = __memp_nameop(env, e->u.r.fileid, NULL, e->u.r.name, NULL, e->u.r.inmem)) != 0 && ret == 0) ret = t_ret; - } else if ((t_ret = - __os_unlink(env, e->u.r.name, 0)) != 0 && ret == 0) - ret = t_ret; + } else if ((t_ret = __os_unlink( + env, e->u.r.name, 0)) != 0 && ret == 0) { + /* + * It is possible for blob files to be deleted + * multiple times when truncating a database, + * so ignore ENOENT errors with blob files. + */ + if (t_ret != ENOENT || strstr( + e->u.r.name, BLOB_FILE_PREFIX) == NULL) + ret = t_ret; + } break; case TXN_TRADE: case TXN_XTRADE: @@ -371,8 +389,6 @@ dofree: /* Free resources here. */ switch (e->op) { case TXN_REMOVE: - if (txn->parent != NULL) - continue; if (e->u.r.fileid != NULL) __os_free(env, e->u.r.fileid); __os_free(env, e->u.r.name); @@ -548,9 +564,8 @@ __txn_reset_fe_watermarks(txn) { DB *db; - if (txn->parent) { + if (txn->parent) DB_ASSERT(txn->mgrp->env, TAILQ_FIRST(&txn->femfs) == NULL); - } while ((db = TAILQ_FIRST(&txn->femfs))) __clear_fe_watermark(txn, db); diff --git a/src/xa/xa.c b/src/xa/xa.c index ee75e792..5ce7842f 100644 --- a/src/xa/xa.c +++ b/src/xa/xa.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1998, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -233,8 +233,8 @@ __xa_put_txn(env, txnp) SH_TAILQ_REMOVE(&ip->dbth_xatxn, txnp, xa_links, __db_txn); TAILQ_REMOVE(&txnp->mgrp->txn_chain, txnp, links); td = txnp->td; - DB_ASSERT(env, td->xa_ref > 0); - td->xa_ref--; + if (td->xa_ref > 0) + td->xa_ref--; __os_free(env, txnp); ip->dbth_xa_status = TXN_XA_THREAD_UNASSOCIATED; } @@ -852,9 +852,9 @@ __db_xa_commit(xid, rmid, arg_flags) return (ret); /* - * Because this transaction is currently associated, commit will not free - * the transaction structure, which is good, because we need to do that - * in xa_put_txn below. + * Because this transaction is currently associated, commit will + * not free the transaction structure, which is good, because we + * need to do that in xa_put_txn below. */ if ((ret = txnp->commit(txnp, 0)) != 0) { dbenv->err(dbenv, ret, DB_STR("4563", diff --git a/src/xa/xa_map.c b/src/xa/xa_map.c index 4dcf4d75..9fd50185 100644 --- a/src/xa/xa_map.c +++ b/src/xa/xa_map.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ |