diff options
Diffstat (limited to 'src/db')
37 files changed, 2088 insertions, 395 deletions
diff --git a/src/db/crdel.src b/src/db/crdel.src index 70473899..a1cbc0ed 100644 --- a/src/db/crdel.src +++ b/src/db/crdel.src @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/db/crdel_rec.c b/src/db/crdel_rec.c index 08e7bae8..2c529627 100644 --- a/src/db/crdel_rec.c +++ b/src/db/crdel_rec.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -81,7 +81,7 @@ __crdel_metasub_recover(env, dbtp, lsnp, op, info) /* * If this was an in-memory database and we are re-creating * and this is the meta-data page, then we need to set up a - * bunch of fields in the dbo as well. + * bunch of fields in the dbp as well. */ if (F_ISSET(file_dbp, DB_AM_INMEM) && argp->pgno == PGNO_BASE_MD && diff --git a/src/db/db.c b/src/db/db.c index 0d9d1e6e..ffeb6d2b 100644 --- a/src/db/db.c +++ b/src/db/db.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1990, 1993, 1994, 1995, 1996 @@ -41,6 +41,7 @@ #include "db_config.h" #include "db_int.h" +#include "dbinc_auto/sequence_ext.h" #include "dbinc/db_page.h" #include "dbinc/db_swap.h" #include "dbinc/btree.h" @@ -92,6 +93,9 @@ __db_master_open(subdbp, ip, txn, name, flags, mode, dbpp) if ((ret = __db_create_internal(&dbp, subdbp->env, 0)) != 0) return (ret); + /* Set the creation directory. */ + dbp->dirname = subdbp->dirname; + /* * It's always a btree. * Run in the transaction we've created. @@ -105,6 +109,20 @@ __db_master_open(subdbp, ip, txn, name, flags, mode, dbpp) DB_AM_ENCRYPT | DB_AM_CHKSUM | DB_AM_NOT_DURABLE)); /* + * If creating the master database, disable blobs, but assign it a + * blob file id if blobs are enabled in the subdatabase. This means + * that subdatabses can only support blobs if the first subdatabse + * supports blobs. This is a temporary restriction, but is needed at + * the moment to prevent an infinite loop. + */ + dbp->blob_threshold = 0; + if (LF_ISSET(DB_CREATE) && subdbp->blob_threshold != 0) { + if ((ret = __blob_generate_dir_ids( + dbp, txn, &dbp->blob_file_id)) != 0) + return (ret); + } + + /* * If there was a subdb specified, then we only want to apply * DB_EXCL to the subdb, not the actual file. We only got here * because there was a subdb specified. @@ -819,6 +837,21 @@ __db_refresh(dbp, txn, flags, deferred_closep, reuse) if (dbp->mpf == NULL) LF_SET(DB_NOSYNC); +#ifdef HAVE_64BIT_TYPES + /* Close the blob meta data databases. */ + if (dbp->blob_seq != NULL) { + if ((t_ret = __seq_close(dbp->blob_seq, 0)) != 0 && ret == 0) + ret = t_ret; + dbp->blob_seq = NULL; + } + if (dbp->blob_meta_db != NULL) { + if ((t_ret = __db_close( + dbp->blob_meta_db, NULL, 0)) != 0 && ret == 0) + ret = t_ret; + dbp->blob_meta_db = NULL; + } +#endif + /* If never opened, or not currently open, it's easy. */ if (!F_ISSET(dbp, DB_AM_OPEN_CALLED)) goto never_opened; @@ -1164,6 +1197,10 @@ never_opened: __os_free(dbp->env, dbp->dname); dbp->dname = NULL; } + if (dbp->blob_sub_dir != NULL) { + __os_free(dbp->env, dbp->blob_sub_dir); + dbp->blob_sub_dir = NULL; + } /* Discard any memory used to store returned data. */ if (dbp->my_rskey.data != NULL) @@ -1235,8 +1272,11 @@ __db_disassociate(sdbp) sdbp->s_refcnt = 0; while ((dbc = TAILQ_FIRST(&sdbp->free_queue)) != NULL) - if ((t_ret = __dbc_destroy(dbc)) != 0 && ret == 0) - ret = t_ret; + if ((t_ret = __dbc_destroy(dbc)) != 0) { + if (ret == 0) + ret = t_ret; + break; + } F_CLR(sdbp, DB_AM_SECONDARY); return (ret); diff --git a/src/db/db.src b/src/db/db.src index 879c7856..4a90ac16 100644 --- a/src/db/db.src +++ b/src/db/db.src @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/db/db_am.c b/src/db/db_am.c index 1cf3a505..84bb04bb 100644 --- a/src/db/db_am.c +++ b/src/db/db_am.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1998, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -205,6 +205,7 @@ __db_cursor_int(dbp, ip, txn, dbtype, root, flags, locker, dbcp) /* Refresh the DBC structure. */ dbc->dbtype = dbtype; RESET_RET_MEM(dbc); + dbc->db_stream = __dbc_db_stream; dbc->set_priority = __dbc_set_priority; dbc->get_priority = __dbc_get_priority; dbc->priority = dbp->priority; @@ -314,11 +315,11 @@ __db_cursor_int(dbp, ip, txn, dbtype, root, flags, locker, dbcp) if (F2_ISSET(dbp, DB2_AM_EXCL)) { F_SET(dbc, DBC_DONTLOCK); if (IS_REAL_TXN(txn)&& !LF_ISSET(DBC_OPD | DBC_DUPLICATE)) { - /* - * Exclusive databases can only have one active - * transaction at a time since there are no internal + /* + * Exclusive databases can only have one active + * transaction at a time since there are no internal * locks to prevent one transaction from reading and - * writing another's uncommitted changes. + * writing another's uncommitted changes. */ if (dbp->cur_txn != NULL && dbp->cur_txn != txn) { __db_errx(env, DB_STR("0749", @@ -332,7 +333,7 @@ __db_cursor_int(dbp, ip, txn, dbtype, root, flags, locker, dbcp) memset(&req, 0, sizeof(req)); req.lock = dbp->handle_lock; req.op = DB_LOCK_TRADE; - if ((ret = __lock_vec(env, txn->locker, 0, + if ((ret = __lock_vec(env, txn->locker, 0, &req, 1, 0)) != 0) goto err; dbp->cur_txn = txn; @@ -397,10 +398,11 @@ __db_cursor_int(dbp, ip, txn, dbtype, root, flags, locker, dbcp) if (ip != NULL) { dbc->thread_info = ip; #ifdef DIAGNOSTIC - if (dbc->locker != NULL) + if (dbc->locker != NULL) { + dbc->locker->prev_locker = ip->dbth_locker; ip->dbth_locker = R_OFFSET(&(env->lk_handle->reginfo), dbc->locker); - else + } else ip->dbth_locker = INVALID_ROFF; #endif } else if (txn != NULL) diff --git a/src/db/db_backup.c b/src/db/db_backup.c index 66d7382a..1c72e4d7 100644 --- a/src/db/db_backup.c +++ b/src/db/db_backup.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2011, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -24,8 +24,9 @@ static int backup_read_data_dir __P((DB_ENV *, DB_THREAD_INFO *, const char *, const char *, u_int32_t)); static int backup_dir_clean __P((DB_ENV *, const char *, const char *, int *, u_int32_t)); -static int backup_data_copy - __P((DB_ENV *, const char *, const char *, const char *, int)); +static int backup_lgconf_chk __P((DB_ENV *)); +static int __db_backup + __P((DB_ENV *, const char *, DB_THREAD_INFO *, int, u_int32_t)); /* * __db_dbbackup_pp -- @@ -47,9 +48,9 @@ __db_dbbackup_pp(dbenv, dbfile, target, flags) "DB_ENV->dbbackup", flags, DB_EXCL)) != 0) return (ret); ENV_ENTER(dbenv->env, ip); - - ret = __db_dbbackup(dbenv, ip, dbfile, target, flags); - + REPLICATION_WRAP(dbenv->env, + (__db_dbbackup( + dbenv, ip, dbfile, target, flags, 0, NULL)), 0, ret); ENV_LEAVE(dbenv->env, ip); return (ret); } @@ -58,15 +59,17 @@ __db_dbbackup_pp(dbenv, dbfile, target, flags) * __db_dbbackup -- * Copy a database file coordinated with mpool. * - * PUBLIC: int __db_dbbackup __P((DB_ENV *, DB_THREAD_INFO *, - * PUBLIC: const char *, const char *, u_int32_t)); + * PUBLIC: int __db_dbbackup __P((DB_ENV *, DB_THREAD_INFO *, const char *, + * PUBLIC: const char *, u_int32_t, u_int32_t, const char *)); */ int -__db_dbbackup(dbenv, ip, dbfile, target, flags) +__db_dbbackup(dbenv, ip, dbfile, target, flags, oflags, full_path) DB_ENV *dbenv; DB_THREAD_INFO *ip; const char *dbfile, *target; u_int32_t flags; + u_int32_t oflags; + const char *full_path; { DB *dbp; DB_FH *fp; @@ -77,8 +80,8 @@ __db_dbbackup(dbenv, ip, dbfile, target, flags) retry_count = 0; retry: if ((ret = __db_create_internal(&dbp, dbenv->env, 0)) == 0 && - (ret = __db_open(dbp, ip, NULL, dbfile, NULL, - DB_UNKNOWN, DB_AUTO_COMMIT | DB_RDONLY, 0, PGNO_BASE_MD)) != 0) { + (ret = __db_open(dbp, ip, NULL, dbfile, NULL, DB_UNKNOWN, + DB_AUTO_COMMIT | DB_RDONLY | oflags, 0, PGNO_BASE_MD)) != 0) { if (ret == DB_LOCK_DEADLOCK || ret == DB_LOCK_NOTGRANTED) { (void)__db_close(dbp, NULL, DB_NOSYNC); dbp = NULL; @@ -91,9 +94,16 @@ retry: if ((ret = __db_create_internal(&dbp, dbenv->env, 0)) == 0 && } } + /* Hot backup requires DB_LOG_BLOB. */ + if (ret == 0 && dbp->blob_threshold != 0 && + (ret = backup_lgconf_chk(dbenv)) != 0) + goto err; + + if (full_path == NULL) + full_path = dbfile; if (ret == 0) { if ((ret = __memp_backup_open(dbenv->env, - dbp->mpf, dbfile, target, flags, &fp, &handle)) == 0) { + dbp->mpf, full_path, target, flags, &fp, &handle)) == 0) { if (dbp->type == DB_HEAP) ret = __heap_backup( dbenv, dbp, ip, fp, handle, flags); @@ -104,10 +114,21 @@ retry: if ((ret = __db_create_internal(&dbp, dbenv->env, 0)) == 0 && fp, handle, flags); } if ((t_ret = __memp_backup_close(dbenv->env, - dbp->mpf, dbfile, fp, handle)) != 0 && ret == 0) + dbp->mpf, full_path, fp, handle)) != 0 && ret == 0) ret = t_ret; } + /* + * Copy blob files. Since no locking is done here, it is possible + * that a blob file may be copied in the middle of being written. + * This is not a problem since hotbackup requires DB_LOG_BLOB and + * catastrophic recovery, which will fix any inconsistances in the + * blob files. + */ + if (ret == 0 && dbp->blob_threshold != 0 && + (t_ret = __blob_copy_all(dbp, target, flags)) != 0) + ret= t_ret; + #ifdef HAVE_QUEUE /* * For compatibility with the 5.2 and patch versions of db_copy @@ -117,7 +138,7 @@ retry: if ((ret = __db_create_internal(&dbp, dbenv->env, 0)) == 0 && ret = __qam_backup_extents(dbp, ip, target, flags); #endif - if (dbp != NULL && +err: if (dbp != NULL && (t_ret = __db_close(dbp, NULL, DB_NOSYNC)) != 0 && ret == 0) ret = t_ret; @@ -205,8 +226,11 @@ backup_dir_clean(dbenv, backup_dir, log_dir, remove_maxp, flags) /* * backup_data_copy -- * Copy a non-database file into the backup directory. + * + * PUBLIC: int backup_data_copy __P(( + * PUBLIC: DB_ENV *, const char *, const char *, const char *, int)); */ -static int +int backup_data_copy(dbenv, file, from_dir, to_dir, log) DB_ENV *dbenv; const char *file, *from_dir, *to_dir; @@ -352,13 +376,16 @@ backup_read_data_dir(dbenv, ip, dir, backup_dir, flags) ENV *env; FILE *savefile; int fcnt, ret; - size_t cnt; + size_t cnt, len; const char *bd; char **names, buf[DB_MAXPATHLEN], bbuf[DB_MAXPATHLEN]; + char fullpath[DB_MAXPATHLEN]; void (*savecall) (const DB_ENV *, const char *, const char *); env = dbenv->env; memset(bbuf, 0, sizeof(bbuf)); + memset(fullpath, 0, sizeof(fullpath)); + len = 0; bd = backup_dir; if (!LF_ISSET(DB_BACKUP_SINGLE_DIR) && dir != env->db_home) { @@ -401,6 +428,12 @@ backup_read_data_dir(dbenv, ip, dir, backup_dir, flags) "%s: path too long", "%s"), buf); return (EINVAL); } + /* Save the original dir. */ + if (!LF_ISSET(DB_BACKUP_SINGLE_DIR)) { + (void)snprintf(fullpath, sizeof(fullpath), + "%s%c%c", dir, PATH_SEPARATOR[0], '\0'); + len = strlen(fullpath); + } dir = buf; } /* Get a list of file names. */ @@ -449,7 +482,16 @@ backup_read_data_dir(dbenv, ip, dir, backup_dir, flags) savefile = dbenv->db_errfile; dbenv->db_errfile = NULL; - ret = __db_dbbackup(dbenv, ip, names[cnt], bd, flags); + /* + * If it is not backing up to a single directory, prefix + * the file with 'dir' so that the file and directory structure + * in the source and backup location will be the same. + */ + if (len != 0) + (void)snprintf(fullpath + len, + sizeof(fullpath) - len, "%s%c", names[cnt], '\0'); + ret = __db_dbbackup(dbenv, ip, names[cnt], + backup_dir, flags, 0, len != 0 ? fullpath : NULL); dbenv->db_errcall = savecall; dbenv->db_errfile = savefile; @@ -662,21 +704,22 @@ err: if (logd != dbenv->db_log_dir && logd != env->db_home) * __db_backup -- * Backup databases in the enviornment. * - * PUBLIC: int __db_backup __P((DB_ENV *, const char *, u_int32_t)); + * PUBLIC: int __db_backup_pp __P((DB_ENV *, const char *, u_int32_t)); */ int -__db_backup(dbenv, target, flags) +__db_backup_pp(dbenv, target, flags) DB_ENV *dbenv; const char *target; u_int32_t flags; { DB_THREAD_INFO *ip; ENV *env; - int copy_min, remove_max, ret; - char **dir; + u_int32_t bytes; + int remove_max, ret; env = dbenv->env; - remove_max = copy_min = 0; + bytes = 0; + remove_max = 0; #undef OKFLAGS #define OKFLAGS \ @@ -692,6 +735,11 @@ __db_backup(dbenv, target, flags) return (EINVAL); } + /* Hot backup requires DB_LOG_BLOB. */ + if ((ret = __env_get_blob_threshold_int(env, &bytes)) != 0 || + (bytes != 0 && (ret = backup_lgconf_chk(dbenv)) != 0)) + return (ret); + /* * If the target directory for the backup does not exist, create it * with mode read-write-execute for the owner. Ignore errors here, @@ -714,6 +762,30 @@ __db_backup(dbenv, target, flags) } ENV_ENTER(env, ip); + REPLICATION_WRAP(env, + (__db_backup(dbenv, target, ip, remove_max, flags)), 0, ret); + ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __db_backup -- + * Backup databases in the enviornment. + */ +static int +__db_backup(dbenv, target, ip, remove_max, flags) + DB_ENV *dbenv; + const char *target; + DB_THREAD_INFO *ip; + int remove_max; + u_int32_t flags; +{ + ENV *env; + int copy_min, ret; + char **dir; + + env = dbenv->env; + copy_min = 0; /* * If the UPDATE option was not specified, copy all database @@ -724,6 +796,19 @@ __db_backup(dbenv, target, flags) goto end; F_SET(dbenv, DB_ENV_HOTBACKUP); if (!LF_ISSET(DB_BACKUP_UPDATE)) { + /* + * Don't allow absolute path of blob directory when + * it is not backing up to a single directory. + */ + if (!LF_ISSET(DB_BACKUP_SINGLE_DIR) && + dbenv->db_blob_dir != NULL && + __os_abspath(dbenv->db_blob_dir)) { + __db_errx(env, DB_STR_A("0780", +"blob directory '%s' is absolute path, not permitted unless backup is to a single directory", + "%s"), dbenv->db_blob_dir); + ret = EINVAL; + goto err; + } if ((ret = backup_read_data_dir(dbenv, ip, env->db_home, target, flags)) != 0) goto err; @@ -734,8 +819,8 @@ __db_backup(dbenv, target, flags) * enviroment -- running recovery with them would * corrupt the source files. */ - if (!LF_ISSET(DB_BACKUP_SINGLE_DIR) - && __os_abspath(*dir)) { + if (!LF_ISSET(DB_BACKUP_SINGLE_DIR) && + __os_abspath(*dir)) { __db_errx(env, DB_STR_A("0725", "data directory '%s' is absolute path, not permitted unless backup is to a single directory", "%s"), *dir); @@ -751,7 +836,17 @@ __db_backup(dbenv, target, flags) /* * Copy all log files found in the log directory. * The log directory defaults to the home directory. + * Don't allow absolute path of log directory when + * it is not backing up to a single directory. */ + if (!LF_ISSET(DB_BACKUP_SINGLE_DIR) && + dbenv->db_log_dir != NULL && __os_abspath(dbenv->db_log_dir)) { + __db_errx(env, DB_STR_A("0781", +"log directory '%s' is absolute path, not permitted unless backup is to a single directory", + "%s"), dbenv->db_log_dir); + ret = EINVAL; + goto err; + } if ((ret = backup_read_log_dir(dbenv, target, ©_min, flags)) != 0) goto err; /* @@ -761,7 +856,7 @@ __db_backup(dbenv, target, flags) * cleanup. */ if (LF_ISSET(DB_BACKUP_UPDATE) && remove_max < copy_min && - !(remove_max == 0 && copy_min == 1)) { + remove_max != 0 && copy_min != 1) { __db_errx(env, DB_STR_A("0743", "the largest log file removed (%d) must be greater than or equal the smallest log file copied (%d)", "%d %d"), remove_max, copy_min); @@ -770,6 +865,28 @@ __db_backup(dbenv, target, flags) err: F_CLR(dbenv, DB_ENV_HOTBACKUP); (void)__env_set_backup(env, 0); -end: ENV_LEAVE(env, ip); +end: return (ret); +} + +/* + * __db_backup_fchk -- + * Log configure checking for backup when blob is enabled. + */ +static int +backup_lgconf_chk(dbenv) + DB_ENV *dbenv; +{ + int lgconf, ret; + + ret = 0; + + if (LOGGING_ON(dbenv->env) && ((ret = __log_get_config(dbenv, + DB_LOG_BLOB, &lgconf)) != 0 || lgconf == 0)) { + __db_errx(dbenv->env, DB_STR("0782", + "Hot backup requires DB_LOG_BLOB")); + if (ret == 0) + ret = EINVAL; + } + return (ret); } diff --git a/src/db/db_cam.c b/src/db/db_cam.c index 6ee8b579..1a330bdb 100644 --- a/src/db/db_cam.c +++ b/src/db/db_cam.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2000, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2000, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -11,6 +11,7 @@ #include "db_int.h" #include "dbinc/db_page.h" #include "dbinc/btree.h" +#include "dbinc/fop.h" #include "dbinc/hash.h" #include "dbinc/heap.h" #include "dbinc/lock.h" @@ -83,6 +84,9 @@ __dbc_close(dbc) DB *dbp; DBC *opd; DBC_INTERNAL *cp; +#ifdef DIAGNOSTIC + DB_THREAD_INFO *ip; +#endif DB_TXN *txn; ENV *env; int ret, t_ret; @@ -149,6 +153,14 @@ __dbc_close(dbc) ret = t_ret; F_CLR(dbc, DBC_FAMILY); } +#ifdef DIAGNOSTIC + if (dbc->locker != NULL) { + ENV_GET_THREAD_INFO(env, ip); + if (ip != NULL) + ip->dbth_locker = dbc->locker->prev_locker; + dbc->locker->prev_locker = INVALID_ROFF; + } +#endif if ((txn = dbc->txn) != NULL) txn->cursors--; @@ -510,6 +522,305 @@ __dbc_idel(dbc, flags) return (ret); } +/* + * __dbc_db_stream -- + * + * DBC->db_stream + * + * PUBLIC: int __dbc_db_stream __P((DBC *, DB_STREAM **, u_int32_t)); + */ +int +__dbc_db_stream(dbc, dbsp, flags) + DBC *dbc; + DB_STREAM **dbsp; + u_int32_t flags; +{ + ENV *env; + int ret; + u_int32_t oflags; + + env = dbc->env; + oflags = flags; + + if ((ret = __db_fchk( + env, "DBC->db_stream", flags, + DB_STREAM_READ | DB_STREAM_WRITE | DB_STREAM_SYNC_WRITE)) != 0) + return (ret); + + if (DB_IS_READONLY(dbc->dbp)) { + LF_SET(DB_STREAM_READ); + oflags |= DB_STREAM_READ; + } + if (LF_ISSET(DB_STREAM_READ) && LF_ISSET(DB_STREAM_WRITE)) { + ret = EINVAL; + __db_errx(env, DB_STR("0750", + "Error, cannot set both DB_STREAM_WRITE and DB_STREAM_READ.")); + goto err; + } + + if (oflags & DB_STREAM_READ) + LF_SET(DB_FOP_READONLY); + else + LF_SET(DB_FOP_WRITE); + if (oflags & DB_STREAM_SYNC_WRITE) + LF_SET(DB_FOP_SYNC_WRITE); + + ret = __db_stream_init(dbc, dbsp, flags); + +err: return (ret); +} + +/* + * __dbc_get_blob_id -- + * + * Returns the blob id stored in the data record to which the cursor currently + * points. Returns EINVAL if the cursor does not point to a blob record. + * + * PUBLIC: int __dbc_get_blob_id __P((DBC *, db_seq_t *)); + */ +int +__dbc_get_blob_id(dbc, blob_id) + DBC *dbc; + db_seq_t *blob_id; +{ + DBT key, data; + BBLOB bl; + HBLOB hbl; + HEAPBLOBHDR bhdr; + int ret; + + if (dbc->dbtype != DB_BTREE && + dbc->dbtype != DB_HEAP && dbc->dbtype != DB_HASH) { + return (EINVAL); + } + + ret = 0; + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + /* Get the blob database record instead of the blob. */ + data.flags |= DB_DBT_BLOB_REC; + + /* + * It would be great if there was a more efficient way to do this, but + * the complexities of getting a page from a database, especially + * when taking into account things like partitions and compression, + * make that more trouble than it is worth. + */ + if ((ret = __dbc_get(dbc, &key, &data, DB_CURRENT)) != 0) + goto err; + + switch (dbc->dbtype) { + case DB_BTREE: + if (data.size != BBLOB_SIZE) { + ret = EINVAL; + goto err; + } + memcpy(&bl, data.data, BBLOB_SIZE); + if (B_TYPE(bl.type) != B_BLOB) { + ret = EINVAL; + goto err; + } + *blob_id = (db_seq_t)bl.id; + break; + case DB_HEAP: + if (data.size != HEAPBLOBREC_SIZE) { + ret = EINVAL; + goto err; + } + memcpy(&bhdr, data.data, HEAPBLOBREC_SIZE); + if (!F_ISSET(&bhdr.std_hdr, HEAP_RECBLOB)) { + ret = EINVAL; + goto err; + } + *blob_id = (db_seq_t)bhdr.id; + break; + case DB_HASH: + if (data.size != HBLOB_SIZE) { + ret = EINVAL; + goto err; + } + memcpy(&hbl, data.data, HBLOB_SIZE); + if (HPAGE_PTYPE(&hbl) != H_BLOB) { + ret = EINVAL; + goto err; + } + *blob_id = (db_seq_t)hbl.id; + break; + default: + ret = EINVAL; + goto err; + } + +err: return (ret); +} + +/* + * __dbc_get_blob_size -- + * + * Returns the blob file size stored in the data record to which the cursor + * currently points. Returns EINVAL if the cursor does not point to a blob + * record. + * + * PUBLIC: int __dbc_get_blob_size __P((DBC *, off_t *)); + */ +int +__dbc_get_blob_size(dbc, size) + DBC *dbc; + off_t *size; +{ + DBT key, data; + ENV *env; + BBLOB bl; + HBLOB hbl; + HEAPBLOBHDR bhdr; + int ret; + + if (dbc->dbtype != DB_BTREE && + dbc->dbtype != DB_HEAP && dbc->dbtype != DB_HASH) { + return (EINVAL); + } + + env = dbc->env; + ret = 0; + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + /* Get the blob database record instead of the blob. */ + data.flags |= DB_DBT_BLOB_REC; + + /* + * It would be great if there was a more efficient way to do this, but + * the complexities of getting a page from a database, especially + * when taking into account things like partitions and compression, + * make that more trouble than it is worth. + */ + if ((ret = __dbc_get(dbc, &key, &data, DB_CURRENT)) != 0) + goto err; + + switch (dbc->dbtype) { + case DB_BTREE: + if (data.size != BBLOB_SIZE) { + ret = EINVAL; + goto err; + } + memcpy(&bl, data.data, BBLOB_SIZE); + if (B_TYPE(bl.type) != B_BLOB) { + ret = EINVAL; + goto err; + } + GET_BLOB_SIZE(env, bl, *size, ret); + break; + case DB_HEAP: + if (data.size != HEAPBLOBREC_SIZE) { + ret = EINVAL; + goto err; + } + memcpy(&bhdr, data.data, HEAPBLOBREC_SIZE); + if (!F_ISSET(&bhdr.std_hdr, HEAP_RECBLOB)) { + ret = EINVAL; + goto err; + } + GET_BLOB_SIZE(env, bhdr, *size, ret); + break; + case DB_HASH: + if (data.size != HBLOB_SIZE) { + ret = EINVAL; + goto err; + } + memcpy(&hbl, data.data, HBLOB_SIZE); + if (HPAGE_PTYPE(&hbl) != H_BLOB) { + ret = EINVAL; + goto err; + } + GET_BLOB_SIZE(env, hbl, *size, ret); + break; + default: + ret = EINVAL; + goto err; + } + +err: return (ret); +} + +/* + * __dbc_set_blob_size -- + * + * Sets the blob file size in the data record to which the cursor + * currently points. Returns EINVAL if the cursor does not point to a blob + * record. + * + * PUBLIC: int __dbc_set_blob_size __P((DBC *, off_t)); + */ +int +__dbc_set_blob_size(dbc, size) + DBC *dbc; + off_t size; +{ + DBT key, data; + BBLOB *bl; + HBLOB *hbl; + HEAPBLOBHDR *bhdr; + int ret; + + if (dbc->dbtype != DB_BTREE && + dbc->dbtype != DB_HEAP && dbc->dbtype != DB_HASH) { + return (EINVAL); + } + + ret = 0; + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + /* Get the blob database record instead of the blob. */ + data.flags |= DB_DBT_BLOB_REC; + + /* + * It would be great if there was a more efficient way to do this, but + * the complexities of getting a page from a database, especially + * when taking into account things like partitions and compression, + * make that more trouble than it is worth. + */ + if ((ret = __dbc_get(dbc, &key, &data, DB_CURRENT)) != 0) + goto err; + + switch (dbc->dbtype) { + case DB_BTREE: + bl = (BBLOB *)data.data; + if (bl == NULL || + B_TYPE(bl->type) != B_BLOB || data.size != BBLOB_SIZE) { + ret = EINVAL; + goto err; + } + SET_BLOB_SIZE(bl, size, BBLOB); + break; + case DB_HEAP: + bhdr = (HEAPBLOBHDR *)data.data; + if (bhdr == NULL || + !F_ISSET(&bhdr->std_hdr, HEAP_RECBLOB) || + data.size != HEAPBLOBREC_SIZE) { + ret = EINVAL; + goto err; + } + SET_BLOB_SIZE(bhdr, size, HEAPBLOBHDR); + break; + case DB_HASH: + hbl = data.data; + if (hbl == NULL || + HPAGE_PTYPE(hbl) != H_BLOB || data.size != HBLOB_SIZE) { + ret = EINVAL; + goto err; + } + SET_BLOB_SIZE((HBLOB *)hbl, size, HBLOB); + break; + default: + ret = EINVAL; + goto err; + } + + if ((ret = __dbc_put(dbc, &key, &data, DB_CURRENT)) != 0) + goto err; + +err: return (ret); +} + #ifdef HAVE_COMPRESSION /* * __dbc_bulk_del -- @@ -632,6 +943,12 @@ __dbc_idup(dbc_orig, dbcp, flags) int_n->stream_off = int_orig->stream_off; int_n->stream_curr_pgno = int_orig->stream_curr_pgno; +#ifdef HAVE_PARTITION + if (DB_IS_PARTITIONED(dbp)) { + if ((ret = __partc_dup(dbc_orig, dbc_n)) != 0) + goto err; + } else +#endif switch (dbc_orig->dbtype) { case DB_QUEUE: if ((ret = __qamc_dup(dbc_orig, dbc_n)) != 0) @@ -859,7 +1176,11 @@ __dbc_iget(dbc, key, data, flags) * we acquire a write lock in the primary tree and no locks in the * off-page dup tree. If the DB_RMW flag was specified and the get * operation is done in an off-page duplicate tree, call the primary - * cursor's upgrade routine first. + * cursor's upgrade routine first. We fetch the primary tree's data + * page to follow the buffer latching order rules for btrees: latch from + * the top of the main tree down, even when also searching OPD trees. + * Deadlocks could otherwise occur if we need to fetch the main page + * while an OPD page is latched. [#22532] */ cp = dbc->internal; if (cp->opd != NULL && @@ -868,6 +1189,10 @@ __dbc_iget(dbc, key, data, flags) flags == DB_PREV || flags == DB_PREV_DUP)) { if (tmp_rmw && (ret = dbc->am_writelock(dbc)) != 0) goto err; + if (cp->page == NULL && (ret = __memp_fget(mpf, &cp->pgno, + dbc->thread_info, dbc->txn, 0, &cp->page)) != 0) + goto err; + if (F_ISSET(dbc, DBC_TRANSIENT)) opd = cp->opd; else if ((ret = __dbc_idup(cp->opd, &opd, DB_POSITION)) != 0) @@ -1660,7 +1985,7 @@ __dbc_put_secondaries(dbc, tskeyp, &oldpkey, rmw | DB_SET); if (ret == 0) { cmp = __bam_defcmp(sdbp, - &oldpkey, pkey); + &oldpkey, pkey, NULL); __os_ufree(env, oldpkey.data); /* * If the secondary key is unchanged, @@ -1868,7 +2193,7 @@ __dbc_put_primary(dbc, key, data, flags) olddata.flags = DB_DBT_PARTIAL | DB_DBT_USERMEM; ret = __dbc_get(dbc, key, &olddata, DB_SET); if (ret == 0) { - ret = DB_KEYEXIST; + ret = DBC_ERR(dbc, DB_KEYEXIST); goto done; } else if (ret != DB_NOTFOUND && ret != DB_KEYEMPTY) goto err; @@ -2100,7 +2425,7 @@ __dbc_iput(dbc, key, data, flags) if (dbc->dbtype == DB_HASH && F_ISSET( ((BTREE_CURSOR *)(dbc->internal->opd->internal)), C_DELETED)) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } @@ -2228,7 +2553,7 @@ __dbc_del_oldskey(sdbp, dbc, skey, pkey, olddata) */ for (i = 0, tskeyp = skey; i < nskey; i++, tskeyp++) if (((BTREE *)sdbp->bt_internal)->bt_compare(sdbp, - toldskeyp, tskeyp) == 0) { + toldskeyp, tskeyp, NULL) == 0) { nsame++; F_CLR(tskeyp, DB_DBT_ISSET); break; @@ -2382,12 +2707,14 @@ __dbc_cleanup(dbc, dbc_n, failed) * cursors. */ if (!failed && ret == 0) { + MUTEX_LOCK(dbp->env, dbp->mutex); if (opd != NULL) opd->internal->pdbc = dbc; if (internal->opd != NULL) internal->opd->internal->pdbc = dbc_n; dbc->internal = dbc_n->internal; dbc_n->internal = internal; + MUTEX_UNLOCK(dbp->env, dbp->mutex); } /* @@ -3501,6 +3828,32 @@ __db_check_skeyset(sdbp, skeyp) for (key2 = key1 + 1; key2 < last_key; key2++) DB_ASSERT(env, ((BTREE *)sdbp->bt_internal)->bt_compare(sdbp, - key1, key2) != 0); + key1, key2, NULL) != 0); +} +#endif + +#ifdef HAVE_ERROR_HISTORY +/* + * __dbc_diags + * Save the context which triggers the "first notice" of an error code; + * i.e., its creation. It doesn't touch anything when err == 0. + * + * PUBLIC: int __dbc_diags __P((DBC *, int)); + */ + int + __dbc_diags(dbc, err) + DBC *dbc; + int err; +{ + DB_MSGBUF *mb; + + if (err != 0 && dbc->env != NULL && + (mb = __db_deferred_get()) != NULL) { + (void)__db_remember_context(dbc->env, mb, err); + __db_msgadd(dbc->env, mb, "DB: %s:%s\n" , + dbc->dbp->fname == NULL ? "in-mem" : dbc->dbp->fname, + dbc->dbp->dname == NULL ? "" : dbc->dbp->fname); + } + return (err); } #endif diff --git a/src/db/db_cds.c b/src/db/db_cds.c index 185d5487..d3cc990a 100644 --- a/src/db/db_cds.c +++ b/src/db/db_cds.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2000, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2000, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -43,7 +43,15 @@ static int __cdsgroup_abort(txn) DB_TXN *txn; { - return (__cdsgroup_notsup(txn->mgrp->env, "abort")); + ENV *env; + + env = txn->mgrp->env; + /* + * As the txn handle can not be used any more, we call + * __cdsgroup_commit to release the lock and destroy the handle. + */ + (void)__cdsgroup_commit(txn, 0); + return (__cdsgroup_notsup(env, "abort")); } static int @@ -83,8 +91,16 @@ static int __cdsgroup_discard(txn, flags) DB_TXN *txn; u_int32_t flags; { + ENV *env; + COMPQUIET(flags, 0); - return (__cdsgroup_notsup(txn->mgrp->env, "discard")); + env = txn->mgrp->env; + /* + * As the txn handle can not be used any more, we call + * __cdsgroup_commit to release the lock and destroy the handle. + */ + (void)__cdsgroup_commit(txn, 0); + return (__cdsgroup_notsup(env, "discard")); } static u_int32_t __cdsgroup_id(txn) diff --git a/src/db/db_compact.c b/src/db/db_compact.c index d0f4801e..afe5a997 100644 --- a/src/db/db_compact.c +++ b/src/db/db_compact.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -262,9 +262,11 @@ err: if (txn_local && txn != NULL) { done: if (LF_ISSET(DB_FREE_SPACE)) { DBMETA *meta; db_pgno_t pgno; + int pgs_done; pgno = PGNO_BASE_MD; isdone = 1; + pgs_done = 0; if (ret == 0 && !LF_ISSET(DB_FREELIST_ONLY) && __memp_fget(dbp->mpf, &pgno, ip, txn, 0, &meta) == 0) { isdone = meta->free == PGNO_INVALID; @@ -281,7 +283,8 @@ done: if (LF_ISSET(DB_FREE_SPACE)) { } else #endif if (!isdone) - ret = __bam_truncate_ipages(dbp, ip, txn_orig, c_data); + ret = __bam_truncate_ipages(dbp, + ip, txn_orig, c_data, &pgs_done); /* Clean up the free list. */ if (list != NULL) @@ -387,17 +390,26 @@ err: if (dbc != NULL && (t_ret = __LPUT(dbc, lock)) != 0 && ret == 0) #endif /* - * __db_exchange_page -- swap a page with a lower numbered page. - * The routine will optionally free the higher numbered page. The cursor - * has a stack which includes at least the immediate parent of this page. - * PUBLIC: int __db_exchange_page __P((DBC *, PAGE **, PAGE *, db_pgno_t, int)); + * __db_exchange_page -- try to move a page 'down', to earlier in the file. + * + * This tries to move a page to a lower location the file, by swapping it + * with an earlier free page. The free page comes either from the free list or + * the newpgno parameter (e.g., __ham_compact_hash()). If the new page turns + * out to be higher than the original one, the allocation is undone and + * the caller is left unchanged. After a successful swap, this routine can + * optionally free the old, higher numbered page. + * The cursor's stack includes at least the immediate parent of this page. + * + * PUBLIC: int __db_exchange_page + * PUBLIC: __P((DBC *, PAGE **, PAGE *, db_pgno_t, int, int *)); */ int -__db_exchange_page(dbc, pgp, opg, newpgno, flags) +__db_exchange_page(dbc, pgp, opg, newpgno, flags, pgs_donep) DBC *dbc; PAGE **pgp, *opg; db_pgno_t newpgno; int flags; + int *pgs_donep; { BTREE_CURSOR *cp; DB *dbp; @@ -445,7 +457,9 @@ __db_exchange_page(dbc, pgp, opg, newpgno, flags) * are allocating at the same time, if so, just put it back. */ if (PGNO(newpage) > PGNO(*pgp)) { - /* Its unfortunate but you can't just free a new overflow. */ + /* It is unfortunate but you can't just free a new overflow. */ + /* XXX Is the above comment still true? */ + /* XXX Should __db_new(OVERFLOW) zero OV_LEN()? */ if (TYPE(newpage) == P_OVERFLOW) OV_LEN(newpage) = 0; if ((ret = __LPUT(dbc, lock)) != 0) @@ -572,7 +586,9 @@ __db_exchange_page(dbc, pgp, opg, newpgno, flags) if ((ret = __TLPUT(dbc, lock)) != 0) return (ret); -done: return (0); +done: + (*pgs_donep)++; + return (0); err: (void)__memp_fput(dbp->mpf, dbc->thread_info, newpage, dbc->priority); (void)__TLPUT(dbc, lock); @@ -584,15 +600,16 @@ err: (void)__memp_fput(dbp->mpf, dbc->thread_info, newpage, dbc->priority); * Walk the pages of an overflow chain and swap out * high numbered pages. We are passed the first page * but only deal with the second and subsequent pages. - * PUBLIC: int __db_truncate_overflow __P((DBC *, - * PUBLIC: db_pgno_t, PAGE **, DB_COMPACT *)); + * PUBLIC: int __db_truncate_overflow __P((DBC *, db_pgno_t, + * PUBLIC: PAGE **, DB_COMPACT *, int *)); */ int -__db_truncate_overflow(dbc, pgno, ppg, c_data) +__db_truncate_overflow(dbc, pgno, ppg, c_data, pgs_donep) DBC *dbc; db_pgno_t pgno; PAGE **ppg; DB_COMPACT *c_data; + int *pgs_donep; { DB *dbp; DB_LOCK lock; @@ -618,7 +635,7 @@ __db_truncate_overflow(dbc, pgno, ppg, c_data) return (ret); if (pgno <= c_data->compact_truncate) continue; - if (have_lock == 0) { + if (!have_lock) { DB_ASSERT(dbp->env, ppg != NULL); ppgno = PGNO(*ppg); if ((ret = __memp_fput(dbp->mpf, dbc->thread_info, @@ -635,30 +652,32 @@ __db_truncate_overflow(dbc, pgno, ppg, c_data) have_lock = 1; } if ((ret = __db_exchange_page(dbc, - &page, NULL, PGNO_INVALID, DB_EXCH_FREE)) != 0) + &page, NULL, PGNO_INVALID, DB_EXCH_FREE, pgs_donep)) != 0) break; } err: if (page != NULL && - (t_ret = __memp_fput( dbp->mpf, + (t_ret = __memp_fput(dbp->mpf, dbc->thread_info, page, dbc->priority)) != 0 && ret == 0) ret = t_ret; if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0) ret = t_ret; return (ret); } + /* * __db_truncate_root -- swap a root page for a lower numbered page. * PUBLIC: int __db_truncate_root __P((DBC *, - * PUBLIC: PAGE *, u_int32_t, db_pgno_t *, u_int32_t)); + * PUBLIC: PAGE *, u_int32_t, db_pgno_t *, u_int32_t, int *)); */ int -__db_truncate_root(dbc, ppg, indx, pgnop, tlen) +__db_truncate_root(dbc, ppg, indx, pgnop, tlen, pgs_donep) DBC *dbc; PAGE *ppg; u_int32_t indx; db_pgno_t *pgnop; u_int32_t tlen; + int *pgs_donep; { DB *dbp; DBT orig; @@ -693,7 +712,7 @@ __db_truncate_root(dbc, ppg, indx, pgnop, tlen) } else { LOCK_CHECK_OFF(dbc->thread_info); ret = __db_exchange_page(dbc, - &page, NULL, PGNO_INVALID, DB_EXCH_FREE); + &page, NULL, PGNO_INVALID, DB_EXCH_FREE, pgs_donep); LOCK_CHECK_ON(dbc->thread_info); if (ret != 0) goto err; @@ -705,8 +724,7 @@ __db_truncate_root(dbc, ppg, indx, pgnop, tlen) /* Update the reference. */ if (DBC_LOGGING(dbc)) { - if ((ret = __db_pgno_log(dbp, - dbc->txn, &LSN(ppg), 0, PGNO(ppg), + if ((ret = __db_pgno_log(dbp, dbc->txn, &LSN(ppg), 0, PGNO(ppg), &LSN(ppg), (u_int32_t)indx, *pgnop, newpgno)) != 0) goto err; } else @@ -780,13 +798,13 @@ __db_find_free(dbc, type, size, bstart, freep) goto err; if (nelems == 0) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } for (i = 0; i < nelems; i++) { if (list[i] > bstart) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } start = i; @@ -812,7 +830,7 @@ __db_find_free(dbc, type, size, bstart, freep) goto found; } } - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; found: /* We have size range of pages. Remove them. */ @@ -1005,13 +1023,15 @@ err: if (np != NULL && np != otherp) * __db_move_metadata -- move a meta data page to a lower page number. * The meta data page must be exclusively latched on entry. * - * PUBLIC: int __db_move_metadata __P((DBC *, DBMETA **, DB_COMPACT *)); + * PUBLIC: int __db_move_metadata + * PUBLIC: __P((DBC *, DBMETA **, DB_COMPACT *, int *)); */ int -__db_move_metadata(dbc, metap, c_data) +__db_move_metadata(dbc, metap, c_data, pgs_donep) DBC *dbc; DBMETA **metap; DB_COMPACT *c_data; + int *pgs_donep; { BTREE *bt; DB *dbp, *mdbp; @@ -1023,7 +1043,7 @@ __db_move_metadata(dbc, metap, c_data) c_data->compact_pages_examine++; if ((ret = __db_exchange_page(dbc, - (PAGE**)metap, NULL, PGNO_INVALID, DB_EXCH_FREE)) != 0) + (PAGE **)metap, NULL, PGNO_INVALID, DB_EXCH_FREE, pgs_donep)) != 0) return (ret); if (PGNO(*metap) == dbp->meta_pgno) diff --git a/src/db/db_conv.c b/src/db/db_conv.c index 210b4d6e..77c6b760 100644 --- a/src/db/db_conv.c +++ b/src/db/db_conv.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1990, 1993, 1994, 1995, 1996 @@ -487,8 +487,12 @@ __db_byteswap(dbp, pg, h, pagesize, pgin) { ENV *env; BINTERNAL *bi; + BBLOB *bl; BKEYDATA *bk; BOVERFLOW *bo; + HEAPBLOBHDR *bhdr; + HEAPHDR *hh; + HEAPSPLITHDR *hsh; RINTERNAL *ri; db_indx_t i, *inp, len, tmp; u_int8_t *end, *p, *pgend; @@ -500,8 +504,14 @@ __db_byteswap(dbp, pg, h, pagesize, pgin) M_32_SWAP(h->lsn.file); M_32_SWAP(h->lsn.offset); M_32_SWAP(h->pgno); - M_32_SWAP(h->prev_pgno); - M_32_SWAP(h->next_pgno); + if (TYPE(h) == P_HEAP) { + M_32_SWAP(((HEAPPG *)h)->high_pgno); + M_16_SWAP(((HEAPPG *)h)->high_indx); + M_16_SWAP(((HEAPPG *)h)->free_indx); + } else { + M_32_SWAP(h->prev_pgno); + M_32_SWAP(h->next_pgno); + } M_16_SWAP(h->entries); M_16_SWAP(h->hf_offset); } @@ -527,6 +537,14 @@ __db_byteswap(dbp, pg, h, pagesize, pgin) continue; switch (HPAGE_TYPE(dbp, h, i)) { + case H_BLOB: + p = HBLOB_ID(P_ENTRY(dbp, h, i)); + SWAP64(p); /* id */ + SWAP64(p); /* size */ + p = HBLOB_FILE_ID(P_ENTRY(dbp, h, i)); + SWAP64(p); /* file id */ + SWAP64(p); /* sdb id */ + break; case H_KEYDATA: break; case H_DUPLICATE: @@ -599,6 +617,14 @@ __db_byteswap(dbp, pg, h, pagesize, pgin) if ((u_int8_t *)bk >= pgend) continue; switch (B_TYPE(bk->type)) { + case B_BLOB: + bl = (BBLOB *)bk; + M_16_SWAP(bl->len); + M_64_SWAP(bl->id); /* id */ + M_64_SWAP(bl->size); /* size */ + M_64_SWAP(bl->file_id); /* file id */ + M_64_SWAP(bl->sdb_id); /* sdb id */ + break; case B_KEYDATA: M_16_SWAP(bk->len); break; @@ -663,6 +689,32 @@ __db_byteswap(dbp, pg, h, pagesize, pgin) } break; case P_HEAP: + for (i = 0; i <= HEAP_HIGHINDX(h); i++) { + if (pgin) + M_16_SWAP(inp[i]); + if (inp[i] == 0) + continue; + + hh = (HEAPHDR *)P_ENTRY(dbp, h, i); + if ((u_int8_t *)hh >= pgend) + continue; + M_16_SWAP(hh->size); + if (F_ISSET(hh, HEAP_RECSPLIT)) { + hsh = (HEAPSPLITHDR *)hh; + M_32_SWAP(hsh->tsize); + M_32_SWAP(hsh->nextpg); + M_16_SWAP(hsh->nextindx); + } else if (F_ISSET(hh, HEAP_RECBLOB)) { + bhdr = (HEAPBLOBHDR *)hh; + M_64_SWAP(bhdr->id); /* id */ + M_64_SWAP(bhdr->size); /* size */ + M_64_SWAP(bhdr->file_id); /* file id */ + } + + if (!pgin) + M_16_SWAP(inp[i]); + } + break; case P_IHEAP: case P_INVALID: case P_OVERFLOW: @@ -678,8 +730,14 @@ out: if (!pgin) { M_32_SWAP(h->lsn.file); M_32_SWAP(h->lsn.offset); M_32_SWAP(h->pgno); - M_32_SWAP(h->prev_pgno); - M_32_SWAP(h->next_pgno); + if (TYPE(h) == P_HEAP) { + M_32_SWAP(((HEAPPG *)h)->high_pgno); + M_16_SWAP(((HEAPPG *)h)->high_indx); + M_16_SWAP(((HEAPPG *)h)->free_indx); + } else { + M_32_SWAP(h->prev_pgno); + M_32_SWAP(h->next_pgno); + } M_16_SWAP(h->entries); M_16_SWAP(h->hf_offset); } @@ -718,7 +776,10 @@ __db_pageswap(env, dbp, pp, len, pdata, pgin) case P_HASHMETA: return (__ham_mswap(env, pp)); - +#ifdef HAVE_HEAP + case P_HEAPMETA: + return (__heap_mswap(env, pp)); +#endif case P_QAMMETA: return (__qam_mswap(env, pp)); @@ -794,12 +855,17 @@ __db_recordswap(op, size, hdr, data, pgin) void *hdr, *data; u_int32_t pgin; { + BBLOB *bl; BKEYDATA *bk; BOVERFLOW *bo; BINTERNAL *bi; + DBT *dbt; + HEAPHDR *hh; + HEAPBLOBHDR bhdr; + HEAPSPLITHDR *hsh; RINTERNAL *ri; db_indx_t tmp; - u_int8_t *p, *end; + u_int8_t buf[HEAPBLOBREC_SIZE], *end, *p; if (size == 0) return; @@ -812,6 +878,14 @@ __db_recordswap(op, size, hdr, data, pgin) case B_KEYDATA: M_16_SWAP(bk->len); break; + case B_BLOB: + bl = (BBLOB *)bk; + M_16_SWAP(bl->len); + M_64_SWAP(bl->id); /* id */ + M_64_SWAP(bl->size); /* size */ + M_64_SWAP(bl->file_id); /* file id */ + M_64_SWAP(bl->sdb_id); /* sdb id */ + break; case B_DUPLICATE: case B_OVERFLOW: bo = (BOVERFLOW *)hdr; @@ -835,6 +909,7 @@ __db_recordswap(op, size, hdr, data, pgin) } else bo = (BOVERFLOW *)data; M_32_SWAP(bo->pgno); + M_32_SWAP(bo->tlen); } break; case P_IRECNO: @@ -867,10 +942,10 @@ __db_recordswap(op, size, hdr, data, pgin) SWAP16(p); } break; - /* These two record types include the full header. */ + /* These three record types include the full header. */ case H_OFFDUP: p = (u_int8_t *)hdr; - p += SSZ(HOFFPAGE, pgno); + p += SSZ(HOFFDUP, pgno); SWAP32(p); /* pgno */ break; case H_OFFPAGE: @@ -879,11 +954,61 @@ __db_recordswap(op, size, hdr, data, pgin) SWAP32(p); /* pgno */ SWAP32(p); /* tlen */ break; + case H_BLOB: + p = HBLOB_ID(hdr); + SWAP64(p); /* id */ + SWAP64(p); /* size */ + p = HBLOB_FILE_ID(hdr); + SWAP64(p); /* file id */ + SWAP64(p); /* sdb id */ + break; default: DB_ASSERT(NULL, op != op); } break; - + case P_HEAP: + hh = (HEAPHDR *)hdr; + M_16_SWAP(hh->size); + if (F_ISSET(hh, HEAP_RECSPLIT)) { + hsh = (HEAPSPLITHDR *)hdr; + M_32_SWAP(hsh->tsize); + M_32_SWAP(hsh->nextpg); + M_16_SWAP(hsh->nextindx); + }else if (F_ISSET(hh, HEAP_RECBLOB)) { + /* + * Heap blob records are broken into two parts when + * logged, the shared header and the part that is + * unique to blob records, which is stored in the + * log data field. + */ + if (data != NULL) { + dbt = NULL; + if (pgin) { + dbt = data; + memcpy(buf + sizeof(HEAPHDR), + dbt->data, HEAPBLOBREC_DSIZE); + } else { + memcpy(buf + sizeof(HEAPHDR), + data, HEAPBLOBREC_DSIZE); + } + memcpy(&bhdr, buf, HEAPBLOBREC_SIZE); + M_64_SWAP(bhdr.id); /* id */ + M_64_SWAP(bhdr.size); /* size */ + M_64_SWAP(bhdr.file_id); /* file id */ + memcpy(buf, &bhdr, HEAPBLOBREC_SIZE); + if (pgin) { + memcpy(dbt->data, + HEAPBLOBREC_DATA(buf), + HEAPBLOBREC_DSIZE); + } else { + memcpy(data, + HEAPBLOBREC_DATA(buf), + HEAPBLOBREC_DSIZE); + } + } + break; + } + break; default: DB_ASSERT(NULL, op != op); } diff --git a/src/db/db_copy.c b/src/db/db_copy.c index 359c74be..d9786702 100644 --- a/src/db/db_copy.c +++ b/src/db/db_copy.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2011, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/db/db_dispatch.c b/src/db/db_dispatch.c index 06de4ef7..7cb7f9ca 100644 --- a/src/db/db_dispatch.c +++ b/src/db/db_dispatch.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1995, 1996 @@ -639,7 +639,7 @@ __db_txnlist_find(env, hp, txnid, statusp) DB_TXNLIST *entry; if (txnid == 0) - return (DB_NOTFOUND); + return (USR_ERR(env, DB_NOTFOUND)); return (__db_txnlist_find_internal(env, hp, TXNLIST_TXNID, txnid, &entry, 0, statusp)); @@ -666,7 +666,7 @@ __db_txnlist_update(env, hp, txnid, status, lsn, ret_status, add_ok) int ret; if (txnid == 0) - return (DB_NOTFOUND); + return (USR_ERR(env, DB_NOTFOUND)); ret = __db_txnlist_find_internal(env, hp, TXNLIST_TXNID, txnid, &elp, 0, ret_status); @@ -715,7 +715,7 @@ __db_txnlist_find_internal(env, ret = 0; if (hp == NULL) - return (DB_NOTFOUND); + return (USR_ERR(env, DB_NOTFOUND)); switch (type) { case TXNLIST_TXNID: @@ -759,7 +759,7 @@ __db_txnlist_find_internal(env, return (ret); } - return (DB_NOTFOUND); + return (USR_ERR(env, DB_NOTFOUND)); } /* diff --git a/src/db/db_dup.c b/src/db/db_dup.c index 9fd04791..e66ec92b 100644 --- a/src/db/db_dup.c +++ b/src/db/db_dup.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/db/db_iface.c b/src/db/db_iface.c index 59e0ba53..da6140a4 100644 --- a/src/db/db_iface.c +++ b/src/db/db_iface.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -159,9 +159,15 @@ __db_associate_arg(dbp, sdbp, callback, flags) env = dbp->env; + if (dbp->blob_threshold || sdbp->blob_threshold) { + __db_errx(env, DB_STR("0751", + "Secondary and primary databases cannot support blobs.")); + return (EINVAL); + } + if (sdbp->type == DB_HEAP) { - __db_errx(env, - "Heap databases may not be used as secondary databases"); + __db_errx(env, DB_STR("0752", + "Heap databases may not be used as secondary databases")); return (EINVAL); } @@ -288,6 +294,7 @@ __db_cursor_pp(dbp, txn, dbcp, flags) int rep_blocked, ret; env = dbp->env; + (*dbcp) = NULL; DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->cursor"); @@ -331,7 +338,8 @@ __db_cursor_pp(dbp, txn, dbcp, flags) * If a family transaction was passed in, the transaction handle in * the cursor may not match. */ - txn = (*dbcp)->txn; + if ((*dbcp) != NULL) + txn = (*dbcp)->txn; if (txn != NULL && ret == 0) TAILQ_INSERT_HEAD(&(txn->my_cursors), *dbcp, txn_cursors); @@ -434,6 +442,13 @@ __db_cursor_arg(dbp, flags) return (__db_fnl(env, "DB->cursor")); } + if (dbp->blob_threshold && + LF_ISSET(DB_READ_UNCOMMITTED | DB_TXN_SNAPSHOT)) { + __db_errx(dbp->env, DB_STR("0753", +"Blob enabled databases do not support READ_UNCOMMITTED and TXN_SNAPSHOT.")); + return (EINVAL); + } + LF_CLR(DB_CURSOR_BULK | DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_TXN_SNAPSHOT); @@ -828,6 +843,12 @@ __db_get_arg(dbp, key, data, flags) env = dbp->env; + if (dbp->blob_threshold && LF_ISSET(DB_READ_UNCOMMITTED)) { + __db_errx(env, DB_STR("0754", + "Blob enabled databases do not support DB_READ_UNCOMMITTED.")); + return (EINVAL); + } + /* * Check for read-modify-write validity. DB_RMW doesn't make sense * with CDB cursors since if you're going to write the cursor, you @@ -876,6 +897,9 @@ __db_get_arg(dbp, key, data, flags) break; case DB_CONSUME: case DB_CONSUME_WAIT: + if (DB_IS_READONLY(dbp)) + return (__db_rdonly(env, + "DB->get CONSUME/CONSUME_WAIT")); if (dirty) { __db_errx(env, DB_STR_A("0583", "%s is not supported with DB_CONSUME or DB_CONSUME_WAIT", @@ -1148,6 +1172,13 @@ __db_open_pp(dbp, txn, fname, dname, type, flags, mode) /* Save the current DB handle flags for refresh. */ dbp->orig_flags = dbp->flags; + if (fname == 0 && PREFMAS_IS_SET(env)) { + __db_errx(env, DB_STR("0783", "In-memory databases are not " + "supported in Replication Manager preferred master mode")); + ret = EINVAL; + goto err; + } + /* Check for replication block. */ handle_check = IS_ENV_REPLICATED(env); if (handle_check && @@ -1389,6 +1420,18 @@ __db_open_arg(dbp, txn, fname, dname, type, flags) return (EINVAL); } + if (LF_ISSET(DB_MULTIVERSION) && dbp->blob_threshold) { + __db_errx(env, DB_STR("0755", + "DB_MULTIVERSION illegal with blob enabled databases")); + return (EINVAL); + } + + if (LF_ISSET(DB_READ_UNCOMMITTED) && dbp->blob_threshold) { + __db_errx(env, DB_STR("0756", + "DB_READ_UNCOMMITTED illegal with blob enabled databases")); + return (EINVAL); + } + /* DB_TRUNCATE is neither transaction recoverable nor lockable. */ if (LF_ISSET(DB_TRUNCATE) && (LOCKING_ON(env) || txn != NULL)) { __db_errx(env, DB_STR_A("0599", @@ -1901,8 +1944,6 @@ __db_compact_pp(dbp, txn, start, stop, c_data, flags, end) ret = __db_compact_int(dbp, ip, txn, start, stop, dp, flags, end); break; - case DB_HEAP: - break; default: ret = __dbh_am_chk(dbp, DB_OK_BTREE); break; @@ -2893,7 +2934,7 @@ __dbt_ferr(dbp, name, dbt, check_thread) * database, without having to clear flags. */ if ((ret = __db_fchk(env, name, dbt->flags, - DB_DBT_APPMALLOC | DB_DBT_BULK | DB_DBT_DUPOK | + DB_DBT_APPMALLOC | DB_DBT_BLOB | DB_DBT_BULK | DB_DBT_DUPOK | DB_DBT_MALLOC | DB_DBT_REALLOC | DB_DBT_USERCOPY | DB_DBT_USERMEM | DB_DBT_PARTIAL | DB_DBT_READONLY)) != 0) return (ret); diff --git a/src/db/db_join.c b/src/db/db_join.c index 751cf9e2..24d5260e 100644 --- a/src/db/db_join.c +++ b/src/db/db_join.c @@ -1,7 +1,7 @@ /* * See the file LICENSE for redistribution information. * - * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1998, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -717,7 +717,6 @@ __db_join_close(dbc) DBC *dbc; { DB *dbp; - DB_THREAD_INFO *ip; ENV *env; JOIN_CURSOR *jc; int ret, t_ret; @@ -737,7 +736,6 @@ __db_join_close(dbc) TAILQ_REMOVE(&dbp->join_queue, dbc, links); MUTEX_UNLOCK(env, dbp->mutex); - ENV_ENTER(env, ip); /* * Close any open scratch cursors. In each case, there may * not be as many outstanding as there are cursors in @@ -757,7 +755,6 @@ __db_join_close(dbc) (t_ret = __dbc_close(jc->j_fdupcurs[i])) != 0) ret = t_ret; } - ENV_LEAVE(env, ip); __os_free(env, jc->j_exhausted); __os_free(env, jc->j_curslist); @@ -796,7 +793,7 @@ __db_join_getnext(dbc, key, data, exhausted, opmods) int ret, cmp; DB *dbp; DBT ldata; - int (*func) __P((DB *, const DBT *, const DBT *)); + int (*func) __P((DB *, const DBT *, const DBT *, size_t *)); dbp = dbc->dbp; func = (dbp->dup_compare == NULL) ? __bam_defcmp : dbp->dup_compare; @@ -812,7 +809,7 @@ __db_join_getnext(dbc, key, data, exhausted, opmods) if ((ret = __dbc_get(dbc, key, &ldata, opmods | DB_CURRENT)) != 0) break; - cmp = func(dbp, data, &ldata); + cmp = func(dbp, data, &ldata, NULL); if (cmp == 0) { /* * We have to return the real data value. Copy diff --git a/src/db/db_meta.c b/src/db/db_meta.c index 8f97ebd8..53cf77cc 100644 --- a/src/db/db_meta.c +++ b/src/db/db_meta.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1990, 1993, 1994, 1995, 1996 @@ -939,12 +939,14 @@ done: if (last_pgnop != NULL) *last_pgnop = meta->last_pgno; /* - * The truncate point is the number of pages in the free - * list back from the last page. The number of pages - * in the free list are the number that we can swap in. - * Adjust it down slightly so if we find higher numbered - * pages early and then free other pages later we can - * truncate them. + * Set the truncation point which determines which pages may be + * relocated. Pages above are candidates to be swapped with a lower one + * from the freelist by __db_exchange_page(); pages before the truncate + * point are not relocated. + * The truncation point starts as N pages less than the last_pgno, where + * N is the size of the free list. This is reduced by 1/4 in the hope + * that partially full pages will be coalesced together, creating + * additional free pages during the compact. */ if (c_data) { c_data->compact_truncate = (u_int32_t)meta->last_pgno - nelems; diff --git a/src/db/db_method.c b/src/db/db_method.c index 82d03e5f..d807bab6 100644 --- a/src/db/db_method.c +++ b/src/db/db_method.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -9,6 +9,7 @@ #include "db_config.h" #include "db_int.h" +#include "dbinc/blob.h" #include "dbinc/crypto.h" #include "dbinc/db_page.h" #include "dbinc/btree.h" @@ -36,14 +37,15 @@ static int __db_set_alloc __P((DB *, void *(*)(size_t), static int __db_get_append_recno __P((DB *, int (**)(DB *, DBT *, db_recno_t))); static int __db_set_append_recno __P((DB *, int (*)(DB *, DBT *, db_recno_t))); +static int __db_get_blob_dir __P((DB *, const char **)); +static int __db_set_blob_dir __P((DB *, const char *)); +static int __db_get_blob_sub_dir __P((DB *, const char **)); static int __db_get_cachesize __P((DB *, u_int32_t *, u_int32_t *, int *)); static int __db_set_cachesize __P((DB *, u_int32_t, u_int32_t, int)); static int __db_get_create_dir __P((DB *, const char **)); static int __db_set_create_dir __P((DB *, const char *)); static int __db_get_dup_compare - __P((DB *, int (**)(DB *, const DBT *, const DBT *))); -static int __db_set_dup_compare - __P((DB *, int (*)(DB *, const DBT *, const DBT *))); + __P((DB *, int (**)(DB *, const DBT *, const DBT *, size_t *))); static int __db_get_encrypt_flags __P((DB *, u_int32_t *)); static int __db_set_encrypt __P((DB *, const char *, u_int32_t)); static int __db_get_feedback __P((DB *, void (**)(DB *, int, int))); @@ -90,6 +92,12 @@ db_create(dbpp, dbenv, flags) ip = NULL; env = dbenv == NULL ? NULL : dbenv->env; +#ifdef HAVE_ERROR_HISTORY + /* Call thread local storage initializer at least once per process. */ + if (env == NULL) + __db_thread_init(); +#endif + /* Check for invalid function flags. */ switch (flags) { case 0: @@ -206,12 +214,11 @@ __db_create_internal(dbpp, env, flags) err: if (dbp != NULL) { if (dbp->mpf != NULL) (void)__memp_fclose(dbp->mpf, 0); + if (F_ISSET(env, ENV_DBLOCAL)) + (void)__env_close(dbp->dbenv, 0); __os_free(env, dbp); } - if (dbp != NULL && F_ISSET(env, ENV_DBLOCAL)) - (void)__env_close(dbp->dbenv, 0); - return (ret); } @@ -225,6 +232,7 @@ __db_init(dbp, flags) u_int32_t flags; { int ret; + u_int32_t bytes; dbp->locker = NULL; dbp->alt_close = NULL; @@ -254,6 +262,9 @@ __db_init(dbp, flags) dbp->get_alloc = __db_get_alloc; dbp->get_append_recno = __db_get_append_recno; dbp->get_assoc_flags = __db_get_assoc_flags; + dbp->get_blob_dir = __db_get_blob_dir; + dbp->get_blob_sub_dir = __db_get_blob_sub_dir; + dbp->get_blob_threshold = __db_get_blob_threshold; dbp->get_byteswapped = __db_get_byteswapped; dbp->get_cachesize = __db_get_cachesize; dbp->get_create_dir = __db_get_create_dir; @@ -290,6 +301,8 @@ __db_init(dbp, flags) dbp->rename = __db_rename_pp; dbp->set_alloc = __db_set_alloc; dbp->set_append_recno = __db_set_append_recno; + dbp->set_blob_dir = __db_set_blob_dir; + dbp->set_blob_threshold = __db_set_blob_threshold; dbp->set_cachesize = __db_set_cachesize; dbp->set_create_dir = __db_set_create_dir; dbp->set_dup_compare = __db_set_dup_compare; @@ -316,7 +329,11 @@ __db_init(dbp, flags) dbp->verify = __db_verify_pp; /* DB PUBLIC HANDLE LIST END */ - /* Access method specific. */ + if ((ret = __env_get_blob_threshold_int(dbp->env, &bytes)) != 0) + return (ret); + dbp->blob_threshold = bytes; + + /* Access method specific. */ if ((ret = __bam_db_create(dbp)) != 0) return (ret); if ((ret = __ham_db_create(dbp)) != 0) @@ -535,6 +552,182 @@ __db_set_append_recno(dbp, func) } /* + * __db_get_blob_threshold -- + * Get the current threshold size at which records are stored as blobs. + * + * PUBLIC: int __db_get_blob_threshold __P((DB *, u_int32_t *)); + */ +int +__db_get_blob_threshold(dbp, bytes) + DB *dbp; + u_int32_t *bytes; +{ + /* + * While shared, this value never changes after open, so it is safe + * to access it without mutex protection. + */ + *bytes = dbp->blob_threshold; + + return (0); +} + +/* + * __db_set_blob_threshold -- + * API to allow setting the threshold size at which records are stored + * as blobs rather than in database items. No flags currently supported. + * PUBLIC: int __db_set_blob_threshold __P((DB *, u_int32_t, u_int32_t)); + */ +int +__db_set_blob_threshold(dbp, bytes, flags) + DB *dbp; + u_int32_t bytes; + u_int32_t flags; +{ + if (__db_fchk(dbp->env, "DB->set_blob_threshold", flags, 0) != 0) + return (EINVAL); + + DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_blob_threshold"); + + if (bytes != 0 && F_ISSET(dbp, + (DB_AM_CHKSUM | DB_AM_ENCRYPT | DB_AM_DUP | DB_AM_DUPSORT))) { + __db_errx(dbp->env, DB_STR("0760", +"Cannot enable blobs in databases with checksum, encryption, or duplicates.")); + return (EINVAL); + } +#ifdef HAVE_COMPRESSION + if (DB_IS_COMPRESSED(dbp) && bytes != 0) { + __db_errx(dbp->env, DB_STR("0761", + "Cannot enable blobs in databases with compression.")); + return (EINVAL); + } +#endif + + dbp->blob_threshold = bytes; + + return (0); +} + +/* + * __db_blobs_enabled -- + * + * Used to tell if the database is configured to support blobs. + * PUBLIC: int __db_blobs_enabled __P((DB *)); + */ +int +__db_blobs_enabled(dbp) + DB *dbp; +{ + /* Blob threshold must be non-0. */ + if (!dbp->blob_threshold) + return (0); + /* Blobs cannot support encryption or checksum, but that may change. */ + if (F_ISSET(dbp, (DB_AM_CHKSUM | DB_AM_ENCRYPT))) + return (0); + /* Blobs do not support compression, but that may change. */ +#ifdef HAVE_COMPRESSION + if (DB_IS_COMPRESSED(dbp)) + return (0); +#endif + if (dbp->env->dbenv != NULL && + F_ISSET(dbp->env->dbenv, DB_ENV_TXN_SNAPSHOT)) + return (0); + /* Cannot support blobs in recno or queue. */ + if (dbp->type == DB_RECNO || dbp->type == DB_QUEUE) + return (0); + /* + * Cannot support dups because that would require comparing + * blob data items. + */ + if (F_ISSET(dbp, (DB_AM_DUP | DB_AM_DUPSORT))) + return (0); + /* No place to put blob files when using an in-memory db. */ + if (F_ISSET(dbp, (DB_AM_INMEM))) + return (0); + + /* BDB managed databases should not support blobs. */ + if ((dbp->fname != NULL && IS_DB_FILE(dbp->fname)) || + (dbp->dname != NULL && IS_DB_FILE(dbp->dname))) + return (0); + + return (1); +} + +/* + * __db_get_blob_sub_dir -- + * + * Returns the subdirectory of the blob directory in which the blob files + * for the given db are stored, or NULL if there is none. + * + */ +static int +__db_get_blob_sub_dir(dbp, dir) + DB *dbp; + const char **dir; +{ + DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->get_blob_sub_dir"); + + *dir = dbp->blob_sub_dir; + + return (0); +} + +/* + * __db_get_blob_dir -- + * + * Get the blob directory for this database. + */ +static int +__db_get_blob_dir(dbp, dir) + DB *dbp; + const char **dir; +{ + DB_ENV *dbenv; + ENV *env; + + env = dbp->env; + dbenv = dbp->env->dbenv; + *dir = NULL; + + if (dbenv == NULL) + return (0); + + if (dbenv->db_blob_dir != NULL) + *dir = dbenv->db_blob_dir; + else if (env->db_home != NULL) + *dir = BLOB_DEFAULT_DIR; + + return (0); +} + +/* + * __db_set_blob_dir -- + * + * Set the blob directory in a local environment. + */ +static int +__db_set_blob_dir(dbp, dir) + DB *dbp; + const char *dir; +{ + DB_ENV *dbenv; + ENV *env; + + DB_ILLEGAL_IN_ENV(dbp, "DB->set_blob_dir"); + DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_blob_dir"); + env = dbp->env; + dbenv = dbp->env->dbenv; + + if (dbenv == NULL) + return (0); + + if (dbenv->db_blob_dir != NULL) + __os_free(env, dbenv->db_blob_dir); + dbenv->db_blob_dir = NULL; + + return (__os_strdup(env, dir, &dbenv->db_blob_dir)); +} + +/* * __db_get_cachesize -- * Get underlying cache size. */ @@ -607,7 +800,7 @@ __db_get_create_dir(dbp, dirp) static int __db_get_dup_compare(dbp, funcp) DB *dbp; - int (**funcp) __P((DB *, const DBT *, const DBT *)); + int (**funcp) __P((DB *, const DBT *, const DBT *, size_t *)); { DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE | DB_OK_HASH); @@ -628,11 +821,14 @@ __db_get_dup_compare(dbp, funcp) /* * __db_set_dup_compare -- * Set duplicate comparison routine. + * + * PUBLIC: int __db_set_dup_compare __P((DB *, + * PUBLIC: int (*)(DB *, const DBT *, const DBT *, size_t *))); */ -static int +int __db_set_dup_compare(dbp, func) DB *dbp; - int (*func) __P((DB *, const DBT *, const DBT *)); + int (*func) __P((DB *, const DBT *, const DBT *, size_t *)); { int ret; @@ -900,6 +1096,13 @@ __db_set_flags(dbp, flags) ENV_REQUIRES_CONFIG(env, env->tx_handle, "DB_NOT_DURABLE", DB_INIT_TXN); + if (dbp->blob_threshold && + LF_ISSET(DB_CHKSUM | DB_ENCRYPT | DB_DUP | DB_DUPSORT)) { + __db_errx(dbp->env, DB_STR("0763", +"Cannot enable checksum, encryption, or duplicates with blob support.")); + return (EINVAL); + } + __db_map_flags(dbp, &flags, &dbp->flags); if ((ret = __bam_set_flags(dbp, &flags)) != 0) diff --git a/src/db/db_open.c b/src/db/db_open.c index fefda48f..21074b15 100644 --- a/src/db/db_open.c +++ b/src/db/db_open.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -119,6 +119,15 @@ __db_open(dbp, ip, txn, fname, dname, type, flags, mode, meta_pgno) goto err; /* + * Silently disabled blobs in databases that cannot support them. + * Most illegal configurations will have already been caught, this + * is to allow a user to set an environment wide blob threshold, but + * not have to explicitly turn it off for in-memory or queue databases. + */ + if (!__db_blobs_enabled(dbp)) + dbp->blob_threshold = 0; + + /* * If both fname and subname are NULL, it's always a create, so make * sure that we have both DB_CREATE and a type specified. It would * be nice if this checking were done in __db_open where most of the @@ -259,6 +268,11 @@ __db_open(dbp, ip, txn, fname, dname, type, flags, mode, meta_pgno) if (ret != 0) goto err; + if (dbp->blob_file_id != 0) + if ((ret = __blob_make_sub_dir(env, &dbp->blob_sub_dir, + dbp->blob_file_id, dbp->blob_sdb_id)) != 0) + goto err; + #ifdef HAVE_PARTITION if (dbp->p_internal != NULL && (ret = __partition_open(dbp, ip, txn, fname, type, flags, mode, 1)) != 0) @@ -432,8 +446,10 @@ err: return (ret); /* * __db_chk_meta -- - * Take a buffer containing a meta-data page and check it for a valid LSN, - * checksum (and verify the checksum if necessary) and possibly decrypt it. + * Validate a buffer containing a possible meta-data page. It is + * byte-swapped as necessary and checked for having a valid magic number. + * If it does, then it can validate the LSN, checksum (if necessary), + * and possibly decrypt it. * * Return 0 on success, >0 (errno). * @@ -447,44 +463,64 @@ __db_chk_meta(env, dbp, meta, flags) u_int32_t flags; { DB_LSN swap_lsn; - int is_hmac, ret, swapped; - u_int32_t magic, orig_chk; + int is_hmac, needs_swap, ret; + u_int32_t magic; u_int8_t *chksum; ret = 0; - swapped = 0; + needs_swap = 0; + /* + * We can verify that this is some kind of db now, before any potential + * decryption, because the first P_OVERHEAD() bytes of most pages are + * cleartext. This gets called both before and after swapping, so we + * need to check for byte swapping ourselves. + */ + magic = meta->magic; +magic_retry: + switch (magic) { + case DB_BTREEMAGIC: + case DB_HASHMAGIC: + case DB_HEAPMAGIC: + case DB_QAMMAGIC: + case DB_RENAMEMAGIC: + break; + default: + if (needs_swap) + /* It's already been swapped, so it isn't a BDB file. */ + return (EINVAL); + M_32_SWAP(magic); + needs_swap = 1; + goto magic_retry; + } + + if (LOGGING_ON(env) && !LF_ISSET(DB_CHK_NOLSN)) { + swap_lsn = meta->lsn; + if (needs_swap) { + M_32_SWAP(swap_lsn.file); + M_32_SWAP(swap_lsn.offset); + } + if (!IS_REP_CLIENT(env) && !IS_NOT_LOGGED_LSN(swap_lsn) && + !IS_ZERO_LSN(swap_lsn) && (ret = + __log_check_page_lsn(env, dbp, &swap_lsn)) != 0) + return (ret); + } if (FLD_ISSET(meta->metaflags, DBMETA_CHKSUM)) { if (dbp != NULL) F_SET(dbp, DB_AM_CHKSUM); - - is_hmac = meta->encrypt_alg == 0 ? 0 : 1; - chksum = ((BTMETA *)meta)->chksum; - - /* - * If we need to swap, the checksum function overwrites the - * original checksum with 0, so we need to save a copy of the - * original for swapping later. - */ - orig_chk = *(u_int32_t *)chksum; - /* * We cannot add this to __db_metaswap because that gets done * later after we've verified the checksum or decrypted. */ if (LF_ISSET(DB_CHK_META)) { - swapped = 0; -chk_retry: if ((ret = + is_hmac = meta->encrypt_alg != 0; + chksum = ((BTMETA *)meta)->chksum; + if (needs_swap && !is_hmac) + M_32_SWAP(*(u_int32_t *)chksum); + if ((ret = __db_check_chksum(env, NULL, env->crypto_handle, - chksum, meta, DBMETASIZE, is_hmac)) != 0) { - if (is_hmac || swapped) - return (DB_CHKSUM_FAIL); - - M_32_SWAP(orig_chk); - swapped = 1; - *(u_int32_t *)chksum = orig_chk; - goto chk_retry; - } + chksum, meta, DBMETASIZE, is_hmac)) != 0) + return (DB_CHKSUM_FAIL); } } else if (dbp != NULL) F_CLR(dbp, DB_AM_CHKSUM); @@ -492,44 +528,8 @@ chk_retry: if ((ret = #ifdef HAVE_CRYPTO if (__crypto_decrypt_meta(env, dbp, (u_int8_t *)meta, LF_ISSET(DB_CHK_META)) != 0) - ret = DB_CHKSUM_FAIL; - else + ret = DB_CHKSUM_FAIL; #endif - - /* Now that we're decrypted, we can check LSN. */ - if (LOGGING_ON(env) && !LF_ISSET(DB_CHK_NOLSN)) { - /* - * This gets called both before and after swapping, so we - * need to check ourselves. If we already swapped it above, - * we'll know that here. - */ - - swap_lsn = meta->lsn; - magic = meta->magic; -lsn_retry: - if (swapped) { - M_32_SWAP(swap_lsn.file); - M_32_SWAP(swap_lsn.offset); - M_32_SWAP(magic); - } - switch (magic) { - case DB_BTREEMAGIC: - case DB_HASHMAGIC: - case DB_HEAPMAGIC: - case DB_QAMMAGIC: - case DB_RENAMEMAGIC: - break; - default: - if (swapped) - return (EINVAL); - swapped = 1; - goto lsn_retry; - } - if (!IS_REP_CLIENT(env) && - !IS_NOT_LOGGED_LSN(swap_lsn) && !IS_ZERO_LSN(swap_lsn)) - /* Need to do check. */ - ret = __log_check_page_lsn(env, dbp, &swap_lsn); - } return (ret); } @@ -598,7 +598,6 @@ swap_retry: } /* - * We can only check the meta page if we are sure we have a meta page. * If it is random data, then this check can fail. So only now can we * checksum and decrypt. Don't distinguish between configuration and * checksum match errors here, because we haven't opened the database @@ -606,9 +605,9 @@ swap_retry: * If DB_SKIP_CHK is set, it means the checksum was already checked * and the page was already decrypted. */ - if (!LF_ISSET(DB_SKIP_CHK) && + if (!LF_ISSET(DB_SKIP_CHK) && (ret = __db_chk_meta(env, dbp, meta, flags)) != 0) { - if (ret == DB_CHKSUM_FAIL) + if (ret == DB_CHKSUM_FAIL) __db_errx(env, DB_STR_A("0640", "%s: metadata page checksum error", "%s"), name); goto bad_format; @@ -669,10 +668,9 @@ swap_retry: } if (FLD_ISSET(meta->metaflags, - DBMETA_PART_RANGE | DBMETA_PART_CALLBACK)) - if ((ret = - __partition_init(dbp, meta->metaflags)) != 0) - return (ret); + DBMETA_PART_RANGE | DBMETA_PART_CALLBACK) && + (ret = __partition_init(dbp, meta->metaflags)) != 0) + return (ret); return (0); bad_format: diff --git a/src/db/db_overflow.c b/src/db/db_overflow.c index d992ec0d..22f349ed 100644 --- a/src/db/db_overflow.c +++ b/src/db/db_overflow.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1990, 1993, 1994, 1995, 1996 @@ -58,39 +58,26 @@ */ /* - * __db_goff -- - * Get an offpage item. + * __db_alloc_dbt * - * PUBLIC: int __db_goff __P((DBC *, - * PUBLIC: DBT *, u_int32_t, db_pgno_t, void **, u_int32_t *)); + * Allocate enough space in the dbt to hold the data. Also used by the + * blob file API. + * + * PUBLIC: int __db_alloc_dbt __P((ENV *, DBT *, u_int32_t, u_int32_t *, + * PUBLIC: u_int32_t *, void **, u_int32_t *)); */ int -__db_goff(dbc, dbt, tlen, pgno, bpp, bpsz) - DBC *dbc; +__db_alloc_dbt(env, dbt, tlen, nd, st, bpp, bpsz) + ENV *env; DBT *dbt; u_int32_t tlen; - db_pgno_t pgno; + u_int32_t *nd; + u_int32_t *st; void **bpp; u_int32_t *bpsz; { - DB *dbp; - DB_MPOOLFILE *mpf; - DB_TXN *txn; - DBC_INTERNAL *cp; - ENV *env; - PAGE *h; - DB_THREAD_INFO *ip; - db_indx_t bytes; - u_int32_t curoff, needed, start; - u_int8_t *p, *src; int ret; - - dbp = dbc->dbp; - cp = dbc->internal; - env = dbp->env; - ip = dbc->thread_info; - mpf = dbp->mpf; - txn = dbc->txn; + u_int32_t needed, start; /* * Check if the buffer is big enough; if it is not and we are @@ -110,6 +97,8 @@ __db_goff(dbc, dbt, tlen, pgno, bpp, bpsz) start = 0; needed = tlen; } + *nd = needed; + *st = start; /* * If the caller has not requested any data, return success. This @@ -123,7 +112,7 @@ __db_goff(dbc, dbt, tlen, pgno, bpp, bpsz) } if (F_ISSET(dbt, DB_DBT_USERCOPY)) - goto skip_alloc; + return (0); /* Allocate any necessary memory. */ if (F_ISSET(dbt, DB_DBT_USERMEM)) { @@ -152,7 +141,48 @@ __db_goff(dbc, dbt, tlen, pgno, bpp, bpsz) return (DB_BUFFER_SMALL); } -skip_alloc: + return (0); +} + +/* + * __db_goff -- + * Get an offpage item. + * + * PUBLIC: int __db_goff __P((DBC *, + * PUBLIC: DBT *, u_int32_t, db_pgno_t, void **, u_int32_t *)); + */ +int +__db_goff(dbc, dbt, tlen, pgno, bpp, bpsz) + DBC *dbc; + DBT *dbt; + u_int32_t tlen; + db_pgno_t pgno; + void **bpp; + u_int32_t *bpsz; +{ + DB *dbp; + DB_MPOOLFILE *mpf; + DB_TXN *txn; + DBC_INTERNAL *cp; + ENV *env; + PAGE *h; + DB_THREAD_INFO *ip; + db_indx_t bytes; + u_int32_t curoff, needed, start; + u_int8_t *p, *src; + int ret; + + dbp = dbc->dbp; + cp = dbc->internal; + env = dbp->env; + ip = dbc->thread_info; + mpf = dbp->mpf; + txn = dbc->txn; + + if (((ret = __db_alloc_dbt( + env, dbt, tlen, &needed, &start, bpp, bpsz)) != 0) || needed == 0) + return (ret); + /* Set up a start page in the overflow chain if streaming. */ if (cp->stream_start_pgno != PGNO_INVALID && pgno == cp->stream_start_pgno && start >= cp->stream_off && @@ -485,28 +515,33 @@ __db_doff(dbc, pgno) /* * __db_moff -- - * Match on overflow pages. + * Match on overflow pages from a specific offset. * - * Given a starting page number and a key, return <0, 0, >0 to indicate if the - * key on the page is less than, equal to or greater than the key specified. - * We optimize this by doing chunk at a time comparison unless the user has - * specified a comparison function. In this case, we need to materialize - * the entire object and call their comparison routine. + * Given a starting page number and a key, store <0, 0, >0 in 'cmpp' to indicate + * if the key on the page is less than, equal to or greater than the key + * specified. We optimize this by doing a chunk at a time comparison unless the + * user has specified a comparison function. In this case, we need to + * materialize the entire object and call their comparison routine. + * + * We start the comparison at an offset and update the offset with the + * longest matching count after the comparison. * * __db_moff and __db_coff are generic functions useful in searching and * ordering off page items. __db_moff matches an overflow DBT with an offpage * item. __db_coff compares two offpage items for lexicographic sort order. * * PUBLIC: int __db_moff __P((DBC *, const DBT *, db_pgno_t, u_int32_t, - * PUBLIC: int (*)(DB *, const DBT *, const DBT *), int *)); + * PUBLIC: int (*)(DB *, const DBT *, const DBT *, size_t *), + * PUBLIC: int *, size_t *)); */ int -__db_moff(dbc, dbt, pgno, tlen, cmpfunc, cmpp) +__db_moff(dbc, dbt, pgno, tlen, cmpfunc, cmpp, locp) DBC *dbc; const DBT *dbt; db_pgno_t pgno; u_int32_t tlen; - int (*cmpfunc) __P((DB *, const DBT *, const DBT *)), *cmpp; + int (*cmpfunc) __P((DB *, const DBT *, const DBT *, size_t *)), *cmpp; + size_t *locp; { DB *dbp; DBT local_dbt; @@ -517,6 +552,7 @@ __db_moff(dbc, dbt, pgno, tlen, cmpfunc, cmpp) u_int32_t bufsize, cmp_bytes, key_left; u_int8_t *p1, *p2; int ret; + size_t pos, start; dbp = dbc->dbp; ip = dbc->thread_info; @@ -535,39 +571,76 @@ __db_moff(dbc, dbt, pgno, tlen, cmpfunc, cmpp) &local_dbt, tlen, pgno, &buf, &bufsize)) != 0) return (ret); /* Pass the key as the first argument */ - *cmpp = cmpfunc(dbp, dbt, &local_dbt); + *cmpp = cmpfunc(dbp, dbt, &local_dbt, NULL); __os_free(dbp->env, buf); return (0); } + /* + * We start the comparison from the location of 'locp' and store the + * last matching location into 'locp'. + */ + start = (locp == NULL ? 0 : *locp); + pos = 0; + + /* Subtract prefix length from lengths. */ + tlen -= (u_int32_t)start; + key_left = dbt->size - (u_int32_t)start; + p1 = (u_int8_t *)dbt->data + start; + /* While there are both keys to compare. */ - for (*cmpp = 0, p1 = dbt->data, - key_left = dbt->size; key_left > 0 && pgno != PGNO_INVALID;) { + for (*cmpp = 0; key_left > 0 && + tlen > 0 && pgno != PGNO_INVALID;) { if ((ret = __memp_fget(mpf, &pgno, ip, dbc->txn, 0, &pagep)) != 0) return (ret); - cmp_bytes = OV_LEN(pagep) < key_left ? OV_LEN(pagep) : key_left; - tlen -= cmp_bytes; - key_left -= cmp_bytes; - for (p2 = (u_int8_t *)pagep + P_OVERHEAD(dbp); - cmp_bytes-- > 0; ++p1, ++p2) - if (*p1 != *p2) { - *cmpp = (long)*p1 - (long)*p2; - break; + /* + * Figure out where to start comparison, and how many + * bytes to compare. + */ + if (pos >= start) { + p2 = (u_int8_t *)pagep + P_OVERHEAD(dbp); + cmp_bytes = OV_LEN(pagep); + } else if (pos + OV_LEN(pagep) > start) { + p2 = (u_int8_t *)pagep + + P_OVERHEAD(dbp) + (start - pos); + cmp_bytes = OV_LEN(pagep) - (u_int32_t)(start - pos); + } else { + p2 = NULL; + cmp_bytes = 0; + } + + pos += OV_LEN(pagep); + + if (cmp_bytes != 0) { + if (cmp_bytes > key_left) + cmp_bytes = key_left; + tlen -= cmp_bytes; + key_left -= cmp_bytes; + for (;cmp_bytes-- > 0; ++p1, ++p2) { + if (*p1 != *p2) { + *cmpp = (long)*p1 - (long)*p2; + break; + } + if (locp != NULL) + ++(*locp); } + + } pgno = NEXT_PGNO(pagep); if ((ret = __memp_fput(mpf, ip, pagep, dbp->priority)) != 0) return (ret); if (*cmpp != 0) return (0); } - if (key_left > 0) /* DBT is longer than the page key. */ - *cmpp = 1; - else if (tlen > 0) /* DBT is shorter than the page key. */ - *cmpp = -1; - else - *cmpp = 0; + + if (*cmpp == 0) { + if (key_left > 0) /* DBT is longer than the page key. */ + *cmpp = 1; + else if (tlen > 0) /* DBT is shorter than the page key. */ + *cmpp = -1; + } return (0); } @@ -587,13 +660,13 @@ __db_moff(dbc, dbt, pgno, tlen, cmpfunc, cmpp) * DBT type. * * PUBLIC: int __db_coff __P((DBC *, const DBT *, const DBT *, - * PUBLIC: int (*)(DB *, const DBT *, const DBT *), int *)); + * PUBLIC: int (*)(DB *, const DBT *, const DBT *, size_t *), int *)); */ int __db_coff(dbc, dbt, match, cmpfunc, cmpp) DBC *dbc; const DBT *dbt, *match; - int (*cmpfunc) __P((DB *, const DBT *, const DBT *)), *cmpp; + int (*cmpfunc) __P((DB *, const DBT *, const DBT *, size_t *)), *cmpp; { DB *dbp; DB_THREAD_INFO *ip; @@ -643,7 +716,7 @@ __db_coff(dbc, dbt, match, cmpfunc, cmpp) match_pgno, &match_buf, &match_bufsz)) != 0) goto err1; /* The key needs to be the first argument for sort order */ - *cmpp = cmpfunc(dbp, &local_key, &local_match); + *cmpp = cmpfunc(dbp, &local_key, &local_match, NULL); err1: if (dbt_buf != NULL) __os_free(dbp->env, dbt_buf); @@ -657,6 +730,7 @@ err1: if (dbt_buf != NULL) if ((ret = __memp_fget(mpf, &dbt_pgno, ip, txn, 0, &dbt_pagep)) != 0) return (ret); + DB_ASSERT(dbc->env, TYPE(dbt_pagep) == P_OVERFLOW); if ((ret = __memp_fget(mpf, &match_pgno, ip, txn, 0, &match_pagep)) != 0) { @@ -664,6 +738,7 @@ err1: if (dbt_buf != NULL) mpf, ip, dbt_pagep, DB_PRIORITY_UNCHANGED); return (ret); } + DB_ASSERT(dbc->env, TYPE(match_pagep) == P_OVERFLOW); cmp_bytes = page_space < max_data ? page_space : max_data; for (p1 = (u_int8_t *)dbt_pagep + P_OVERHEAD(dbp), p2 = (u_int8_t *)match_pagep + P_OVERHEAD(dbp); diff --git a/src/db/db_ovfl_vrfy.c b/src/db/db_ovfl_vrfy.c index fa630f7b..55eb2b70 100644 --- a/src/db/db_ovfl_vrfy.c +++ b/src/db/db_ovfl_vrfy.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1990, 1993, 1994, 1995, 1996 diff --git a/src/db/db_pr.c b/src/db/db_pr.c index d95440f9..4933498e 100644 --- a/src/db/db_pr.c +++ b/src/db/db_pr.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -11,6 +11,7 @@ #include "db_int.h" #include "dbinc/db_page.h" #include "dbinc/btree.h" +#include "dbinc/fop.h" #include "dbinc/hash.h" #include "dbinc/heap.h" #include "dbinc/mp.h" @@ -25,6 +26,11 @@ static int __db_hmeta __P((ENV *, DB *, HMETA *, u_int32_t)); static void __db_meta __P((ENV *, DB *, DBMETA *, FN const *, u_int32_t)); static void __db_proff __P((ENV *, DB_MSGBUF *, void *)); static int __db_qmeta __P((ENV *, DB *, QMETA *, u_int32_t)); +static int __db_prblob __P((DBC *, DBT *, DBT *, int, const char *, + void *, int (*callback) __P((void *, const void *)), int, int)); +static int __db_prblob_id __P((DB *, db_seq_t, + off_t, DBT *, int, const char *, void *, + int (*callback) __P((void *, const void *)))); #ifdef HAVE_STATISTICS static void __db_prdb __P((DB *, u_int32_t)); static int __db_prtree __P((DB *, DB_TXN *, @@ -515,6 +521,11 @@ __db_bmeta(env, dbp, h, flags) __db_msg(env, "\tre_len: %#lx re_pad: %#lx", (u_long)h->re_len, (u_long)h->re_pad); __db_msg(env, "\troot: %lu", (u_long)h->root); + __db_msg(env, "\tblob_threshold: %lu", (u_long)h->blob_threshold); + __db_msg(env, "\tblob_file_lo: %lu", (u_long)h->blob_file_lo); + __db_msg(env, "\tblob_file_hi: %lu", (u_long)h->blob_file_hi); + __db_msg(env, "\tblob_sdb_lo: %lu", (u_long)h->blob_sdb_lo); + __db_msg(env, "\tblob_sdb_hi: %lu", (u_long)h->blob_sdb_hi); return (0); } @@ -549,6 +560,11 @@ __db_hmeta(env, dbp, h, flags) __db_msg(env, "\tffactor: %lu", (u_long)h->ffactor); __db_msg(env, "\tnelem: %lu", (u_long)h->nelem); __db_msg(env, "\th_charkey: %#lx", (u_long)h->h_charkey); + __db_msg(env, "\tblob_threshold: %lu", (u_long)h->blob_threshold); + __db_msg(env, "\tblob_file_lo: %lu", (u_long)h->blob_file_lo); + __db_msg(env, "\tblob_file_hi: %lu", (u_long)h->blob_file_hi); + __db_msg(env, "\tblob_sdb_lo: %lu", (u_long)h->blob_sdb_lo); + __db_msg(env, "\tblob_sdb_hi: %lu", (u_long)h->blob_sdb_hi); __db_msgadd(env, &mb, "\tspare points:\n\t"); for (i = 0; i < NCACHED; i++) { __db_msgadd(env, &mb, "%lu (%lu) ", (u_long)h->spares[i], @@ -604,6 +620,9 @@ __db_heapmeta(env, dbp, h, flags) __db_msg(env, "\tnregions: %lu", (u_long)h->nregions); __db_msg(env, "\tgbytes: %lu", (u_long)h->gbytes); __db_msg(env, "\tbytes: %lu", (u_long)h->bytes); + __db_msg(env, "\tblob_threshold: %lu", (u_long)h->blob_threshold); + __db_msg(env, "\tblob_file_lo: %lu", (u_long)h->blob_file_lo); + __db_msg(env, "\tblob_file_hi: %lu", (u_long)h->blob_file_hi); return (0); } @@ -682,14 +701,19 @@ __db_prpage_int(env, mbp, dbp, lead, h, pagesize, data, flags) { BINTERNAL *bi; BKEYDATA *bk; + BBLOB bl; HOFFPAGE a_hkd; + HBLOB hblob; QAMDATA *qp, *qep; RINTERNAL *ri; HEAPHDR *hh; HEAPSPLITHDR *hs; + HEAPBLOBHDR bhdr; db_indx_t dlen, len, i, *inp, max; db_pgno_t pgno; db_recno_t recno; + off_t blob_size; + db_seq_t blob_id; u_int32_t qlen; u_int8_t *ep, *hk, *p; int deleted, ret; @@ -899,6 +923,23 @@ __db_prpage_int(env, mbp, dbp, lead, h, pagesize, data, flags) (u_long)a_hkd.tlen, (u_long)a_hkd.pgno); DB_MSGBUF_FLUSH(env, mbp); break; + case H_BLOB: + memcpy(&hblob, hk, HBLOB_SIZE); + blob_id = (db_seq_t)hblob.id; + __db_msgadd(env, mbp, "blob: id: %llu ", + (long long)blob_id); + GET_BLOB_SIZE(env, hblob, blob_size, ret); + if (ret != 0) + __db_msgadd(env, mbp, + "blob: blob_size overflow. "); + __db_msgadd(env, mbp, "blob: size: %llu", + (long long)blob_size); + /* + * No point printing the blob file, it is + * likely not readable by humans. + */ + DB_MSGBUF_FLUSH(env, mbp); + break; default: DB_MSGBUF_FLUSH(env, mbp); __db_msg(env, "ILLEGAL HASH PAGE TYPE: %lu", @@ -925,6 +966,7 @@ __db_prpage_int(env, mbp, dbp, lead, h, pagesize, data, flags) __db_proff(env, mbp, bi->data); break; default: + /* B_BLOB does not appear on internal pages. */ DB_MSGBUF_FLUSH(env, mbp); __db_msg(env, "ILLEGAL BINTERNAL TYPE: %lu", (u_long)B_TYPE(bi->type)); @@ -950,6 +992,19 @@ __db_prpage_int(env, mbp, dbp, lead, h, pagesize, data, flags) case B_OVERFLOW: __db_proff(env, mbp, bk); break; + case B_BLOB: + memcpy(&bl, bk, BBLOB_SIZE); + blob_id = (db_seq_t)bl.id; + __db_msgadd(env, mbp, "blob: id: %llu ", + (long long)blob_id); + GET_BLOB_SIZE(env, bl, blob_size, ret); + if (ret != 0) + __db_msgadd(env, mbp, + "blob: blob_size overflow. "); + __db_msgadd(env, mbp, "blob: size: %llu", + (long long)blob_size); + DB_MSGBUF_FLUSH(env, mbp); + break; default: DB_MSGBUF_FLUSH(env, mbp); __db_msg(env, @@ -961,9 +1016,27 @@ __db_prpage_int(env, mbp, dbp, lead, h, pagesize, data, flags) break; case P_HEAP: hh = sp; - if (!F_ISSET(hh,HEAP_RECSPLIT)) + if (!F_ISSET(hh,HEAP_RECSPLIT) && + !F_ISSET(hh, HEAP_RECBLOB)) hdata = (u_int8_t *)hh + sizeof(HEAPHDR); - else { + else if (F_ISSET(hh, HEAP_RECBLOB)) { + memcpy(&bhdr, hh, HEAPBLOBREC_SIZE); + blob_id = (db_seq_t)bhdr.id; + __db_msgadd(env, mbp, "blob: id: %llu ", + (long long)blob_id); + GET_BLOB_SIZE(env, bhdr, blob_size, ret); + if (ret != 0) + __db_msgadd(env, mbp, + "blob: blob_size overflow. "); + __db_msgadd(env, mbp, "blob: size: %llu", + (long long)blob_size); + /* + * No point printing the blob file, it is + * likely not readable by humans. + */ + DB_MSGBUF_FLUSH(env, mbp); + break; + } else { hs = sp; __db_msgadd(env, mbp, "split: 0x%02x tsize: %lu next: %lu.%lu ", @@ -1276,10 +1349,16 @@ __db_dump(dbp, subname, callback, handle, pflag, keyflag) ENV *env; db_recno_t recno; int is_recno, is_heap, ret, t_ret; + u_int32_t blob_threshold; void *pointer; env = dbp->env; is_heap = 0; + memset(&dataret, 0, sizeof(DBT)); + memset(&keyret, 0, sizeof(DBT)); + + if ((ret = __db_get_blob_threshold(dbp, &blob_threshold)) != 0) + return (ret); if ((ret = __db_prheader( dbp, subname, pflag, keyflag, handle, callback, NULL, 0)) != 0) @@ -1317,8 +1396,8 @@ retry: while ((ret = !is_heap ? DB_NEXT | DB_MULTIPLE_KEY : DB_NEXT )) == 0) { if (is_heap) { /* Never dump keys for HEAP */ - if ((ret = __db_prdbt( - &data, pflag, " ", handle, callback, 0, 0)) != 0) + if ((ret = __db_prdbt(&data, + pflag, " ", handle, callback, 0, 0, 0)) != 0) goto err; continue; } @@ -1337,17 +1416,24 @@ retry: while ((ret = if ((keyflag && (ret = __db_prdbt(&keyret, pflag, " ", - handle, callback, is_recno, 0)) != 0) || + handle, callback, is_recno, 0, 0)) != 0) || (ret = __db_prdbt(&dataret, pflag, " ", - handle, callback, 0, 0)) != 0) + handle, callback, 0, 0, 0)) != 0) goto err; } } if (ret == DB_BUFFER_SMALL) { - data.size = (u_int32_t)DB_ALIGN(data.size, 1024); - if ((ret = __os_realloc(env, data.size, &data.data)) != 0) - goto err; - data.ulen = data.size; + if (blob_threshold != 0 && data.size >= blob_threshold) { + if ((ret = __db_prblob(dbcp, &key, &data, pflag, + " ", handle, callback, is_heap, keyflag)) != 0) + goto err; + } else { + data.size = (u_int32_t)DB_ALIGN(data.size, 1024); + if ((ret = __os_realloc( + env, data.size, &data.data)) != 0) + goto err; + data.ulen = data.size; + } goto retry; } if (ret == DB_NOTFOUND) @@ -1365,14 +1451,153 @@ err: if ((t_ret = __dbc_close(dbcp)) != 0 && ret == 0) } /* + * __db_prblob + * Print a blob file. + */ +static int +__db_prblob(dbc, key, data, checkprint, + prefix, handle, callback, is_heap, keyflag) + DBC *dbc; + DBT *key; + DBT *data; + int checkprint; + const char *prefix; + void *handle; + int (*callback) __P((void *, const void *)); + int is_heap; + int keyflag; +{ + DBC *local; + DBT partial; + int ret, t_ret; + off_t blob_size; + db_seq_t blob_id; + + local = NULL; + memset(&partial, 0, sizeof(DBT)); + partial.flags = DB_DBT_PARTIAL; + + if ((ret = __dbc_idup(dbc, &local, DB_POSITION)) != 0) + goto err; + + /* Move the cursor to the blob. */ + if ((ret = __dbc_get(local, key, &partial, DB_NEXT)) != 0) + return (ret); + + if ((ret = __dbc_get_blob_id(local, &blob_id)) != 0) { + /* + * It is possible this is not a blob. Non-blob items that are + * larger than the blob threshold can exist if the item was + * smaller than the threshold when created, then later updated + * to larger than the threshold value. + */ + if (ret == EINVAL) { + ret = 0; + data->size = (u_int32_t)DB_ALIGN(data->size, 1024); + if ((ret = __os_realloc( + dbc->env, data->size, &data->data)) != 0) + goto err; + data->ulen = data->size; + } + goto err; + } + + if (data->ulen < MEGABYTE) { + if ((data->data = realloc( + data->data, data->ulen = MEGABYTE)) == NULL) { + ret = ENOMEM; + goto err; + } + } + + if ((ret = __dbc_get_blob_size(local, &blob_size)) != 0) + goto err; + + if (keyflag && !is_heap && (ret = __db_prdbt( + key, checkprint, " ", handle, callback, 0, 0, 0)) != 0) + goto err; + + if ((ret = __db_prblob_id(local->dbp, blob_id, blob_size, + data, checkprint, prefix, handle, callback)) != 0) + goto err; + + /* Move the cursor. */ + ret = __dbc_get(dbc, key, &partial, DB_NEXT); + +err: if (local != NULL) { + if ((t_ret = __dbc_close(local)) != 0 && ret == 0) + ret = t_ret; + } + + return (ret); +} + +/* + * __db_prblob_id -- + * Print a blob file identified by the given id. + */ +static int +__db_prblob_id(dbp, blob_id, + blob_size, data, checkprint, prefix, handle, callback) + DB *dbp; + db_seq_t blob_id; + off_t blob_size; + DBT *data; + int checkprint; + const char *prefix; + void *handle; + int (*callback) __P((void *, const void *)); +{ + DB_FH *fhp; + const char *pre; + int ret, skip_newline, t_ret; + off_t left, offset; + + fhp = NULL; + offset = 0; + + if ((ret = __blob_file_open( + dbp, &fhp, blob_id, DB_FOP_READONLY, 1)) != 0) + goto err; + + left = blob_size; + while (left > 0) { + if ((ret = __blob_file_read( + dbp->env, fhp, data, offset, data->ulen)) != 0) + goto err; + if (offset == 0) + pre = prefix; + else + pre = NULL; + skip_newline = data->size < left ? 1 : 0; + if ((ret = __db_prdbt(data, checkprint, pre, + handle, callback, 0, 0, skip_newline)) != 0) + goto err; + if (data->size > left) + left = 0; + else + left = left - data->size; + offset = offset + data->size; + } + +err: if (fhp != NULL) { + if ((t_ret = __os_closehandle(dbp->env, fhp)) != 0 && ret == 0) + ret = t_ret; + } + + return (ret); +} + +/* * __db_prdbt -- * Print out a DBT data element. * * PUBLIC: int __db_prdbt __P((DBT *, int, const char *, void *, - * PUBLIC: int (*)(void *, const void *), int, int)); + * PUBLIC: int (*)(void *, const void *), int, int, int)); */ int -__db_prdbt(dbtp, checkprint, prefix, handle, callback, is_recno, is_heap) +__db_prdbt(dbtp, checkprint, + prefix, handle, callback, is_recno, is_heap, no_newline) DBT *dbtp; int checkprint; const char *prefix; @@ -1380,16 +1605,17 @@ __db_prdbt(dbtp, checkprint, prefix, handle, callback, is_recno, is_heap) int (*callback) __P((void *, const void *)); int is_recno; int is_heap; + int no_newline; { - static const u_char hex[] = "0123456789abcdef"; db_recno_t recno; DB_HEAP_RID rid; - size_t len; + size_t count, len; int ret; + u_int8_t *p; #define DBTBUFLEN 100 - u_int8_t *p, *hp; - char buf[DBTBUFLEN], hbuf[DBTBUFLEN]; + char buf[DBTBUFLEN], hexbuf[2 * DBTBUFLEN + 1]; + ret = 0; /* * !!! * This routine is the routine that dumps out items in the format @@ -1409,13 +1635,8 @@ __db_prdbt(dbtp, checkprint, prefix, handle, callback, is_recno, is_heap) /* If we're printing data as hex, print keys as hex too. */ if (!checkprint) { - for (len = strlen(buf), p = (u_int8_t *)buf, - hp = (u_int8_t *)hbuf; len-- > 0; ++p) { - *hp++ = hex[(u_int8_t)(*p & 0xf0) >> 4]; - *hp++ = hex[*p & 0x0f]; - } - *hp = '\0'; - ret = callback(handle, hbuf); + (void)__db_tohex(buf, strlen(buf), hexbuf); + ret = callback(handle, hexbuf); } else ret = callback(handle, buf); @@ -1433,44 +1654,46 @@ __db_prdbt(dbtp, checkprint, prefix, handle, callback, is_recno, is_heap) /* If we're printing data as hex, print keys as hex too. */ if (!checkprint) { - for (len = strlen(buf), p = (u_int8_t *)buf, - hp = (u_int8_t *)hbuf; len-- > 0; ++p) { - *hp++ = hex[(u_int8_t)(*p & 0xf0) >> 4]; - *hp++ = hex[*p & 0x0f]; - } - *hp = '\0'; - ret = callback(handle, hbuf); + (void)__db_tohex(buf, strlen(buf), hexbuf); + ret = callback(handle, hexbuf); } else ret = callback(handle, buf); if (ret != 0) return (ret); } else if (checkprint) { + /* + * Prepare buf for the 'isprint()' case: printable single char + * strings; prepare hexbuf for the other case '\<2 hex digits>'. + */ + buf[1] = '\0'; + hexbuf[0] = '\\'; for (len = dbtp->size, p = dbtp->data; len--; ++p) if (isprint((int)*p)) { if (*p == '\\' && (ret = callback(handle, "\\")) != 0) return (ret); - snprintf(buf, DBTBUFLEN, "%c", *p); + buf[0] = (char)*p; if ((ret = callback(handle, buf)) != 0) return (ret); } else { - snprintf(buf, DBTBUFLEN, "\\%c%c", - hex[(u_int8_t)(*p & 0xf0) >> 4], - hex[*p & 0x0f]); - if ((ret = callback(handle, buf)) != 0) + (void)__db_tohex(p, 1, hexbuf + 1); + if ((ret = callback(handle, hexbuf)) != 0) return (ret); } } else - for (len = dbtp->size, p = dbtp->data; len--; ++p) { - snprintf(buf, DBTBUFLEN, "%c%c", - hex[(u_int8_t)(*p & 0xf0) >> 4], - hex[*p & 0x0f]); - if ((ret = callback(handle, buf)) != 0) + for (len = dbtp->size, p = dbtp->data, count = DBTBUFLEN; + len > 0; len -= count, p += count) { + if (count > len) + count = len; + (void)__db_tohex(p, count, hexbuf); + if ((ret = callback(handle, hexbuf)) != 0) return (ret); } - - return (callback(handle, "\n")); + if (no_newline == 0) + return (callback(handle, "\n")); + else + return (ret); } /* @@ -1598,7 +1821,7 @@ __db_prheader(dbp, subname, pflag, keyflag, handle, callback, vdp, meta_pgno) goto err; DB_INIT_DBT(dbt, subname, strlen(subname)); if ((ret = __db_prdbt(&dbt, 1, - NULL, handle, callback, 0, 0)) != 0) + NULL, handle, callback, 0, 0, 0)) != 0) goto err; } switch (dbtype) { @@ -1868,7 +2091,7 @@ __db_prheader(dbp, subname, pflag, keyflag, handle, callback, vdp, meta_pgno) goto err; for (i = 0; i < tmp_u_int32 - 1; i++) if ((ret = __db_prdbt(&keys[i], - pflag, " ", handle, callback, 0, 0)) != 0) + pflag, " ", handle, callback, 0, 0, 0)) != 0) goto err; } } @@ -1954,3 +2177,33 @@ __db_dbtype_to_string(type) } return ("UNKNOWN TYPE"); } + +/* + * __db_tohex -- + * Generate a hex string representation of a byte array. + * The size of the destination must be at least 2*len + 1 bytes long, + * to allow for the '\0' terminator, which is always added. + * + * PUBLIC: char *__db_tohex __P((const void *, size_t, char *)); + */ +char * +__db_tohex(source, len, dest) + const void *source; + size_t len; + char *dest; +{ + static const char hex[] = "0123456789abcdef"; + const u_int8_t *s; + char *d; + + s = source; + d = dest; + while (len > 0) { + *d++ = hex[(*s & 0xf0) >> 4]; + *d++ = hex[*s & 0x0f]; + s++; + len--; + } + *d = '\0'; + return ((char *)dest); +} diff --git a/src/db/db_rec.c b/src/db/db_rec.c index 8ba1124e..98b29b22 100644 --- a/src/db/db_rec.c +++ b/src/db/db_rec.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -1194,8 +1194,9 @@ __db_pg_init_recover(env, dbtp, lsnp, op, info) DB_LSN copy_lsn; DB_MPOOLFILE *mpf; PAGE *pagep; - int cmp_n, cmp_p, ret, type; + int cmp_n, cmp_p, ret, t_ret, type; + pagep = NULL; ip = ((DB_TXNHEAD *)info)->thread_info; REC_PRINT(__db_pg_init_print); REC_INTRO(__db_pg_init_read, ip, 0); @@ -1247,11 +1248,12 @@ __db_pg_init_recover(env, dbtp, lsnp, op, info) memcpy((u_int8_t*)pagep + HOFFSET(pagep), argp->data.data, argp->data.size); } - if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0) - goto out; done: *lsnp = argp->prev_lsn; out: + if (pagep != NULL && (t_ret = + __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0 && ret == 0) + ret = t_ret; REC_CLOSE; } diff --git a/src/db/db_reclaim.c b/src/db/db_reclaim.c index b902769a..abae33d9 100644 --- a/src/db/db_reclaim.c +++ b/src/db/db_reclaim.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -181,6 +181,7 @@ __db_truncate_callback(dbc, p, cookie, putp) switch (*H_PAIRDATA(dbp, p, indx)) { case H_OFFDUP: break; + case H_BLOB: case H_OFFPAGE: case H_KEYDATA: ++*countp; diff --git a/src/db/db_remove.c b/src/db/db_remove.c index 591a29b2..d6118fae 100644 --- a/src/db/db_remove.c +++ b/src/db/db_remove.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -18,7 +18,7 @@ #include "dbinc/txn.h" static int __db_dbtxn_remove __P((DB *, - DB_THREAD_INFO *, DB_TXN *, const char *, const char *)); + DB_THREAD_INFO *, DB_TXN *, const char *, const char *, APPNAME)); static int __db_subdb_remove __P((DB *, DB_THREAD_INFO *, DB_TXN *, const char *, const char *, u_int32_t)); @@ -264,7 +264,7 @@ __db_remove_int(dbp, ip, txn, name, subdb, flags) /* Handle transactional file removes separately. */ if (IS_REAL_TXN(txn)) { - ret = __db_dbtxn_remove(dbp, ip, txn, name, subdb); + ret = __db_dbtxn_remove(dbp, ip, txn, name, subdb, DB_APP_DATA); goto err; } @@ -293,6 +293,10 @@ __db_remove_int(dbp, ip, txn, name, subdb, flags) (ret = dbp->db_am_remove(dbp, ip, NULL, name, subdb, flags)) != 0) goto err; + if (dbp->db_am_remove == NULL && + (ret = __blob_del_all(dbp, txn, 0)) != 0) + goto err; + ret = F_ISSET(dbp, DB_AM_INMEM) ? __db_inmem_remove(dbp, NULL, real_name) : __fop_remove(env, @@ -407,6 +411,10 @@ __db_subdb_remove(dbp, ip, txn, name, subdb, flags) txn, name, subdb, DB_UNKNOWN, DB_WRITEOPEN, 0, PGNO_BASE_MD)) != 0) goto err; + if (sdbp->blob_threshold != 0) + if ((ret = __blob_del_all(sdbp, txn, 0)) != 0) + goto err; + DB_TEST_RECOVERY(sdbp, DB_TEST_PREDESTROY, ret, name); /* Have the handle locked so we will not lock pages. */ @@ -460,18 +468,21 @@ err: } static int -__db_dbtxn_remove(dbp, ip, txn, name, subdb) +__db_dbtxn_remove(dbp, ip, txn, name, subdb, appname) DB *dbp; DB_THREAD_INFO *ip; DB_TXN *txn; const char *name, *subdb; + APPNAME appname; { ENV *env; int ret; char *tmpname; + u_int32_t flags; env = dbp->env; tmpname = NULL; + flags = DB_NOSYNC; /* * This is a transactional remove, so we have to keep the name @@ -488,7 +499,12 @@ __db_dbtxn_remove(dbp, ip, txn, name, subdb) DB_TEST_RECOVERY(dbp, DB_TEST_PREDESTROY, ret, name); if ((ret = __db_rename_int(dbp, - txn->thread_info, txn, name, subdb, tmpname, DB_NOSYNC)) != 0) + txn->thread_info, txn, name, subdb, tmpname, flags)) != 0) + goto err; + + /* Delete all blob files, if this database supports blobs. */ + if (appname != DB_APP_BLOB && (dbp->blob_file_id != 0 || + dbp->blob_sdb_id != 0) && (ret = __blob_del_all(dbp, txn, 0)) != 0) goto err; /* @@ -501,7 +517,7 @@ __db_dbtxn_remove(dbp, ip, txn, name, subdb) ret = F_ISSET(dbp, DB_AM_INMEM) ? __db_inmem_remove(dbp, txn, tmpname) : __fop_remove(env, - txn, dbp->fileid, tmpname, &dbp->dirname, DB_APP_DATA, + txn, dbp->fileid, tmpname, &dbp->dirname, appname, F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0); DB_TEST_RECOVERY(dbp, DB_TEST_POSTDESTROY, ret, name); diff --git a/src/db/db_rename.c b/src/db/db_rename.c index 2812b948..5b2bed42 100644 --- a/src/db/db_rename.c +++ b/src/db/db_rename.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -285,10 +285,11 @@ __db_rename_int(dbp, ip, txn, name, subdb, newname, flags) * taken care of in the fop layer. */ if (IS_REAL_TXN(txn)) { - if ((ret = __fop_dummy(dbp, txn, old, newname)) != 0) + if ((ret = + __fop_dummy(dbp, txn, old, newname, DB_APP_DATA)) != 0) goto err; } else { - if ((ret = __fop_dbrename(dbp, old, newname)) != 0) + if ((ret = __fop_dbrename(dbp, old, newname, DB_APP_DATA)) != 0) goto err; } diff --git a/src/db/db_ret.c b/src/db/db_ret.c index 709605f6..ddd0ef51 100644 --- a/src/db/db_ret.c +++ b/src/db/db_ret.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -29,18 +29,27 @@ __db_ret(dbc, h, indx, dbt, memp, memsize) void **memp; u_int32_t *memsize; { + BBLOB bl; BKEYDATA *bk; BOVERFLOW *bo; DB *dbp; + ENV *env; + HBLOB hblob; + HEAPBLOBHDR bhdr; HEAPHDR *hdr; + db_seq_t blob_id; + int ret; HOFFPAGE ho; + off_t blob_size; u_int32_t len; u_int8_t *hk; void *data; if (F_ISSET(dbt, DB_DBT_READONLY)) return (0); + ret = 0; dbp = dbc->dbp; + env = dbp->env; switch (TYPE(h)) { case P_HASH_UNSORTED: @@ -50,6 +59,20 @@ __db_ret(dbc, h, indx, dbt, memp, memsize) memcpy(&ho, hk, sizeof(HOFFPAGE)); return (__db_goff(dbc, dbt, ho.tlen, ho.pgno, memp, memsize)); + } else if (HPAGE_PTYPE(hk) == H_BLOB) { + /* Get the record instead of the blob item. */ + if (F_ISSET(dbt, DB_DBT_BLOB_REC)) { + data = P_ENTRY(dbp, h, indx); + len = HBLOB_SIZE; + break; + } + memcpy(&hblob, hk, HBLOB_SIZE); + blob_id = (db_seq_t)hblob.id; + GET_BLOB_SIZE(env, hblob, blob_size, ret); + if (ret != 0) + return (ret); + return (__blob_get( + dbc, dbt, blob_id, blob_size, memp, memsize)); } len = LEN_HKEYDATA(dbp, h, dbp->pgsize, indx); data = HKEYDATA_DATA(hk); @@ -58,6 +81,21 @@ __db_ret(dbc, h, indx, dbt, memp, memsize) hdr = (HEAPHDR *)P_ENTRY(dbp, h, indx); if (F_ISSET(hdr,(HEAP_RECSPLIT | HEAP_RECFIRST))) return (__heapc_gsplit(dbc, dbt, memp, memsize)); + else if (F_ISSET(hdr, HEAP_RECBLOB)) { + /* Get the record instead of the blob item. */ + if (F_ISSET(dbt, DB_DBT_BLOB_REC)) { + data = P_ENTRY(dbp, h, indx); + len = HEAPBLOBREC_SIZE; + break; + } + memcpy(&bhdr, hdr, HEAPBLOBREC_SIZE); + blob_id = (db_seq_t)bhdr.id; + GET_BLOB_SIZE(env, bhdr, blob_size, ret); + if (ret != 0) + return (ret); + return (__blob_get( + dbc, dbt, blob_id, blob_size, memp, memsize)); + } len = hdr->size; data = (u_int8_t *)hdr + sizeof(HEAPHDR); break; @@ -69,6 +107,20 @@ __db_ret(dbc, h, indx, dbt, memp, memsize) bo = (BOVERFLOW *)bk; return (__db_goff(dbc, dbt, bo->tlen, bo->pgno, memp, memsize)); + } else if (B_TYPE(bk->type) == B_BLOB) { + /* Get the record instead of the blob item. */ + if (F_ISSET(dbt, DB_DBT_BLOB_REC)) { + data = P_ENTRY(dbp, h, indx); + len = BBLOB_SIZE; + break; + } + memcpy(&bl, bk, BBLOB_SIZE); + blob_id = (db_seq_t)bl.id; + GET_BLOB_SIZE(env, bl, blob_size, ret); + if (ret != 0) + return (ret); + return (__blob_get( + dbc, dbt, blob_id, blob_size, memp, memsize)); } len = bk->len; data = bk->data; @@ -167,3 +219,71 @@ __db_retcopy(env, dbt, data, len, memp, memsize) return (ret); } + +/* + * __db_dbt_clone -- + * Clone a DBT from another DBT. + * The input dest DBT must be a zero initialized DBT that will be populated. + * The function does not allocate a dest DBT to allow for cloning into stack + * or locally allocated variables. It is the callers responsibility to free + * the memory allocated in dest->data. + * + * PUBLIC: int __db_dbt_clone __P((ENV *, DBT *, const DBT *)); + */ +int +__db_dbt_clone(env, dest, src) + ENV *env; + DBT *dest; + const DBT *src; +{ + u_int32_t err_flags; + int ret; + + DB_ASSERT(env, dest->data == NULL); + + ret = 0; + + /* The function does not support the following DBT flags. */ + err_flags = DB_DBT_MALLOC | DB_DBT_REALLOC | + DB_DBT_MULTIPLE | DB_DBT_PARTIAL; + if (F_ISSET(src, err_flags)) { + __db_errx(env, DB_STR("0758", + "Unsupported flags when cloning the DBT.")); + return (EINVAL); + } + + if ((ret = __os_malloc(env, src->size, &dest->data)) != 0) + return (ret); + + memcpy(dest->data, src->data, src->size); + dest->ulen = src->size; + dest->size = src->size; + dest->flags = DB_DBT_USERMEM; + + return (ret); +} + +/* + * __db_dbt_clone_free -- + * Free a DBT cloned by __db_dbt_clone + * + * PUBLIC: int __db_dbt_clone_free __P((ENV *, DBT *)); + */ +int +__db_dbt_clone_free(env, dbt) + ENV *env; + DBT *dbt; +{ + /* Currently only DB_DBT_USERMEM is supported. */ + if (dbt->flags != DB_DBT_USERMEM) { + __db_errx(env, DB_STR("0759", + "Unsupported flags when freeing the cloned DBT.")); + return (EINVAL); + } + + if (dbt->data != NULL) + __os_free(env, dbt->data); + dbt->size = dbt->ulen = 0; + + return (0); +} diff --git a/src/db/db_setid.c b/src/db/db_setid.c index 697c3ff7..5c61a139 100644 --- a/src/db/db_setid.c +++ b/src/db/db_setid.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2000, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2000, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/db/db_setlsn.c b/src/db/db_setlsn.c index 1a3280ed..acee80f6 100644 --- a/src/db/db_setlsn.c +++ b/src/db/db_setlsn.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2000, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2000, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/db/db_sort_multiple.c b/src/db/db_sort_multiple.c index c5e2e941..7facb80e 100644 --- a/src/db/db_sort_multiple.c +++ b/src/db/db_sort_multiple.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ #include "db_config.h" @@ -34,7 +34,7 @@ __db_compare_both(db, akey, adata, bkey, bdata) t = (BTREE *)db->bt_internal; - cmp = t->bt_compare(db, akey, bkey); + cmp = t->bt_compare(db, akey, bkey, NULL); if (cmp != 0) return cmp; if (!F_ISSET(db, DB_AM_DUPSORT)) return (0); @@ -44,9 +44,9 @@ __db_compare_both(db, akey, adata, bkey, bdata) #ifdef HAVE_COMPRESSION if (DB_IS_COMPRESSED(db)) - return t->compress_dup_compare(db, adata, bdata); + return t->compress_dup_compare(db, adata, bdata, NULL); #endif - return db->dup_compare(db, adata, bdata); + return db->dup_compare(db, adata, bdata, NULL); } #define DB_SORT_SWAP(a, ad, b, bd) \ diff --git a/src/db/db_stati.c b/src/db/db_stati.c index 61744e81..b7367f37 100644 --- a/src/db/db_stati.c +++ b/src/db/db_stati.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/db/db_truncate.c b/src/db/db_truncate.c index 0eeb0c64..d57a23b2 100644 --- a/src/db/db_truncate.c +++ b/src/db/db_truncate.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -191,6 +191,10 @@ __db_truncate(dbp, ip, txn, countp) if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0) ret = t_ret; + /* Delete all blob files. */ + if (ret == 0) + ret = __blob_del_all(dbp, txn, 1); + DB_TEST_RECOVERY(dbp, DB_TEST_POSTDESTROY, ret, NULL); DB_TEST_RECOVERY_LABEL diff --git a/src/db/db_upg.c b/src/db/db_upg.c index de5d0dc7..7dcc3b1c 100644 --- a/src/db/db_upg.c +++ b/src/db/db_upg.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -13,6 +13,7 @@ #include "dbinc/db_swap.h" #include "dbinc/btree.h" #include "dbinc/hash.h" +#include "dbinc/heap.h" #include "dbinc/qam.h" /* @@ -98,6 +99,27 @@ static int (* const func_46_list[P_PAGETYPE_MAX]) NULL, /* P_IHEAP */ }; +static int (* const func_60_list[P_PAGETYPE_MAX]) + __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)) = { + NULL, /* P_INVALID */ + NULL, /* __P_DUPLICATE */ + NULL, /* P_HASH_UNSORTED */ + NULL, /* P_IBTREE */ + NULL, /* P_IRECNO */ + __bam_60_lbtree, /* P_LBTREE */ + NULL, /* P_LRECNO */ + NULL, /* P_OVERFLOW */ + __ham_60_hashmeta, /* P_HASHMETA */ + __bam_60_btreemeta, /* P_BTREEMETA */ + NULL, /* P_QAMMETA */ + NULL, /* P_QAMDATA */ + NULL, /* P_LDUP */ + __ham_60_hash, /* P_HASH */ + __heap_60_heapmeta, /* P_HEAPMETA */ + __heap_60_heap, /* P_HEAP */ + NULL, /* P_IHEAP */ +}; + static int __db_page_pass __P((DB *, char *, u_int32_t, int (* const []) (DB *, char *, u_int32_t, DB_FH *, PAGE *, int *), DB_FH *)); static int __db_set_lastpgno __P((DB *, char *, DB_FH *)); @@ -181,6 +203,34 @@ __db_upgrade(dbp, fname, flags) goto err; /* FALLTHROUGH */ case 9: + /* + * Various blob ids and size use two u_int32_t values + * to represent 64 bit integers in early 6.0. Change + * those values to 64 bit integers. + */ + /* + * Read the encrypt_alg and chksum fields from the + * metadata page. + */ + meta = (DBMETA *)mbuf; + if (FLD_ISSET(meta->metaflags, DBMETA_CHKSUM)) + F_SET(dbp, DB_AM_CHKSUM); + if (meta->encrypt_alg != 0) { + if (!CRYPTO_ON(dbp->env)) { + __db_errx(env, DB_STR("0777", +"Attempt to upgrade an encrypted database without providing a password.")); + ret = EINVAL; + goto err; + } + F_SET(dbp, DB_AM_ENCRYPT); + } + memcpy(&dbp->pgsize, + &meta->pagesize, sizeof(u_int32_t)); + if ((ret = __db_page_pass(dbp, + real_name, flags, func_60_list, fhp)) != 0) + goto err; + /* FALLTHROUGH */ + case 10: break; default: __db_errx(env, DB_STR_A("0666", @@ -307,6 +357,34 @@ __db_upgrade(dbp, fname, flags) /* FALLTHROUGH */ case 9: + /* + * Various blob ids and size use two u_int32_t values + * to represent 64 bit integers in early 6.0. Change + * those values to 64 bit integers. + */ + meta = (DBMETA*)mbuf; + memcpy(&dbp->pgsize, + &meta->pagesize, sizeof(u_int32_t)); + /* + * Read the encrypt_alg and chksum fields from the + * metadata page. + */ + if (FLD_ISSET(meta->metaflags, DBMETA_CHKSUM)) + F_SET(dbp, DB_AM_CHKSUM); + if (meta->encrypt_alg != 0) { + if (!CRYPTO_ON(dbp->env)) { + __db_errx(env, DB_STR("0778", +"Attempt to upgrade an encrypted database without providing a password.")); + ret = EINVAL; + goto err; + } + F_SET(dbp, DB_AM_ENCRYPT); + } + if ((ret = __db_page_pass(dbp, + real_name, flags, func_60_list, fhp)) != 0) + goto err; + /* FALLTHROUGH */ + case 10: break; default: __db_errx(env, DB_STR_A("0668", @@ -317,9 +395,45 @@ __db_upgrade(dbp, fname, flags) } break; case DB_HEAPMAGIC: - /* - * There's no upgrade needed for Heap yet. - */ + switch (((DBMETA *)mbuf)->version) { + case 1: + /* + * Various blob ids and size use two u_int32_t values + * to represent 64 bit integers in early 6.0. Change + * those values to 64 bit integers. + */ + meta = (DBMETA*)mbuf; + memcpy(&dbp->pgsize, + &meta->pagesize, sizeof(u_int32_t)); + /* + * Read the encrypt_alg and chksum fields from the + * metadata page. + */ + if (FLD_ISSET(meta->metaflags, DBMETA_CHKSUM)) + F_SET(dbp, DB_AM_CHKSUM); + if (meta->encrypt_alg != 0) { + if (!CRYPTO_ON(dbp->env)) { + __db_errx(env, DB_STR("0779", +"Attempt to upgrade an encrypted database without providing a password.")); + ret = EINVAL; + goto err; + } + F_SET(dbp, DB_AM_ENCRYPT); + } + if ((ret = __db_page_pass(dbp, + real_name, flags, func_60_list, fhp)) != 0) + goto err; + /* FALLTHROUGH */ + case 2: + break; + default: + __db_errx(env, DB_STR_A("0776", + "%s: unsupported heap version: %lu", + "%s %lu"), real_name, + (u_long)((DBMETA *)mbuf)->version); + ret = DB_OLD_VERSION; + goto err; + } break; case DB_QAMMAGIC: switch (((DBMETA *)mbuf)->version) { diff --git a/src/db/db_upg_opd.c b/src/db/db_upg_opd.c index 992115ad..6f6dfb71 100644 --- a/src/db/db_upg_opd.c +++ b/src/db/db_upg_opd.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -37,6 +37,9 @@ static int __db_up_ovref __P((DB *, DB_FH *, db_pgno_t)); * __db_31_offdup -- * Convert 3.0 off-page duplicates to 3.1 off-page duplicates. * + * This code and its descendants should be removed when support for + * upgrading from a 3.0 database format is removed. + * * PUBLIC: int __db_31_offdup __P((DB *, char *, DB_FH *, int, db_pgno_t *)); */ int @@ -317,7 +320,7 @@ __db_build_ri(dbp, fhp, ipage, page, indx, nomemp) /* * __db_up_ovref -- - * Increment/decrement the reference count on an overflow page. + * Increment the reference count on an overflow page. */ static int __db_up_ovref(dbp, fhp, pgno) diff --git a/src/db/db_vrfy.c b/src/db/db_vrfy.c index 9cb94ad2..a8c80cae 100644 --- a/src/db/db_vrfy.c +++ b/src/db/db_vrfy.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2000, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2000, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -553,7 +553,7 @@ __db_vrfy_pagezero(dbp, vdp, fhp, name, flags) if ((ret = __db_vrfy_getpageinfo(vdp, PGNO_BASE_MD, &pip)) != 0) return (ret); - if ((ret = __db_chk_meta(env, dbp, meta, 1)) != 0) { + if ((ret = __db_chk_meta(env, dbp, meta, DB_CHK_META)) != 0) { EPRINT((env, DB_STR_A("0522", "Page %lu: metadata page corrupted", "%lu"), (u_long)PGNO_BASE_MD)); @@ -920,7 +920,7 @@ err1: if (ret == 0) * If we've seen a Queue metadata page, we may need to walk Queue * extent pages that won't show up between 0 and vdp->last_pgno. */ - if (F_ISSET(vdp, VRFY_QMETA_SET) && (t_ret = + if (F_ISSET(vdp, SALVAGE_QMETA_SET) && (t_ret = __qam_vrfy_walkqueue(dbp, vdp, handle, callback, flags)) != 0) { if (ret == 0) ret = t_ret; @@ -1563,6 +1563,10 @@ __db_vrfy_meta(dbp, vdp, meta, pgno, flags) * If we don't have FTRUNCATE then mpool could include some * zeroed pages at the end of the file, we assume the meta page * is correct. Queue does not update the meta page's last_pgno. + * + * We have seen one false positive after a failure while rolling the log + * forward, last_pgno was updated and the file had not yet been + * extended. [#18418] */ if (pgno == PGNO_BASE_MD && dbtype != DB_QUEUE && meta->last_pgno != vdp->last_pgno) { @@ -2401,6 +2405,15 @@ __db_vrfy_inpitem(dbp, h, pgno, i, is_btree, flags, himarkp, offsetp) * length, so it's not possible to certify it as safe. */ switch (B_TYPE(bk->type)) { + case B_BLOB: + len = bk->len; + if (len != BBLOB_DSIZE) { + EPRINT((env, DB_STR_A("0771", + "Page %lu: item %lu illegal size.", + "%lu %lu"), (u_long)pgno, (u_long)i)); + return (DB_VERIFY_BAD); + } + break; case B_KEYDATA: len = bk->len; break; diff --git a/src/db/db_vrfy_stub.c b/src/db/db_vrfy_stub.c index 5037f33e..a9eed84c 100644 --- a/src/db/db_vrfy_stub.c +++ b/src/db/db_vrfy_stub.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/db/db_vrfyutil.c b/src/db/db_vrfyutil.c index d72e1188..3a64bd50 100644 --- a/src/db/db_vrfyutil.c +++ b/src/db/db_vrfyutil.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2000, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2000, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -43,6 +43,9 @@ __db_vrfy_dbinfo_create(env, ip, pgsize, vdpp) if ((ret = __db_create_internal(&cdbp, env, 0)) != 0) goto err; + if ((ret = __db_set_blob_threshold(cdbp, 0, 0)) != 0) + goto err; + if ((ret = __db_set_flags(cdbp, DB_DUP)) != 0) goto err; @@ -60,6 +63,9 @@ __db_vrfy_dbinfo_create(env, ip, pgsize, vdpp) if ((ret = __db_create_internal(&pgdbp, env, 0)) != 0) goto err; + if ((ret = __db_set_blob_threshold(pgdbp, 0, 0)) != 0) + goto err; + if ((ret = __db_set_pagesize(pgdbp, pgsize)) != 0) goto err; @@ -928,5 +934,6 @@ __db_vrfy_prdbt(dbtp, checkprint, prefix, } return ( __db_prdbt(dbtp, checkprint, - prefix, handle, callback, is_recno, is_heap)); + prefix, handle, callback, is_recno, is_heap, + vdp != NULL && F_ISSET(vdp, SALVAGE_STREAM_BLOB) ? 1 : 0)); } diff --git a/src/db/partition.c b/src/db/partition.c index f8beaf16..86491ba3 100644 --- a/src/db/partition.c +++ b/src/db/partition.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -32,13 +32,12 @@ static int __partc_writelock __P((DBC*)); static int __partition_chk_meta __P((DB *, DB_THREAD_INFO *, DB_TXN *, u_int32_t)); static int __partition_setup_keys __P((DBC *, - DB_PARTITION *, DBMETA *, u_int32_t)); + DB_PARTITION *, u_int32_t, u_int32_t)); static int __part_key_cmp __P((const void *, const void *)); static inline void __part_search __P((DB *, DB_PARTITION *, DBT *, u_int32_t *)); -static char *Alloc_err = DB_STR_A("0644", - "Partition open failed to allocate %d bytes", "%d"); +#define ALLOC_ERR DB_STR_A("0764","Partition failed to allocate %d bytes","%d") /* * Allocate a partition cursor and copy flags to the partition cursor. @@ -70,20 +69,27 @@ static inline void __part_search(dbp, part, key, part_idp) { db_indx_t base, indx, limit; int cmp; - int (*func) __P((DB *, const DBT *, const DBT *)); + int (*func) __P((DB *, const DBT *, const DBT *, size_t *)); + size_t pos, pos_h, pos_l; DB_ASSERT(dbp->env, part->nparts != 0); COMPQUIET(cmp, 0); COMPQUIET(indx, 0); + pos_h = 0; + pos_l = 0; func = ((BTREE *)dbp->bt_internal)->bt_compare; DB_BINARY_SEARCH_FOR(base, limit, part->nparts, O_INDX) { + pos = pos_l > pos_h ? pos_h : pos_l; DB_BINARY_SEARCH_INCR(indx, base, limit, O_INDX); - cmp = func(dbp, key, &part->keys[indx]); + cmp = func(dbp, key, &part->keys[indx], &pos); if (cmp == 0) break; - if (cmp > 0) + if (cmp > 0) { DB_BINARY_SEARCH_SHIFT_BASE(indx, base, limit, O_INDX); + pos_l = pos; + } else + pos_h = pos; } if (cmp == 0) *part_idp = indx; @@ -146,7 +152,8 @@ __partition_set(dbp, parts, keys, callback) { DB_PARTITION *part; ENV *env; - int ret; + u_int32_t i; + int ret, t_ret; DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_partition"); env = dbp->dbenv->env; @@ -155,6 +162,11 @@ __partition_set(dbp, parts, keys, callback) __db_errx(env, DB_STR("0646", "Must specify at least 2 partitions.")); return (EINVAL); + } else if (parts > PART_MAXIMUM) { + __db_errx(env, DB_STR_A("0772", + "Must not specify more than %u partitions.", "%u"), + (unsigned int)PART_MAXIMUM); + return (EINVAL); } if (keys == NULL && callback == NULL) { @@ -178,11 +190,59 @@ bad: __db_errx(env, DB_STR("0648", (part->callback != NULL && keys != NULL)) goto bad; + /* + * Free a key array that was allocated by an earlier set_partition call. + */ + if (part->keys != NULL) { + for (i = 0; i < part->nparts - 1; i++) { + /* + * Always free all entries in the key array and return + * the first error code. + */ + if ((t_ret = __db_dbt_clone_free(dbp->env, + &part->keys[i])) != 0 && ret == 0) + ret = t_ret; + } + __os_free(dbp->env, part->keys); + part->keys = NULL; + } + + if (ret != 0) + return (ret); + part->nparts = parts; - part->keys = keys; part->callback = callback; - return (0); + /* + * Take a copy of the users key array otherwise we cannot be sure + * that the memory will still be valid when the database is opened. + */ + if (keys != NULL) { + if ((ret = __os_calloc(dbp->env, + part->nparts - 1, sizeof(DBT), &part->keys)) != 0) + goto err; + + for (i = 0, parts = 0; i < part->nparts - 1; i++, parts++) + if ((ret = __db_dbt_clone(dbp->env, + &part->keys[i], &keys[i])) != 0) + goto err; + } + +err: if (ret != 0 && part->keys != NULL) { + /* + * Always free those entries cloned successfully in the key + * array and the one which fails in __db_dbt_clone, and + * return the first error code. As ret != 0 here, so it is + * safe to ignore any error from __db_dbt_clone_free. + */ + for (i = 0; i < parts; i++) + (void)__db_dbt_clone_free(dbp->env, &part->keys[i]); + if (parts < part->nparts - 1 && part->keys[parts].data != NULL) + __os_free(dbp->env, part->keys[parts].data); + __os_free(dbp->env, part->keys); + part->keys = NULL; + } + return (ret); } /* @@ -288,15 +348,16 @@ __partition_open(dbp, ip, txn, fname, type, flags, mode, do_open) if ((ret = __os_calloc(env, part->nparts, sizeof(*part->handles), &part->handles)) != 0) { - __db_errx(env, - Alloc_err, part->nparts * sizeof(*part->handles)); + __db_errx(env, ALLOC_ERR, + (int)(part->nparts * sizeof(*part->handles))); goto err; } DB_ASSERT(env, fname != NULL); if ((ret = __os_malloc(env, strlen(fname) + PART_LEN + 1, &name)) != 0) { - __db_errx(env, Alloc_err, strlen(fname) + PART_LEN + 1); + __db_errx(env, ALLOC_ERR, + (int)(strlen(fname) + PART_LEN + 1)); goto err; } @@ -330,6 +391,9 @@ __partition_open(dbp, ip, txn, fname, type, flags, mode, do_open) part_db->dup_compare = dbp->dup_compare; part_db->app_private = dbp->app_private; part_db->api_internal = dbp->api_internal; + part_db->blob_threshold = dbp->blob_threshold; + part_db->blob_file_id = dbp->blob_file_id; + part_db->blob_sdb_id = dbp->blob_sdb_id; if (dbp->type == DB_BTREE) __bam_copy_config(dbp, part_db, part->nparts); @@ -388,7 +452,8 @@ __partition_chk_meta(dbp, ip, txn, flags) DB_MPOOLFILE *mpf; ENV *env; db_pgno_t base_pgno; - int ret, t_ret; + int ret, set_keys, t_ret; + u_int32_t pgsize; dbc = NULL; meta = NULL; @@ -397,6 +462,14 @@ __partition_chk_meta(dbp, ip, txn, flags) mpf = dbp->mpf; env = dbp->env; ret = 0; + set_keys = 0; + + /* + * Just to fix the lint warning. + * The real value will be set later, and we will + * only use the value after being set properly. + */ + pgsize = dbp->pgsize; /* Get a cursor on the main db. */ dbp->p_internal = NULL; @@ -475,10 +548,12 @@ __partition_chk_meta(dbp, ip, txn, flags) } } else if (meta->magic != DB_BTREEMAGIC) { __db_errx(env, DB_STR("0658", - "Partitioning only supported on BTREE nad HASH.")); + "Partitioning only supported on BTREE and HASH.")); ret = EINVAL; - } else - ret = __partition_setup_keys(dbc, part, meta, flags); + } else { + set_keys = 1; + pgsize = meta->pagesize; + } err: /* Put the metadata page back. */ if (meta != NULL && (t_ret = __memp_fput(mpf, @@ -487,6 +562,15 @@ err: /* Put the metadata page back. */ if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0) ret = t_ret; + /* + * We can only call __partition_setup_keys after putting + * the meta page and releasing the meta lock, or self-deadlock + * will occur. + */ + if (ret == 0 && set_keys && (t_ret = + __partition_setup_keys(dbc, part, pgsize, flags)) != 0) + ret = t_ret; + if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0) ret = t_ret; @@ -502,7 +586,7 @@ err: /* Put the metadata page back. */ struct key_sort { DB *dbp; DBT *key; - int (*compare) __P((DB *, const DBT *, const DBT *)); + int (*compare) __P((DB *, const DBT *, const DBT *, size_t *)); }; static int __part_key_cmp(a, b) @@ -512,7 +596,7 @@ static int __part_key_cmp(a, b) ka = a; kb = b; - return (ka->compare(ka->dbp, ka->key, kb->key)); + return (ka->compare(ka->dbp, ka->key, kb->key, NULL)); } /* * __partition_setup_keys -- @@ -520,25 +604,22 @@ static int __part_key_cmp(a, b) * are creating a partitioned database. */ static int -__partition_setup_keys(dbc, part, meta, flags) +__partition_setup_keys(dbc, part, pgsize, flags) DBC *dbc; DB_PARTITION *part; - DBMETA *meta; - u_int32_t flags; + u_int32_t flags, pgsize; { BTREE *t; DB *dbp; - DBT data, key, *keys, *kp; + DBT data, key, *keys, *kp, *okp; ENV *env; - u_int32_t ds, i, j; - u_int8_t *dd; + db_pgno_t last_pgno; + u_int32_t cgetflags, i, j; + size_t dsize; struct key_sort *ks; - int have_keys, ret; - int (*compare) __P((DB *, const DBT *, const DBT *)); - void *dp; + int have_keys, ret, t_ret; + int (*compare) __P((DB *, const DBT *, const DBT *, size_t *)); - COMPQUIET(dd, NULL); - COMPQUIET(ds, 0); memset(&data, 0, sizeof(data)); memset(&key, 0, sizeof(key)); ks = NULL; @@ -549,6 +630,9 @@ __partition_setup_keys(dbc, part, meta, flags) /* Need to just read the main database. */ dbp->p_internal = NULL; have_keys = 0; + dsize = 0; + + keys = part->keys; /* First verify that things what we expect. */ if ((ret = __dbc_get(dbc, &key, &data, DB_FIRST)) != 0) { @@ -581,11 +665,15 @@ __partition_setup_keys(dbc, part, meta, flags) } if (LF_ISSET(DB_CREATE) && have_keys == 0) { - /* Insert the keys into the master database. */ + /* + * Insert the keys into the master database. We will also + * compute the total size of the keys for later use. + */ for (i = 0; i < part->nparts - 1; i++) { if ((ret = __db_put(dbp, dbc->thread_info, dbc->txn, &part->keys[i], &data, 0)) != 0) goto err; + dsize += part->keys[i].size; } /* @@ -604,39 +692,71 @@ __partition_setup_keys(dbc, part, meta, flags) } done: if (F_ISSET(part, PART_RANGE)) { /* - * Allocate one page to hold the keys plus space at the - * end of the buffer to put an array of DBTs. If there - * is not enough space __dbc_get will return how much - * is needed and we realloc. + * If we just did the insert, we have known the total size of + * the keys. Otherwise, the keys must have been in the database, + * and we can calculate the size by checking the last pgno of + * the corresponding mpoolfile. + * + * We make the size aligned at 1024 for performance. */ + if (dsize == 0) { + ret = __memp_get_last_pgno(dbp->mpf, &last_pgno); + if (ret != 0) + goto err; + if (last_pgno > 1) + last_pgno--; + dsize = last_pgno * pgsize; + } + dsize = DB_ALIGN(dsize, 1024); + if ((ret = __os_malloc(env, - meta->pagesize + (sizeof(DBT) * part->nparts), + dsize + (sizeof(DBT) * part->nparts), &part->data)) != 0) { - __db_errx(env, Alloc_err, meta->pagesize); + __db_errx(env, ALLOC_ERR, (int)dsize); goto err; } + memset(part->data, 0, + dsize + (sizeof(DBT) * part->nparts)); + + kp = okp = (DBT *) + ((u_int8_t *)part->data + dsize); memset(&key, 0, sizeof(key)); memset(&data, 0, sizeof(data)); - data.data = part->data; - data.ulen = meta->pagesize; data.flags = DB_DBT_USERMEM; -again: if ((ret = __dbc_get(dbc, &key, &data, - DB_FIRST | DB_MULTIPLE_KEY)) == DB_BUFFER_SMALL) { - if ((ret = __os_realloc(env, - data.size + (sizeof(DBT) * part->nparts), - &part->data)) != 0) + j = 0; + cgetflags = DB_FIRST; + while ((ret = __dbc_get(dbc, &key, &data, cgetflags)) == 0) { + /* It is an error if we get more keys than expect. */ + if ((u_int32_t)(kp - okp) > part->nparts) { + ret = EINVAL; goto err; - data.data = part->data; - data.ulen = data.size; - goto again; + } + kp->size = key.size; + kp->data = (u_int8_t *)part->data + j; + /* It is an error if the keys overflow the space. */ + if (j + kp->size > dsize) { + ret = EINVAL; + goto err; + } + memcpy(kp->data, key.data, kp->size); + j += kp->size; + cgetflags = DB_NEXT; + kp++; } + + /* + * We should get part->nparts keys back, otherwise it means + * the passed-in keys are not valid. + */ + if (ret == DB_NOTFOUND && (u_int32_t)(kp - okp) == part->nparts) + ret = 0; + if (ret == 0) { /* * They passed in keys, they must match. */ - keys = NULL; compare = NULL; - if (have_keys == 1 && (keys = part->keys) != NULL) { + if (have_keys == 1 && keys != NULL) { t = dbc->dbp->bt_internal; compare = t->bt_compare; if ((ret = __os_malloc(env, (part->nparts - 1) @@ -651,20 +771,15 @@ again: if ((ret = __dbc_get(dbc, &key, &data, qsort(ks, (size_t)part->nparts - 1, sizeof(struct key_sort), __part_key_cmp); } - DB_MULTIPLE_INIT(dp, &data); part->keys = (DBT *) - ((u_int8_t *)part->data + data.size); + ((u_int8_t *)part->data + dsize); + F_SET(part, PART_KEYS_SETUP); j = 0; for (kp = part->keys; kp < &part->keys[part->nparts]; kp++, j++) { - DB_MULTIPLE_KEY_NEXT(dp, - &data, kp->data, kp->size, dd, ds); - if (dp == NULL) { - ret = DB_NOTFOUND; - break; - } - if (keys != NULL && j != 0 && - compare(dbc->dbp, ks[j - 1].key, kp) != 0) { + if (have_keys == 1 && keys != NULL && j != 0 && + compare(dbc->dbp, ks[j - 1].key, + kp, NULL) != 0) { if (kp->data == NULL && F_ISSET(dbp, DB_AM_RECOVER)) goto err; @@ -683,6 +798,24 @@ again: if ((ret = __dbc_get(dbc, &key, &data, err: dbp->p_internal = part; if (ks != NULL) __os_free(env, ks); + + /* + * We only free the original copy of the key array when + * the keys have been setup properly, otherwise we let + * the close function to free the memory. + */ + if (keys != NULL && F_ISSET(part, PART_KEYS_SETUP)) { + for (i = 0; i < part->nparts - 1; i++) + /* + * Always free all entries in the key array and return + * the first error code. + */ + if ((t_ret = __db_dbt_clone_free(env, + &keys[i])) != 0 && ret == 0) + ret = t_ret; + __os_free(env, keys); + } + return (ret); } @@ -1183,6 +1316,15 @@ __partition_close(dbp, txn, flags) ret = t_ret; __os_free(env, part->handles); } + if (!F_ISSET(part, PART_KEYS_SETUP) && part->keys != NULL) { + for (i = 0; i < part->nparts - 1; i++) { + if (part->keys[i].data != NULL && (t_ret = + __db_dbt_clone_free(env, &part->keys[i])) != 0 && + ret == 0) + ret = t_ret; + } + __os_free(env, part->keys); + } if (part->dirs != NULL) __os_free(env, (char **)part->dirs); if (part->data != NULL) @@ -1471,7 +1613,8 @@ __part_fileid_reset(env, ip, fname, nparts, encrypted) if ((ret = __os_malloc(env, strlen(fname) + PART_LEN + 1, &name)) != 0) { - __db_errx(env, Alloc_err, strlen(fname) + PART_LEN + 1); + __db_errx(env, ALLOC_ERR, + (int)(strlen(fname) + PART_LEN + 1)); return (ret); } @@ -1747,7 +1890,8 @@ __part_rr(dbp, ip, txn, name, subdb, newname, flags) COMPQUIET(np, NULL); if (newname != NULL && (ret = __os_malloc(env, strlen(newname) + PART_LEN + 1, &np)) != 0) { - __db_errx(env, Alloc_err, strlen(newname) + PART_LEN + 1); + __db_errx(env, ALLOC_ERR, + (int)(strlen(newname) + PART_LEN + 1)); goto err; } for (i = 0; i < part->nparts; i++, pdbp++) { @@ -1790,6 +1934,32 @@ err: /* } return (ret); } + +/* + * __partc_dup -- + * Duplicate a cursor on a partitioned database. + * + * PUBLIC: int __partc_dup __P((DBC *, DBC *)); + */ +int +__partc_dup(dbc_orig, dbc_n) + DBC *dbc_orig; + DBC *dbc_n; +{ + PART_CURSOR *orig, *new; + + orig = (PART_CURSOR *)dbc_orig->internal; + new = (PART_CURSOR *)dbc_n->internal; + + /* + * A cursor on a partitioned database contains the identifier + * of the underlying database and a regular cursor that points + * to the underlying database. Copy both pieces. + */ + new->part_id = orig->part_id; + + return (__dbc_dup(orig->sub_cursor, &new->sub_cursor, DB_POSITION)); +} #ifdef HAVE_VERIFY /* * __part_verify -- |