summaryrefslogtreecommitdiff
path: root/src/fileops/fop_util.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/fileops/fop_util.c')
-rw-r--r--src/fileops/fop_util.c1841
1 files changed, 1841 insertions, 0 deletions
diff --git a/src/fileops/fop_util.c b/src/fileops/fop_util.c
new file mode 100644
index 00000000..1925ffd1
--- /dev/null
+++ b/src/fileops/fop_util.c
@@ -0,0 +1,1841 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/hash.h"
+#include "dbinc/fop.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+static int __fop_set_pgsize __P((DB *, DB_FH *, const char *));
+static int __fop_inmem_create __P((DB *, const char *, DB_TXN *, u_int32_t));
+static int __fop_inmem_dummy __P((DB *, DB_TXN *, const char *, u_int8_t *));
+static int __fop_inmem_read_meta __P((DB *, DB_TXN *, const char *, u_int32_t,
+ u_int32_t));
+static int __fop_inmem_swap __P((DB *, DB *, DB_TXN *,
+ const char *, const char *, const char *, DB_LOCKER *));
+static int __fop_ondisk_dummy __P((DB *, DB_TXN *, const char *, u_int8_t *));
+static int __fop_ondisk_swap __P((DB *, DB *, DB_TXN *,
+ const char *, const char *, const char *, DB_LOCKER *));
+
+/*
+ * Acquire the environment meta-data lock. The parameters are the
+ * environment (ENV), the locker id to use in acquiring the lock (ID)
+ * and a pointer to a DB_LOCK.
+ *
+ * !!!
+ * Turn off locking for Critical Path. The application must do its own
+ * synchronization of open/create. Two threads creating and opening a
+ * file at the same time may have unpredictable results.
+ */
+#ifdef CRITICALPATH_10266
+#define GET_ENVLOCK(ENV, ID, L) (0)
+#else
+#define GET_ENVLOCK(ENV, ID, L) do { \
+ DBT __dbt; \
+ u_int32_t __lockval; \
+ \
+ if (LOCKING_ON((ENV))) { \
+ __lockval = 1; \
+ __dbt.data = &__lockval; \
+ __dbt.size = sizeof(__lockval); \
+ if ((ret = __lock_get((ENV), (ID), \
+ 0, &__dbt, DB_LOCK_WRITE, (L))) != 0) \
+ goto err; \
+ } \
+} while (0)
+#endif
+
+#define RESET_MPF(D, F) do { \
+ (void)__memp_fclose((D)->mpf, (F)); \
+ (D)->mpf = NULL; \
+ F_CLR((D), DB_AM_OPEN_CALLED); \
+ if ((ret = __memp_fcreate((D)->env, &(D)->mpf)) != 0) \
+ goto err; \
+} while (0)
+
+/*
+ * If we open a file handle and our caller is doing fcntl(2) locking,
+ * we can't close the handle because that would discard the caller's
+ * lock. Save it until we close or refresh the DB handle.
+ */
+#define CLOSE_HANDLE(D, F) { \
+ if ((F) != NULL) { \
+ if (LF_ISSET(DB_FCNTL_LOCKING)) \
+ (D)->saved_open_fhp = (F); \
+ else if ((t_ret = \
+ __os_closehandle((D)->env, (F))) != 0) { \
+ if (ret == 0) \
+ ret = t_ret; \
+ goto err; \
+ } \
+ (F) = NULL; \
+ } \
+}
+
+/*
+ * __fop_lock_handle --
+ *
+ * Get the handle lock for a database. If the envlock is specified, do this
+ * as a lock_vec call that releases the environment lock before acquiring the
+ * handle lock.
+ *
+ * PUBLIC: int __fop_lock_handle __P((ENV *,
+ * PUBLIC: DB *, DB_LOCKER *, db_lockmode_t, DB_LOCK *, u_int32_t));
+ *
+ */
+int
+__fop_lock_handle(env, dbp, locker, mode, elockp, flags)
+ ENV *env;
+ DB *dbp;
+ DB_LOCKER *locker;
+ db_lockmode_t mode;
+ DB_LOCK *elockp;
+ u_int32_t flags;
+{
+ DBT fileobj;
+ DB_LOCKREQ reqs[2], *ereq;
+ DB_LOCK_ILOCK lock_desc;
+ int ret;
+
+ if (!LOCKING_ON(env) ||
+ F_ISSET(dbp, DB_AM_COMPENSATE | DB_AM_RECOVER))
+ return (0);
+
+ /*
+ * If we are in recovery, the only locking we should be
+ * doing is on the global environment. The one exception
+ * is if we are opening an exclusive database on a client
+ * syncing with the master.
+ */
+ if (IS_RECOVERING(env) && !F2_ISSET(dbp, DB2_AM_INTEXCL))
+ return (elockp == NULL ? 0 : __ENV_LPUT(env, *elockp));
+
+ memcpy(lock_desc.fileid, dbp->fileid, DB_FILE_ID_LEN);
+ lock_desc.pgno = dbp->meta_pgno;
+ lock_desc.type = DB_HANDLE_LOCK;
+
+ memset(&fileobj, 0, sizeof(fileobj));
+ fileobj.data = &lock_desc;
+ fileobj.size = sizeof(lock_desc);
+ DB_TEST_SUBLOCKS(env, flags);
+ if (F2_ISSET(dbp, DB2_AM_INTEXCL))
+ flags |= DB_LOCK_IGNORE_REC;
+ if (elockp == NULL)
+ ret = __lock_get(env, locker,
+ flags, &fileobj, mode, &dbp->handle_lock);
+ else {
+ reqs[0].op = DB_LOCK_PUT;
+ reqs[0].lock = *elockp;
+ reqs[1].op = DB_LOCK_GET;
+ reqs[1].mode = mode;
+ reqs[1].obj = &fileobj;
+ reqs[1].timeout = 0;
+ if ((ret = __lock_vec(env,
+ locker, flags, reqs, 2, &ereq)) == 0) {
+ dbp->handle_lock = reqs[1].lock;
+ if (elockp != &dbp->handle_lock)
+ LOCK_INIT(*elockp);
+ } else if (ereq != reqs)
+ LOCK_INIT(*elockp);
+ }
+
+ dbp->cur_locker = locker;
+ return (ret);
+}
+
+/*
+ * __fop_file_setup --
+ *
+ * Perform all the needed checking and locking to open up or create a
+ * file.
+ *
+ * There's a reason we don't push this code down into the buffer cache.
+ * The problem is that there's no information external to the file that
+ * we can use as a unique ID. UNIX has dev/inode pairs, but they are
+ * not necessarily unique after reboot, if the file was mounted via NFS.
+ * Windows has similar problems, as the FAT filesystem doesn't maintain
+ * dev/inode numbers across reboot. So, we must get something from the
+ * file we can use to ensure that, even after a reboot, the file we're
+ * joining in the cache is the right file for us to join. The solution
+ * we use is to maintain a file ID that's stored in the database, and
+ * that's why we have to open and read the file before calling into the
+ * buffer cache or obtaining a lock (we use this unique fileid to lock
+ * as well as to identify like files in the cache).
+ *
+ * There are a couple of idiosyncrasies that this code must support, in
+ * particular, DB_TRUNCATE and DB_FCNTL_LOCKING. First, we disallow
+ * DB_TRUNCATE in the presence of transactions, since opening a file with
+ * O_TRUNC will result in data being lost in an unrecoverable fashion.
+ * We also disallow DB_TRUNCATE if locking is enabled, because even in
+ * the presence of locking, we cannot avoid race conditions, so allowing
+ * DB_TRUNCATE with locking would be misleading. See SR [#7345] for more
+ * details.
+ *
+ * However, if you are running with neither locking nor transactions, then
+ * you can specify DB_TRUNCATE, and if you do so, we will truncate the file
+ * regardless of its contents.
+ *
+ * FCNTL locking introduces another set of complications. First, the only
+ * reason we support the DB_FCNTL_LOCKING flag is for historic compatibility
+ * with programs like Sendmail and Postfix. In these cases, the caller may
+ * already have a lock on the file; we need to make sure that any file handles
+ * we open remain open, because if we were to close them, the lock held by the
+ * caller would go away. Furthermore, Sendmail and/or Postfix need the ability
+ * to create databases in empty files. So, when you're doing FCNTL locking,
+ * it's reasonable that you are trying to create a database into a 0-length
+ * file and we allow it, while under normal conditions, we do not create
+ * databases if the files already exist and are not Berkeley DB files.
+ *
+ * PUBLIC: int __fop_file_setup __P((DB *, DB_THREAD_INFO *ip,
+ * PUBLIC: DB_TXN *, const char *, int, u_int32_t, u_int32_t *));
+ */
+int
+__fop_file_setup(dbp, ip, txn, name, mode, flags, retidp)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *name;
+ int mode;
+ u_int32_t flags, *retidp;
+{
+ DBTYPE save_type;
+ DB_FH *fhp;
+ DB_LOCK elock;
+ DB_LOCKER *locker;
+ DB_TXN *stxn;
+ ENV *env;
+ size_t len;
+ APPNAME aflags;
+ u_int32_t dflags, oflags;
+ u_int8_t mbuf[DBMETASIZE];
+ int created_locker, create_ok, ret, retries, t_ret, tmp_created;
+ int truncating, was_inval;
+ char *real_name, *real_tmpname, *tmpname;
+ db_lockmode_t lockmode;
+
+ *retidp = TXN_INVALID;
+
+ env = dbp->env;
+ fhp = NULL;
+ LOCK_INIT(elock);
+ stxn = NULL;
+ created_locker = tmp_created = truncating = was_inval = 0;
+ real_name = real_tmpname = tmpname = NULL;
+ dflags = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0;
+ aflags = LF_ISSET(DB_INTERNAL_PERSISTENT_DB) ? DB_APP_META :
+ (LF_ISSET(DB_INTERNAL_TEMPORARY_DB) ? DB_APP_NONE : DB_APP_DATA);
+ LF_CLR(DB_INTERNAL_PERSISTENT_DB | DB_INTERNAL_TEMPORARY_DB);
+
+ ret = 0;
+ retries = 0;
+ save_type = dbp->type;
+ if (F2_ISSET(dbp, DB2_AM_EXCL))
+ lockmode = DB_LOCK_WRITE;
+ else
+ lockmode = DB_LOCK_READ;
+
+ /*
+ * Get a lockerid for this handle. There are paths through queue
+ * rename and remove where this dbp already has a locker, so make
+ * sure we don't clobber it and conflict.
+ */
+ if (LOCKING_ON(env) &&
+ !F_ISSET(dbp, DB_AM_COMPENSATE) &&
+ !F_ISSET(dbp, DB_AM_RECOVER) &&
+ dbp->locker == DB_LOCK_INVALIDID) {
+ if ((ret = __lock_id(env, NULL, &dbp->locker)) != 0)
+ goto err;
+ created_locker = 1;
+ }
+ LOCK_INIT(dbp->handle_lock);
+
+ if (txn != NULL && dbp->locker != NULL && F_ISSET(txn, TXN_INFAMILY)) {
+ if ((ret = __lock_addfamilylocker(env,
+ txn->txnid, dbp->locker->id, 1)) != 0)
+ goto err;
+ txn = NULL;
+ }
+
+ locker = txn == NULL ? dbp->locker : txn->locker;
+
+ oflags = 0;
+ if (F_ISSET(dbp, DB_AM_INMEM))
+ real_name = (char *)name;
+ else {
+ /* Get the real backing file name. */
+ if ((ret = __db_appname(env,
+ aflags, name, &dbp->dirname, &real_name)) != 0)
+ goto err;
+
+ /* Fill in the default file mode. */
+ if (mode == 0)
+ mode = DB_MODE_660;
+
+ if (LF_ISSET(DB_RDONLY))
+ oflags |= DB_OSO_RDONLY;
+ if (LF_ISSET(DB_TRUNCATE))
+ oflags |= DB_OSO_TRUNC;
+ }
+
+ retries = 0;
+ create_ok = LF_ISSET(DB_CREATE);
+ LF_CLR(DB_CREATE);
+
+retry:
+ /*
+ * If we cannot create the file, only retry a few times. We
+ * think we might be in a race with another create, but it could
+ * be that the backup filename exists (that is, is left over from
+ * a previous crash). It is also possible to read the metadata
+ * page while it is being written and fail the checksum.
+ */
+ if (++retries > DB_RETRY) {
+ __db_errx(env, DB_STR_A("0002",
+ "__fop_file_setup: Retry limit (%d) exceeded", "%d"),
+ DB_RETRY);
+ goto err;
+ }
+ if (!F_ISSET(dbp, DB_AM_COMPENSATE) && !F_ISSET(dbp, DB_AM_RECOVER))
+ GET_ENVLOCK(env, locker, &elock);
+ if (name == NULL)
+ ret = ENOENT;
+ else if (F_ISSET(dbp, DB_AM_INMEM)) {
+ ret = __env_mpool(dbp, name, flags);
+ /*
+ * We are using __env_open as a check for existence.
+ * However, __env_mpool does an actual open and there
+ * are scenarios where the object exists, but cannot be
+ * opened, because our settings don't match those internally.
+ * We need to check for that explicitly. We'll need the
+ * mpool open to read the meta-data page, so we're going to
+ * have to temporarily turn this dbp into an UNKNOWN one.
+ */
+ if (ret == EINVAL) {
+ was_inval = 1;
+ save_type = dbp->type;
+ dbp->type = DB_UNKNOWN;
+ ret = __env_mpool(dbp, name, flags);
+ dbp->type = save_type;
+ }
+ } else
+ ret = __os_exists(env, real_name, NULL);
+
+ if (ret == 0) {
+ /*
+ * If the file exists, there are 5 possible cases:
+ * 1. DB_EXCL was specified so this is an error, unless
+ * this is a file left around after a rename and we
+ * are in the same transaction. This gets decomposed
+ * into several subcases, because we check for various
+ * errors before we know we're in rename.
+ * 2. We are truncating, and it doesn't matter what kind
+ * of file it is, we should open/create it.
+ * 3. It is 0-length, we are not doing transactions (i.e.,
+ * we are sendmail), we should open/create into it.
+ * -- on-disk files only!
+ * 4. Is it a Berkeley DB file and we should simply open it.
+ * 5. It is not a BDB file and we should return an error.
+ */
+
+ /* Open file (if there is one). */
+reopen: if (!F_ISSET(dbp, DB_AM_INMEM) && (ret =
+ __os_open(env, real_name, 0, oflags, 0, &fhp)) != 0)
+ goto err;
+
+ /* Case 2: DB_TRUNCATE: we must do the creation in place. */
+ if (LF_ISSET(DB_TRUNCATE)) {
+ if (LF_ISSET(DB_EXCL)) {
+ /* Case 1a: DB_EXCL and DB_TRUNCATE. */
+ ret = EEXIST;
+ goto err;
+ }
+ tmpname = (char *)name;
+ goto creat2;
+ }
+
+ /* Cases 1,3-5: we need to read the meta-data page. */
+ if (F_ISSET(dbp, DB_AM_INMEM)) {
+ if (LOGGING_ON(env) && (ret = __env_dbreg_setup(dbp,
+ txn, NULL, name, TXN_INVALID)) != 0)
+ return (ret);
+ ret = __fop_inmem_read_meta(
+ dbp, txn, name, flags, DB_CHK_META|DB_CHK_ONLY);
+ } else {
+ ret = __fop_read_meta(env, real_name, mbuf,
+ sizeof(mbuf), fhp,
+ LF_ISSET(DB_NOERROR) ||
+ (LF_ISSET(DB_FCNTL_LOCKING) && txn == NULL) ? 1 : 0,
+ &len);
+
+ /* Case 3: 0-length, no txns. */
+ if (ret != 0 && len == 0 && txn == NULL) {
+ if (LF_ISSET(DB_EXCL)) {
+ /*
+ * Case 1b: DB_EXCL and
+ * 0-length file exists.
+ */
+ ret = EEXIST;
+ goto err;
+ }
+ tmpname = (char *)name;
+ if (create_ok)
+ goto creat2;
+ goto done;
+ }
+
+ /*
+ * Case 4: This is a valid file. Now check the
+ * checksum and decrypt the file so the file
+ * id can be obtained for the handle lock. Note that
+ * the checksum can fail if the database is being
+ * written (possible because the handle lock has
+ * not been obtained yet). So on checksum fail retry
+ * until the checksum succeeds or the number of
+ * retries is exhausted, then throw an error.
+ */
+ if (ret == 0 && (ret = __db_chk_meta(env, dbp,
+ (DBMETA *)mbuf, DB_CHK_META)) == DB_CHKSUM_FAIL) {
+ if ((t_ret = __ENV_LPUT(env, elock)) != 0) {
+ ret = t_ret;
+ goto err;
+ }
+ /*
+ * Retry unless the number of retries is
+ * exhausted.
+ */
+ if (!(retries < DB_RETRY)) {
+ __db_errx(env, DB_STR_A("0210",
+ "%s: metadata page checksum error", "%s"), real_name);
+ if (F_ISSET(dbp, DB_AM_RECOVER))
+ ret = ENOENT;
+ else
+ ret = EINVAL;
+ goto err;
+ }
+ if ((ret = __os_closehandle(env, fhp)) != 0)
+ goto err;
+ goto retry;
+ }
+ /* Get the file id for the handle lock. */
+ if (ret == 0)
+ memcpy(dbp->fileid,
+ ((DBMETA *)mbuf)->uid, DB_FILE_ID_LEN);
+ }
+
+ /* Case 5: Invalid file. */
+ if (ret != 0)
+ goto err;
+
+ /* Now, get our handle lock. */
+ if ((ret = __fop_lock_handle(env,
+ dbp, locker, lockmode, NULL, DB_LOCK_NOWAIT)) == 0) {
+ if ((ret = __ENV_LPUT(env, elock)) != 0)
+ goto err;
+ } else if (ret != DB_LOCK_NOTGRANTED ||
+ ((txn != NULL && (F_ISSET(txn, TXN_NOWAIT))) ||
+ F2_ISSET(dbp, DB2_AM_NOWAIT)))
+ goto err;
+ else {
+ PERFMON3(env,
+ race, fop_file_setup, (char *) name, ret, flags);
+ /*
+ * We were unable to acquire the handle lock without
+ * blocking. The fact that we are blocking might mean
+ * that someone else is trying to delete the file.
+ * Since some platforms cannot delete files while they
+ * are open (Windows), we are going to have to close
+ * the file. This would be a problem if we were doing
+ * FCNTL locking, because our closing the handle would
+ * release the FCNTL locks. Fortunately, if we are
+ * doing FCNTL locking, then we should never fail to
+ * acquire our handle lock, so we should never get here.
+ * We assert it here to make sure we aren't destroying
+ * any application level FCNTL semantics.
+ */
+ DB_ASSERT(env, !LF_ISSET(DB_FCNTL_LOCKING));
+ if (!F_ISSET(dbp, DB_AM_INMEM)) {
+ if ((ret = __os_closehandle(env, fhp)) != 0)
+ goto err;
+ fhp = NULL;
+ }
+ if ((ret = __fop_lock_handle(env,
+ dbp, locker, lockmode, &elock, 0)) != 0) {
+ if (F_ISSET(dbp, DB_AM_INMEM))
+ RESET_MPF(dbp, 0);
+ goto err;
+ }
+
+ /*
+ * If we had to wait, we might be waiting on a
+ * dummy file used in create/destroy of a database.
+ * To be sure we have the correct information we
+ * try again.
+ */
+ if (F_ISSET(dbp, DB_AM_INMEM)) {
+ RESET_MPF(dbp, 0);
+ MAKE_INMEM(dbp);
+ }
+ if ((ret =
+ __ENV_LPUT(env, dbp->handle_lock)) != 0) {
+ LOCK_INIT(dbp->handle_lock);
+ goto err;
+ }
+ goto retry;
+
+ }
+
+ /*
+ * If we got here, then we have the handle lock, it is now
+ * safe to check the rest of the meta data, since the file
+ * will not be deleted out from under the handle.
+ */
+ if (F_ISSET(dbp, DB_AM_INMEM)) {
+ if ((ret = __fop_inmem_read_meta(
+ dbp, txn, name, flags, DB_SKIP_CHK)) != 0)
+ goto err;
+ } else {
+ if ((ret = __db_meta_setup(env, dbp, real_name,
+ (DBMETA *)mbuf, flags, DB_SKIP_CHK)) != 0)
+ goto err;
+ }
+
+ /*
+ * Check for a file in the midst of a rename. If we find that
+ * the file is in the midst of a rename, it must be the case
+ * that it is in our current transaction (else we would still
+ * be blocking), so we can continue along and create a new file
+ * with the same name. In that case, we have to close the file
+ * handle because we reuse it below. This is a case where
+ * a 'was_inval' above is OK.
+ */
+ if (F_ISSET(dbp, DB_AM_IN_RENAME)) {
+ was_inval = 0;
+ if (create_ok) {
+ if (F_ISSET(dbp, DB_AM_INMEM)) {
+ RESET_MPF(dbp, DB_MPOOL_DISCARD);
+ } else if ((ret =
+ __os_closehandle(env, fhp)) != 0)
+ goto err;
+ LF_SET(DB_CREATE);
+ goto create;
+ } else {
+ ret = ENOENT;
+ goto err;
+ }
+ }
+
+ /* If we get here, a was_inval is bad. */
+ if (was_inval) {
+ ret = EINVAL;
+ goto err;
+ }
+
+ /*
+ * Now, case 1: check for DB_EXCL, because the file that exists
+ * is not in the middle of a rename, so we have an error. This
+ * is a weird case, but we need to make sure that we don't
+ * continue to hold the handle lock, since technically, we
+ * should not have been allowed to open it.
+ */
+ if (LF_ISSET(DB_EXCL)) {
+ ret = __ENV_LPUT(env, dbp->handle_lock);
+ LOCK_INIT(dbp->handle_lock);
+ if (ret == 0)
+ ret = EEXIST;
+ goto err;
+ }
+ goto done;
+ }
+
+ /* File does not exist. */
+#ifdef HAVE_VXWORKS
+ /*
+ * VxWorks can return file-system specific error codes if the
+ * file does not exist, not ENOENT.
+ */
+ if (!create_ok)
+#else
+ if (!create_ok || ret != ENOENT)
+#endif
+ goto err;
+ LF_SET(DB_CREATE);
+ /*
+ * If we were trying to open a non-existent master database
+ * readonly clear that here.
+ */
+ LF_CLR(DB_RDONLY);
+ F_CLR(dbp, DB_AM_RDONLY);
+ ret = 0;
+
+ /*
+ * We need to create file, which means that we need to set up the file,
+ * the fileid and the locks. Then we need to call the appropriate
+ * routines to create meta-data pages. For in-memory files, we retain
+ * the environment lock, while for on-disk files, we drop the env lock
+ * and create into a temporary.
+ */
+ if (!F_ISSET(dbp, DB_AM_INMEM) &&
+ (ret = __ENV_LPUT(env, elock)) != 0)
+ goto err;
+
+create: if (txn != NULL && IS_REP_CLIENT(env) &&
+ !F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
+ __db_errx(env, DB_STR("0003",
+ "Transactional create on replication client disallowed"));
+ ret = EINVAL;
+ goto err;
+ }
+
+ if (F_ISSET(dbp, DB_AM_INMEM)) {
+ if (LOGGING_ON(env) && (ret =
+ __env_dbreg_setup(dbp, txn, NULL, name, TXN_INVALID)) != 0)
+ return (ret);
+ if ((ret = __fop_inmem_create(dbp, name, txn, flags)) != 0)
+ return (ret);
+ } else {
+ if ((ret = __db_backup_name(env, name, txn, &tmpname)) != 0)
+ goto err;
+ if (TXN_ON(env) && txn != NULL &&
+ (ret = __txn_begin(env, NULL, txn, &stxn, 0)) != 0)
+ goto err;
+ if ((ret = __fop_create(env, stxn, &fhp,
+ tmpname, &dbp->dirname, aflags, mode, dflags)) != 0) {
+ /*
+ * If no transactions, there is a race on creating the
+ * backup file, as the backup file name is the same for
+ * all processes. Wait for the other process to finish
+ * with the name.
+ */
+ if (!TXN_ON(env) && ret == EEXIST) {
+ PERFMON3(env,
+ race, fop_file_setup, tmpname, ret, flags);
+ __os_free(env, tmpname);
+ tmpname = NULL;
+ __os_yield(env, 1, 0);
+ goto retry;
+ }
+ goto err;
+ }
+ tmp_created = 1;
+ }
+
+creat2: if (!F_ISSET(dbp, DB_AM_INMEM)) {
+ if ((ret = __db_appname(env,
+ aflags, tmpname, &dbp->dirname, &real_tmpname)) != 0)
+ goto err;
+
+ /* Set the pagesize if it isn't yet set. */
+ if (dbp->pgsize == 0 &&
+ (ret = __fop_set_pgsize(dbp, fhp, real_tmpname)) != 0)
+ goto errmsg;
+
+ /* Construct a file_id. */
+ if ((ret =
+ __os_fileid(env, real_tmpname, 1, dbp->fileid)) != 0)
+ goto errmsg;
+ }
+
+ if ((ret = __db_new_file(dbp, ip,
+ F_ISSET(dbp, DB_AM_INMEM) ? txn : stxn, fhp, tmpname)) != 0)
+ goto err;
+
+ /* Output the REOPEN record after we create. */
+ if (F_ISSET(dbp, DB_AM_INMEM) && dbp->log_filename != NULL && (ret =
+ __dbreg_log_id(dbp, txn, dbp->log_filename->id, 0)) != 0)
+ return (ret);
+
+ /*
+ * We need to close the handle here on platforms where remove and
+ * rename fail if a handle is open (including Windows).
+ */
+ CLOSE_HANDLE(dbp, fhp);
+
+ /*
+ * Now move the file into place unless we are creating in place (because
+ * we created a database in a file that started out 0-length). If
+ * this is an in-memory file, we may or may not hold the environment
+ * lock depending on how we got here.
+ */
+ if (!F_ISSET(dbp, DB_AM_COMPENSATE) &&
+ !F_ISSET(dbp, DB_AM_RECOVER) && !LOCK_ISSET(elock))
+ GET_ENVLOCK(env, locker, &elock);
+
+ if (F_ISSET(dbp, DB_AM_IN_RENAME)) {
+ F_CLR(dbp, DB_AM_IN_RENAME);
+ __txn_remrem(env, txn, real_name);
+ } else if (name == tmpname) {
+ /* We created it in place. */
+ } else if (!F_ISSET(dbp, DB_AM_INMEM) &&
+ __os_exists(env, real_name, NULL) == 0) {
+ /*
+ * Someone managed to create the file; remove our temp
+ * and try to open the file that now exists.
+ */
+ (void)__fop_remove(env, NULL,
+ dbp->fileid, tmpname, &dbp->dirname, aflags, dflags);
+ (void)__ENV_LPUT(env, dbp->handle_lock);
+ LOCK_INIT(dbp->handle_lock);
+
+ if (stxn != NULL) {
+ ret = __txn_abort(stxn);
+ stxn = NULL;
+ }
+ if (ret != 0)
+ goto err;
+ goto reopen;
+ }
+
+ if (name != NULL && (ret = __fop_lock_handle(env,
+ dbp, locker, DB_LOCK_WRITE, NULL, NOWAIT_FLAG(txn)|
+ (F2_ISSET(dbp,DB2_AM_NOWAIT) ? DB_LOCK_NOWAIT : 0))) != 0)
+ goto err;
+ if (tmpname != NULL &&
+ tmpname != name && (ret = __fop_rename(env, stxn, tmpname,
+ name, &dbp->dirname, dbp->fileid, aflags, 1, dflags)) != 0)
+ goto err;
+ if ((ret = __ENV_LPUT(env, elock)) != 0)
+ goto err;
+
+ if (stxn != NULL) {
+ *retidp = stxn->txnid;
+ ret = __txn_commit(stxn, 0);
+ stxn = NULL;
+ } else
+ *retidp = TXN_INVALID;
+
+ if (ret != 0)
+ goto err;
+
+ F_SET(dbp, DB_AM_CREATED);
+
+ if (0) {
+errmsg: __db_err(env, ret, "%s", name);
+
+err: CLOSE_HANDLE(dbp, fhp);
+ if (stxn != NULL)
+ (void)__txn_abort(stxn);
+ if (tmp_created && txn == NULL)
+ (void)__fop_remove(env,
+ NULL, NULL, tmpname, NULL, aflags, dflags);
+ if (txn == NULL)
+ (void)__ENV_LPUT(env, dbp->handle_lock);
+ (void)__ENV_LPUT(env, elock);
+ if (created_locker) {
+ (void)__lock_id_free(env, dbp->locker);
+ dbp->locker = NULL;
+ }
+ }
+
+done: /*
+ * There are cases where real_name and tmpname take on the
+ * exact same string, so we need to make sure that we do not
+ * free twice.
+ */
+ if (!truncating && tmpname != NULL && tmpname != name)
+ __os_free(env, tmpname);
+ if (real_name != name && real_name != NULL)
+ __os_free(env, real_name);
+ if (real_tmpname != NULL)
+ __os_free(env, real_tmpname);
+ CLOSE_HANDLE(dbp, fhp);
+
+ return (ret);
+}
+
+/*
+ * __fop_set_pgsize --
+ * Set the page size based on file information.
+ */
+static int
+__fop_set_pgsize(dbp, fhp, name)
+ DB *dbp;
+ DB_FH *fhp;
+ const char *name;
+{
+ ENV *env;
+ u_int32_t iopsize;
+ int ret;
+
+ env = dbp->env;
+
+ /*
+ * Use the filesystem's optimum I/O size as the pagesize if a pagesize
+ * not specified. Some filesystems have 64K as their optimum I/O size,
+ * but as that results in fairly large default caches, we limit the
+ * default pagesize to 16K.
+ */
+ if ((ret = __os_ioinfo(env, name, fhp, NULL, NULL, &iopsize)) != 0) {
+ __db_err(env, ret, "%s", name);
+ return (ret);
+ }
+ if (iopsize < 512)
+ iopsize = 512;
+ if (iopsize > 16 * 1024)
+ iopsize = 16 * 1024;
+
+ /*
+ * Sheer paranoia, but we don't want anything that's not a power-of-2
+ * (we rely on that for alignment of various types on the pages), and
+ * we want a multiple of the sector size as well. If the value
+ * we got out of __os_ioinfo looks bad, use a default instead.
+ */
+ if (!IS_VALID_PAGESIZE(iopsize))
+ iopsize = DB_DEF_IOSIZE;
+
+ dbp->pgsize = iopsize;
+ F_SET(dbp, DB_AM_PGDEF);
+
+ return (0);
+}
+
+/*
+ * __fop_subdb_setup --
+ *
+ * Subdb setup is significantly simpler than file setup. In terms of
+ * locking, for the duration of the operation/transaction, the locks on
+ * the meta-data page will suffice to protect us from simultaneous operations
+ * on the sub-database. Before we complete the operation though, we'll get a
+ * handle lock on the subdatabase so that on one else can try to remove it
+ * while we've got it open. We use an object that looks like the meta-data
+ * page lock with a different type (DB_HANDLE_LOCK) for the long-term handle.
+ * locks.
+ *
+ * PUBLIC: int __fop_subdb_setup __P((DB *, DB_THREAD_INFO *, DB_TXN *,
+ * PUBLIC: const char *, const char *, int, u_int32_t));
+ */
+int
+__fop_subdb_setup(dbp, ip, txn, mname, name, mode, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *mname, *name;
+ int mode;
+ u_int32_t flags;
+{
+ DB *mdbp;
+ ENV *env;
+ db_lockmode_t lkmode;
+ u_int32_t mflags;
+ int ret, t_ret;
+
+ mdbp = NULL;
+ env = dbp->env;
+
+ mflags = flags | DB_RDONLY;
+retry: if ((ret = __db_master_open(dbp,
+ ip, txn, mname, mflags, mode, &mdbp)) != 0)
+ return (ret);
+ /*
+ * If we created this file, then we need to set the DISCARD flag so
+ * that if we fail in the middle of this routine, we discard from the
+ * mpool any pages that we just created.
+ */
+ if (F_ISSET(mdbp, DB_AM_CREATED))
+ F_SET(mdbp, DB_AM_DISCARD);
+
+ /*
+ * We are going to close this instance of the master, so we can
+ * steal its handle instead of reopening a handle on the database.
+ */
+ if (LF_ISSET(DB_FCNTL_LOCKING)) {
+ dbp->saved_open_fhp = mdbp->saved_open_fhp;
+ mdbp->saved_open_fhp = NULL;
+ }
+
+ /* Copy the pagesize and set the sub-database flag. */
+ dbp->pgsize = mdbp->pgsize;
+ F_SET(dbp, DB_AM_SUBDB);
+
+ if (name != NULL && (ret = __db_master_update(mdbp, dbp,
+ ip, txn, name, dbp->type, MU_OPEN, NULL, flags)) != 0) {
+ if (ret == EBADF && F_ISSET(mdbp, DB_AM_RDONLY)) {
+ /* We need to reopen the master R/W to do the create. */
+ if ((ret = __db_close(mdbp, txn, 0)) != 0)
+ goto err;
+ FLD_CLR(mflags, DB_RDONLY);
+ goto retry;
+ }
+ goto err;
+ }
+
+ /*
+ * Hijack the master's locker ID as well, so that our locks don't
+ * conflict with the master's. Since we're closing the master,
+ * that locker would just have been freed anyway. Once we've gotten
+ * the locker id, we need to acquire the handle lock for this
+ * subdatabase.
+ */
+ dbp->locker = mdbp->locker;
+ mdbp->locker = NULL;
+
+ DB_TEST_RECOVERY(dbp, DB_TEST_POSTLOG, ret, mname);
+
+ /*
+ * We copy our fileid from our master so that we all open
+ * the same file in mpool. We'll use the meta-pgno to lock
+ * so that we end up with different handle locks.
+ */
+
+ memcpy(dbp->fileid, mdbp->fileid, DB_FILE_ID_LEN);
+ lkmode = F_ISSET(dbp, DB_AM_CREATED) || LF_ISSET(DB_WRITEOPEN) ||
+ F2_ISSET(dbp, DB2_AM_EXCL) ? DB_LOCK_WRITE : DB_LOCK_READ;
+ if ((ret = __fop_lock_handle(env, dbp,
+ txn == NULL ? dbp->locker : txn->locker, lkmode, NULL,
+ NOWAIT_FLAG(txn) |
+ (F2_ISSET(dbp, DB2_AM_NOWAIT) ? DB_LOCK_NOWAIT : 0))) != 0)
+ goto err;
+
+ if ((ret = __db_init_subdb(mdbp, dbp, name, ip, txn)) != 0) {
+ /*
+ * If there was no transaction and we created this database,
+ * then we need to undo the update of the master database.
+ */
+ if (F_ISSET(dbp, DB_AM_CREATED) && txn == NULL)
+ (void)__db_master_update(mdbp, dbp,
+ ip, txn, name, dbp->type, MU_REMOVE, NULL, 0);
+ F_CLR(dbp, DB_AM_CREATED);
+ goto err;
+ }
+
+ /*
+ * XXX
+ * This should have been done at the top of this routine. The problem
+ * is that __db_init_subdb() uses "standard" routines to process the
+ * meta-data page and set information in the DB handle based on it.
+ * Those routines have to deal with swapped pages and will normally set
+ * the DB_AM_SWAP flag. However, we use the master's metadata page and
+ * that has already been swapped, so they get the is-swapped test wrong.
+ */
+ F_CLR(dbp, DB_AM_SWAP);
+ F_SET(dbp, F_ISSET(mdbp, DB_AM_SWAP));
+
+ /*
+ * In the file create case, these happen in separate places so we have
+ * two different tests. They end up in the same place for subdbs, but
+ * for compatibility with file testing, we put them both here anyway.
+ */
+ DB_TEST_RECOVERY(dbp, DB_TEST_POSTLOGMETA, ret, mname);
+ DB_TEST_RECOVERY(dbp, DB_TEST_POSTSYNC, ret, mname);
+
+ /*
+ * File exists and we have the appropriate locks; we should now
+ * process a normal open.
+ */
+ if (F_ISSET(mdbp, DB_AM_CREATED)) {
+ F_SET(dbp, DB_AM_CREATED_MSTR);
+ F_CLR(mdbp, DB_AM_DISCARD);
+ }
+
+ if (0) {
+err:
+DB_TEST_RECOVERY_LABEL
+ if (txn == NULL)
+ (void)__ENV_LPUT(env, dbp->handle_lock);
+ }
+
+ /*
+ * The master's handle lock is under the control of the
+ * subdb (it acquired the master's locker). We want to
+ * keep the master's handle lock so that no one can remove
+ * the file while the subdb is open. If we register the
+ * trade event and then invalidate the copy of the lock
+ * in the master's handle, that will accomplish this. However,
+ * before we register this event, we'd better remove any
+ * events that we've already registered for the master.
+ */
+ if (!F_ISSET(dbp, DB_AM_RECOVER) && IS_REAL_TXN(txn)) {
+ /* Unregister old master events. */
+ __txn_remlock(env,
+ txn, &mdbp->handle_lock, DB_LOCK_INVALIDID);
+
+ /* Now register the new event. */
+ if ((t_ret = __txn_lockevent(env, txn, dbp,
+ &mdbp->handle_lock, dbp->locker == NULL ?
+ mdbp->locker : dbp->locker)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+ LOCK_INIT(mdbp->handle_lock);
+
+ /*
+ * If the master was created, we need to sync so that the metadata
+ * page is correct on disk for recovery, since it isn't read through
+ * mpool. If we're opening a subdb in an existing file, we can skip
+ * the sync.
+ */
+ if ((t_ret = __db_close(mdbp, txn,
+ F_ISSET(dbp, DB_AM_CREATED_MSTR) ? 0 : DB_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __fop_remove_setup --
+ * Open handle appropriately and lock for removal of a database file.
+ *
+ * PUBLIC: int __fop_remove_setup __P((DB *,
+ * PUBLIC: DB_TXN *, const char *, u_int32_t));
+ */
+int
+__fop_remove_setup(dbp, txn, name, flags)
+ DB *dbp;
+ DB_TXN *txn;
+ const char *name;
+ u_int32_t flags;
+{
+ DB_FH *fhp;
+ DB_LOCK elock;
+ ENV *env;
+ u_int8_t mbuf[DBMETASIZE];
+ int ret;
+
+ COMPQUIET(flags, 0);
+
+ env = dbp->env;
+
+ LOCK_INIT(elock);
+ fhp = NULL;
+ ret = 0;
+
+ /* Create locker if necessary. */
+retry: if (LOCKING_ON(env)) {
+ if (IS_REAL_TXN(txn))
+ dbp->locker = txn->locker;
+ else if (dbp->locker == DB_LOCK_INVALIDID) {
+ if ((ret = __lock_id(env, NULL, &dbp->locker)) != 0)
+ goto err;
+ if (txn != NULL && F_ISSET(txn, TXN_INFAMILY) &&
+ (ret = __lock_addfamilylocker(env,
+ txn->txnid, dbp->locker->id, 1)) != 0)
+ goto err;
+ }
+ }
+
+ /*
+ * We are about to open a file handle and then possibly close it.
+ * We cannot close handles if we are doing FCNTL locking. However,
+ * there is no way to pass the FCNTL flag into this routine via the
+ * user API. The only way we can get in here and be doing FCNTL
+ * locking is if we are trying to clean up an open that was called
+ * with FCNTL locking. In that case, the save_fhp should already be
+ * set. So, we use that field to tell us if we need to make sure
+ * that we shouldn't close the handle.
+ */
+ fhp = dbp->saved_open_fhp;
+ DB_ASSERT(env, LF_ISSET(DB_FCNTL_LOCKING) || fhp == NULL);
+
+ /*
+ * Lock environment to protect file open. That will enable us to
+ * read the meta-data page and get the fileid so that we can lock
+ * the handle.
+ */
+ GET_ENVLOCK(env, dbp->locker, &elock);
+
+ /* Open database. */
+ if (F_ISSET(dbp, DB_AM_INMEM)) {
+ if ((ret = __env_mpool(dbp, name, flags)) == 0)
+ ret = __os_strdup(env, name, &dbp->dname);
+ } else if (fhp == NULL)
+ ret = __os_open(env, name, 0, DB_OSO_RDONLY, 0, &fhp);
+ if (ret != 0)
+ goto err;
+
+ /* Get meta-data */
+ if (F_ISSET(dbp, DB_AM_INMEM))
+ ret = __fop_inmem_read_meta(
+ dbp, txn, name, flags, DB_CHK_META);
+ else if ((ret = __fop_read_meta(env,
+ name, mbuf, sizeof(mbuf), fhp, 0, NULL)) == 0)
+ ret = __db_meta_setup(env, dbp,
+ name, (DBMETA *)mbuf, flags, DB_CHK_META | DB_CHK_NOLSN);
+ if (ret != 0)
+ goto err;
+
+ /*
+ * Now, get the handle lock. We first try with NOWAIT, because if
+ * we have to wait, we're going to have to close the file and reopen
+ * it, so that if there is someone else removing it, our open doesn't
+ * prevent that.
+ */
+ if ((ret = __fop_lock_handle(env,
+ dbp, dbp->locker, DB_LOCK_WRITE, NULL, DB_LOCK_NOWAIT)) != 0) {
+ /*
+ * Close the file, block on the lock, clean up the dbp, and
+ * then start all over again.
+ */
+ if (!F_ISSET(dbp, DB_AM_INMEM) && !LF_ISSET(DB_FCNTL_LOCKING)) {
+ (void)__os_closehandle(env, fhp);
+ fhp = NULL;
+ }
+ if (ret != DB_LOCK_NOTGRANTED ||
+ (txn != NULL && F_ISSET(txn, TXN_NOWAIT)))
+ goto err;
+ else if ((ret = __fop_lock_handle(env,
+ dbp, dbp->locker, DB_LOCK_WRITE, &elock, 0)) != 0)
+ goto err;
+
+ if (F_ISSET(dbp, DB_AM_INMEM)) {
+ (void)__lock_put(env, &dbp->handle_lock);
+ (void)__db_refresh(dbp, txn, DB_NOSYNC, NULL, 1);
+ } else {
+ if (txn != NULL)
+ dbp->locker = NULL;
+ (void)__db_refresh(dbp, txn, DB_NOSYNC, NULL, 0);
+ }
+ goto retry;
+ } else if ((ret = __ENV_LPUT(env, elock)) != 0)
+ goto err;
+ else if (F_ISSET(dbp, DB_AM_IN_RENAME))
+ ret = ENOENT;
+
+ if (0) {
+err: (void)__ENV_LPUT(env, elock);
+ }
+ if (fhp != NULL && !LF_ISSET(DB_FCNTL_LOCKING))
+ (void)__os_closehandle(env, fhp);
+ /*
+ * If this is a real file and we are going to proceed with the removal,
+ * then we need to make sure that we don't leave any pages around in the
+ * mpool since the file is closed and will be reopened again before
+ * access. However, this might be an in-memory file, in which case
+ * we will handle the discard from the mpool later as it's the "real"
+ * removal of the database.
+ */
+ if (ret == 0 && !F_ISSET(dbp, DB_AM_INMEM))
+ F_SET(dbp, DB_AM_DISCARD);
+ return (ret);
+}
+
+/*
+ * __fop_read_meta --
+ * Read the meta-data page from a file and return it in buf.
+ *
+ * PUBLIC: int __fop_read_meta __P((ENV *, const char *,
+ * PUBLIC: u_int8_t *, size_t, DB_FH *, int, size_t *));
+ */
+int
+__fop_read_meta(env, name, buf, size, fhp, errok, nbytesp)
+ ENV *env;
+ const char *name;
+ u_int8_t *buf;
+ size_t size;
+ DB_FH *fhp;
+ int errok;
+ size_t *nbytesp;
+{
+ size_t nr;
+ int ret;
+
+ /*
+ * Our caller wants to know the number of bytes read, even if we
+ * return an error.
+ */
+ if (nbytesp != NULL)
+ *nbytesp = 0;
+
+ nr = 0;
+ ret = __os_read(env, fhp, buf, size, &nr);
+ if (nbytesp != NULL)
+ *nbytesp = nr;
+
+ if (ret != 0) {
+ if (!errok)
+ __db_err(env, ret, "%s", name);
+ goto err;
+ }
+
+ if (nr != size) {
+ if (!errok)
+ __db_errx(env, DB_STR_A("0004",
+ "fop_read_meta: %s: unexpected file type or format",
+ "%s"), name);
+ ret = EINVAL;
+ }
+
+err:
+ return (ret);
+}
+
+/*
+ * __fop_dummy --
+ * This implements the creation and name swapping of dummy files that
+ * we use for remove and rename (remove is simply a rename with a delayed
+ * remove).
+ *
+ * PUBLIC: int __fop_dummy __P((DB *,
+ * PUBLIC: DB_TXN *, const char *, const char *));
+ */
+int
+__fop_dummy(dbp, txn, old, new)
+ DB *dbp;
+ DB_TXN *txn;
+ const char *old, *new;
+{
+ DB *tmpdbp;
+ DB_TXN *stxn;
+ ENV *env;
+ char *back;
+ int ret, t_ret;
+ u_int8_t mbuf[DBMETASIZE];
+
+ env = dbp->env;
+ back = NULL;
+ stxn = NULL;
+ tmpdbp = NULL;
+
+ DB_ASSERT(env, txn != NULL);
+
+ /*
+ * Begin sub transaction to encapsulate the rename. Note that we
+ * expect the inmem_swap calls to complete the sub-transaction,
+ * aborting on error and committing on success.
+ */
+ if (TXN_ON(env) &&
+ (ret = __txn_begin(env, NULL, txn, &stxn, 0)) != 0)
+ goto err;
+
+ /* We need to create a dummy file as a place holder. */
+ if ((ret = __db_backup_name(env, new, stxn, &back)) != 0)
+ goto err;
+ /* Create a dummy dbp handle. */
+ if ((ret = __db_create_internal(&tmpdbp, env, 0)) != 0)
+ goto err;
+ if (F_ISSET(dbp, DB_AM_NOT_DURABLE) &&
+ (ret = __db_set_flags(tmpdbp, DB_TXN_NOT_DURABLE)) != 0)
+ goto err;
+ memset(mbuf, 0, sizeof(mbuf));
+ ret = F_ISSET(dbp, DB_AM_INMEM) ?
+ __fop_inmem_dummy(tmpdbp, stxn, back, mbuf) :
+ __fop_ondisk_dummy(tmpdbp, stxn, back, mbuf);
+
+ if (ret != 0)
+ goto err;
+
+ ret = F_ISSET(dbp, DB_AM_INMEM) ?
+ __fop_inmem_swap(dbp, tmpdbp, stxn, old, new, back, txn->locker) :
+ __fop_ondisk_swap(dbp, tmpdbp, stxn, old, new, back, txn->locker);
+ stxn = NULL;
+ if (ret != 0)
+ goto err;
+
+err: if (stxn != NULL)
+ (void)__txn_abort(stxn);
+ if (tmpdbp != NULL &&
+ (t_ret = __db_close(tmpdbp, NULL, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ if (back != NULL)
+ __os_free(env, back);
+ return (ret);
+}
+
+/*
+ * __fop_dbrename --
+ * Do the appropriate file locking and file system operations
+ * to effect a dbrename in the absence of transactions (__fop_dummy
+ * and the subsequent calls in __db_rename do the work for the
+ * transactional case).
+ *
+ * PUBLIC: int __fop_dbrename __P((DB *, const char *, const char *));
+ */
+int
+__fop_dbrename(dbp, old, new)
+ DB *dbp;
+ const char *old, *new;
+{
+ DB_LOCK elock;
+ ENV *env;
+ char *real_new, *real_old;
+ int ret, t_ret;
+
+ env = dbp->env;
+ real_new = NULL;
+ real_old = NULL;
+ LOCK_INIT(elock);
+
+ if (F_ISSET(dbp, DB_AM_INMEM)) {
+ real_new = (char *)new;
+ real_old = (char *)old;
+ } else {
+ /* Get full names. */
+ if ((ret = __db_appname(env,
+ DB_APP_DATA, old, &dbp->dirname, &real_old)) != 0)
+ goto err;
+
+ if ((ret = __db_appname(env,
+ DB_APP_DATA, new, &dbp->dirname, &real_new)) != 0)
+ goto err;
+ }
+
+ /*
+ * It is an error to rename a file over one that already exists,
+ * as that wouldn't be transaction-safe. We check explicitly
+ * for ondisk files, but it's done memp_nameop for in-memory ones.
+ */
+ GET_ENVLOCK(env, dbp->locker, &elock);
+ ret = F_ISSET(dbp, DB_AM_INMEM) ? ENOENT :
+ __os_exists(env, real_new, NULL);
+
+ if (ret == 0) {
+ ret = EEXIST;
+ __db_errx(env, DB_STR_A("0005",
+ "rename: file %s exists", "%s"), real_new);
+ goto err;
+ }
+
+ ret = __memp_nameop(env,
+ dbp->fileid, new, real_old, real_new, F_ISSET(dbp, DB_AM_INMEM));
+
+err: if ((t_ret = __ENV_LPUT(env, elock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (!F_ISSET(dbp, DB_AM_INMEM) && real_old != NULL)
+ __os_free(env, real_old);
+ if (!F_ISSET(dbp, DB_AM_INMEM) && real_new != NULL)
+ __os_free(env, real_new);
+ return (ret);
+}
+
+static int
+__fop_inmem_create(dbp, name, txn, flags)
+ DB *dbp;
+ const char *name;
+ DB_TXN *txn;
+ u_int32_t flags;
+{
+ DBT fid_dbt, name_dbt;
+ DB_LSN lsn;
+ ENV *env;
+ int ret;
+ int32_t lfid;
+ u_int32_t dflags, *p32;
+
+ env = dbp->env;
+ dflags = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0;
+
+ MAKE_INMEM(dbp);
+
+ /* Set the pagesize if it isn't yet set. */
+ if (dbp->pgsize == 0)
+ dbp->pgsize = DB_DEF_IOSIZE;
+
+ /*
+ * Construct a file_id.
+ *
+ * If this file has no name, then we only need a fileid for locking.
+ * If this file has a name, we need the fileid both for locking and
+ * matching in the memory pool. So, with unnamed in-memory databases,
+ * use a lock_id. For named in-memory files, we need to find a value
+ * that we can use to uniquely identify a name/fid pair. We use a
+ * combination of a unique id (__os_unique_id) and a hash of the
+ * original name.
+ */
+ if (name == NULL) {
+ if (LOCKING_ON(env) && (ret =
+ __lock_id(env, (u_int32_t *)dbp->fileid, NULL)) != 0)
+ goto err;
+ } else {
+ p32 = (u_int32_t *)(&dbp->fileid[0]);
+ __os_unique_id(env, p32);
+ p32++;
+ (void)strncpy(
+ (char *)p32, name, DB_FILE_ID_LEN - sizeof(u_int32_t));
+ dbp->preserve_fid = 1;
+
+ if (DBENV_LOGGING(env) &&
+#if !defined(DEBUG_WOP) && !defined(DIAGNOSTIC)
+ txn != NULL &&
+#endif
+ dbp->log_filename != NULL)
+ memcpy(dbp->log_filename->ufid,
+ dbp->fileid, DB_FILE_ID_LEN);
+ }
+
+ /* Now, set the fileid. */
+ if ((ret = __memp_set_fileid(dbp->mpf, dbp->fileid)) != 0)
+ goto err;
+
+ if ((ret = __env_mpool(dbp, name, flags)) != 0)
+ goto err;
+
+ if (DBENV_LOGGING(env) &&
+#if !defined(DEBUG_WOP)
+ txn != NULL &&
+#endif
+ name != NULL) {
+ DB_INIT_DBT(name_dbt, name, strlen(name) + 1);
+ memset(&fid_dbt, 0, sizeof(fid_dbt));
+ fid_dbt.data = dbp->fileid;
+ fid_dbt.size = DB_FILE_ID_LEN;
+ lfid = dbp->log_filename == NULL ?
+ DB_LOGFILEID_INVALID : dbp->log_filename->id;
+ if ((ret = __crdel_inmem_create_log(env, txn,
+ &lsn, dflags, lfid, &name_dbt, &fid_dbt, dbp->pgsize)) != 0)
+ goto err;
+ }
+
+ F_SET(dbp, DB_AM_CREATED);
+
+err:
+ return (ret);
+}
+
+static int
+__fop_inmem_read_meta(dbp, txn, name, flags, chkflags)
+ DB *dbp;
+ DB_TXN *txn;
+ const char *name;
+ u_int32_t flags;
+ u_int32_t chkflags;
+{
+ DBMETA *metap;
+ DB_THREAD_INFO *ip;
+ db_pgno_t pgno;
+ int ret, t_ret;
+
+ if (txn == NULL)
+ ENV_GET_THREAD_INFO(dbp->env, ip);
+ else
+ ip = txn->thread_info;
+
+ pgno = PGNO_BASE_MD;
+ if ((ret = __memp_fget(dbp->mpf, &pgno, ip, txn, 0, &metap)) != 0)
+ return (ret);
+ if (FLD_ISSET(chkflags, DB_CHK_ONLY)) {
+ if ((ret = __db_chk_meta(dbp->env, dbp, metap, chkflags)) == 0)
+ memcpy(dbp->fileid,
+ ((DBMETA *)metap)->uid, DB_FILE_ID_LEN);
+ } else
+ ret = __db_meta_setup(
+ dbp->env, dbp, name, metap, flags, chkflags);
+
+ if ((t_ret =
+ __memp_fput(dbp->mpf, ip, metap, dbp->priority)) && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+static int
+__fop_ondisk_dummy(dbp, txn, name, mbuf)
+ DB *dbp;
+ DB_TXN *txn;
+ const char *name;
+ u_int8_t *mbuf;
+{
+ ENV *env;
+ int ret;
+ char *realname;
+ u_int32_t dflags;
+
+ realname = NULL;
+ env = dbp->env;
+ dflags = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0;
+
+ if ((ret = __db_appname(env,
+ DB_APP_DATA, name, &dbp->dirname, &realname)) != 0)
+ goto err;
+
+ if ((ret = __fop_create(env,
+ txn, NULL, name, &dbp->dirname, DB_APP_DATA, 0, dflags)) != 0)
+ goto err;
+
+ if ((ret =
+ __os_fileid(env, realname, 1, ((DBMETA *)mbuf)->uid)) != 0)
+ goto err;
+
+ ((DBMETA *)mbuf)->magic = DB_RENAMEMAGIC;
+ if ((ret = __fop_write(env, txn, name, dbp->dirname,
+ DB_APP_DATA, NULL, 0, 0, 0, mbuf, DBMETASIZE, 1, dflags)) != 0)
+ goto err;
+
+ memcpy(dbp->fileid, ((DBMETA *)mbuf)->uid, DB_FILE_ID_LEN);
+
+err: if (realname != NULL)
+ __os_free(env, realname);
+
+ return (ret);
+}
+
+static int
+__fop_inmem_dummy(dbp, txn, name, mbuf)
+ DB *dbp;
+ DB_TXN *txn;
+ const char *name;
+ u_int8_t *mbuf;
+{
+ DBMETA *metap;
+ DB_THREAD_INFO *ip;
+ db_pgno_t pgno;
+ int ret, t_ret;
+
+ if ((ret = __fop_inmem_create(dbp, name, txn, DB_CREATE)) != 0)
+ return (ret);
+ if (txn == NULL)
+ ENV_GET_THREAD_INFO(dbp->env, ip);
+ else
+ ip = txn->thread_info;
+
+ pgno = PGNO_BASE_MD;
+ if ((ret = __memp_fget(dbp->mpf, &pgno, ip, txn,
+ DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &metap)) != 0)
+ return (ret);
+ /* Check file existed. */
+ if (metap->magic != 0)
+ ret = EEXIST;
+ else
+ metap->magic = DB_RENAMEMAGIC;
+
+ /* Copy the fileid onto the meta-data page. */
+ memcpy(metap->uid, dbp->fileid, DB_FILE_ID_LEN);
+
+ if ((t_ret = __memp_fput(dbp->mpf, ip, metap,
+ ret == 0 ? dbp->priority : DB_PRIORITY_VERY_LOW)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (ret != 0)
+ goto err;
+
+ ((DBMETA *)mbuf)->magic = DB_RENAMEMAGIC;
+
+err: return (ret);
+}
+
+static int
+__fop_ondisk_swap(dbp, tmpdbp, txn, old, new, back, locker)
+ DB *dbp, *tmpdbp;
+ DB_TXN *txn;
+ const char *old, *new, *back;
+ DB_LOCKER *locker;
+{
+ DBT fiddbt, namedbt, tmpdbt;
+ DB_FH *fhp;
+ DB_LOCK elock;
+ DB_LSN lsn;
+ DB_TXN *parent;
+ ENV *env;
+ u_int8_t mbuf[DBMETASIZE];
+ u_int32_t child_txnid, dflags;
+ int ret, t_ret;
+ char *realold, *realnew;
+
+ env = dbp->env;
+ DB_ASSERT(env, txn != NULL);
+ DB_ASSERT(env, old != NULL);
+
+ realold = realnew = NULL;
+ LOCK_INIT(elock);
+ fhp = NULL;
+ dflags = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0;
+
+ if ((ret = __db_appname(env,
+ DB_APP_DATA, new, &dbp->dirname, &realnew)) != 0)
+ goto err;
+
+ /* Now, lock the name space while we initialize this file. */
+retry: GET_ENVLOCK(env, locker, &elock);
+ if (__os_exists(env, realnew, NULL) == 0) {
+ /*
+ * It is possible that the only reason this file exists is
+ * because we've done a previous rename of it and we have
+ * left a placeholder here. We need to check for that case
+ * and allow this rename to succeed if that's the case.
+ */
+ if ((ret = __os_open(env, realnew, 0, 0, 0, &fhp)) != 0)
+ goto err;
+ if ((ret = __fop_read_meta(env,
+ realnew, mbuf, sizeof(mbuf), fhp, 0, NULL)) != 0 ||
+ (ret = __db_meta_setup(env,
+ tmpdbp, realnew, (DBMETA *)mbuf, 0, DB_CHK_META)) != 0) {
+ ret = EEXIST;
+ goto err;
+ }
+ ret = __os_closehandle(env, fhp);
+ fhp = NULL;
+ if (ret != 0)
+ goto err;
+
+ /*
+ * Now, try to acquire the handle lock. If the handle is locked
+ * by our current, transaction, then we'll get it and life is
+ * good.
+ *
+ * Alternately, it's not locked at all, we'll get the lock, but
+ * we will realize it exists and consider this an error.
+ *
+ * However, if it's held by another transaction, then there
+ * could be two different scenarios: 1) the file is in the
+ * midst of being created or deleted and when that transaction
+ * is over, we might be able to proceed. 2) the file is open
+ * and exists and we should report an error. In order to
+ * distinguish these two cases, we do the following. First, we
+ * try to acquire a READLOCK. If the handle is in the midst of
+ * being created, then we'll block because a writelock is held.
+ * In that case, we should request a blocking write, and when we
+ * get the lock, we should then go back and check to see if the
+ * object exists and start all over again.
+ *
+ * If we got the READLOCK, then either no one is holding the
+ * lock or someone has an open handle and the fact that the file
+ * exists is problematic. So, in this case, we request the
+ * WRITELOCK non-blocking -- if it succeeds, we're golden. If
+ * it fails, then the file exists and we return EEXIST.
+ */
+ if ((ret = __fop_lock_handle(env,
+ tmpdbp, locker, DB_LOCK_READ, NULL, DB_LOCK_NOWAIT)) != 0) {
+ /*
+ * Someone holds a write-lock. Wait for the write-lock
+ * and after we get it, release it and start over.
+ */
+ if ((ret = __fop_lock_handle(env, tmpdbp,
+ locker, DB_LOCK_WRITE, &elock, 0)) != 0)
+ goto err;
+ if ((ret =
+ __lock_put(env, &tmpdbp->handle_lock)) != 0)
+ goto err;
+ if ((ret = __db_refresh(tmpdbp, NULL, 0, NULL, 0)) != 0)
+ goto err;
+ goto retry;
+ }
+
+ /* We got the read lock; try to upgrade it. */
+ ret = __fop_lock_handle(env,
+ tmpdbp, locker, DB_LOCK_WRITE,
+ NULL, DB_LOCK_UPGRADE | DB_LOCK_NOWAIT);
+ if (ret != 0) {
+ /*
+ * We did not get the writelock, so someone
+ * has the handle open. This is an error.
+ */
+ (void)__lock_put(env, &tmpdbp->handle_lock);
+ ret = EEXIST;
+ } else if (F_ISSET(tmpdbp, DB_AM_IN_RENAME))
+ /* We got the lock and are renaming it. */
+ ret = 0;
+ else { /* We got the lock, but the file exists. */
+ (void)__lock_put(env, &tmpdbp->handle_lock);
+ ret = EEXIST;
+ }
+ if (ret != 0)
+ goto err;
+ }
+
+ /*
+ * While we have the namespace locked, do the renames and then
+ * swap for the handle lock.
+ */
+ if ((ret = __fop_rename(env, txn,
+ old, new, &dbp->dirname, dbp->fileid, DB_APP_DATA, 1, dflags)) != 0)
+ goto err;
+ if ((ret = __fop_rename(env, txn, back, old,
+ &dbp->dirname, tmpdbp->fileid, DB_APP_DATA, 0, dflags)) != 0)
+ goto err;
+ if ((ret = __fop_lock_handle(env,
+ tmpdbp, locker, DB_LOCK_WRITE, &elock, NOWAIT_FLAG(txn))) != 0)
+ goto err;
+
+ /*
+ * We just acquired a transactional lock on the tmp handle.
+ * We need to null out the tmp handle's lock so that it
+ * doesn't create problems for us in the close path.
+ */
+ LOCK_INIT(tmpdbp->handle_lock);
+
+ /* Commit the child. */
+ child_txnid = txn->txnid;
+ parent = txn->parent;
+ ret = __txn_commit(txn, 0);
+ txn = NULL;
+
+ /*
+ * If the new name is available because it was previously renamed
+ * remove it from the remove list.
+ */
+ if (F_ISSET(tmpdbp, DB_AM_IN_RENAME))
+ __txn_remrem(env, parent, realnew);
+
+ /* Now log the child information in the parent. */
+ memset(&fiddbt, 0, sizeof(fiddbt));
+ fiddbt.data = dbp->fileid;
+ fiddbt.size = DB_FILE_ID_LEN;
+ memset(&tmpdbt, 0, sizeof(fiddbt));
+ tmpdbt.data = tmpdbp->fileid;
+ tmpdbt.size = DB_FILE_ID_LEN;
+ DB_INIT_DBT(namedbt, old, strlen(old) + 1);
+ if ((t_ret = __fop_file_remove_log(env,
+ parent, &lsn, dflags, &fiddbt, &tmpdbt, &namedbt,
+ (u_int32_t)DB_APP_DATA, child_txnid)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* This is a delayed delete of the dummy file. */
+ if ((ret = __db_appname(env,
+ DB_APP_DATA, old, &dbp->dirname, &realold)) != 0)
+ goto err;
+
+ if ((ret = __txn_remevent(env, parent, realold, NULL, 0)) != 0)
+ goto err;
+
+err: if (txn != NULL) /* Ret must already be set, so void abort. */
+ (void)__txn_abort(txn);
+
+ (void)__ENV_LPUT(env, elock);
+
+ if (fhp != NULL &&
+ (t_ret = __os_closehandle(env, fhp)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (realnew != NULL)
+ __os_free(env, realnew);
+ if (realold != NULL)
+ __os_free(env, realold);
+ return (ret);
+}
+
+static int
+__fop_inmem_swap(olddbp, backdbp, txn, old, new, back, locker)
+ DB *olddbp, *backdbp;
+ DB_TXN *txn;
+ const char *old, *new, *back;
+ DB_LOCKER *locker;
+{
+ DB *tmpdbp;
+ DBT fid_dbt, n1_dbt, n2_dbt;
+ DB_LOCK elock;
+ DB_LSN lsn;
+ DB_TXN *parent;
+ ENV *env;
+ int ret, t_ret;
+
+ env = olddbp->env;
+ parent = txn->parent;
+retry: LOCK_INIT(elock);
+ if ((ret = __db_create_internal(&tmpdbp, env, 0)) != 0)
+ return (ret);
+ MAKE_INMEM(tmpdbp);
+
+ GET_ENVLOCK(env, locker, &elock);
+ if ((ret = __env_mpool(tmpdbp, new, 0)) == 0) {
+ /*
+ * It is possible that the only reason this database exists is
+ * because we've done a previous rename of it and we have
+ * left a placeholder here. We need to check for that case
+ * and allow this rename to succeed if that's the case.
+ */
+
+ if ((ret = __fop_inmem_read_meta(
+ tmpdbp, txn, new, 0, DB_CHK_META)) != 0) {
+ ret = EEXIST;
+ goto err;
+ }
+
+ /*
+ * Now, try to acquire the handle lock. If it's from our txn,
+ * then we'll get the lock. If it's not, then someone else has
+ * it locked. See the comments in __fop_ondisk_swap for
+ * details.
+ */
+ if ((ret = __fop_lock_handle(env,
+ tmpdbp, locker, DB_LOCK_READ, NULL, DB_LOCK_NOWAIT)) != 0) {
+ /*
+ * Someone holds a writelock. Try for the WRITELOCK
+ * and after we get it, retry.
+ */
+ if ((ret = __fop_lock_handle(env, tmpdbp,
+ locker, DB_LOCK_WRITE, &elock, 0)) != 0)
+ goto err;
+
+ /* We have the write lock; release it and start over. */
+ (void)__lock_put(env, &tmpdbp->handle_lock);
+ (void)__db_close(tmpdbp, NULL, DB_NOSYNC);
+ (void)__ENV_LPUT(env, elock);
+ goto retry;
+ } else {
+ (void)__lock_put(env, &tmpdbp->handle_lock);
+ if (!F_ISSET(tmpdbp, DB_AM_IN_RENAME))
+ ret = EEXIST;
+ }
+ if (ret != 0)
+ goto err;
+ }
+
+ /* Log the renames. */
+ if (LOGGING_ON(env)
+#ifndef DEBUG_WOP
+ && txn != NULL
+#endif
+ ) {
+ /* Rename old to new. */
+ DB_INIT_DBT(fid_dbt, olddbp->fileid, DB_FILE_ID_LEN);
+ DB_INIT_DBT(n1_dbt, old, strlen(old) + 1);
+ DB_INIT_DBT(n2_dbt, new, strlen(new) + 1);
+ if ((ret = __crdel_inmem_rename_log(
+ env, txn, &lsn, 0, &n1_dbt, &n2_dbt, &fid_dbt)) != 0)
+ goto err;
+
+ /* Rename back to old */
+ fid_dbt.data = backdbp->fileid;
+ DB_SET_DBT(n2_dbt, back, strlen(back) + 1);
+ if ((ret = __crdel_inmem_rename_log(
+ env, txn, &lsn, 0, &n2_dbt, &n1_dbt, &fid_dbt)) != 0)
+ goto err;
+ }
+
+ /*
+ * While we have the namespace locked, do the renames and then
+ * swap for the handle lock. If we ran into a file in the midst
+ * of rename, then we need to delete it first, else nameop is
+ * going to consider it an error.
+ */
+ if (F_ISSET(tmpdbp, DB_AM_IN_RENAME)) {
+ if ((ret = __memp_nameop(env,
+ tmpdbp->fileid, NULL, new, NULL, 1)) != 0)
+ goto err;
+ __txn_remrem(env, parent, new);
+ }
+
+ if ((ret = __memp_nameop(
+ env, olddbp->fileid, new, old, new, 1)) != 0)
+ goto err;
+ if ((ret = __memp_nameop(
+ env, backdbp->fileid, old, back, old, 1)) != 0)
+ goto err;
+
+ if ((ret = __fop_lock_handle(env,
+ tmpdbp, locker, DB_LOCK_WRITE, &elock, 0)) != 0)
+ goto err;
+
+ /*
+ * We just acquired a transactional lock on the tmp handle.
+ * We need to null out the tmp handle's lock so that it
+ * doesn't create problems for us in the close path.
+ */
+ LOCK_INIT(tmpdbp->handle_lock);
+
+ DB_ASSERT(env, txn != NULL);
+
+ /* Commit the child. */
+ ret = __txn_commit(txn, 0);
+ txn = NULL;
+
+ if ((ret = __db_inmem_remove(backdbp, parent, old)) != 0)
+ goto err;
+
+err: (void)__ENV_LPUT(env, elock);
+
+ if (txn != NULL)
+ (void)__txn_abort(txn);
+
+ if ((t_ret = __db_close(tmpdbp, NULL, 0)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}