diff options
Diffstat (limited to 'src/fileops/fop_util.c')
-rw-r--r-- | src/fileops/fop_util.c | 1841 |
1 files changed, 1841 insertions, 0 deletions
diff --git a/src/fileops/fop_util.c b/src/fileops/fop_util.c new file mode 100644 index 00000000..1925ffd1 --- /dev/null +++ b/src/fileops/fop_util.c @@ -0,0 +1,1841 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/db_am.h" +#include "dbinc/hash.h" +#include "dbinc/fop.h" +#include "dbinc/lock.h" +#include "dbinc/mp.h" +#include "dbinc/txn.h" + +static int __fop_set_pgsize __P((DB *, DB_FH *, const char *)); +static int __fop_inmem_create __P((DB *, const char *, DB_TXN *, u_int32_t)); +static int __fop_inmem_dummy __P((DB *, DB_TXN *, const char *, u_int8_t *)); +static int __fop_inmem_read_meta __P((DB *, DB_TXN *, const char *, u_int32_t, + u_int32_t)); +static int __fop_inmem_swap __P((DB *, DB *, DB_TXN *, + const char *, const char *, const char *, DB_LOCKER *)); +static int __fop_ondisk_dummy __P((DB *, DB_TXN *, const char *, u_int8_t *)); +static int __fop_ondisk_swap __P((DB *, DB *, DB_TXN *, + const char *, const char *, const char *, DB_LOCKER *)); + +/* + * Acquire the environment meta-data lock. The parameters are the + * environment (ENV), the locker id to use in acquiring the lock (ID) + * and a pointer to a DB_LOCK. + * + * !!! + * Turn off locking for Critical Path. The application must do its own + * synchronization of open/create. Two threads creating and opening a + * file at the same time may have unpredictable results. + */ +#ifdef CRITICALPATH_10266 +#define GET_ENVLOCK(ENV, ID, L) (0) +#else +#define GET_ENVLOCK(ENV, ID, L) do { \ + DBT __dbt; \ + u_int32_t __lockval; \ + \ + if (LOCKING_ON((ENV))) { \ + __lockval = 1; \ + __dbt.data = &__lockval; \ + __dbt.size = sizeof(__lockval); \ + if ((ret = __lock_get((ENV), (ID), \ + 0, &__dbt, DB_LOCK_WRITE, (L))) != 0) \ + goto err; \ + } \ +} while (0) +#endif + +#define RESET_MPF(D, F) do { \ + (void)__memp_fclose((D)->mpf, (F)); \ + (D)->mpf = NULL; \ + F_CLR((D), DB_AM_OPEN_CALLED); \ + if ((ret = __memp_fcreate((D)->env, &(D)->mpf)) != 0) \ + goto err; \ +} while (0) + +/* + * If we open a file handle and our caller is doing fcntl(2) locking, + * we can't close the handle because that would discard the caller's + * lock. Save it until we close or refresh the DB handle. + */ +#define CLOSE_HANDLE(D, F) { \ + if ((F) != NULL) { \ + if (LF_ISSET(DB_FCNTL_LOCKING)) \ + (D)->saved_open_fhp = (F); \ + else if ((t_ret = \ + __os_closehandle((D)->env, (F))) != 0) { \ + if (ret == 0) \ + ret = t_ret; \ + goto err; \ + } \ + (F) = NULL; \ + } \ +} + +/* + * __fop_lock_handle -- + * + * Get the handle lock for a database. If the envlock is specified, do this + * as a lock_vec call that releases the environment lock before acquiring the + * handle lock. + * + * PUBLIC: int __fop_lock_handle __P((ENV *, + * PUBLIC: DB *, DB_LOCKER *, db_lockmode_t, DB_LOCK *, u_int32_t)); + * + */ +int +__fop_lock_handle(env, dbp, locker, mode, elockp, flags) + ENV *env; + DB *dbp; + DB_LOCKER *locker; + db_lockmode_t mode; + DB_LOCK *elockp; + u_int32_t flags; +{ + DBT fileobj; + DB_LOCKREQ reqs[2], *ereq; + DB_LOCK_ILOCK lock_desc; + int ret; + + if (!LOCKING_ON(env) || + F_ISSET(dbp, DB_AM_COMPENSATE | DB_AM_RECOVER)) + return (0); + + /* + * If we are in recovery, the only locking we should be + * doing is on the global environment. The one exception + * is if we are opening an exclusive database on a client + * syncing with the master. + */ + if (IS_RECOVERING(env) && !F2_ISSET(dbp, DB2_AM_INTEXCL)) + return (elockp == NULL ? 0 : __ENV_LPUT(env, *elockp)); + + memcpy(lock_desc.fileid, dbp->fileid, DB_FILE_ID_LEN); + lock_desc.pgno = dbp->meta_pgno; + lock_desc.type = DB_HANDLE_LOCK; + + memset(&fileobj, 0, sizeof(fileobj)); + fileobj.data = &lock_desc; + fileobj.size = sizeof(lock_desc); + DB_TEST_SUBLOCKS(env, flags); + if (F2_ISSET(dbp, DB2_AM_INTEXCL)) + flags |= DB_LOCK_IGNORE_REC; + if (elockp == NULL) + ret = __lock_get(env, locker, + flags, &fileobj, mode, &dbp->handle_lock); + else { + reqs[0].op = DB_LOCK_PUT; + reqs[0].lock = *elockp; + reqs[1].op = DB_LOCK_GET; + reqs[1].mode = mode; + reqs[1].obj = &fileobj; + reqs[1].timeout = 0; + if ((ret = __lock_vec(env, + locker, flags, reqs, 2, &ereq)) == 0) { + dbp->handle_lock = reqs[1].lock; + if (elockp != &dbp->handle_lock) + LOCK_INIT(*elockp); + } else if (ereq != reqs) + LOCK_INIT(*elockp); + } + + dbp->cur_locker = locker; + return (ret); +} + +/* + * __fop_file_setup -- + * + * Perform all the needed checking and locking to open up or create a + * file. + * + * There's a reason we don't push this code down into the buffer cache. + * The problem is that there's no information external to the file that + * we can use as a unique ID. UNIX has dev/inode pairs, but they are + * not necessarily unique after reboot, if the file was mounted via NFS. + * Windows has similar problems, as the FAT filesystem doesn't maintain + * dev/inode numbers across reboot. So, we must get something from the + * file we can use to ensure that, even after a reboot, the file we're + * joining in the cache is the right file for us to join. The solution + * we use is to maintain a file ID that's stored in the database, and + * that's why we have to open and read the file before calling into the + * buffer cache or obtaining a lock (we use this unique fileid to lock + * as well as to identify like files in the cache). + * + * There are a couple of idiosyncrasies that this code must support, in + * particular, DB_TRUNCATE and DB_FCNTL_LOCKING. First, we disallow + * DB_TRUNCATE in the presence of transactions, since opening a file with + * O_TRUNC will result in data being lost in an unrecoverable fashion. + * We also disallow DB_TRUNCATE if locking is enabled, because even in + * the presence of locking, we cannot avoid race conditions, so allowing + * DB_TRUNCATE with locking would be misleading. See SR [#7345] for more + * details. + * + * However, if you are running with neither locking nor transactions, then + * you can specify DB_TRUNCATE, and if you do so, we will truncate the file + * regardless of its contents. + * + * FCNTL locking introduces another set of complications. First, the only + * reason we support the DB_FCNTL_LOCKING flag is for historic compatibility + * with programs like Sendmail and Postfix. In these cases, the caller may + * already have a lock on the file; we need to make sure that any file handles + * we open remain open, because if we were to close them, the lock held by the + * caller would go away. Furthermore, Sendmail and/or Postfix need the ability + * to create databases in empty files. So, when you're doing FCNTL locking, + * it's reasonable that you are trying to create a database into a 0-length + * file and we allow it, while under normal conditions, we do not create + * databases if the files already exist and are not Berkeley DB files. + * + * PUBLIC: int __fop_file_setup __P((DB *, DB_THREAD_INFO *ip, + * PUBLIC: DB_TXN *, const char *, int, u_int32_t, u_int32_t *)); + */ +int +__fop_file_setup(dbp, ip, txn, name, mode, flags, retidp) + DB *dbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; + const char *name; + int mode; + u_int32_t flags, *retidp; +{ + DBTYPE save_type; + DB_FH *fhp; + DB_LOCK elock; + DB_LOCKER *locker; + DB_TXN *stxn; + ENV *env; + size_t len; + APPNAME aflags; + u_int32_t dflags, oflags; + u_int8_t mbuf[DBMETASIZE]; + int created_locker, create_ok, ret, retries, t_ret, tmp_created; + int truncating, was_inval; + char *real_name, *real_tmpname, *tmpname; + db_lockmode_t lockmode; + + *retidp = TXN_INVALID; + + env = dbp->env; + fhp = NULL; + LOCK_INIT(elock); + stxn = NULL; + created_locker = tmp_created = truncating = was_inval = 0; + real_name = real_tmpname = tmpname = NULL; + dflags = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0; + aflags = LF_ISSET(DB_INTERNAL_PERSISTENT_DB) ? DB_APP_META : + (LF_ISSET(DB_INTERNAL_TEMPORARY_DB) ? DB_APP_NONE : DB_APP_DATA); + LF_CLR(DB_INTERNAL_PERSISTENT_DB | DB_INTERNAL_TEMPORARY_DB); + + ret = 0; + retries = 0; + save_type = dbp->type; + if (F2_ISSET(dbp, DB2_AM_EXCL)) + lockmode = DB_LOCK_WRITE; + else + lockmode = DB_LOCK_READ; + + /* + * Get a lockerid for this handle. There are paths through queue + * rename and remove where this dbp already has a locker, so make + * sure we don't clobber it and conflict. + */ + if (LOCKING_ON(env) && + !F_ISSET(dbp, DB_AM_COMPENSATE) && + !F_ISSET(dbp, DB_AM_RECOVER) && + dbp->locker == DB_LOCK_INVALIDID) { + if ((ret = __lock_id(env, NULL, &dbp->locker)) != 0) + goto err; + created_locker = 1; + } + LOCK_INIT(dbp->handle_lock); + + if (txn != NULL && dbp->locker != NULL && F_ISSET(txn, TXN_INFAMILY)) { + if ((ret = __lock_addfamilylocker(env, + txn->txnid, dbp->locker->id, 1)) != 0) + goto err; + txn = NULL; + } + + locker = txn == NULL ? dbp->locker : txn->locker; + + oflags = 0; + if (F_ISSET(dbp, DB_AM_INMEM)) + real_name = (char *)name; + else { + /* Get the real backing file name. */ + if ((ret = __db_appname(env, + aflags, name, &dbp->dirname, &real_name)) != 0) + goto err; + + /* Fill in the default file mode. */ + if (mode == 0) + mode = DB_MODE_660; + + if (LF_ISSET(DB_RDONLY)) + oflags |= DB_OSO_RDONLY; + if (LF_ISSET(DB_TRUNCATE)) + oflags |= DB_OSO_TRUNC; + } + + retries = 0; + create_ok = LF_ISSET(DB_CREATE); + LF_CLR(DB_CREATE); + +retry: + /* + * If we cannot create the file, only retry a few times. We + * think we might be in a race with another create, but it could + * be that the backup filename exists (that is, is left over from + * a previous crash). It is also possible to read the metadata + * page while it is being written and fail the checksum. + */ + if (++retries > DB_RETRY) { + __db_errx(env, DB_STR_A("0002", + "__fop_file_setup: Retry limit (%d) exceeded", "%d"), + DB_RETRY); + goto err; + } + if (!F_ISSET(dbp, DB_AM_COMPENSATE) && !F_ISSET(dbp, DB_AM_RECOVER)) + GET_ENVLOCK(env, locker, &elock); + if (name == NULL) + ret = ENOENT; + else if (F_ISSET(dbp, DB_AM_INMEM)) { + ret = __env_mpool(dbp, name, flags); + /* + * We are using __env_open as a check for existence. + * However, __env_mpool does an actual open and there + * are scenarios where the object exists, but cannot be + * opened, because our settings don't match those internally. + * We need to check for that explicitly. We'll need the + * mpool open to read the meta-data page, so we're going to + * have to temporarily turn this dbp into an UNKNOWN one. + */ + if (ret == EINVAL) { + was_inval = 1; + save_type = dbp->type; + dbp->type = DB_UNKNOWN; + ret = __env_mpool(dbp, name, flags); + dbp->type = save_type; + } + } else + ret = __os_exists(env, real_name, NULL); + + if (ret == 0) { + /* + * If the file exists, there are 5 possible cases: + * 1. DB_EXCL was specified so this is an error, unless + * this is a file left around after a rename and we + * are in the same transaction. This gets decomposed + * into several subcases, because we check for various + * errors before we know we're in rename. + * 2. We are truncating, and it doesn't matter what kind + * of file it is, we should open/create it. + * 3. It is 0-length, we are not doing transactions (i.e., + * we are sendmail), we should open/create into it. + * -- on-disk files only! + * 4. Is it a Berkeley DB file and we should simply open it. + * 5. It is not a BDB file and we should return an error. + */ + + /* Open file (if there is one). */ +reopen: if (!F_ISSET(dbp, DB_AM_INMEM) && (ret = + __os_open(env, real_name, 0, oflags, 0, &fhp)) != 0) + goto err; + + /* Case 2: DB_TRUNCATE: we must do the creation in place. */ + if (LF_ISSET(DB_TRUNCATE)) { + if (LF_ISSET(DB_EXCL)) { + /* Case 1a: DB_EXCL and DB_TRUNCATE. */ + ret = EEXIST; + goto err; + } + tmpname = (char *)name; + goto creat2; + } + + /* Cases 1,3-5: we need to read the meta-data page. */ + if (F_ISSET(dbp, DB_AM_INMEM)) { + if (LOGGING_ON(env) && (ret = __env_dbreg_setup(dbp, + txn, NULL, name, TXN_INVALID)) != 0) + return (ret); + ret = __fop_inmem_read_meta( + dbp, txn, name, flags, DB_CHK_META|DB_CHK_ONLY); + } else { + ret = __fop_read_meta(env, real_name, mbuf, + sizeof(mbuf), fhp, + LF_ISSET(DB_NOERROR) || + (LF_ISSET(DB_FCNTL_LOCKING) && txn == NULL) ? 1 : 0, + &len); + + /* Case 3: 0-length, no txns. */ + if (ret != 0 && len == 0 && txn == NULL) { + if (LF_ISSET(DB_EXCL)) { + /* + * Case 1b: DB_EXCL and + * 0-length file exists. + */ + ret = EEXIST; + goto err; + } + tmpname = (char *)name; + if (create_ok) + goto creat2; + goto done; + } + + /* + * Case 4: This is a valid file. Now check the + * checksum and decrypt the file so the file + * id can be obtained for the handle lock. Note that + * the checksum can fail if the database is being + * written (possible because the handle lock has + * not been obtained yet). So on checksum fail retry + * until the checksum succeeds or the number of + * retries is exhausted, then throw an error. + */ + if (ret == 0 && (ret = __db_chk_meta(env, dbp, + (DBMETA *)mbuf, DB_CHK_META)) == DB_CHKSUM_FAIL) { + if ((t_ret = __ENV_LPUT(env, elock)) != 0) { + ret = t_ret; + goto err; + } + /* + * Retry unless the number of retries is + * exhausted. + */ + if (!(retries < DB_RETRY)) { + __db_errx(env, DB_STR_A("0210", + "%s: metadata page checksum error", "%s"), real_name); + if (F_ISSET(dbp, DB_AM_RECOVER)) + ret = ENOENT; + else + ret = EINVAL; + goto err; + } + if ((ret = __os_closehandle(env, fhp)) != 0) + goto err; + goto retry; + } + /* Get the file id for the handle lock. */ + if (ret == 0) + memcpy(dbp->fileid, + ((DBMETA *)mbuf)->uid, DB_FILE_ID_LEN); + } + + /* Case 5: Invalid file. */ + if (ret != 0) + goto err; + + /* Now, get our handle lock. */ + if ((ret = __fop_lock_handle(env, + dbp, locker, lockmode, NULL, DB_LOCK_NOWAIT)) == 0) { + if ((ret = __ENV_LPUT(env, elock)) != 0) + goto err; + } else if (ret != DB_LOCK_NOTGRANTED || + ((txn != NULL && (F_ISSET(txn, TXN_NOWAIT))) || + F2_ISSET(dbp, DB2_AM_NOWAIT))) + goto err; + else { + PERFMON3(env, + race, fop_file_setup, (char *) name, ret, flags); + /* + * We were unable to acquire the handle lock without + * blocking. The fact that we are blocking might mean + * that someone else is trying to delete the file. + * Since some platforms cannot delete files while they + * are open (Windows), we are going to have to close + * the file. This would be a problem if we were doing + * FCNTL locking, because our closing the handle would + * release the FCNTL locks. Fortunately, if we are + * doing FCNTL locking, then we should never fail to + * acquire our handle lock, so we should never get here. + * We assert it here to make sure we aren't destroying + * any application level FCNTL semantics. + */ + DB_ASSERT(env, !LF_ISSET(DB_FCNTL_LOCKING)); + if (!F_ISSET(dbp, DB_AM_INMEM)) { + if ((ret = __os_closehandle(env, fhp)) != 0) + goto err; + fhp = NULL; + } + if ((ret = __fop_lock_handle(env, + dbp, locker, lockmode, &elock, 0)) != 0) { + if (F_ISSET(dbp, DB_AM_INMEM)) + RESET_MPF(dbp, 0); + goto err; + } + + /* + * If we had to wait, we might be waiting on a + * dummy file used in create/destroy of a database. + * To be sure we have the correct information we + * try again. + */ + if (F_ISSET(dbp, DB_AM_INMEM)) { + RESET_MPF(dbp, 0); + MAKE_INMEM(dbp); + } + if ((ret = + __ENV_LPUT(env, dbp->handle_lock)) != 0) { + LOCK_INIT(dbp->handle_lock); + goto err; + } + goto retry; + + } + + /* + * If we got here, then we have the handle lock, it is now + * safe to check the rest of the meta data, since the file + * will not be deleted out from under the handle. + */ + if (F_ISSET(dbp, DB_AM_INMEM)) { + if ((ret = __fop_inmem_read_meta( + dbp, txn, name, flags, DB_SKIP_CHK)) != 0) + goto err; + } else { + if ((ret = __db_meta_setup(env, dbp, real_name, + (DBMETA *)mbuf, flags, DB_SKIP_CHK)) != 0) + goto err; + } + + /* + * Check for a file in the midst of a rename. If we find that + * the file is in the midst of a rename, it must be the case + * that it is in our current transaction (else we would still + * be blocking), so we can continue along and create a new file + * with the same name. In that case, we have to close the file + * handle because we reuse it below. This is a case where + * a 'was_inval' above is OK. + */ + if (F_ISSET(dbp, DB_AM_IN_RENAME)) { + was_inval = 0; + if (create_ok) { + if (F_ISSET(dbp, DB_AM_INMEM)) { + RESET_MPF(dbp, DB_MPOOL_DISCARD); + } else if ((ret = + __os_closehandle(env, fhp)) != 0) + goto err; + LF_SET(DB_CREATE); + goto create; + } else { + ret = ENOENT; + goto err; + } + } + + /* If we get here, a was_inval is bad. */ + if (was_inval) { + ret = EINVAL; + goto err; + } + + /* + * Now, case 1: check for DB_EXCL, because the file that exists + * is not in the middle of a rename, so we have an error. This + * is a weird case, but we need to make sure that we don't + * continue to hold the handle lock, since technically, we + * should not have been allowed to open it. + */ + if (LF_ISSET(DB_EXCL)) { + ret = __ENV_LPUT(env, dbp->handle_lock); + LOCK_INIT(dbp->handle_lock); + if (ret == 0) + ret = EEXIST; + goto err; + } + goto done; + } + + /* File does not exist. */ +#ifdef HAVE_VXWORKS + /* + * VxWorks can return file-system specific error codes if the + * file does not exist, not ENOENT. + */ + if (!create_ok) +#else + if (!create_ok || ret != ENOENT) +#endif + goto err; + LF_SET(DB_CREATE); + /* + * If we were trying to open a non-existent master database + * readonly clear that here. + */ + LF_CLR(DB_RDONLY); + F_CLR(dbp, DB_AM_RDONLY); + ret = 0; + + /* + * We need to create file, which means that we need to set up the file, + * the fileid and the locks. Then we need to call the appropriate + * routines to create meta-data pages. For in-memory files, we retain + * the environment lock, while for on-disk files, we drop the env lock + * and create into a temporary. + */ + if (!F_ISSET(dbp, DB_AM_INMEM) && + (ret = __ENV_LPUT(env, elock)) != 0) + goto err; + +create: if (txn != NULL && IS_REP_CLIENT(env) && + !F_ISSET(dbp, DB_AM_NOT_DURABLE)) { + __db_errx(env, DB_STR("0003", + "Transactional create on replication client disallowed")); + ret = EINVAL; + goto err; + } + + if (F_ISSET(dbp, DB_AM_INMEM)) { + if (LOGGING_ON(env) && (ret = + __env_dbreg_setup(dbp, txn, NULL, name, TXN_INVALID)) != 0) + return (ret); + if ((ret = __fop_inmem_create(dbp, name, txn, flags)) != 0) + return (ret); + } else { + if ((ret = __db_backup_name(env, name, txn, &tmpname)) != 0) + goto err; + if (TXN_ON(env) && txn != NULL && + (ret = __txn_begin(env, NULL, txn, &stxn, 0)) != 0) + goto err; + if ((ret = __fop_create(env, stxn, &fhp, + tmpname, &dbp->dirname, aflags, mode, dflags)) != 0) { + /* + * If no transactions, there is a race on creating the + * backup file, as the backup file name is the same for + * all processes. Wait for the other process to finish + * with the name. + */ + if (!TXN_ON(env) && ret == EEXIST) { + PERFMON3(env, + race, fop_file_setup, tmpname, ret, flags); + __os_free(env, tmpname); + tmpname = NULL; + __os_yield(env, 1, 0); + goto retry; + } + goto err; + } + tmp_created = 1; + } + +creat2: if (!F_ISSET(dbp, DB_AM_INMEM)) { + if ((ret = __db_appname(env, + aflags, tmpname, &dbp->dirname, &real_tmpname)) != 0) + goto err; + + /* Set the pagesize if it isn't yet set. */ + if (dbp->pgsize == 0 && + (ret = __fop_set_pgsize(dbp, fhp, real_tmpname)) != 0) + goto errmsg; + + /* Construct a file_id. */ + if ((ret = + __os_fileid(env, real_tmpname, 1, dbp->fileid)) != 0) + goto errmsg; + } + + if ((ret = __db_new_file(dbp, ip, + F_ISSET(dbp, DB_AM_INMEM) ? txn : stxn, fhp, tmpname)) != 0) + goto err; + + /* Output the REOPEN record after we create. */ + if (F_ISSET(dbp, DB_AM_INMEM) && dbp->log_filename != NULL && (ret = + __dbreg_log_id(dbp, txn, dbp->log_filename->id, 0)) != 0) + return (ret); + + /* + * We need to close the handle here on platforms where remove and + * rename fail if a handle is open (including Windows). + */ + CLOSE_HANDLE(dbp, fhp); + + /* + * Now move the file into place unless we are creating in place (because + * we created a database in a file that started out 0-length). If + * this is an in-memory file, we may or may not hold the environment + * lock depending on how we got here. + */ + if (!F_ISSET(dbp, DB_AM_COMPENSATE) && + !F_ISSET(dbp, DB_AM_RECOVER) && !LOCK_ISSET(elock)) + GET_ENVLOCK(env, locker, &elock); + + if (F_ISSET(dbp, DB_AM_IN_RENAME)) { + F_CLR(dbp, DB_AM_IN_RENAME); + __txn_remrem(env, txn, real_name); + } else if (name == tmpname) { + /* We created it in place. */ + } else if (!F_ISSET(dbp, DB_AM_INMEM) && + __os_exists(env, real_name, NULL) == 0) { + /* + * Someone managed to create the file; remove our temp + * and try to open the file that now exists. + */ + (void)__fop_remove(env, NULL, + dbp->fileid, tmpname, &dbp->dirname, aflags, dflags); + (void)__ENV_LPUT(env, dbp->handle_lock); + LOCK_INIT(dbp->handle_lock); + + if (stxn != NULL) { + ret = __txn_abort(stxn); + stxn = NULL; + } + if (ret != 0) + goto err; + goto reopen; + } + + if (name != NULL && (ret = __fop_lock_handle(env, + dbp, locker, DB_LOCK_WRITE, NULL, NOWAIT_FLAG(txn)| + (F2_ISSET(dbp,DB2_AM_NOWAIT) ? DB_LOCK_NOWAIT : 0))) != 0) + goto err; + if (tmpname != NULL && + tmpname != name && (ret = __fop_rename(env, stxn, tmpname, + name, &dbp->dirname, dbp->fileid, aflags, 1, dflags)) != 0) + goto err; + if ((ret = __ENV_LPUT(env, elock)) != 0) + goto err; + + if (stxn != NULL) { + *retidp = stxn->txnid; + ret = __txn_commit(stxn, 0); + stxn = NULL; + } else + *retidp = TXN_INVALID; + + if (ret != 0) + goto err; + + F_SET(dbp, DB_AM_CREATED); + + if (0) { +errmsg: __db_err(env, ret, "%s", name); + +err: CLOSE_HANDLE(dbp, fhp); + if (stxn != NULL) + (void)__txn_abort(stxn); + if (tmp_created && txn == NULL) + (void)__fop_remove(env, + NULL, NULL, tmpname, NULL, aflags, dflags); + if (txn == NULL) + (void)__ENV_LPUT(env, dbp->handle_lock); + (void)__ENV_LPUT(env, elock); + if (created_locker) { + (void)__lock_id_free(env, dbp->locker); + dbp->locker = NULL; + } + } + +done: /* + * There are cases where real_name and tmpname take on the + * exact same string, so we need to make sure that we do not + * free twice. + */ + if (!truncating && tmpname != NULL && tmpname != name) + __os_free(env, tmpname); + if (real_name != name && real_name != NULL) + __os_free(env, real_name); + if (real_tmpname != NULL) + __os_free(env, real_tmpname); + CLOSE_HANDLE(dbp, fhp); + + return (ret); +} + +/* + * __fop_set_pgsize -- + * Set the page size based on file information. + */ +static int +__fop_set_pgsize(dbp, fhp, name) + DB *dbp; + DB_FH *fhp; + const char *name; +{ + ENV *env; + u_int32_t iopsize; + int ret; + + env = dbp->env; + + /* + * Use the filesystem's optimum I/O size as the pagesize if a pagesize + * not specified. Some filesystems have 64K as their optimum I/O size, + * but as that results in fairly large default caches, we limit the + * default pagesize to 16K. + */ + if ((ret = __os_ioinfo(env, name, fhp, NULL, NULL, &iopsize)) != 0) { + __db_err(env, ret, "%s", name); + return (ret); + } + if (iopsize < 512) + iopsize = 512; + if (iopsize > 16 * 1024) + iopsize = 16 * 1024; + + /* + * Sheer paranoia, but we don't want anything that's not a power-of-2 + * (we rely on that for alignment of various types on the pages), and + * we want a multiple of the sector size as well. If the value + * we got out of __os_ioinfo looks bad, use a default instead. + */ + if (!IS_VALID_PAGESIZE(iopsize)) + iopsize = DB_DEF_IOSIZE; + + dbp->pgsize = iopsize; + F_SET(dbp, DB_AM_PGDEF); + + return (0); +} + +/* + * __fop_subdb_setup -- + * + * Subdb setup is significantly simpler than file setup. In terms of + * locking, for the duration of the operation/transaction, the locks on + * the meta-data page will suffice to protect us from simultaneous operations + * on the sub-database. Before we complete the operation though, we'll get a + * handle lock on the subdatabase so that on one else can try to remove it + * while we've got it open. We use an object that looks like the meta-data + * page lock with a different type (DB_HANDLE_LOCK) for the long-term handle. + * locks. + * + * PUBLIC: int __fop_subdb_setup __P((DB *, DB_THREAD_INFO *, DB_TXN *, + * PUBLIC: const char *, const char *, int, u_int32_t)); + */ +int +__fop_subdb_setup(dbp, ip, txn, mname, name, mode, flags) + DB *dbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; + const char *mname, *name; + int mode; + u_int32_t flags; +{ + DB *mdbp; + ENV *env; + db_lockmode_t lkmode; + u_int32_t mflags; + int ret, t_ret; + + mdbp = NULL; + env = dbp->env; + + mflags = flags | DB_RDONLY; +retry: if ((ret = __db_master_open(dbp, + ip, txn, mname, mflags, mode, &mdbp)) != 0) + return (ret); + /* + * If we created this file, then we need to set the DISCARD flag so + * that if we fail in the middle of this routine, we discard from the + * mpool any pages that we just created. + */ + if (F_ISSET(mdbp, DB_AM_CREATED)) + F_SET(mdbp, DB_AM_DISCARD); + + /* + * We are going to close this instance of the master, so we can + * steal its handle instead of reopening a handle on the database. + */ + if (LF_ISSET(DB_FCNTL_LOCKING)) { + dbp->saved_open_fhp = mdbp->saved_open_fhp; + mdbp->saved_open_fhp = NULL; + } + + /* Copy the pagesize and set the sub-database flag. */ + dbp->pgsize = mdbp->pgsize; + F_SET(dbp, DB_AM_SUBDB); + + if (name != NULL && (ret = __db_master_update(mdbp, dbp, + ip, txn, name, dbp->type, MU_OPEN, NULL, flags)) != 0) { + if (ret == EBADF && F_ISSET(mdbp, DB_AM_RDONLY)) { + /* We need to reopen the master R/W to do the create. */ + if ((ret = __db_close(mdbp, txn, 0)) != 0) + goto err; + FLD_CLR(mflags, DB_RDONLY); + goto retry; + } + goto err; + } + + /* + * Hijack the master's locker ID as well, so that our locks don't + * conflict with the master's. Since we're closing the master, + * that locker would just have been freed anyway. Once we've gotten + * the locker id, we need to acquire the handle lock for this + * subdatabase. + */ + dbp->locker = mdbp->locker; + mdbp->locker = NULL; + + DB_TEST_RECOVERY(dbp, DB_TEST_POSTLOG, ret, mname); + + /* + * We copy our fileid from our master so that we all open + * the same file in mpool. We'll use the meta-pgno to lock + * so that we end up with different handle locks. + */ + + memcpy(dbp->fileid, mdbp->fileid, DB_FILE_ID_LEN); + lkmode = F_ISSET(dbp, DB_AM_CREATED) || LF_ISSET(DB_WRITEOPEN) || + F2_ISSET(dbp, DB2_AM_EXCL) ? DB_LOCK_WRITE : DB_LOCK_READ; + if ((ret = __fop_lock_handle(env, dbp, + txn == NULL ? dbp->locker : txn->locker, lkmode, NULL, + NOWAIT_FLAG(txn) | + (F2_ISSET(dbp, DB2_AM_NOWAIT) ? DB_LOCK_NOWAIT : 0))) != 0) + goto err; + + if ((ret = __db_init_subdb(mdbp, dbp, name, ip, txn)) != 0) { + /* + * If there was no transaction and we created this database, + * then we need to undo the update of the master database. + */ + if (F_ISSET(dbp, DB_AM_CREATED) && txn == NULL) + (void)__db_master_update(mdbp, dbp, + ip, txn, name, dbp->type, MU_REMOVE, NULL, 0); + F_CLR(dbp, DB_AM_CREATED); + goto err; + } + + /* + * XXX + * This should have been done at the top of this routine. The problem + * is that __db_init_subdb() uses "standard" routines to process the + * meta-data page and set information in the DB handle based on it. + * Those routines have to deal with swapped pages and will normally set + * the DB_AM_SWAP flag. However, we use the master's metadata page and + * that has already been swapped, so they get the is-swapped test wrong. + */ + F_CLR(dbp, DB_AM_SWAP); + F_SET(dbp, F_ISSET(mdbp, DB_AM_SWAP)); + + /* + * In the file create case, these happen in separate places so we have + * two different tests. They end up in the same place for subdbs, but + * for compatibility with file testing, we put them both here anyway. + */ + DB_TEST_RECOVERY(dbp, DB_TEST_POSTLOGMETA, ret, mname); + DB_TEST_RECOVERY(dbp, DB_TEST_POSTSYNC, ret, mname); + + /* + * File exists and we have the appropriate locks; we should now + * process a normal open. + */ + if (F_ISSET(mdbp, DB_AM_CREATED)) { + F_SET(dbp, DB_AM_CREATED_MSTR); + F_CLR(mdbp, DB_AM_DISCARD); + } + + if (0) { +err: +DB_TEST_RECOVERY_LABEL + if (txn == NULL) + (void)__ENV_LPUT(env, dbp->handle_lock); + } + + /* + * The master's handle lock is under the control of the + * subdb (it acquired the master's locker). We want to + * keep the master's handle lock so that no one can remove + * the file while the subdb is open. If we register the + * trade event and then invalidate the copy of the lock + * in the master's handle, that will accomplish this. However, + * before we register this event, we'd better remove any + * events that we've already registered for the master. + */ + if (!F_ISSET(dbp, DB_AM_RECOVER) && IS_REAL_TXN(txn)) { + /* Unregister old master events. */ + __txn_remlock(env, + txn, &mdbp->handle_lock, DB_LOCK_INVALIDID); + + /* Now register the new event. */ + if ((t_ret = __txn_lockevent(env, txn, dbp, + &mdbp->handle_lock, dbp->locker == NULL ? + mdbp->locker : dbp->locker)) != 0 && ret == 0) + ret = t_ret; + } + LOCK_INIT(mdbp->handle_lock); + + /* + * If the master was created, we need to sync so that the metadata + * page is correct on disk for recovery, since it isn't read through + * mpool. If we're opening a subdb in an existing file, we can skip + * the sync. + */ + if ((t_ret = __db_close(mdbp, txn, + F_ISSET(dbp, DB_AM_CREATED_MSTR) ? 0 : DB_NOSYNC)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __fop_remove_setup -- + * Open handle appropriately and lock for removal of a database file. + * + * PUBLIC: int __fop_remove_setup __P((DB *, + * PUBLIC: DB_TXN *, const char *, u_int32_t)); + */ +int +__fop_remove_setup(dbp, txn, name, flags) + DB *dbp; + DB_TXN *txn; + const char *name; + u_int32_t flags; +{ + DB_FH *fhp; + DB_LOCK elock; + ENV *env; + u_int8_t mbuf[DBMETASIZE]; + int ret; + + COMPQUIET(flags, 0); + + env = dbp->env; + + LOCK_INIT(elock); + fhp = NULL; + ret = 0; + + /* Create locker if necessary. */ +retry: if (LOCKING_ON(env)) { + if (IS_REAL_TXN(txn)) + dbp->locker = txn->locker; + else if (dbp->locker == DB_LOCK_INVALIDID) { + if ((ret = __lock_id(env, NULL, &dbp->locker)) != 0) + goto err; + if (txn != NULL && F_ISSET(txn, TXN_INFAMILY) && + (ret = __lock_addfamilylocker(env, + txn->txnid, dbp->locker->id, 1)) != 0) + goto err; + } + } + + /* + * We are about to open a file handle and then possibly close it. + * We cannot close handles if we are doing FCNTL locking. However, + * there is no way to pass the FCNTL flag into this routine via the + * user API. The only way we can get in here and be doing FCNTL + * locking is if we are trying to clean up an open that was called + * with FCNTL locking. In that case, the save_fhp should already be + * set. So, we use that field to tell us if we need to make sure + * that we shouldn't close the handle. + */ + fhp = dbp->saved_open_fhp; + DB_ASSERT(env, LF_ISSET(DB_FCNTL_LOCKING) || fhp == NULL); + + /* + * Lock environment to protect file open. That will enable us to + * read the meta-data page and get the fileid so that we can lock + * the handle. + */ + GET_ENVLOCK(env, dbp->locker, &elock); + + /* Open database. */ + if (F_ISSET(dbp, DB_AM_INMEM)) { + if ((ret = __env_mpool(dbp, name, flags)) == 0) + ret = __os_strdup(env, name, &dbp->dname); + } else if (fhp == NULL) + ret = __os_open(env, name, 0, DB_OSO_RDONLY, 0, &fhp); + if (ret != 0) + goto err; + + /* Get meta-data */ + if (F_ISSET(dbp, DB_AM_INMEM)) + ret = __fop_inmem_read_meta( + dbp, txn, name, flags, DB_CHK_META); + else if ((ret = __fop_read_meta(env, + name, mbuf, sizeof(mbuf), fhp, 0, NULL)) == 0) + ret = __db_meta_setup(env, dbp, + name, (DBMETA *)mbuf, flags, DB_CHK_META | DB_CHK_NOLSN); + if (ret != 0) + goto err; + + /* + * Now, get the handle lock. We first try with NOWAIT, because if + * we have to wait, we're going to have to close the file and reopen + * it, so that if there is someone else removing it, our open doesn't + * prevent that. + */ + if ((ret = __fop_lock_handle(env, + dbp, dbp->locker, DB_LOCK_WRITE, NULL, DB_LOCK_NOWAIT)) != 0) { + /* + * Close the file, block on the lock, clean up the dbp, and + * then start all over again. + */ + if (!F_ISSET(dbp, DB_AM_INMEM) && !LF_ISSET(DB_FCNTL_LOCKING)) { + (void)__os_closehandle(env, fhp); + fhp = NULL; + } + if (ret != DB_LOCK_NOTGRANTED || + (txn != NULL && F_ISSET(txn, TXN_NOWAIT))) + goto err; + else if ((ret = __fop_lock_handle(env, + dbp, dbp->locker, DB_LOCK_WRITE, &elock, 0)) != 0) + goto err; + + if (F_ISSET(dbp, DB_AM_INMEM)) { + (void)__lock_put(env, &dbp->handle_lock); + (void)__db_refresh(dbp, txn, DB_NOSYNC, NULL, 1); + } else { + if (txn != NULL) + dbp->locker = NULL; + (void)__db_refresh(dbp, txn, DB_NOSYNC, NULL, 0); + } + goto retry; + } else if ((ret = __ENV_LPUT(env, elock)) != 0) + goto err; + else if (F_ISSET(dbp, DB_AM_IN_RENAME)) + ret = ENOENT; + + if (0) { +err: (void)__ENV_LPUT(env, elock); + } + if (fhp != NULL && !LF_ISSET(DB_FCNTL_LOCKING)) + (void)__os_closehandle(env, fhp); + /* + * If this is a real file and we are going to proceed with the removal, + * then we need to make sure that we don't leave any pages around in the + * mpool since the file is closed and will be reopened again before + * access. However, this might be an in-memory file, in which case + * we will handle the discard from the mpool later as it's the "real" + * removal of the database. + */ + if (ret == 0 && !F_ISSET(dbp, DB_AM_INMEM)) + F_SET(dbp, DB_AM_DISCARD); + return (ret); +} + +/* + * __fop_read_meta -- + * Read the meta-data page from a file and return it in buf. + * + * PUBLIC: int __fop_read_meta __P((ENV *, const char *, + * PUBLIC: u_int8_t *, size_t, DB_FH *, int, size_t *)); + */ +int +__fop_read_meta(env, name, buf, size, fhp, errok, nbytesp) + ENV *env; + const char *name; + u_int8_t *buf; + size_t size; + DB_FH *fhp; + int errok; + size_t *nbytesp; +{ + size_t nr; + int ret; + + /* + * Our caller wants to know the number of bytes read, even if we + * return an error. + */ + if (nbytesp != NULL) + *nbytesp = 0; + + nr = 0; + ret = __os_read(env, fhp, buf, size, &nr); + if (nbytesp != NULL) + *nbytesp = nr; + + if (ret != 0) { + if (!errok) + __db_err(env, ret, "%s", name); + goto err; + } + + if (nr != size) { + if (!errok) + __db_errx(env, DB_STR_A("0004", + "fop_read_meta: %s: unexpected file type or format", + "%s"), name); + ret = EINVAL; + } + +err: + return (ret); +} + +/* + * __fop_dummy -- + * This implements the creation and name swapping of dummy files that + * we use for remove and rename (remove is simply a rename with a delayed + * remove). + * + * PUBLIC: int __fop_dummy __P((DB *, + * PUBLIC: DB_TXN *, const char *, const char *)); + */ +int +__fop_dummy(dbp, txn, old, new) + DB *dbp; + DB_TXN *txn; + const char *old, *new; +{ + DB *tmpdbp; + DB_TXN *stxn; + ENV *env; + char *back; + int ret, t_ret; + u_int8_t mbuf[DBMETASIZE]; + + env = dbp->env; + back = NULL; + stxn = NULL; + tmpdbp = NULL; + + DB_ASSERT(env, txn != NULL); + + /* + * Begin sub transaction to encapsulate the rename. Note that we + * expect the inmem_swap calls to complete the sub-transaction, + * aborting on error and committing on success. + */ + if (TXN_ON(env) && + (ret = __txn_begin(env, NULL, txn, &stxn, 0)) != 0) + goto err; + + /* We need to create a dummy file as a place holder. */ + if ((ret = __db_backup_name(env, new, stxn, &back)) != 0) + goto err; + /* Create a dummy dbp handle. */ + if ((ret = __db_create_internal(&tmpdbp, env, 0)) != 0) + goto err; + if (F_ISSET(dbp, DB_AM_NOT_DURABLE) && + (ret = __db_set_flags(tmpdbp, DB_TXN_NOT_DURABLE)) != 0) + goto err; + memset(mbuf, 0, sizeof(mbuf)); + ret = F_ISSET(dbp, DB_AM_INMEM) ? + __fop_inmem_dummy(tmpdbp, stxn, back, mbuf) : + __fop_ondisk_dummy(tmpdbp, stxn, back, mbuf); + + if (ret != 0) + goto err; + + ret = F_ISSET(dbp, DB_AM_INMEM) ? + __fop_inmem_swap(dbp, tmpdbp, stxn, old, new, back, txn->locker) : + __fop_ondisk_swap(dbp, tmpdbp, stxn, old, new, back, txn->locker); + stxn = NULL; + if (ret != 0) + goto err; + +err: if (stxn != NULL) + (void)__txn_abort(stxn); + if (tmpdbp != NULL && + (t_ret = __db_close(tmpdbp, NULL, 0)) != 0 && ret == 0) + ret = t_ret; + if (back != NULL) + __os_free(env, back); + return (ret); +} + +/* + * __fop_dbrename -- + * Do the appropriate file locking and file system operations + * to effect a dbrename in the absence of transactions (__fop_dummy + * and the subsequent calls in __db_rename do the work for the + * transactional case). + * + * PUBLIC: int __fop_dbrename __P((DB *, const char *, const char *)); + */ +int +__fop_dbrename(dbp, old, new) + DB *dbp; + const char *old, *new; +{ + DB_LOCK elock; + ENV *env; + char *real_new, *real_old; + int ret, t_ret; + + env = dbp->env; + real_new = NULL; + real_old = NULL; + LOCK_INIT(elock); + + if (F_ISSET(dbp, DB_AM_INMEM)) { + real_new = (char *)new; + real_old = (char *)old; + } else { + /* Get full names. */ + if ((ret = __db_appname(env, + DB_APP_DATA, old, &dbp->dirname, &real_old)) != 0) + goto err; + + if ((ret = __db_appname(env, + DB_APP_DATA, new, &dbp->dirname, &real_new)) != 0) + goto err; + } + + /* + * It is an error to rename a file over one that already exists, + * as that wouldn't be transaction-safe. We check explicitly + * for ondisk files, but it's done memp_nameop for in-memory ones. + */ + GET_ENVLOCK(env, dbp->locker, &elock); + ret = F_ISSET(dbp, DB_AM_INMEM) ? ENOENT : + __os_exists(env, real_new, NULL); + + if (ret == 0) { + ret = EEXIST; + __db_errx(env, DB_STR_A("0005", + "rename: file %s exists", "%s"), real_new); + goto err; + } + + ret = __memp_nameop(env, + dbp->fileid, new, real_old, real_new, F_ISSET(dbp, DB_AM_INMEM)); + +err: if ((t_ret = __ENV_LPUT(env, elock)) != 0 && ret == 0) + ret = t_ret; + if (!F_ISSET(dbp, DB_AM_INMEM) && real_old != NULL) + __os_free(env, real_old); + if (!F_ISSET(dbp, DB_AM_INMEM) && real_new != NULL) + __os_free(env, real_new); + return (ret); +} + +static int +__fop_inmem_create(dbp, name, txn, flags) + DB *dbp; + const char *name; + DB_TXN *txn; + u_int32_t flags; +{ + DBT fid_dbt, name_dbt; + DB_LSN lsn; + ENV *env; + int ret; + int32_t lfid; + u_int32_t dflags, *p32; + + env = dbp->env; + dflags = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0; + + MAKE_INMEM(dbp); + + /* Set the pagesize if it isn't yet set. */ + if (dbp->pgsize == 0) + dbp->pgsize = DB_DEF_IOSIZE; + + /* + * Construct a file_id. + * + * If this file has no name, then we only need a fileid for locking. + * If this file has a name, we need the fileid both for locking and + * matching in the memory pool. So, with unnamed in-memory databases, + * use a lock_id. For named in-memory files, we need to find a value + * that we can use to uniquely identify a name/fid pair. We use a + * combination of a unique id (__os_unique_id) and a hash of the + * original name. + */ + if (name == NULL) { + if (LOCKING_ON(env) && (ret = + __lock_id(env, (u_int32_t *)dbp->fileid, NULL)) != 0) + goto err; + } else { + p32 = (u_int32_t *)(&dbp->fileid[0]); + __os_unique_id(env, p32); + p32++; + (void)strncpy( + (char *)p32, name, DB_FILE_ID_LEN - sizeof(u_int32_t)); + dbp->preserve_fid = 1; + + if (DBENV_LOGGING(env) && +#if !defined(DEBUG_WOP) && !defined(DIAGNOSTIC) + txn != NULL && +#endif + dbp->log_filename != NULL) + memcpy(dbp->log_filename->ufid, + dbp->fileid, DB_FILE_ID_LEN); + } + + /* Now, set the fileid. */ + if ((ret = __memp_set_fileid(dbp->mpf, dbp->fileid)) != 0) + goto err; + + if ((ret = __env_mpool(dbp, name, flags)) != 0) + goto err; + + if (DBENV_LOGGING(env) && +#if !defined(DEBUG_WOP) + txn != NULL && +#endif + name != NULL) { + DB_INIT_DBT(name_dbt, name, strlen(name) + 1); + memset(&fid_dbt, 0, sizeof(fid_dbt)); + fid_dbt.data = dbp->fileid; + fid_dbt.size = DB_FILE_ID_LEN; + lfid = dbp->log_filename == NULL ? + DB_LOGFILEID_INVALID : dbp->log_filename->id; + if ((ret = __crdel_inmem_create_log(env, txn, + &lsn, dflags, lfid, &name_dbt, &fid_dbt, dbp->pgsize)) != 0) + goto err; + } + + F_SET(dbp, DB_AM_CREATED); + +err: + return (ret); +} + +static int +__fop_inmem_read_meta(dbp, txn, name, flags, chkflags) + DB *dbp; + DB_TXN *txn; + const char *name; + u_int32_t flags; + u_int32_t chkflags; +{ + DBMETA *metap; + DB_THREAD_INFO *ip; + db_pgno_t pgno; + int ret, t_ret; + + if (txn == NULL) + ENV_GET_THREAD_INFO(dbp->env, ip); + else + ip = txn->thread_info; + + pgno = PGNO_BASE_MD; + if ((ret = __memp_fget(dbp->mpf, &pgno, ip, txn, 0, &metap)) != 0) + return (ret); + if (FLD_ISSET(chkflags, DB_CHK_ONLY)) { + if ((ret = __db_chk_meta(dbp->env, dbp, metap, chkflags)) == 0) + memcpy(dbp->fileid, + ((DBMETA *)metap)->uid, DB_FILE_ID_LEN); + } else + ret = __db_meta_setup( + dbp->env, dbp, name, metap, flags, chkflags); + + if ((t_ret = + __memp_fput(dbp->mpf, ip, metap, dbp->priority)) && ret == 0) + ret = t_ret; + + return (ret); +} + +static int +__fop_ondisk_dummy(dbp, txn, name, mbuf) + DB *dbp; + DB_TXN *txn; + const char *name; + u_int8_t *mbuf; +{ + ENV *env; + int ret; + char *realname; + u_int32_t dflags; + + realname = NULL; + env = dbp->env; + dflags = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0; + + if ((ret = __db_appname(env, + DB_APP_DATA, name, &dbp->dirname, &realname)) != 0) + goto err; + + if ((ret = __fop_create(env, + txn, NULL, name, &dbp->dirname, DB_APP_DATA, 0, dflags)) != 0) + goto err; + + if ((ret = + __os_fileid(env, realname, 1, ((DBMETA *)mbuf)->uid)) != 0) + goto err; + + ((DBMETA *)mbuf)->magic = DB_RENAMEMAGIC; + if ((ret = __fop_write(env, txn, name, dbp->dirname, + DB_APP_DATA, NULL, 0, 0, 0, mbuf, DBMETASIZE, 1, dflags)) != 0) + goto err; + + memcpy(dbp->fileid, ((DBMETA *)mbuf)->uid, DB_FILE_ID_LEN); + +err: if (realname != NULL) + __os_free(env, realname); + + return (ret); +} + +static int +__fop_inmem_dummy(dbp, txn, name, mbuf) + DB *dbp; + DB_TXN *txn; + const char *name; + u_int8_t *mbuf; +{ + DBMETA *metap; + DB_THREAD_INFO *ip; + db_pgno_t pgno; + int ret, t_ret; + + if ((ret = __fop_inmem_create(dbp, name, txn, DB_CREATE)) != 0) + return (ret); + if (txn == NULL) + ENV_GET_THREAD_INFO(dbp->env, ip); + else + ip = txn->thread_info; + + pgno = PGNO_BASE_MD; + if ((ret = __memp_fget(dbp->mpf, &pgno, ip, txn, + DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &metap)) != 0) + return (ret); + /* Check file existed. */ + if (metap->magic != 0) + ret = EEXIST; + else + metap->magic = DB_RENAMEMAGIC; + + /* Copy the fileid onto the meta-data page. */ + memcpy(metap->uid, dbp->fileid, DB_FILE_ID_LEN); + + if ((t_ret = __memp_fput(dbp->mpf, ip, metap, + ret == 0 ? dbp->priority : DB_PRIORITY_VERY_LOW)) != 0 && ret == 0) + ret = t_ret; + + if (ret != 0) + goto err; + + ((DBMETA *)mbuf)->magic = DB_RENAMEMAGIC; + +err: return (ret); +} + +static int +__fop_ondisk_swap(dbp, tmpdbp, txn, old, new, back, locker) + DB *dbp, *tmpdbp; + DB_TXN *txn; + const char *old, *new, *back; + DB_LOCKER *locker; +{ + DBT fiddbt, namedbt, tmpdbt; + DB_FH *fhp; + DB_LOCK elock; + DB_LSN lsn; + DB_TXN *parent; + ENV *env; + u_int8_t mbuf[DBMETASIZE]; + u_int32_t child_txnid, dflags; + int ret, t_ret; + char *realold, *realnew; + + env = dbp->env; + DB_ASSERT(env, txn != NULL); + DB_ASSERT(env, old != NULL); + + realold = realnew = NULL; + LOCK_INIT(elock); + fhp = NULL; + dflags = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0; + + if ((ret = __db_appname(env, + DB_APP_DATA, new, &dbp->dirname, &realnew)) != 0) + goto err; + + /* Now, lock the name space while we initialize this file. */ +retry: GET_ENVLOCK(env, locker, &elock); + if (__os_exists(env, realnew, NULL) == 0) { + /* + * It is possible that the only reason this file exists is + * because we've done a previous rename of it and we have + * left a placeholder here. We need to check for that case + * and allow this rename to succeed if that's the case. + */ + if ((ret = __os_open(env, realnew, 0, 0, 0, &fhp)) != 0) + goto err; + if ((ret = __fop_read_meta(env, + realnew, mbuf, sizeof(mbuf), fhp, 0, NULL)) != 0 || + (ret = __db_meta_setup(env, + tmpdbp, realnew, (DBMETA *)mbuf, 0, DB_CHK_META)) != 0) { + ret = EEXIST; + goto err; + } + ret = __os_closehandle(env, fhp); + fhp = NULL; + if (ret != 0) + goto err; + + /* + * Now, try to acquire the handle lock. If the handle is locked + * by our current, transaction, then we'll get it and life is + * good. + * + * Alternately, it's not locked at all, we'll get the lock, but + * we will realize it exists and consider this an error. + * + * However, if it's held by another transaction, then there + * could be two different scenarios: 1) the file is in the + * midst of being created or deleted and when that transaction + * is over, we might be able to proceed. 2) the file is open + * and exists and we should report an error. In order to + * distinguish these two cases, we do the following. First, we + * try to acquire a READLOCK. If the handle is in the midst of + * being created, then we'll block because a writelock is held. + * In that case, we should request a blocking write, and when we + * get the lock, we should then go back and check to see if the + * object exists and start all over again. + * + * If we got the READLOCK, then either no one is holding the + * lock or someone has an open handle and the fact that the file + * exists is problematic. So, in this case, we request the + * WRITELOCK non-blocking -- if it succeeds, we're golden. If + * it fails, then the file exists and we return EEXIST. + */ + if ((ret = __fop_lock_handle(env, + tmpdbp, locker, DB_LOCK_READ, NULL, DB_LOCK_NOWAIT)) != 0) { + /* + * Someone holds a write-lock. Wait for the write-lock + * and after we get it, release it and start over. + */ + if ((ret = __fop_lock_handle(env, tmpdbp, + locker, DB_LOCK_WRITE, &elock, 0)) != 0) + goto err; + if ((ret = + __lock_put(env, &tmpdbp->handle_lock)) != 0) + goto err; + if ((ret = __db_refresh(tmpdbp, NULL, 0, NULL, 0)) != 0) + goto err; + goto retry; + } + + /* We got the read lock; try to upgrade it. */ + ret = __fop_lock_handle(env, + tmpdbp, locker, DB_LOCK_WRITE, + NULL, DB_LOCK_UPGRADE | DB_LOCK_NOWAIT); + if (ret != 0) { + /* + * We did not get the writelock, so someone + * has the handle open. This is an error. + */ + (void)__lock_put(env, &tmpdbp->handle_lock); + ret = EEXIST; + } else if (F_ISSET(tmpdbp, DB_AM_IN_RENAME)) + /* We got the lock and are renaming it. */ + ret = 0; + else { /* We got the lock, but the file exists. */ + (void)__lock_put(env, &tmpdbp->handle_lock); + ret = EEXIST; + } + if (ret != 0) + goto err; + } + + /* + * While we have the namespace locked, do the renames and then + * swap for the handle lock. + */ + if ((ret = __fop_rename(env, txn, + old, new, &dbp->dirname, dbp->fileid, DB_APP_DATA, 1, dflags)) != 0) + goto err; + if ((ret = __fop_rename(env, txn, back, old, + &dbp->dirname, tmpdbp->fileid, DB_APP_DATA, 0, dflags)) != 0) + goto err; + if ((ret = __fop_lock_handle(env, + tmpdbp, locker, DB_LOCK_WRITE, &elock, NOWAIT_FLAG(txn))) != 0) + goto err; + + /* + * We just acquired a transactional lock on the tmp handle. + * We need to null out the tmp handle's lock so that it + * doesn't create problems for us in the close path. + */ + LOCK_INIT(tmpdbp->handle_lock); + + /* Commit the child. */ + child_txnid = txn->txnid; + parent = txn->parent; + ret = __txn_commit(txn, 0); + txn = NULL; + + /* + * If the new name is available because it was previously renamed + * remove it from the remove list. + */ + if (F_ISSET(tmpdbp, DB_AM_IN_RENAME)) + __txn_remrem(env, parent, realnew); + + /* Now log the child information in the parent. */ + memset(&fiddbt, 0, sizeof(fiddbt)); + fiddbt.data = dbp->fileid; + fiddbt.size = DB_FILE_ID_LEN; + memset(&tmpdbt, 0, sizeof(fiddbt)); + tmpdbt.data = tmpdbp->fileid; + tmpdbt.size = DB_FILE_ID_LEN; + DB_INIT_DBT(namedbt, old, strlen(old) + 1); + if ((t_ret = __fop_file_remove_log(env, + parent, &lsn, dflags, &fiddbt, &tmpdbt, &namedbt, + (u_int32_t)DB_APP_DATA, child_txnid)) != 0 && ret == 0) + ret = t_ret; + + /* This is a delayed delete of the dummy file. */ + if ((ret = __db_appname(env, + DB_APP_DATA, old, &dbp->dirname, &realold)) != 0) + goto err; + + if ((ret = __txn_remevent(env, parent, realold, NULL, 0)) != 0) + goto err; + +err: if (txn != NULL) /* Ret must already be set, so void abort. */ + (void)__txn_abort(txn); + + (void)__ENV_LPUT(env, elock); + + if (fhp != NULL && + (t_ret = __os_closehandle(env, fhp)) != 0 && ret == 0) + ret = t_ret; + + if (realnew != NULL) + __os_free(env, realnew); + if (realold != NULL) + __os_free(env, realold); + return (ret); +} + +static int +__fop_inmem_swap(olddbp, backdbp, txn, old, new, back, locker) + DB *olddbp, *backdbp; + DB_TXN *txn; + const char *old, *new, *back; + DB_LOCKER *locker; +{ + DB *tmpdbp; + DBT fid_dbt, n1_dbt, n2_dbt; + DB_LOCK elock; + DB_LSN lsn; + DB_TXN *parent; + ENV *env; + int ret, t_ret; + + env = olddbp->env; + parent = txn->parent; +retry: LOCK_INIT(elock); + if ((ret = __db_create_internal(&tmpdbp, env, 0)) != 0) + return (ret); + MAKE_INMEM(tmpdbp); + + GET_ENVLOCK(env, locker, &elock); + if ((ret = __env_mpool(tmpdbp, new, 0)) == 0) { + /* + * It is possible that the only reason this database exists is + * because we've done a previous rename of it and we have + * left a placeholder here. We need to check for that case + * and allow this rename to succeed if that's the case. + */ + + if ((ret = __fop_inmem_read_meta( + tmpdbp, txn, new, 0, DB_CHK_META)) != 0) { + ret = EEXIST; + goto err; + } + + /* + * Now, try to acquire the handle lock. If it's from our txn, + * then we'll get the lock. If it's not, then someone else has + * it locked. See the comments in __fop_ondisk_swap for + * details. + */ + if ((ret = __fop_lock_handle(env, + tmpdbp, locker, DB_LOCK_READ, NULL, DB_LOCK_NOWAIT)) != 0) { + /* + * Someone holds a writelock. Try for the WRITELOCK + * and after we get it, retry. + */ + if ((ret = __fop_lock_handle(env, tmpdbp, + locker, DB_LOCK_WRITE, &elock, 0)) != 0) + goto err; + + /* We have the write lock; release it and start over. */ + (void)__lock_put(env, &tmpdbp->handle_lock); + (void)__db_close(tmpdbp, NULL, DB_NOSYNC); + (void)__ENV_LPUT(env, elock); + goto retry; + } else { + (void)__lock_put(env, &tmpdbp->handle_lock); + if (!F_ISSET(tmpdbp, DB_AM_IN_RENAME)) + ret = EEXIST; + } + if (ret != 0) + goto err; + } + + /* Log the renames. */ + if (LOGGING_ON(env) +#ifndef DEBUG_WOP + && txn != NULL +#endif + ) { + /* Rename old to new. */ + DB_INIT_DBT(fid_dbt, olddbp->fileid, DB_FILE_ID_LEN); + DB_INIT_DBT(n1_dbt, old, strlen(old) + 1); + DB_INIT_DBT(n2_dbt, new, strlen(new) + 1); + if ((ret = __crdel_inmem_rename_log( + env, txn, &lsn, 0, &n1_dbt, &n2_dbt, &fid_dbt)) != 0) + goto err; + + /* Rename back to old */ + fid_dbt.data = backdbp->fileid; + DB_SET_DBT(n2_dbt, back, strlen(back) + 1); + if ((ret = __crdel_inmem_rename_log( + env, txn, &lsn, 0, &n2_dbt, &n1_dbt, &fid_dbt)) != 0) + goto err; + } + + /* + * While we have the namespace locked, do the renames and then + * swap for the handle lock. If we ran into a file in the midst + * of rename, then we need to delete it first, else nameop is + * going to consider it an error. + */ + if (F_ISSET(tmpdbp, DB_AM_IN_RENAME)) { + if ((ret = __memp_nameop(env, + tmpdbp->fileid, NULL, new, NULL, 1)) != 0) + goto err; + __txn_remrem(env, parent, new); + } + + if ((ret = __memp_nameop( + env, olddbp->fileid, new, old, new, 1)) != 0) + goto err; + if ((ret = __memp_nameop( + env, backdbp->fileid, old, back, old, 1)) != 0) + goto err; + + if ((ret = __fop_lock_handle(env, + tmpdbp, locker, DB_LOCK_WRITE, &elock, 0)) != 0) + goto err; + + /* + * We just acquired a transactional lock on the tmp handle. + * We need to null out the tmp handle's lock so that it + * doesn't create problems for us in the close path. + */ + LOCK_INIT(tmpdbp->handle_lock); + + DB_ASSERT(env, txn != NULL); + + /* Commit the child. */ + ret = __txn_commit(txn, 0); + txn = NULL; + + if ((ret = __db_inmem_remove(backdbp, parent, old)) != 0) + goto err; + +err: (void)__ENV_LPUT(env, elock); + + if (txn != NULL) + (void)__txn_abort(txn); + + if ((t_ret = __db_close(tmpdbp, NULL, 0)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} |