diff options
Diffstat (limited to 'src/db/partition.c')
-rw-r--r-- | src/db/partition.c | 2059 |
1 files changed, 2059 insertions, 0 deletions
diff --git a/src/db/partition.c b/src/db/partition.c new file mode 100644 index 00000000..f8beaf16 --- /dev/null +++ b/src/db/partition.c @@ -0,0 +1,2059 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/db_verify.h" +#include "dbinc/btree.h" +#ifdef HAVE_HASH +#include "dbinc/hash.h" +#endif +#include "dbinc/lock.h" +#include "dbinc/mp.h" +#include "dbinc/partition.h" +#include "dbinc/txn.h" +#ifdef HAVE_PARTITION + +static int __part_rr __P((DB *, DB_THREAD_INFO *, DB_TXN *, + const char *, const char *, const char *, u_int32_t)); +static int __partc_close __P((DBC *, db_pgno_t, int *)); +static int __partc_del __P((DBC*, u_int32_t)); +static int __partc_destroy __P((DBC*)); +static int __partc_get_pp __P((DBC*, DBT *, DBT *, u_int32_t)); +static int __partc_put __P((DBC*, DBT *, DBT *, u_int32_t, db_pgno_t *)); +static int __partc_writelock __P((DBC*)); +static int __partition_chk_meta __P((DB *, + DB_THREAD_INFO *, DB_TXN *, u_int32_t)); +static int __partition_setup_keys __P((DBC *, + DB_PARTITION *, DBMETA *, u_int32_t)); +static int __part_key_cmp __P((const void *, const void *)); +static inline void __part_search __P((DB *, + DB_PARTITION *, DBT *, u_int32_t *)); + +static char *Alloc_err = DB_STR_A("0644", + "Partition open failed to allocate %d bytes", "%d"); + +/* + * Allocate a partition cursor and copy flags to the partition cursor. + * Not passed: + * DBC_PARTITIONED -- the subcursors are not. + * DBC_OWN_LID -- the arg dbc owns the lock id. + * DBC_WRITECURSOR DBC_WRITER -- CDS locking happens on + * the whole DB, not the partition. + */ +#define GET_PART_CURSOR(dbc, new_dbc, part_id) do { \ + DB *__part_dbp; \ + __part_dbp = part->handles[part_id]; \ + if ((ret = __db_cursor_int(__part_dbp, \ + (dbc)->thread_info, (dbc)->txn, __part_dbp->type, \ + PGNO_INVALID, 0, (dbc)->locker, &new_dbc)) != 0) \ + goto err; \ + (new_dbc)->flags = (dbc)->flags & \ + ~(DBC_PARTITIONED|DBC_OWN_LID|DBC_WRITECURSOR|DBC_WRITER); \ +} while (0) + +/* + * Search for the correct partition. + */ +static inline void __part_search(dbp, part, key, part_idp) + DB *dbp; + DB_PARTITION *part; + DBT *key; + u_int32_t *part_idp; +{ + db_indx_t base, indx, limit; + int cmp; + int (*func) __P((DB *, const DBT *, const DBT *)); + + DB_ASSERT(dbp->env, part->nparts != 0); + COMPQUIET(cmp, 0); + COMPQUIET(indx, 0); + + func = ((BTREE *)dbp->bt_internal)->bt_compare; + DB_BINARY_SEARCH_FOR(base, limit, part->nparts, O_INDX) { + DB_BINARY_SEARCH_INCR(indx, base, limit, O_INDX); + cmp = func(dbp, key, &part->keys[indx]); + if (cmp == 0) + break; + if (cmp > 0) + DB_BINARY_SEARCH_SHIFT_BASE(indx, base, limit, O_INDX); + } + if (cmp == 0) + *part_idp = indx; + else if ((*part_idp = base) != 0) + (*part_idp)--; +} + +/* + * __partition_init -- + * Initialize the partition structure. + * Called when the meta data page is read in during database open or + * when partition keys or a callback are set. + * + * PUBLIC: int __partition_init __P((DB *, u_int32_t)); + */ +int +__partition_init(dbp, flags) + DB *dbp; + u_int32_t flags; +{ + DB_PARTITION *part; + int ret; + + if ((part = dbp->p_internal) != NULL) { + if ((LF_ISSET(DBMETA_PART_RANGE) && + F_ISSET(part, PART_CALLBACK)) || + (LF_ISSET(DBMETA_PART_CALLBACK) && + F_ISSET(part, PART_RANGE))) { + __db_errx(dbp->env, DB_STR("0645", + "Cannot specify callback and range keys.")); + return (EINVAL); + } + } else if ((ret = __os_calloc(dbp->env, 1, sizeof(*part), &part)) != 0) + return (ret); + + if (LF_ISSET(DBMETA_PART_RANGE)) + F_SET(part, PART_RANGE); + if (LF_ISSET(DBMETA_PART_CALLBACK)) + F_SET(part, PART_CALLBACK); + dbp->p_internal = part; + /* Set up AM-specific methods that do not require an open. */ + dbp->db_am_rename = __part_rename; + dbp->db_am_remove = __part_remove; + return (0); +} +/* + * __partition_set -- + * Set the partitioning keys or callback function. + * This routine must be called prior to creating the database. + * PUBLIC: int __partition_set __P((DB *, u_int32_t, DBT *, + * PUBLIC: u_int32_t (*callback)(DB *, DBT *key))); + */ + +int +__partition_set(dbp, parts, keys, callback) + DB *dbp; + u_int32_t parts; + DBT *keys; + u_int32_t (*callback)(DB *, DBT *key); +{ + DB_PARTITION *part; + ENV *env; + int ret; + + DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_partition"); + env = dbp->dbenv->env; + + if (parts < 2) { + __db_errx(env, DB_STR("0646", + "Must specify at least 2 partitions.")); + return (EINVAL); + } + + if (keys == NULL && callback == NULL) { + __db_errx(env, DB_STR("0647", + "Must specify either keys or a callback.")); + return (EINVAL); + } + if (keys != NULL && callback != NULL) { +bad: __db_errx(env, DB_STR("0648", + "May not specify both keys and a callback.")); + return (EINVAL); + } + + if ((ret = __partition_init(dbp, + keys != NULL ? + DBMETA_PART_RANGE : DBMETA_PART_CALLBACK)) != 0) + return (ret); + part = dbp->p_internal; + + if ((part->keys != NULL && callback != NULL) || + (part->callback != NULL && keys != NULL)) + goto bad; + + part->nparts = parts; + part->keys = keys; + part->callback = callback; + + return (0); +} + +/* + * __partition_set_dirs -- + * Set the directories for creating the partition databases. + * They must be in the environment. + * PUBLIC: int __partition_set_dirs __P((DB *, const char **)); + */ +int +__partition_set_dirs(dbp, dirp) + DB *dbp; + const char **dirp; +{ + DB_ENV *dbenv; + DB_PARTITION *part; + ENV *env; + u_int32_t ndirs, slen; + int i, ret; + const char **dir; + char *cp, **part_dirs, **pd; + + DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_partition_dirs"); + dbenv = dbp->dbenv; + env = dbp->env; + + ndirs = 1; + slen = 0; + for (dir = dirp; *dir != NULL; dir++) { + if (F_ISSET(env, ENV_DBLOCAL)) + slen += (u_int32_t)strlen(*dir) + 1; + ndirs++; + } + + slen += sizeof(char *) * ndirs; + if ((ret = __os_malloc(env, slen, &part_dirs)) != 0) + return (EINVAL); + memset(part_dirs, 0, slen); + + cp = (char *) part_dirs + (sizeof(char *) * ndirs); + pd = part_dirs; + for (dir = dirp; *dir != NULL; dir++, pd++) { + if (F_ISSET(env, ENV_DBLOCAL)) { + (void)strcpy(cp, *dir); + *pd = cp; + cp += strlen(*dir) + 1; + continue; + } + for (i = 0; i < dbenv->data_next; i++) + if (strcmp(*dir, dbenv->db_data_dir[i]) == 0) + break; + if (i == dbenv->data_next) { + __db_errx(dbp->env, DB_STR_A("0649", + "Directory not in environment list %s", + "%s"), *dir); + __os_free(env, part_dirs); + return (EINVAL); + } + *pd = dbenv->db_data_dir[i]; + } + + if ((part = dbp->p_internal) == NULL) { + if ((ret = __partition_init(dbp, 0)) != 0) + return (ret); + part = dbp->p_internal; + } + + part->dirs = (const char **)part_dirs; + + return (0); +} + +/* + * __partition_open -- + * Open/create a partitioned database. + * PUBLIC: int __partition_open __P((DB *, DB_THREAD_INFO *, + * PUBLIC: DB_TXN *, const char *, DBTYPE, u_int32_t, int, int)); + */ +int +__partition_open(dbp, ip, txn, fname, type, flags, mode, do_open) + DB *dbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; + const char *fname; + DBTYPE type; + u_int32_t flags; + int mode, do_open; +{ + DB *part_db; + DB_PARTITION *part; + DBC *dbc; + ENV *env; + u_int32_t part_id; + int ret; + char *name, *sp; + const char **dirp, *np; + + part = dbp->p_internal; + env = dbp->dbenv->env; + name = NULL; + + if ((ret = __partition_chk_meta(dbp, ip, txn, flags)) != 0 && do_open) + goto err; + + if ((ret = __os_calloc(env, + part->nparts, sizeof(*part->handles), &part->handles)) != 0) { + __db_errx(env, + Alloc_err, part->nparts * sizeof(*part->handles)); + goto err; + } + + DB_ASSERT(env, fname != NULL); + if ((ret = __os_malloc(env, + strlen(fname) + PART_LEN + 1, &name)) != 0) { + __db_errx(env, Alloc_err, strlen(fname) + PART_LEN + 1); + goto err; + } + + sp = name; + np = __db_rpath(fname); + if (np == NULL) + np = fname; + else { + np++; + (void)strncpy(name, fname, (size_t)(np - fname)); + sp = name + (np - fname); + } + + if (F_ISSET(dbp, DB_AM_RECOVER)) + goto done; + dirp = part->dirs; + for (part_id = 0; part_id < part->nparts; part_id++) { + if ((ret = __db_create_internal( + &part->handles[part_id], dbp->env, 0)) != 0) + goto err; + + part_db = part->handles[part_id]; + part_db->flags = F_ISSET(dbp, + ~(DB_AM_CREATED | DB_AM_CREATED_MSTR | DB_AM_OPEN_CALLED)); + F_SET(part_db, DB_AM_PARTDB); + part_db->adj_fileid = dbp->adj_fileid; + part_db->pgsize = dbp->pgsize; + part_db->priority = dbp->priority; + part_db->db_append_recno = dbp->db_append_recno; + part_db->db_feedback = dbp->db_feedback; + part_db->dup_compare = dbp->dup_compare; + part_db->app_private = dbp->app_private; + part_db->api_internal = dbp->api_internal; + + if (dbp->type == DB_BTREE) + __bam_copy_config(dbp, part_db, part->nparts); +#ifdef HAVE_HASH + if (dbp->type == DB_HASH) + __ham_copy_config(dbp, part_db, part->nparts); +#endif + + (void)sprintf(sp, PART_NAME, np, part_id); + if (do_open) { + /* + * Cycle through the directory names passed in, + * if any. + */ + if (dirp != NULL && + (part_db->dirname = *dirp++) == NULL) { + part_db->dirname = *(dirp = part->dirs); + dirp++; + } + if ((ret = __db_open(part_db, ip, txn, + name, NULL, type, flags, mode, PGNO_BASE_MD)) != 0) + goto err; + } else if ((ret = __os_strdup(env, name, &part_db->fname)) != 0) + goto err; + } + + /* Get rid of the cursor used to open the database its the wrong type */ +done: while ((dbc = TAILQ_FIRST(&dbp->free_queue)) != NULL) + if ((ret = __dbc_destroy(dbc)) != 0) + break; + + if (0) { +err: (void)__partition_close(dbp, txn, 0); + } + if (name != NULL) + __os_free(env, name); + return (ret); +} + +/* + * __partition_chk_meta -- + * Check for a consistent meta data page and parameters when opening a + * partitioned database. + */ +static int +__partition_chk_meta(dbp, ip, txn, flags) + DB *dbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; + u_int32_t flags; +{ + DBMETA *meta; + DB_PARTITION *part; + DBC *dbc; + DB_LOCK metalock; + DB_MPOOLFILE *mpf; + ENV *env; + db_pgno_t base_pgno; + int ret, t_ret; + + dbc = NULL; + meta = NULL; + LOCK_INIT(metalock); + part = dbp->p_internal; + mpf = dbp->mpf; + env = dbp->env; + ret = 0; + + /* Get a cursor on the main db. */ + dbp->p_internal = NULL; + if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0) + goto err; + + /* Get the metadata page. */ + base_pgno = PGNO_BASE_MD; + if ((ret = + __db_lget(dbc, 0, base_pgno, DB_LOCK_READ, 0, &metalock)) != 0) + goto err; + if ((ret = __memp_fget(mpf, &base_pgno, ip, dbc->txn, 0, &meta)) != 0) + goto err; + + if (meta->magic != DB_HASHMAGIC && + (meta->magic != DB_BTREEMAGIC || F_ISSET(meta, BTM_RECNO))) { + __db_errx(env, DB_STR("0650", + "Partitioning may only specified on BTREE and HASH databases.")); + ret = EINVAL; + goto err; + } + if (!FLD_ISSET(meta->metaflags, + DBMETA_PART_RANGE | DBMETA_PART_CALLBACK)) { + __db_errx(env, DB_STR("0651", + "Partitioning specified on a non-partitioned database.")); + ret = EINVAL; + goto err; + } + + if ((F_ISSET(part, PART_RANGE) && + FLD_ISSET(meta->metaflags, DBMETA_PART_CALLBACK)) || + (F_ISSET(part, PART_CALLBACK) && + FLD_ISSET(meta->metaflags, DBMETA_PART_RANGE))) { + __db_errx(env, DB_STR("0652", + "Incompatible partitioning specified.")); + ret = EINVAL; + goto err; + } + + if (FLD_ISSET(meta->metaflags, DBMETA_PART_CALLBACK) && + part->callback == NULL && !IS_RECOVERING(env) && + !F_ISSET(dbp, DB_AM_RECOVER) && !LF_ISSET(DB_RDWRMASTER)) { + __db_errx(env, DB_STR("0653", + "Partition callback not specified.")); + ret = EINVAL; + goto err; + } + + if (F_ISSET(dbp, DB_AM_RECNUM)) { + __db_errx(env, DB_STR("0654", + "Record numbers are not supported in partitioned databases.")); + ret = EINVAL; + goto err; + } + + if (part->nparts == 0) { + if (LF_ISSET(DB_CREATE) && meta->nparts == 0) { + __db_errx(env, DB_STR("0655", + "Zero paritions specified.")); + ret = EINVAL; + goto err; + } else + part->nparts = meta->nparts; + } else if (meta->nparts != 0 && part->nparts != meta->nparts) { + __db_errx(env, DB_STR("0656", + "Number of partitions does not match.")); + ret = EINVAL; + goto err; + } + + if (meta->magic == DB_HASHMAGIC) { + if (!F_ISSET(part, PART_CALLBACK)) { + __db_errx(env, DB_STR("0657", + "Hash database must specify a partition callback.")); + ret = EINVAL; + } + } else if (meta->magic != DB_BTREEMAGIC) { + __db_errx(env, DB_STR("0658", + "Partitioning only supported on BTREE nad HASH.")); + ret = EINVAL; + } else + ret = __partition_setup_keys(dbc, part, meta, flags); + +err: /* Put the metadata page back. */ + if (meta != NULL && (t_ret = __memp_fput(mpf, + ip, meta, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0) + ret = t_ret; + + if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0) + ret = t_ret; + + dbp->p_internal = part; + return (ret); +} + +/* + * Support for sorting keys. Keys must be sorted using the btree + * compare function so if we call qsort in __partition_setup_keys + * we use this structure to pass the DBP and compare function. + */ +struct key_sort { + DB *dbp; + DBT *key; + int (*compare) __P((DB *, const DBT *, const DBT *)); +}; + +static int __part_key_cmp(a, b) + const void *a, *b; +{ + const struct key_sort *ka, *kb; + + ka = a; + kb = b; + return (ka->compare(ka->dbp, ka->key, kb->key)); +} +/* + * __partition_setup_keys -- + * Get the partition keys into memory, or put them to disk if we + * are creating a partitioned database. + */ +static int +__partition_setup_keys(dbc, part, meta, flags) + DBC *dbc; + DB_PARTITION *part; + DBMETA *meta; + u_int32_t flags; +{ + BTREE *t; + DB *dbp; + DBT data, key, *keys, *kp; + ENV *env; + u_int32_t ds, i, j; + u_int8_t *dd; + struct key_sort *ks; + int have_keys, ret; + int (*compare) __P((DB *, const DBT *, const DBT *)); + void *dp; + + COMPQUIET(dd, NULL); + COMPQUIET(ds, 0); + memset(&data, 0, sizeof(data)); + memset(&key, 0, sizeof(key)); + ks = NULL; + + dbp = dbc->dbp; + env = dbp->env; + + /* Need to just read the main database. */ + dbp->p_internal = NULL; + have_keys = 0; + + /* First verify that things what we expect. */ + if ((ret = __dbc_get(dbc, &key, &data, DB_FIRST)) != 0) { + if (ret != DB_NOTFOUND) + goto err; + if (F_ISSET(part, PART_CALLBACK)) { + ret = 0; + goto done; + } + if (!LF_ISSET(DB_CREATE) && !F_ISSET(dbp, DB_AM_RECOVER) && + !LF_ISSET(DB_RDWRMASTER)) { + __db_errx(env, DB_STR("0659", "No range keys found.")); + ret = EINVAL; + goto err; + } + } else { + if (F_ISSET(part, PART_CALLBACK)) { + __db_errx(env, DB_STR("0660", + "Keys found and callback set.")); + ret = EINVAL; + goto err; + } + if (key.size != 0) { + __db_errx(env, DB_STR("0661", + "Partition key 0 is not empty.")); + ret = EINVAL; + goto err; + } + have_keys = 1; + } + + if (LF_ISSET(DB_CREATE) && have_keys == 0) { + /* Insert the keys into the master database. */ + for (i = 0; i < part->nparts - 1; i++) { + if ((ret = __db_put(dbp, dbc->thread_info, + dbc->txn, &part->keys[i], &data, 0)) != 0) + goto err; + } + + /* + * Insert the "0" pointer. All records less than the first + * given key go into this partition. We must use the default + * compare to insert this key, otherwise it might not be first. + */ + t = dbc->dbp->bt_internal; + compare = t->bt_compare; + t->bt_compare = __bam_defcmp; + memset(&key, 0, sizeof(key)); + ret = __db_put(dbp, dbc->thread_info, dbc->txn, &key, &data, 0); + t->bt_compare = compare; + if (ret != 0) + goto err; + } +done: if (F_ISSET(part, PART_RANGE)) { + /* + * Allocate one page to hold the keys plus space at the + * end of the buffer to put an array of DBTs. If there + * is not enough space __dbc_get will return how much + * is needed and we realloc. + */ + if ((ret = __os_malloc(env, + meta->pagesize + (sizeof(DBT) * part->nparts), + &part->data)) != 0) { + __db_errx(env, Alloc_err, meta->pagesize); + goto err; + } + memset(&key, 0, sizeof(key)); + memset(&data, 0, sizeof(data)); + data.data = part->data; + data.ulen = meta->pagesize; + data.flags = DB_DBT_USERMEM; +again: if ((ret = __dbc_get(dbc, &key, &data, + DB_FIRST | DB_MULTIPLE_KEY)) == DB_BUFFER_SMALL) { + if ((ret = __os_realloc(env, + data.size + (sizeof(DBT) * part->nparts), + &part->data)) != 0) + goto err; + data.data = part->data; + data.ulen = data.size; + goto again; + } + if (ret == 0) { + /* + * They passed in keys, they must match. + */ + keys = NULL; + compare = NULL; + if (have_keys == 1 && (keys = part->keys) != NULL) { + t = dbc->dbp->bt_internal; + compare = t->bt_compare; + if ((ret = __os_malloc(env, (part->nparts - 1) + * sizeof(struct key_sort), &ks)) != 0) + goto err; + for (j = 0; j < part->nparts - 1; j++) { + ks[j].dbp = dbc->dbp; + ks[j].compare = compare; + ks[j].key = &keys[j]; + } + + qsort(ks, (size_t)part->nparts - 1, + sizeof(struct key_sort), __part_key_cmp); + } + DB_MULTIPLE_INIT(dp, &data); + part->keys = (DBT *) + ((u_int8_t *)part->data + data.size); + j = 0; + for (kp = part->keys; + kp < &part->keys[part->nparts]; kp++, j++) { + DB_MULTIPLE_KEY_NEXT(dp, + &data, kp->data, kp->size, dd, ds); + if (dp == NULL) { + ret = DB_NOTFOUND; + break; + } + if (keys != NULL && j != 0 && + compare(dbc->dbp, ks[j - 1].key, kp) != 0) { + if (kp->data == NULL && + F_ISSET(dbp, DB_AM_RECOVER)) + goto err; + __db_errx(env, DB_STR_A("0662", + "Partition key %d does not match", + "%d"), j); + ret = EINVAL; + goto err; + } + } + } + } + if (ret == DB_NOTFOUND && F_ISSET(dbp, DB_AM_RECOVER)) + ret = 0; + +err: dbp->p_internal = part; + if (ks != NULL) + __os_free(env, ks); + return (ret); +} + +/* + * __partition_get_callback -- + * Get the partition callback function. + * PUBLIC: int __partition_get_callback __P((DB *, + * PUBLIC: u_int32_t *, u_int32_t (**callback)(DB *, DBT *key))); + */ +int +__partition_get_callback(dbp, parts, callback) + DB *dbp; + u_int32_t *parts; + u_int32_t (**callback)(DB *, DBT *key); +{ + DB_PARTITION *part; + + part = dbp->p_internal; + /* Only return populated results if partitioned using callbacks. */ + if (part != NULL && !F_ISSET(part, PART_CALLBACK)) + part = NULL; + if (parts != NULL) + *parts = (part != NULL ? part->nparts : 0); + if (callback != NULL) + *callback = (part != NULL ? part->callback : NULL); + + return (0); +} + +/* + * __partition_get_keys -- + * Get partition keys. + * PUBLIC: int __partition_get_keys __P((DB *, u_int32_t *, DBT **)); + */ +int +__partition_get_keys(dbp, parts, keys) + DB *dbp; + u_int32_t *parts; + DBT **keys; +{ + DB_PARTITION *part; + + part = dbp->p_internal; + /* Only return populated results if partitioned using ranges. */ + if (part != NULL && !F_ISSET(part, PART_RANGE)) + part = NULL; + if (parts != NULL) + *parts = (part != NULL ? part->nparts : 0); + if (keys != NULL) + *keys = (part != NULL ? &part->keys[1] : NULL); + + return (0); +} + +/* + * __partition_get_dirs -- + * Get partition dirs. + * PUBLIC: int __partition_get_dirs __P((DB *, const char ***)); + */ +int +__partition_get_dirs(dbp, dirpp) + DB *dbp; + const char ***dirpp; +{ + DB_PARTITION *part; + ENV *env; + u_int32_t i; + int ret; + + env = dbp->env; + if ((part = dbp->p_internal) == NULL) { + *dirpp = NULL; + return (0); + } + if (!F_ISSET(dbp, DB_AM_OPEN_CALLED)) { + *dirpp = part->dirs; + return (0); + } + + /* + * We build a list once when asked. The original directory list, + * if any, was discarded at open time. + */ + if ((*dirpp = part->dirs) != NULL) + return (0); + + if ((ret = __os_calloc(env, + sizeof(char *), part->nparts + 1, (void *) &part->dirs)) != 0) + return (ret); + + for (i = 0; i < part->nparts; i++) + part->dirs[i] = part->handles[i]->dirname; + + *dirpp = part->dirs; + return (0); +} + +/* + * __partc_init -- + * Initialize the access private portion of a cursor + * + * PUBLIC: int __partc_init __P((DBC *)); + */ +int +__partc_init(dbc) + DBC *dbc; +{ + ENV *env; + int ret; + + env = dbc->env; + + /* Allocate/initialize the internal structure. */ + if (dbc->internal == NULL && (ret = + __os_calloc(env, 1, sizeof(PART_CURSOR), &dbc->internal)) != 0) + return (ret); + + /* Initialize methods. */ + dbc->close = dbc->c_close = __dbc_close_pp; + dbc->cmp = __dbc_cmp_pp; + dbc->count = dbc->c_count = __dbc_count_pp; + dbc->del = dbc->c_del = __dbc_del_pp; + dbc->dup = dbc->c_dup = __dbc_dup_pp; + dbc->get = dbc->c_get = __partc_get_pp; + dbc->pget = dbc->c_pget = __dbc_pget_pp; + dbc->put = dbc->c_put = __dbc_put_pp; + dbc->am_bulk = NULL; + dbc->am_close = __partc_close; + dbc->am_del = __partc_del; + dbc->am_destroy = __partc_destroy; + dbc->am_get = NULL; + dbc->am_put = __partc_put; + dbc->am_writelock = __partc_writelock; + + /* We avoid swapping partition cursors since we swap the sub cursors */ + F_SET(dbc, DBC_PARTITIONED); + + return (0); +} +/* + * __partc_get_pp -- + * cursor get opeartion on a partitioned database. + */ +static int +__partc_get_pp(dbc, key, data, flags) + DBC *dbc; + DBT *key, *data; + u_int32_t flags; +{ + DB *dbp; + DB_THREAD_INFO *ip; + ENV *env; + int ignore_lease, ret; + + dbp = dbc->dbp; + env = dbp->env; + + ignore_lease = LF_ISSET(DB_IGNORE_LEASE) ? 1 : 0; + LF_CLR(DB_IGNORE_LEASE); + if ((ret = __dbc_get_arg(dbc, key, data, flags)) != 0) + return (ret); + + ENV_ENTER(env, ip); + + DEBUG_LREAD(dbc, dbc->txn, "DBcursor->get", + flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, NULL, flags); + + ret = __partc_get(dbc, key, data, flags); + /* + * Check for master leases. + */ + if (ret == 0 && + IS_REP_MASTER(env) && IS_USING_LEASES(env) && !ignore_lease) + ret = __rep_lease_check(env, 1); + + ENV_LEAVE(env, ip); + __dbt_userfree(env, key, NULL, data); + return (ret); +} +/* + * __partition_get -- + * cursor get opeartion on a partitioned database. + * + * PUBLIC: int __partc_get __P((DBC*, DBT *, DBT *, u_int32_t)); + */ +int +__partc_get(dbc, key, data, flags) + DBC *dbc; + DBT *key, *data; + u_int32_t flags; +{ + DB *dbp; + DBC *orig_dbc, *new_dbc; + DB_PARTITION *part; + PART_CURSOR *cp; + u_int32_t multi, part_id; + int ret, retry, search; + + dbp = dbc->dbp; + cp = (PART_CURSOR*)dbc->internal; + orig_dbc = cp->sub_cursor; + part = dbp->p_internal; + + new_dbc = NULL; + retry = search = 0; + part_id = cp->part_id; + multi = flags & ~DB_OPFLAGS_MASK; + + switch (flags & DB_OPFLAGS_MASK) { + case DB_CURRENT: + break; + case DB_FIRST: + part_id = 0; + retry = 1; + break; + case DB_GET_BOTH: + case DB_GET_BOTHC: + case DB_GET_BOTH_RANGE: + search = 1; + break; + case DB_SET_RANGE: + search = 1; + retry = 1; + break; + case DB_LAST: + part_id = part->nparts - 1; + retry = 1; + break; + case DB_NEXT: + case DB_NEXT_NODUP: + if (orig_dbc == NULL) + part_id = 0; + else + part_id = cp->part_id; + retry = 1; + break; + case DB_NEXT_DUP: + break; + case DB_PREV: + case DB_PREV_NODUP: + if (orig_dbc == NULL) + part_id = part->nparts - 1; + else + part_id = cp->part_id; + retry = 1; + break; + case DB_PREV_DUP: + break; + case DB_SET: + search = 1; + break; + default: + return (__db_unknown_flag(dbp->env, "__partc_get", flags)); + } + + /* + * If we need to find the partition to start on, then + * do a binary search of the in memory partition table. + */ + if (search == 1 && F_ISSET(part, PART_CALLBACK)) + part_id = part->callback(dbp, key) % part->nparts; + else if (search == 1) + __part_search(dbp, part, key, &part_id); + + /* Get a new cursor if necessary */ + if (orig_dbc == NULL || cp->part_id != part_id) { + GET_PART_CURSOR(dbc, new_dbc, part_id); + } else + new_dbc = orig_dbc; + + while ((ret = __dbc_get(new_dbc, + key, data, flags)) == DB_NOTFOUND && retry == 1) { + switch (flags & DB_OPFLAGS_MASK) { + case DB_FIRST: + case DB_NEXT: + case DB_NEXT_NODUP: + case DB_SET_RANGE: + if (++part_id < part->nparts) { + flags = DB_FIRST | multi; + break; + } + goto err; + case DB_LAST: + case DB_PREV: + case DB_PREV_NODUP: + if (part_id-- > 0) { + flags = DB_LAST | multi; + break; + } + goto err; + default: + goto err; + } + + if (new_dbc != orig_dbc && (ret = __dbc_close(new_dbc)) != 0) + goto err; + GET_PART_CURSOR(dbc, new_dbc, part_id); + } + + if (ret != 0) + goto err; + + /* Success: swap original and new cursors. */ + if (new_dbc != orig_dbc) { + if (orig_dbc != NULL) { + cp->sub_cursor = NULL; + if ((ret = __dbc_close(orig_dbc)) != 0) + goto err; + } + cp->sub_cursor = new_dbc; + cp->part_id = part_id; + } + + return (0); + +err: if (new_dbc != NULL && new_dbc != orig_dbc) + (void)__dbc_close(new_dbc); + return (ret); +} + +/* + * __partc_put -- + * cursor put opeartion on a partitioned cursor. + * + */ +static int +__partc_put(dbc, key, data, flags, pgnop) + DBC *dbc; + DBT *key, *data; + u_int32_t flags; + db_pgno_t *pgnop; +{ + DB *dbp; + DB_PARTITION *part; + DBC *new_dbc; + PART_CURSOR *cp; + u_int32_t part_id; + int ret; + + dbp = dbc->dbp; + cp = (PART_CURSOR*)dbc->internal; + part_id = cp->part_id; + part = dbp->p_internal; + *pgnop = PGNO_INVALID; + + switch (flags) { + case DB_KEYFIRST: + case DB_KEYLAST: + case DB_NODUPDATA: + case DB_NOOVERWRITE: + case DB_OVERWRITE_DUP: + if (F_ISSET(part, PART_CALLBACK)) { + part_id = part->callback(dbp, key) % part->nparts; + break; + } + __part_search(dbp, part, key, &part_id); + break; + default: + break; + } + + if ((new_dbc = cp->sub_cursor) == NULL || cp->part_id != part_id) { + if ((ret = __db_cursor_int(part->handles[part_id], + dbc->thread_info, dbc->txn, part->handles[part_id]->type, + PGNO_INVALID, 0, dbc->locker, &new_dbc)) != 0) + goto err; + } + + if (F_ISSET(dbc, DBC_WRITER | DBC_WRITECURSOR)) + F_SET(new_dbc, DBC_WRITER); + if ((ret = __dbc_put(new_dbc, key, data, flags)) != 0) + goto err; + + if (new_dbc != cp->sub_cursor) { + if (cp->sub_cursor != NULL) { + if ((ret = __dbc_close(cp->sub_cursor)) != 0) + goto err; + cp->sub_cursor = NULL; + } + cp->sub_cursor = new_dbc; + cp->part_id = part_id; + } + + return (0); + +err: if (new_dbc != NULL && cp->sub_cursor != new_dbc) + (void)__dbc_close(new_dbc); + return (ret); +} + +/* + * __partc_del + * Delete interface to partitioned cursors. + * + */ +static int +__partc_del(dbc, flags) + DBC *dbc; + u_int32_t flags; +{ + PART_CURSOR *cp; + cp = (PART_CURSOR*)dbc->internal; + + if (F_ISSET(dbc, DBC_WRITER | DBC_WRITECURSOR)) + F_SET(cp->sub_cursor, DBC_WRITER); + return (__dbc_del(cp->sub_cursor, flags)); +} + +/* + * __partc_writelock + * Writelock interface to partitioned cursors. + * + */ +static int +__partc_writelock(dbc) + DBC *dbc; +{ + PART_CURSOR *cp; + cp = (PART_CURSOR*)dbc->internal; + + return (cp->sub_cursor->am_writelock(cp->sub_cursor)); +} + +/* + * __partc_close + * Close interface to partitioned cursors. + * + */ +static int +__partc_close(dbc, root_pgno, rmroot) + DBC *dbc; + db_pgno_t root_pgno; + int *rmroot; +{ + PART_CURSOR *cp; + int ret; + + COMPQUIET(root_pgno, 0); + COMPQUIET(rmroot, NULL); + + cp = (PART_CURSOR*)dbc->internal; + + if (cp->sub_cursor == NULL) + return (0); + ret = __dbc_close(cp->sub_cursor); + cp->sub_cursor = NULL; + return (ret); +} + +/* + * __partc_destroy -- + * Destroy a single cursor. + */ +static int +__partc_destroy(dbc) + DBC *dbc; +{ + PART_CURSOR *cp; + ENV *env; + + cp = (PART_CURSOR *)dbc->internal; + env = dbc->env; + + /* Discard the structure. Don't recurse. */ + __os_free(env, cp); + + return (0); +} + +/* + * __partition_close + * Close a partitioned database. + * + * PUBLIC: int __partition_close __P((DB *, DB_TXN *, u_int32_t)); + */ +int +__partition_close(dbp, txn, flags) + DB *dbp; + DB_TXN *txn; + u_int32_t flags; +{ + DB **pdbp; + DB_PARTITION *part; + ENV *env; + u_int32_t i; + int ret, t_ret; + + if ((part = dbp->p_internal) == NULL) + return (0); + + env = dbp->env; + ret = 0; + + if ((pdbp = part->handles) != NULL) { + for (i = 0; i < part->nparts; i++, pdbp++) + if (*pdbp != NULL && (t_ret = + __db_close(*pdbp, txn, flags)) != 0 && ret == 0) + ret = t_ret; + __os_free(env, part->handles); + } + if (part->dirs != NULL) + __os_free(env, (char **)part->dirs); + if (part->data != NULL) + __os_free(env, (char **)part->data); + __os_free(env, part); + dbp->p_internal = NULL; + + return (ret); +} + +/* + * __partition_sync + * Sync a partitioned database. + * + * PUBLIC: int __partition_sync __P((DB *)); + */ +int +__partition_sync(dbp) + DB *dbp; +{ + DB **pdbp; + DB_PARTITION *part; + u_int32_t i; + int ret, t_ret; + + ret = 0; + part = dbp->p_internal; + + if ((pdbp = part->handles) != NULL) { + for (i = 0; i < part->nparts; i++, pdbp++) + if (*pdbp != NULL && + F_ISSET(*pdbp, DB_AM_OPEN_CALLED) && (t_ret = + __memp_fsync((*pdbp)->mpf)) != 0 && ret == 0) + ret = t_ret; + } + if ((t_ret = __memp_fsync(dbp->mpf)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __partition_stat + * Stat a partitioned database. + * + * PUBLIC: int __partition_stat __P((DBC *, void *, u_int32_t)); + */ +int +__partition_stat(dbc, spp, flags) + DBC *dbc; + void *spp; + u_int32_t flags; +{ + DB *dbp, **pdbp; + DB_BTREE_STAT *fsp, *bsp; +#ifdef HAVE_HASH + DB_HASH_STAT *hfsp, *hsp; +#endif + DB_PARTITION *part; + DBC *new_dbc; + ENV *env; + u_int32_t i; + int ret; + + dbp = dbc->dbp; + part = dbp->p_internal; + env = dbp->env; + fsp = NULL; +#ifdef HAVE_HASH + hfsp = NULL; +#endif + + pdbp = part->handles; + for (i = 0; i < part->nparts; i++, pdbp++) { + if ((ret = __db_cursor_int(*pdbp, dbc->thread_info, dbc->txn, + (*pdbp)->type, PGNO_INVALID, + 0, dbc->locker, &new_dbc)) != 0) + goto err; + switch (new_dbc->dbtype) { + case DB_BTREE: + if ((ret = __bam_stat(new_dbc, &bsp, flags)) != 0) + goto err; + if (fsp == NULL) { + fsp = bsp; + *(DB_BTREE_STAT **)spp = fsp; + } else { + fsp->bt_nkeys += bsp->bt_nkeys; + fsp->bt_ndata += bsp->bt_ndata; + fsp->bt_pagecnt += bsp->bt_pagecnt; + if (fsp->bt_levels < bsp->bt_levels) + fsp->bt_levels = bsp->bt_levels; + fsp->bt_int_pg += bsp->bt_int_pg; + fsp->bt_leaf_pg += bsp->bt_leaf_pg; + fsp->bt_dup_pg += bsp->bt_dup_pg; + fsp->bt_over_pg += bsp->bt_over_pg; + fsp->bt_free += bsp->bt_free; + fsp->bt_int_pgfree += bsp->bt_int_pgfree; + fsp->bt_leaf_pgfree += bsp->bt_leaf_pgfree; + fsp->bt_dup_pgfree += bsp->bt_dup_pgfree; + fsp->bt_over_pgfree += bsp->bt_over_pgfree; + __os_ufree(env, bsp); + } + break; +#ifdef HAVE_HASH + case DB_HASH: + if ((ret = __ham_stat(new_dbc, &hsp, flags)) != 0) + goto err; + if (hfsp == NULL) { + hfsp = hsp; + *(DB_HASH_STAT **)spp = hfsp; + } else { + hfsp->hash_nkeys += hsp->hash_nkeys; + hfsp->hash_ndata += hsp->hash_ndata; + hfsp->hash_pagecnt += hsp->hash_pagecnt; + hfsp->hash_ffactor += hsp->hash_ffactor; + hfsp->hash_buckets += hsp->hash_buckets; + hfsp->hash_free += hsp->hash_free; + hfsp->hash_bfree += hsp->hash_bfree; + hfsp->hash_bigpages += hsp->hash_bigpages; + hfsp->hash_big_bfree += hsp->hash_big_bfree; + hfsp->hash_overflows += hsp->hash_overflows; + hfsp->hash_ovfl_free += hsp->hash_ovfl_free; + hfsp->hash_dup += hsp->hash_dup; + hfsp->hash_dup_free += hsp->hash_dup_free; + __os_ufree(env, hsp); + } + break; +#endif + default: + break; + } + if ((ret = __dbc_close(new_dbc)) != 0) + goto err; + } + return (0); + +err: + if (fsp != NULL) + __os_ufree(env, fsp); + *(DB_BTREE_STAT **)spp = NULL; + return (ret); +} + +/* + * __part_truncate -- + * Truncate a database. + * + * PUBLIC: int __part_truncate __P((DBC *, u_int32_t *)); + */ +int +__part_truncate(dbc, countp) + DBC *dbc; + u_int32_t *countp; +{ + DB *dbp, **pdbp; + DB_PARTITION *part; + DBC *new_dbc; + u_int32_t count, i; + int ret, t_ret; + + dbp = dbc->dbp; + part = dbp->p_internal; + pdbp = part->handles; + ret = 0; + + if (countp != NULL) + *countp = 0; + for (i = 0; ret == 0 && i < part->nparts; i++, pdbp++) { + if ((ret = __db_cursor_int(*pdbp, dbc->thread_info, dbc->txn, + (*pdbp)->type, PGNO_INVALID, + 0, dbc->locker, &new_dbc)) != 0) + break; + switch (dbp->type) { + case DB_BTREE: + case DB_RECNO: + ret = __bam_truncate(new_dbc, &count); + break; + case DB_HASH: +#ifdef HAVE_HASH + ret = __ham_truncate(new_dbc, &count); + break; +#endif + case DB_QUEUE: + case DB_UNKNOWN: + default: + ret = __db_unknown_type(dbp->env, + "DB->truncate", dbp->type); + count = 0; + break; + } + if ((t_ret = __dbc_close(new_dbc)) != 0 && ret == 0) + ret = t_ret; + if (countp != NULL) + *countp += count; + } + + return (ret); +} +/* + * __part_compact -- compact a partitioned database. + * + * PUBLIC: int __part_compact __P((DB *, DB_THREAD_INFO *, DB_TXN *, + * PUBLIC: DBT *, DBT *, DB_COMPACT *, u_int32_t, DBT *)); + */ +int +__part_compact(dbp, ip, txn, start, stop, c_data, flags, end) + DB *dbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; + DBT *start, *stop; + DB_COMPACT *c_data; + u_int32_t flags; + DBT *end; +{ + DB **pdbp; + DB_PARTITION *part; + u_int32_t i; + int ret; + + part = dbp->p_internal; + pdbp = part->handles; + ret = 0; + + for (i = 0; ret == 0 && i < part->nparts; i++, pdbp++) { + switch (dbp->type) { + case DB_HASH: + case DB_BTREE: + case DB_RECNO: + ret = __db_compact_int(*pdbp, + ip, txn, start, stop, c_data, flags, end); + break; + + default: + ret = __dbh_am_chk(dbp, DB_OK_BTREE); + break; + } + } + return (ret); +} + +/* + * __part_lsn_reset -- + * reset the lsns on each partition. + * + * PUBLIC: int __part_lsn_reset __P((DB *, DB_THREAD_INFO *)); + */ +int +__part_lsn_reset(dbp, ip) + DB *dbp; + DB_THREAD_INFO *ip; +{ + DB **pdbp; + DB_PARTITION *part; + u_int32_t i; + int ret; + + part = dbp->p_internal; + pdbp = part->handles; + ret = 0; + + for (i = 0; ret == 0 && i < part->nparts; i++, pdbp++) + ret = __db_lsn_reset((*pdbp)->mpf, ip); + + return (ret); +} + +/* + * __part_fileid_reset -- + * reset the fileid on each partition. + * + * PUBLIC: int __part_fileid_reset + * PUBLIC: __P((ENV *, DB_THREAD_INFO *, const char *, u_int32_t, int)); + */ +int +__part_fileid_reset(env, ip, fname, nparts, encrypted) + ENV *env; + DB_THREAD_INFO *ip; + const char *fname; + u_int32_t nparts; + int encrypted; +{ + int ret; + u_int32_t part_id; + char *name, *sp; + const char *np; + + if ((ret = __os_malloc(env, + strlen(fname) + PART_LEN + 1, &name)) != 0) { + __db_errx(env, Alloc_err, strlen(fname) + PART_LEN + 1); + return (ret); + } + + sp = name; + np = __db_rpath(fname); + if (np == NULL) + np = fname; + else { + np++; + (void)strncpy(name, fname, (size_t)(np - fname)); + sp = name + (np - fname); + } + + for (part_id = 0; ret == 0 && part_id < nparts; part_id++) { + (void)sprintf(sp, PART_NAME, np, part_id); + ret = __env_fileid_reset(env, ip, sp, encrypted); + } + + __os_free(env, name); + return (ret); +} + +/* + * __part_key_range -- + * Return proportion of keys relative to given key. + * + * PUBLIC: int __part_key_range __P((DBC *, DBT *, DB_KEY_RANGE *, u_int32_t)); + */ +int +__part_key_range(dbc, dbt, kp, flags) + DBC *dbc; + DBT *dbt; + DB_KEY_RANGE *kp; + u_int32_t flags; +{ + BTREE_CURSOR *cp; + DBC *new_dbc; + DB_PARTITION *part; + PAGE *h; + u_int32_t id, part_id; + u_int32_t elems, empty, less_elems, my_elems, greater_elems; + u_int32_t levels, max_levels, my_levels; + db_pgno_t root_pgno; + int ret; + double total_elems; + + COMPQUIET(flags, 0); + + part = dbc->dbp->p_internal; + + /* + * First we find the key range for the partition that contains the + * key. Then we scale based on estimates of the other partitions. + */ + if (F_ISSET(part, PART_CALLBACK)) + part_id = part->callback(dbc->dbp, dbt) % part->nparts; + else + __part_search(dbc->dbp, part, dbt, &part_id); + GET_PART_CURSOR(dbc, new_dbc, part_id); + + if ((ret = __bam_key_range(new_dbc, dbt, kp, flags)) != 0) + goto err; + + cp = (BTREE_CURSOR *)new_dbc->internal; + + root_pgno = BAM_ROOT_PGNO(new_dbc); + if ((ret = __memp_fget(new_dbc->dbp->mpf, &root_pgno, + new_dbc->thread_info, new_dbc->txn, 0, &h)) != 0) + goto c_err; + + my_elems = NUM_ENT(h); + my_levels = LEVEL(h); + max_levels = my_levels; + + if ((ret = __memp_fput(new_dbc->dbp->mpf, + new_dbc->thread_info, h, new_dbc->priority)) != 0) + goto c_err; + + if ((ret = __dbc_close(new_dbc)) != 0) + goto err; + /* + * We have the range within one subtree. Now estimate + * what part of the whole range that subtree is. Figure + * out how many levels each part has and how many entries + * in the level below the root. + */ + empty = less_elems = greater_elems = 0; + for (id = 0; id < part->nparts; id++) { + if (id == part_id) { + empty = 0; + continue; + } + GET_PART_CURSOR(dbc, new_dbc, id); + cp = (BTREE_CURSOR *)new_dbc->internal; + if ((ret = __memp_fget(new_dbc->dbp->mpf, &cp->root, + new_dbc->thread_info, new_dbc->txn, 0, &h)) != 0) + goto c_err; + + elems = NUM_ENT(h); + levels = LEVEL(h); + if (levels == 1) + elems /= 2; + + if ((ret = __memp_fput(new_dbc->dbp->mpf, + new_dbc->thread_info, h, new_dbc->priority)) != 0) + goto c_err; + + if ((ret = __dbc_close(new_dbc)) != 0) + goto err; + + /* If the tree is empty, ignore it. */ + if (elems == 0) { + empty++; + continue; + } + + /* + * If a tree has fewer levels than the max just count + * it as a single element in the higher level. + */ + if (id < part_id) { + if (levels > max_levels) { + max_levels = levels; + less_elems = id + elems - empty; + } else if (levels < max_levels) + less_elems++; + else + less_elems += elems; + } else { + if (levels > max_levels) { + max_levels = levels; + greater_elems = (id - part_id) + elems - empty; + } else if (levels < max_levels) + greater_elems++; + else + greater_elems += elems; + } + + } + + if (my_levels < max_levels) { + /* + * The subtree containing the key is not the tallest one. + * Reduce its share by the number of records at the highest + * level. Scale the greater and lesser components up + * by the number of records on either side of this + * subtree. + */ + total_elems = 1 + greater_elems + less_elems; + kp->equal /= total_elems; + kp->less /= total_elems; + kp->less += less_elems/total_elems; + kp->greater /= total_elems; + kp->greater += greater_elems/total_elems; + } else if (my_levels == max_levels) { + /* + * The key is in one of the tallest subtrees. We will + * scale the values by the ratio of the records at the + * top of this stubtree to the number of records at the + * highest level. + */ + total_elems = greater_elems + less_elems; + if (total_elems != 0) { + /* + * First scale down by the fraction of elements + * in this subtree. + */ + total_elems += my_elems; + kp->equal *= my_elems; + kp->equal /= total_elems; + kp->less *= my_elems; + kp->less /= total_elems; + kp->greater *= my_elems; + kp->greater /= total_elems; + /* + * Proportionally add weight from the subtrees to the + * left and right of this one. + */ + kp->less += less_elems / total_elems; + kp->greater += greater_elems / total_elems; + } + } + + if (0) { +c_err: (void)__dbc_close(new_dbc); + } + +err: return (ret); +} + +/* + * __part_remove -- + * Remove method for a partitioned database. + * + * PUBLIC: int __part_remove __P((DB *, DB_THREAD_INFO *, + * PUBLIC: DB_TXN *, const char *, const char *, u_int32_t)); + */ +int +__part_remove(dbp, ip, txn, name, subdb, flags) + DB *dbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; + const char *name, *subdb; + u_int32_t flags; +{ + return (__part_rr(dbp, ip, txn, name, subdb, NULL, flags)); +} + +/* + * __part_rename -- + * Rename method for a partitioned database. + * + * PUBLIC: int __part_rename __P((DB *, DB_THREAD_INFO *, + * PUBLIC: DB_TXN *, const char *, const char *, const char *)); + */ +int +__part_rename(dbp, ip, txn, name, subdb, newname) + DB *dbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; + const char *name, *subdb, *newname; +{ + return (__part_rr(dbp, ip, txn, name, subdb, newname, 0)); +} + +/* + * __part_rr -- + * Remove/Rename method for a partitioned database. + */ +static int +__part_rr(dbp, ip, txn, name, subdb, newname, flags) + DB *dbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; + const char *name, *subdb, *newname; + u_int32_t flags; +{ + DB **pdbp, *ptmpdbp, *tmpdbp; + DB_PARTITION *part; + ENV *env; + u_int32_t i; + int ret, t_ret; + char *np; + + env = dbp->env; + ret = 0; + + if (subdb != NULL && name != NULL) { + __db_errx(env, DB_STR("0663", + "A partitioned database can not be in a multiple databases file")); + return (EINVAL); + } + ENV_GET_THREAD_INFO(env, ip); + + /* + * Since rename no longer opens the database, we have + * to do it here. + */ + if ((ret = __db_create_internal(&tmpdbp, env, 0)) != 0) + return (ret); + + /* + * We need to make sure we don't self-deadlock, so give + * this dbp the same locker as the incoming one. + */ + tmpdbp->locker = dbp->locker; + if ((ret = __db_open(tmpdbp, ip, txn, name, NULL, dbp->type, + DB_RDWRMASTER | DB_RDONLY, 0, PGNO_BASE_MD)) != 0) + goto err; + + part = tmpdbp->p_internal; + pdbp = part->handles; + COMPQUIET(np, NULL); + if (newname != NULL && (ret = __os_malloc(env, + strlen(newname) + PART_LEN + 1, &np)) != 0) { + __db_errx(env, Alloc_err, strlen(newname) + PART_LEN + 1); + goto err; + } + for (i = 0; i < part->nparts; i++, pdbp++) { + if ((ret = __db_create_internal(&ptmpdbp, env, 0)) != 0) + break; + ptmpdbp->locker = (*pdbp)->locker; + if (newname == NULL) + ret = __db_remove_int(ptmpdbp, + ip, txn, (*pdbp)->fname, NULL, flags); + else { + DB_ASSERT(env, np != NULL); + (void)sprintf(np, PART_NAME, newname, i); + ret = __db_rename_int(ptmpdbp, + ip, txn, (*pdbp)->fname, NULL, np, flags); + } + ptmpdbp->locker = NULL; + (void)__db_close(ptmpdbp, NULL, DB_NOSYNC); + if (ret != 0) + break; + } + + if (newname != NULL) + __os_free(env, np); + + if (!F_ISSET(dbp, DB_AM_OPEN_CALLED)) { +err: /* + * Since we copied the locker ID from the dbp, we'd better not + * free it here. + */ + tmpdbp->locker = NULL; + + /* We need to remove the lock event we associated with this. */ + if (txn != NULL) + __txn_remlock(env, + txn, &tmpdbp->handle_lock, DB_LOCK_INVALIDID); + + if ((t_ret = __db_close(tmpdbp, + txn, DB_NOSYNC)) != 0 && ret == 0) + ret = t_ret; + } + return (ret); +} +#ifdef HAVE_VERIFY +/* + * __part_verify -- + * Verify a partitioned database. + * + * PUBLIC: int __part_verify __P((DB *, VRFY_DBINFO *, const char *, + * PUBLIC: void *, int (*)(void *, const void *), u_int32_t)); + */ +int +__part_verify(dbp, vdp, fname, handle, callback, flags) + DB *dbp; + VRFY_DBINFO *vdp; + const char *fname; + void *handle; + int (*callback) __P((void *, const void *)); + u_int32_t flags; +{ + BINTERNAL *lp, *rp; + DB **pdbp; + DB_PARTITION *part; + DBC *dbc; + DBT *key; + ENV *env; + DB_THREAD_INFO *ip; + u_int32_t i; + int ret, t_ret; + + env = dbp->env; + lp = rp = NULL; + dbc = NULL; + ip = vdp->thread_info; + + if (dbp->type == DB_BTREE) { + if ((ret = __bam_open(dbp, ip, + NULL, fname, PGNO_BASE_MD, flags)) != 0) + goto err; + } +#ifdef HAVE_HASH + else if ((ret = __ham_open(dbp, ip, + NULL, fname, PGNO_BASE_MD, flags)) != 0) + goto err; +#endif + + /* + * Initalize partition db handles and get the names. Set DB_RDWRMASTER + * because we may not have the partition callback, but we can still + * look at the structure of the tree. + */ + if ((ret = __partition_open(dbp, + ip, NULL, fname, dbp->type, flags | DB_RDWRMASTER, 0, 0)) != 0) + goto err; + part = dbp->p_internal; + + if (LF_ISSET(DB_SALVAGE)) { + /* If we are being aggressive we don't want to dump the keys. */ + if (LF_ISSET(DB_AGGRESSIVE)) + dbp->p_internal = NULL; + ret = __db_prheader(dbp, + NULL, 0, 0, handle, callback, vdp, PGNO_BASE_MD); + dbp->p_internal = part; + if (ret != 0) + goto err; + } + + if ((ret = __db_cursor(dbp, ip, NULL, &dbc, 0)) != 0) + goto err; + + pdbp = part->handles; + for (i = 0; i < part->nparts; i++, pdbp++) { + if (!F_ISSET(part, PART_RANGE) || part->keys == NULL) + goto vrfy; + if (lp != NULL) + __os_free(env, lp); + lp = rp; + rp = NULL; + if (i + 1 < part->nparts) { + key = &part->keys[i + 1]; + if ((ret = __os_malloc(env, + BINTERNAL_SIZE(key->size), &rp)) != 0) + goto err; + rp->len = key->size; + memcpy(rp->data, key->data, key->size); + B_TSET(rp->type, B_KEYDATA); + } +vrfy: if ((t_ret = __db_verify(*pdbp, ip, (*pdbp)->fname, + NULL, handle, callback, + lp, rp, flags | DB_VERIFY_PARTITION)) != 0 && ret == 0) + ret = t_ret; + } + +err: if (lp != NULL) + __os_free(env, lp); + if (rp != NULL) + __os_free(env, rp); + return (ret); +} +#endif + +#ifdef CONFIG_TEST +/* + * __part_testdocopy -- copy all partitions for testing purposes. + * + * PUBLIC: int __part_testdocopy __P((DB *, const char *)); + */ +int +__part_testdocopy(dbp, name) + DB *dbp; + const char *name; +{ + DB **pdbp; + DB_PARTITION *part; + u_int32_t i; + int ret; + + if ((ret = __db_testdocopy(dbp->env, name)) != 0) + return (ret); + + part = dbp->p_internal; + pdbp = part->handles; + for (i = 0; i < part->nparts; i++, pdbp++) + if ((ret = __db_testdocopy(dbp->env, (*pdbp)->fname)) != 0) + return (ret); + + return (0); +} +#endif +#else +/* + * __db_nopartition -- + * Error when a Berkeley DB build doesn't include partitioning. + * + * PUBLIC: int __db_no_partition __P((ENV *)); + */ +int +__db_no_partition(env) + ENV *env; +{ + __db_errx(env, DB_STR("0664", + "library build did not include support for the database partitioning")); + return (DB_OPNOTSUP); +} +/* + * __partition_set -- + * Set the partitioning keys or callback function. + * This routine must be called prior to creating the database. + * PUBLIC: int __partition_set __P((DB *, u_int32_t, DBT *, + * PUBLIC: u_int32_t (*callback)(DB *, DBT *key))); + */ + +int +__partition_set(dbp, parts, keys, callback) + DB *dbp; + u_int32_t parts; + DBT *keys; + u_int32_t (*callback)(DB *, DBT *key); +{ + COMPQUIET(parts, 0); + COMPQUIET(keys, NULL); + COMPQUIET(callback, NULL); + + return (__db_no_partition(dbp->env)); +} + +/* + * __partition_get_callback -- + * Set the partition callback function. This routine must be called + * prior to opening a partition database that requires a function. + * PUBLIC: int __partition_get_callback __P((DB *, + * PUBLIC: u_int32_t *, u_int32_t (**callback)(DB *, DBT *key))); + */ +int +__partition_get_callback(dbp, parts, callback) + DB *dbp; + u_int32_t *parts; + u_int32_t (**callback)(DB *, DBT *key); +{ + COMPQUIET(parts, NULL); + COMPQUIET(callback, NULL); + + return (__db_no_partition(dbp->env)); +} + +/* + * __partition_get_dirs -- + * Get partition dirs. + * PUBLIC: int __partition_get_dirs __P((DB *, const char ***)); + */ +int +__partition_get_dirs(dbp, dirpp) + DB *dbp; + const char ***dirpp; +{ + COMPQUIET(dirpp, NULL); + return (__db_no_partition(dbp->env)); +} + +/* + * __partition_get_keys -- + * Get partition keys. + * PUBLIC: int __partition_get_keys __P((DB *, u_int32_t *, DBT **)); + */ +int +__partition_get_keys(dbp, parts, keys) + DB *dbp; + u_int32_t *parts; + DBT **keys; +{ + COMPQUIET(parts, NULL); + COMPQUIET(keys, NULL); + + return (__db_no_partition(dbp->env)); +} +/* + * __partition_init -- + * Initialize the partition structure. + * Called when the meta data page is read in during database open or + * when partition keys or a callback are set. + * + * PUBLIC: int __partition_init __P((DB *, u_int32_t)); + */ +int +__partition_init(dbp, flags) + DB *dbp; + u_int32_t flags; +{ + COMPQUIET(flags, 0); + + return (__db_no_partition(dbp->env)); +} +/* + * __part_fileid_reset -- + * reset the fileid on each partition. + * + * PUBLIC: int __part_fileid_reset + * PUBLIC: __P((ENV *, DB_THREAD_INFO *, const char *, u_int32_t, int)); + */ +int +__part_fileid_reset(env, ip, fname, nparts, encrypted) + ENV *env; + DB_THREAD_INFO *ip; + const char *fname; + u_int32_t nparts; + int encrypted; +{ + COMPQUIET(ip, NULL); + COMPQUIET(fname, NULL); + COMPQUIET(nparts, 0); + COMPQUIET(encrypted, 0); + + return (__db_no_partition(env)); +} +/* + * __partition_set_dirs -- + * Set the directories for creating the partition databases. + * They must be in the environment. + * PUBLIC: int __partition_set_dirs __P((DB *, const char **)); + */ +int +__partition_set_dirs(dbp, dirp) + DB *dbp; + const char **dirp; +{ + COMPQUIET(dirp, NULL); + + return (__db_no_partition(dbp->env)); +} +#endif |