diff options
author | Lorry Tar Creator <lorry-tar-importer@baserock.org> | 2015-02-17 17:25:57 +0000 |
---|---|---|
committer | <> | 2015-03-17 16:26:24 +0000 |
commit | 780b92ada9afcf1d58085a83a0b9e6bc982203d1 (patch) | |
tree | 598f8b9fa431b228d29897e798de4ac0c1d3d970 /src/btree | |
parent | 7a2660ba9cc2dc03a69ddfcfd95369395cc87444 (diff) | |
download | berkeleydb-master.tar.gz |
Diffstat (limited to 'src/btree')
-rw-r--r-- | src/btree/bt_compact.c | 239 | ||||
-rw-r--r-- | src/btree/bt_compare.c | 105 | ||||
-rw-r--r-- | src/btree/bt_compress.c | 72 | ||||
-rw-r--r-- | src/btree/bt_conv.c | 9 | ||||
-rw-r--r-- | src/btree/bt_curadj.c | 2 | ||||
-rw-r--r-- | src/btree/bt_cursor.c | 94 | ||||
-rw-r--r-- | src/btree/bt_delete.c | 18 | ||||
-rw-r--r-- | src/btree/bt_method.c | 19 | ||||
-rw-r--r-- | src/btree/bt_open.c | 43 | ||||
-rw-r--r-- | src/btree/bt_put.c | 177 | ||||
-rw-r--r-- | src/btree/bt_rec.c | 2 | ||||
-rw-r--r-- | src/btree/bt_reclaim.c | 2 | ||||
-rw-r--r-- | src/btree/bt_recno.c | 30 | ||||
-rw-r--r-- | src/btree/bt_rsearch.c | 7 | ||||
-rw-r--r-- | src/btree/bt_search.c | 49 | ||||
-rw-r--r-- | src/btree/bt_split.c | 118 | ||||
-rw-r--r-- | src/btree/bt_stat.c | 12 | ||||
-rw-r--r-- | src/btree/bt_upgrade.c | 94 | ||||
-rw-r--r-- | src/btree/bt_verify.c | 261 | ||||
-rw-r--r-- | src/btree/btree.src | 2 |
20 files changed, 1040 insertions, 315 deletions
diff --git a/src/btree/bt_compact.c b/src/btree/bt_compact.c index b455ff23..be4c6b01 100644 --- a/src/btree/bt_compact.c +++ b/src/btree/bt_compact.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -22,13 +22,16 @@ static int __bam_csearch __P((DBC *, DBT *, u_int32_t, int)); static int __bam_lock_tree __P((DBC *, EPG *, EPG *csp, u_int32_t, u_int32_t)); static int __bam_lock_subtree __P((DBC *, PAGE *, u_int32_t, u_int32_t)); static int __bam_merge __P((DBC *, - DBC *, u_int32_t, DBT *, DB_COMPACT *,int *)); -static int __bam_merge_internal __P((DBC *, DBC *, int, DB_COMPACT *, int *)); + DBC *, u_int32_t, DBT *, DB_COMPACT *, int *, int *)); +static int __bam_merge_internal __P((DBC *, + DBC *, int, DB_COMPACT *, int *, int *)); static int __bam_merge_pages __P((DBC *, DBC *, DB_COMPACT *)); -static int __bam_merge_records __P((DBC *, DBC*, u_int32_t, DB_COMPACT *)); -static int __bam_truncate_internal_overflow __P((DBC *, PAGE *, DB_COMPACT *)); +static int __bam_merge_records __P((DBC *, + DBC *, u_int32_t, DB_COMPACT *, int *)); +static int __bam_truncate_internal_overflow __P((DBC *, + PAGE *, DB_COMPACT *, int *)); static int __bam_truncate_root_page __P((DBC *, - PAGE *, u_int32_t, DB_COMPACT *)); + PAGE *, u_int32_t, DB_COMPACT *, int *)); #ifdef HAVE_FTRUNCATE static int __bam_savekey __P((DBC *, int, DBT *)); @@ -145,13 +148,13 @@ __bam_csearch(dbc, start, sflag, level) * PUBLIC: DBT *, DBT *, u_int32_t, int *, DB_COMPACT *, int *)); */ int -__bam_compact_int(dbc, start, stop, factor, spanp, c_data, donep) +__bam_compact_int(dbc, start, stop, factor, spanp, c_data, isdonep) DBC *dbc; DBT *start, *stop; u_int32_t factor; int *spanp; DB_COMPACT *c_data; - int *donep; + int *isdonep; { BTREE_CURSOR *cp, *ncp; DB *dbp; @@ -168,7 +171,7 @@ __bam_compact_int(dbc, start, stop, factor, spanp, c_data, donep) int check_dups, check_trunc, clear_root, do_commit, isdone; int merged, next_p, pgs_done, ret, t_ret, tdone; -#ifdef DEBUG +#ifdef DEBUG_WOP #define CTRACE(dbc, location, t, start, f) do { \ DBT __trace; \ DB_SET_DBT(__trace, t, strlen(t)); \ @@ -182,8 +185,8 @@ __bam_compact_int(dbc, start, stop, factor, spanp, c_data, donep) CTRACE(dbc, location, __buf, start, f); \ } while (0) #else -#define CTRACE(dbc, location, t, start, f) -#define PTRACE(dbc, location, p, start, f) +#define CTRACE(dbc, location, t, start, f) NOP_STATEMENT +#define PTRACE(dbc, location, p, start, f) NOP_STATEMENT #endif ndbc = NULL; @@ -551,11 +554,10 @@ retry: pg = NULL; if (ret != 0) goto err1; } - pgs_done++; - /* Get a fresh low numbered page. */ + /* Try to swap to a lower numbered page. */ if ((ret = __db_exchange_page(dbc, &cp->csp->page, ncp->csp->page, - PGNO_INVALID, DB_EXCH_DEFAULT)) != 0) + PGNO_INVALID, DB_EXCH_DEFAULT, &pgs_done)) != 0) goto err1; if ((ret = __TLPUT(dbc, prev_lock)) != 0) goto err1; @@ -598,8 +600,8 @@ retry: pg = NULL; merged = 0; for (epg = cp->sp; epg != cp->csp; epg++) { PTRACE(dbc, "PMerge", PGNO(epg->page), start, 0); - if ((ret = __bam_merge_internal(dbc, - ndbc, LEVEL(epg->page), c_data, &merged)) != 0) + if ((ret = __bam_merge_internal(dbc, ndbc, + LEVEL(epg->page), c_data, &merged, &pgs_done)) != 0) break; if (merged) break; @@ -627,7 +629,7 @@ retry: pg = NULL; } PTRACE(dbc, "SMerge", PGNO(cp->csp->page), start, 0); - /* if we remove the next page, then we need its next locked */ + /* If we remove the next page, then we need its next locked. */ npgno = NEXT_PGNO(ncp->csp->page); if (npgno != PGNO_INVALID) { TRY_LOCK2(dbc, ndbc, npgno, @@ -637,9 +639,8 @@ retry: pg = NULL; } /*lint -e{794} */ if ((ret = __bam_merge(dbc, - ndbc, factor, stop, c_data, &isdone)) != 0) + ndbc, factor, stop, c_data, &isdone, &pgs_done)) != 0) goto err1; - pgs_done++; /* * __bam_merge could have freed our stack if it * deleted a page possibly collapsing the tree. @@ -722,8 +723,8 @@ retry: pg = NULL; /* Get a fresh low numbered page. */ pgno = PGNO(pg); if ((ret = __db_exchange_page(dbc, - &cp->csp->page, NULL, - PGNO_INVALID, DB_EXCH_DEFAULT)) != 0) + &cp->csp->page, NULL, PGNO_INVALID, + DB_EXCH_DEFAULT, &pgs_done)) != 0) goto err1; if ((ret = __TLPUT(dbc, prev_lock)) != 0) goto err1; @@ -734,10 +735,7 @@ retry: pg = NULL; LOCK_INIT(next_lock); saved_pgno = PGNO_INVALID; pg = cp->csp->page; - if (pgno != PGNO(pg)) { - pgs_done++; - pgno = PGNO(pg); - } + pgno = PGNO(pg); } /* * If we are going to leave this parent commit @@ -752,7 +750,7 @@ retry: pg = NULL; goto next_page; } - /* If they have the same parent, just dup the cursor */ + /* If they have the same parent, just dup the cursor. */ if (ndbc != NULL && (ret = __dbc_close(ndbc)) != 0) goto err1; if ((ret = __dbc_dup(dbc, &ndbc, DB_POSITION)) != 0) @@ -842,17 +840,15 @@ retry: pg = NULL; pgno = PGNO(pg); /* Get a fresh low numbered page. */ if ((ret = __db_exchange_page(dbc, &cp->csp->page, - npg, PGNO_INVALID, DB_EXCH_DEFAULT)) != 0) + npg, PGNO_INVALID, + DB_EXCH_DEFAULT, &pgs_done)) != 0) goto err1; if ((ret = __TLPUT(dbc, prev_lock)) != 0) goto err1; LOCK_INIT(prev_lock); prev_pgno = PGNO_INVALID; pg = cp->csp->page; - if (pgno != PGNO(pg)) { - pgs_done++; - pgno = PGNO(pg); - } + pgno = PGNO(pg); } c_data->compact_pages_examine++; @@ -887,11 +883,9 @@ retry: pg = NULL; */ PTRACE(dbc, "Merge", PGNO(cp->csp->page), start, 0); if ((ret = __bam_merge(dbc, - ndbc, factor, stop, c_data, &isdone)) != 0) + ndbc, factor, stop, c_data, &isdone, &pgs_done)) != 0) goto err1; - pgs_done++; - if ((ret = __TLPUT(dbc, nnext_lock)) != 0) goto err1; LOCK_INIT(nnext_lock); @@ -932,7 +926,7 @@ next_page: pg = NULL; if ((ret = __bam_stkrel(dbc, STK_PGONLY)) != 0) goto err; - if (npgno != PGNO_INVALID && + if (npgno != PGNO_INVALID && !do_commit && (ret = __db_lget(dbc, 0, npgno, DB_LOCK_READ, 0, &next_lock)) != 0) goto err; if ((ret = __bam_stkrel(dbc, pgs_done == 0 ? STK_NOLOCK : 0)) != 0) @@ -1010,9 +1004,6 @@ err: /* if ((t_ret = __bam_stkrel(dbc, sflag)) != 0 && ret == 0) ret = t_ret; - if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0) - ret = t_ret; - if (pg != NULL && (t_ret = __memp_fput(dbmp, dbc->thread_info, pg, dbc->priority) != 0) && ret == 0) @@ -1022,7 +1013,11 @@ err: /* dbc->thread_info, npg, dbc->priority) != 0) && ret == 0) ret = t_ret; -out: *donep = isdone; +out: + if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0) + ret = t_ret; + + *isdonep = isdone; /* For OPD trees return if we did anything in the span variable. */ if (F_ISSET(dbc, DBC_OPD)) @@ -1035,12 +1030,13 @@ out: *donep = isdone; * __bam_merge -- do actual merging of leaf pages. */ static int -__bam_merge(dbc, ndbc, factor, stop, c_data, donep) +__bam_merge(dbc, ndbc, factor, stop, c_data, isdonep, pgs_donep) DBC *dbc, *ndbc; u_int32_t factor; DBT *stop; DB_COMPACT *c_data; - int *donep; + int *isdonep; + int *pgs_donep; { BTREE_CURSOR *cp, *ncp; DB *dbp; @@ -1064,9 +1060,9 @@ __bam_merge(dbc, ndbc, factor, stop, c_data, donep) /* Find if the stopping point is on this page. */ if (stop != NULL && stop->size != 0) { - if ((ret = __bam_compact_isdone(dbc, stop, npg, donep)) != 0) + if ((ret = __bam_compact_isdone(dbc, stop, npg, isdonep)) != 0) return (ret); - if (*donep) + if (*isdonep) return (0); } @@ -1080,20 +1076,23 @@ __bam_merge(dbc, ndbc, factor, stop, c_data, donep) ncp->csp[-1].indx == 0 && ncp->csp[-1].entries != 1) || (int)(P_FREESPACE(dbp, pg) - ((dbp->pgsize - P_OVERHEAD(dbp)) - - P_FREESPACE(dbp, npg))) < (int)factor) - ret = __bam_merge_records(dbc, ndbc, factor, c_data); - else + P_FREESPACE(dbp, npg))) < (int)factor) { + ret = __bam_merge_records(dbc, ndbc, factor, c_data, pgs_donep); + } else { /*lint -e{794} */ free_page: ret = __bam_merge_pages(dbc, ndbc, c_data); + (*pgs_donep)++; + } return (ret); } static int -__bam_merge_records(dbc, ndbc, factor, c_data) +__bam_merge_records(dbc, ndbc, factor, c_data, pgs_donep) DBC *dbc, *ndbc; u_int32_t factor; DB_COMPACT *c_data; + int *pgs_donep; { BINTERNAL *bi; BKEYDATA *bk, *tmp_bk; @@ -1126,8 +1125,8 @@ __bam_merge_records(dbc, ndbc, factor, c_data) if (c_data->compact_truncate != PGNO_INVALID && PGNO(ncp->csp->page) > c_data->compact_truncate) { /* Get a fresh low numbered page. */ - if ((ret = __db_exchange_page(ndbc, - &ncp->csp->page, pg, PGNO_INVALID, DB_EXCH_DEFAULT)) != 0) + if ((ret = __db_exchange_page(ndbc, &ncp->csp->page, + pg, PGNO_INVALID, DB_EXCH_DEFAULT, pgs_donep)) != 0) goto err; } @@ -1197,6 +1196,7 @@ __bam_merge_records(dbc, ndbc, factor, c_data) /* If we have hit the first record then there is nothing we can move. */ if (indx == 0) goto done; + (*pgs_donep)++; if (TYPE(pg) != P_LBTREE && TYPE(pg) != P_LDUP) { if (indx == nent) return (__bam_merge_pages(dbc, ndbc, c_data)); @@ -1237,7 +1237,8 @@ __bam_merge_records(dbc, ndbc, factor, c_data) indx -= adj; } bk = GET_BKEYDATA(dbp, npg, indx); - len = (B_TYPE(bk->type) != B_KEYDATA) ? BOVERFLOW_SIZE : bk->len; + len = (B_TYPE(bk->type) == B_KEYDATA) ? bk->len : + ((B_TYPE(bk->type) == B_BLOB) ? BBLOB_DSIZE : BOVERFLOW_SIZE); if (indx != 0 && BINTERNAL_SIZE(len) >= pfree) { if (F_ISSET(dbc, DBC_OPD)) { if (dbp->dup_compare == __bam_defcmp) @@ -1281,8 +1282,9 @@ noprefix: } while (indx != 0 && ninp[indx] == ninp[indx - adj]); bk = GET_BKEYDATA(dbp, npg, indx); - len = - (B_TYPE(bk->type) != B_KEYDATA) ? BOVERFLOW_SIZE : bk->len; + len = (B_TYPE(bk->type) == B_KEYDATA) ? + bk->len : ((B_TYPE(bk->type) == B_BLOB) ? + BBLOB_DSIZE : BOVERFLOW_SIZE); } /* @@ -1346,6 +1348,13 @@ no_check: is_dup = first_dup = next_dup = 0; BOVERFLOW_SIZE, &data, NULL)) != 0) goto err; break; + case B_BLOB: + data.size = BBLOB_SIZE; + data.data = bk; + if ((ret = __db_pitem(dbc, pg, + pind, BBLOB_SIZE, &data, NULL)) != 0) + goto err; + break; default: __db_errx(env, DB_STR_A("1022", "Unknown record format, page %lu, indx 0", @@ -1538,15 +1547,20 @@ err: return (ret); /* * __bam_merge_internal -- * Merge internal nodes of the tree. + * + * The first key of an internal page does not have a guaranteed- + * useful key. */ static int -__bam_merge_internal(dbc, ndbc, level, c_data, merged) +__bam_merge_internal(dbc, ndbc, level, c_data, merged, pgs_donep) DBC *dbc, *ndbc; int level; DB_COMPACT *c_data; int *merged; + int *pgs_donep; { BINTERNAL bi, *bip, *fip; + BOVERFLOW bo; BTREE_CURSOR *cp, *ncp; DB *dbp; DBT data, hdr; @@ -1579,7 +1593,6 @@ __bam_merge_internal(dbc, ndbc, level, c_data, merged) dbmp = dbp->mpf; cp = (BTREE_CURSOR *)dbc->internal; ncp = (BTREE_CURSOR *)ndbc->internal; - *merged = 0; ret = 0; /* @@ -1608,11 +1621,11 @@ __bam_merge_internal(dbc, ndbc, level, c_data, merged) * Check for overflow keys on both pages while we have * them locked. */ - if ((ret = - __bam_truncate_internal_overflow(dbc, pg, c_data)) != 0) + if ((ret = __bam_truncate_internal_overflow(dbc, + pg, c_data, pgs_donep)) != 0) goto err; - if ((ret = - __bam_truncate_internal_overflow(dbc, npg, c_data)) != 0) + if ((ret = __bam_truncate_internal_overflow(dbc, + npg, c_data, pgs_donep)) != 0) goto err; } @@ -1624,7 +1637,12 @@ __bam_merge_internal(dbc, ndbc, level, c_data, merged) */ fip = NULL; if (TYPE(pg) == P_IBTREE) { - /* See where we run out of space. */ + /* See where we run out of space. This does not yet include + * whatever extra pages are needed if an overflow key is + * going to be added to one or more parent pages. It would be + * better to use as little of the key that as necessary, though + * the effort of determining that might not be worthwhile. + */ freespace = P_FREESPACE(dbp, pg); /* * The leftmost key of an internal page is not accurate. @@ -1704,12 +1722,37 @@ fits: memset(&bi, 0, sizeof(bi)); if (fip == NULL) { data.size = bip->len; data.data = bip->data; + } else if (fip->type == B_OVERFLOW) { + DB_ASSERT(dbc->env, + fip->len == sizeof(BOVERFLOW)); + /* Cast to "BOVERFLOW *" to calm down lint. */ + memmove(&bo, + (BOVERFLOW *)fip->data, sizeof(BOVERFLOW)); + memset(&hdr, 0, sizeof(hdr)); + if ((ret = __db_goff(dbc, &hdr, bo.tlen, + bo.pgno, &hdr.data, &hdr.size)) == 0) + ret = __db_poff(dbc, &hdr, &bo.pgno); + if (hdr.data != NULL) + __os_free(dbp->env, hdr.data); + if (ret != 0) + return (ret); + data.size = sizeof(bo); + data.data = &bo; + } else if (fip->type == B_BLOB) { + /* Blobs should never appear as keys. */ + DB_ASSERT(dbc->env, + !(fip->type == B_BLOB && + TYPE(pg) == P_IBTREE)); } else { data.size = fip->len; data.data = fip->data; } bi.len = data.size; - B_TSET(bi.type, bip->type); + /* + * Set bi.type according to the data's type, to ensure + * that it is B_OVERLOW iff the data is BOVERFLOW. + */ + B_TSET(bi.type, fip == NULL ? bip->type : fip->type); bi.pgno = bip->pgno; bi.nrecs = bip->nrecs; hdr.data = &bi; @@ -1750,7 +1793,12 @@ fits: memset(&bi, 0, sizeof(bi)); if ((ret = __db_pitem(dbc, pg, pind, size, &hdr, &data)) != 0) goto err; pind++; - if (fip != NULL) { + /* add bip test so fortify does not complain */ + if (fip != NULL && bip != NULL) { + if (B_TYPE(bip->type) == B_OVERFLOW && + (ret = __db_doff(dbc, + ((BOVERFLOW *)bip->data)->pgno)) != 0) + goto err; /* reset size to be for the record being deleted. */ size = BINTERNAL_SIZE(bip->len); fip = NULL; @@ -1848,14 +1896,14 @@ fits: memset(&bi, 0, sizeof(bi)); PGNO(npg) > c_data->compact_truncate && ncp->csp != ncp->sp) { if ((ret = __db_exchange_page(ndbc, &ncp->csp->page, - pg, PGNO_INVALID, DB_EXCH_DEFAULT)) != 0) + pg, PGNO_INVALID, DB_EXCH_DEFAULT, pgs_donep)) != 0) goto err; } if (c_data->compact_truncate != PGNO_INVALID && PGNO(pg) > c_data->compact_truncate && cp->csp != cp->sp) { if ((ret = __db_exchange_page(dbc, &cp->csp->page, ncp->csp->page, - PGNO_INVALID, DB_EXCH_DEFAULT)) != 0) + PGNO_INVALID, DB_EXCH_DEFAULT, pgs_donep)) != 0) goto err; } } @@ -1875,13 +1923,13 @@ err: cp->csp = save_csp; * We may or may not have a write lock on this page. */ static int -__bam_compact_dups(dbc, ppg, factor, have_lock, c_data, donep) +__bam_compact_dups(dbc, ppg, factor, have_lock, c_data, pgs_donep) DBC *dbc; PAGE **ppg; u_int32_t factor; int have_lock; DB_COMPACT *c_data; - int *donep; + int *pgs_donep; { BOVERFLOW *bo; BTREE_CURSOR *cp; @@ -1896,15 +1944,19 @@ __bam_compact_dups(dbc, ppg, factor, have_lock, c_data, donep) DB_ASSERT(NULL, dbc != NULL); dbp = dbc->dbp; dbmp = dbp->mpf; + /* XXX Don't reserve any free bytes (Force 100% fillfactor) in OPD trees + * to ensure forward progress. + */ + factor = 0; cp = (BTREE_CURSOR *)dbc->internal; for (i = 0; i < NUM_ENT(*ppg); i++) { bo = GET_BOVERFLOW(dbp, *ppg, i); - if (B_TYPE(bo->type) == B_KEYDATA) + if (B_TYPE(bo->type) == B_KEYDATA || + B_TYPE(bo->type) == B_BLOB) continue; c_data->compact_pages_examine++; if (bo->pgno > c_data->compact_truncate) { - (*donep)++; if (!have_lock) { /* * The caller should have the page at @@ -1925,8 +1977,9 @@ __bam_compact_dups(dbc, ppg, factor, have_lock, c_data, donep) dbc->txn, DB_MPOOL_DIRTY, ppg)) != 0) goto err; } + pgno = bo->pgno; if ((ret = __bam_truncate_root_page(dbc, - *ppg, i, c_data)) != 0) + *ppg, i, c_data, pgs_donep)) != 0) goto err; /* Just in case it should move. Could it? */ bo = GET_BOVERFLOW(dbp, *ppg, i); @@ -1934,13 +1987,13 @@ __bam_compact_dups(dbc, ppg, factor, have_lock, c_data, donep) if (B_TYPE(bo->type) == B_OVERFLOW) { if ((ret = __db_truncate_overflow(dbc, - bo->pgno, have_lock ? NULL : ppg, c_data)) != 0) + bo->pgno, have_lock ? NULL : ppg, + c_data, pgs_donep)) != 0) goto err; - (*donep)++; continue; } if ((ret = __bam_compact_opd(dbc, bo->pgno, - have_lock ? NULL : ppg, factor, c_data, donep)) != 0) + have_lock ? NULL : ppg, factor, c_data, pgs_donep)) != 0) goto err; } @@ -1955,13 +2008,13 @@ err: * PUBLIC: db_pgno_t, PAGE **, u_int32_t, DB_COMPACT *, int *)); */ int -__bam_compact_opd(dbc, root_pgno, ppg, factor, c_data, donep) +__bam_compact_opd(dbc, root_pgno, ppg, factor, c_data, pgs_donep) DBC *dbc; db_pgno_t root_pgno; PAGE **ppg; u_int32_t factor; DB_COMPACT *c_data; - int *donep; + int *pgs_donep; { BTREE_CURSOR *cp; DBC *opd; @@ -2021,7 +2074,7 @@ __bam_compact_opd(dbc, root_pgno, ppg, factor, c_data, donep) NULL, factor, &span, c_data, &isdone)) != 0) break; /* For OPD the number of pages dirtied is returned in span. */ - *donep += span; + *pgs_donep += span; } while (!isdone); if (start.data != NULL) @@ -2041,11 +2094,12 @@ done: * The page is reference by the pg/indx passed in. */ static int -__bam_truncate_root_page(dbc, pg, indx, c_data) +__bam_truncate_root_page(dbc, pg, indx, c_data, pgs_donep) DBC *dbc; PAGE *pg; u_int32_t indx; DB_COMPACT *c_data; + int *pgs_donep; { BINTERNAL *bi; BOVERFLOW *bo; @@ -2053,8 +2107,8 @@ __bam_truncate_root_page(dbc, pg, indx, c_data) db_pgno_t *pgnop; u_int32_t tlen; - COMPQUIET(c_data, NULL); COMPQUIET(bo, NULL); + COMPQUIET(c_data, NULL); dbp = dbc->dbp; if (TYPE(pg) == P_IBTREE) { bi = GET_BINTERNAL(dbp, pg, indx); @@ -2075,7 +2129,7 @@ __bam_truncate_root_page(dbc, pg, indx, c_data) DB_ASSERT(dbp->env, IS_DIRTY(pg)); - return (__db_truncate_root(dbc, pg, indx, pgnop, tlen)); + return (__db_truncate_root(dbc, pg, indx, pgnop, tlen, pgs_donep)); } /* @@ -2086,10 +2140,11 @@ __bam_truncate_root_page(dbc, pg, indx, c_data) * nodes they will get copied adding pages to the database. */ static int -__bam_truncate_internal_overflow(dbc, page, c_data) +__bam_truncate_internal_overflow(dbc, page, c_data, pgs_donep) DBC *dbc; PAGE *page; DB_COMPACT *c_data; + int *pgs_donep; { BINTERNAL *bi; BOVERFLOW *bo; @@ -2104,10 +2159,11 @@ __bam_truncate_internal_overflow(dbc, page, c_data) continue; bo = (BOVERFLOW *)(bi->data); if (bo->pgno > c_data->compact_truncate && (ret = - __bam_truncate_root_page(dbc, page, indx, c_data)) != 0) + __bam_truncate_root_page(dbc, page, + indx, c_data, pgs_donep)) != 0) break; - if ((ret = __db_truncate_overflow( - dbc, bo->pgno, NULL, c_data)) != 0) + if ((ret = __db_truncate_overflow(dbc, + bo->pgno, NULL, c_data, pgs_donep)) != 0) break; } return (ret); @@ -2142,7 +2198,7 @@ __bam_compact_isdone(dbc, stop, pg, isdone) } else { DB_ASSERT(dbc->dbp->env, TYPE(pg) == P_LBTREE); if ((ret = __bam_cmp(dbc, stop, pg, 0, - t->bt_compare, &cmp)) != 0) + t->bt_compare, &cmp, NULL)) != 0) return (ret); *isdone = cmp <= 0; @@ -2328,7 +2384,7 @@ __bam_savekey(dbc, next, start) if (len == 0) { no_key: __db_errx(env, DB_STR("1023", "Compact cannot handle zero length key")); - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } } else { @@ -2360,14 +2416,15 @@ retry: return (DB_LOCK_NOTGRANTED); * Find high numbered pages in the internal nodes of a tree and * swap them for lower numbered pages. * PUBLIC: int __bam_truncate_ipages __P((DB *, - * PUBLIC: DB_THREAD_INFO *, DB_TXN *, DB_COMPACT *)); + * PUBLIC: DB_THREAD_INFO *, DB_TXN *, DB_COMPACT *, int *)); */ int -__bam_truncate_ipages(dbp, ip, txn, c_data) +__bam_truncate_ipages(dbp, ip, txn, c_data, pgs_donep) DB *dbp; DB_THREAD_INFO *ip; DB_TXN *txn; DB_COMPACT *c_data; + int *pgs_donep; { BTMETA *meta; BTREE *bt; @@ -2480,8 +2537,9 @@ new_txn: pgno = PGNO(cp->csp->page); if (pgno > c_data->compact_truncate) { - if ((ret = __db_exchange_page(dbc, &cp->csp->page, - NULL, PGNO_INVALID, DB_EXCH_DEFAULT)) != 0) + if ((ret = __db_exchange_page(dbc, + &cp->csp->page, NULL, PGNO_INVALID, + DB_EXCH_DEFAULT, pgs_donep)) != 0) goto err; } @@ -2561,7 +2619,8 @@ again: if (F_ISSET(dbp, DB_AM_SUBDB) && } if (PGNO(meta) > c_data->compact_truncate) { dbmeta = (DBMETA *)meta; - ret = __db_move_metadata(dbc, &dbmeta, c_data); + ret = __db_move_metadata(dbc, + &dbmeta, c_data, pgs_donep); meta = (BTMETA *)dbmeta; if (ret != 0) goto err; @@ -2583,8 +2642,8 @@ again: if (F_ISSET(dbp, DB_AM_SUBDB) && * page latch is released. */ ++dbp->mpf->mfp->revision; - if ((ret = __db_exchange_page(dbc, - &root, NULL, PGNO_INVALID, DB_EXCH_FREE)) != 0) + if ((ret = __db_exchange_page(dbc, &root, NULL, + PGNO_INVALID, DB_EXCH_FREE, pgs_donep)) != 0) goto err; if (PGNO(root) == bt->bt_root) goto err; diff --git a/src/btree/bt_compare.c b/src/btree/bt_compare.c index 5c009071..8923c5fa 100644 --- a/src/btree/bt_compare.c +++ b/src/btree/bt_compare.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1990, 1993, 1994, 1995, 1996 @@ -49,27 +49,39 @@ /* * __bam_cmp -- - * Compare a key to a given record. + * Compare a key to a given record. We always start the comparison + * at an offset and update the offset with longest matching count + * after the comparison. * * PUBLIC: int __bam_cmp __P((DBC *, const DBT *, PAGE *, u_int32_t, - * PUBLIC: int (*)(DB *, const DBT *, const DBT *), int *)); + * PUBLIC: int (*)(DB *, const DBT *, const DBT *, size_t *), + * PUBLIC: int *, size_t *)); */ int -__bam_cmp(dbc, dbt, h, indx, func, cmpp) +__bam_cmp(dbc, dbt, h, indx, func, cmpp, locp) DBC *dbc; const DBT *dbt; PAGE *h; u_int32_t indx; - int (*func)__P((DB *, const DBT *, const DBT *)); + int (*func)__P((DB *, const DBT *, const DBT *, size_t *)); int *cmpp; + size_t *locp; { + BBLOB bl; BINTERNAL *bi; BKEYDATA *bk; BOVERFLOW *bo; DB *dbp; DBT pg_dbt; + off_t blob_size; + int ret; + db_seq_t blob_id; dbp = dbc->dbp; + ret = 0; + + /* Assert that the func is non-Null. */ + DB_ASSERT(dbp->env, func != NULL); /* * Returns: @@ -91,11 +103,49 @@ __bam_cmp(dbc, dbt, h, indx, func, cmpp) bk = GET_BKEYDATA(dbp, h, indx); if (B_TYPE(bk->type) == B_OVERFLOW) bo = (BOVERFLOW *)bk; - else { + else if (B_TYPE(bk->type) == B_BLOB) { + /* + * This is very slow, but since blobs cannot be + * in databases with duplicates or be keys, it should + * only happen when using DB_GET_BOTH or DB_SET. + */ + memcpy(&bl, bk, BBLOB_SIZE); + memset(&pg_dbt, 0, sizeof(DBT)); + GET_BLOB_SIZE(dbc->env, bl, blob_size, ret); + if (ret != 0) + return (ret); + if (blob_size > UINT32_MAX) + pg_dbt.size = UINT32_MAX; + else + pg_dbt.size = (u_int32_t)blob_size; + blob_id = (db_seq_t)bl.id; + pg_dbt.flags = DB_DBT_USERMEM; + if ((ret = __os_malloc( + dbc->env, pg_dbt.size, &pg_dbt.data)) != 0) + return (ret); + pg_dbt.ulen = pg_dbt.size; + if ((ret = __blob_get(dbc, + &pg_dbt, blob_id, blob_size, NULL, NULL)) != 0) { + __os_free(dbc->env, pg_dbt.data); + return (ret); + } + *cmpp = func(dbp, dbt, &pg_dbt, locp); + /* + * There is no way to directly compare a blob file that + * is greater in size than UINT32_MAX, so instead we + * compare the data up to UINT32_MAX, and if they are + * equal return that the blob is larger, since it is + * longer than the input data. + */ + if (*cmpp == 0 && (blob_size > UINT32_MAX)) + *cmpp = -1; + __os_free(dbc->env, pg_dbt.data); + return (0); + } else { pg_dbt.app_data = NULL; pg_dbt.data = bk->data; pg_dbt.size = bk->len; - *cmpp = func(dbp, dbt, &pg_dbt); + *cmpp = func(dbp, dbt, &pg_dbt, locp); return (0); } break; @@ -123,13 +173,14 @@ __bam_cmp(dbc, dbt, h, indx, func, cmpp) } bi = GET_BINTERNAL(dbp, h, indx); - if (B_TYPE(bi->type) == B_OVERFLOW) + if (B_TYPE(bi->type) == B_OVERFLOW) { + DB_ASSERT(dbp->env, bi->len == BOVERFLOW_SIZE); bo = (BOVERFLOW *)(bi->data); - else { + } else { pg_dbt.app_data = NULL; pg_dbt.data = bi->data; pg_dbt.size = bi->len; - *cmpp = func(dbp, dbt, &pg_dbt); + *cmpp = func(dbp, dbt, &pg_dbt, locp); return (0); } break; @@ -141,42 +192,56 @@ __bam_cmp(dbc, dbt, h, indx, func, cmpp) * Overflow. */ return (__db_moff(dbc, dbt, bo->pgno, bo->tlen, - func == __bam_defcmp ? NULL : func, cmpp)); + func == __bam_defcmp ? NULL : func, cmpp, locp)); } /* * __bam_defcmp -- - * Default comparison routine. + * Keep track of how far along in the two keys we find matching + * characters, and use that as an offset into the keys to begin + * future comparisons. This will save us the overhead of always + * starting the comparisons on the first character. * - * PUBLIC: int __bam_defcmp __P((DB *, const DBT *, const DBT *)); + * PUBLIC: int __bam_defcmp __P((DB *, const DBT *, const DBT *, size_t *)); */ int -__bam_defcmp(dbp, a, b) +__bam_defcmp(dbp, a, b, locp) DB *dbp; const DBT *a, *b; + size_t *locp; { - size_t len; + size_t len, i, start; u_int8_t *p1, *p2; COMPQUIET(dbp, NULL); - + start = (locp == NULL ? 0 : *locp); /* * Returns: * < 0 if a is < b * = 0 if a is = b * > 0 if a is > b * + * We start the comparison from 'locp' and store the last match + * location in 'locp'. + * * XXX * If a size_t doesn't fit into a long, or if the difference between * any two characters doesn't fit into an int, this routine can lose. * What we need is a signed integral type that's guaranteed to be at * least as large as a size_t, and there is no such thing. */ + p1 = (u_int8_t *)a->data + start; + p2 = (u_int8_t *)b->data + start; len = a->size > b->size ? b->size : a->size; - for (p1 = a->data, p2 = b->data; len--; ++p1, ++p2) - if (*p1 != *p2) - return ((long)*p1 - (long)*p2); - return ((long)a->size - (long)b->size); + for (i = start; i < len; ++p1, ++p2, ++i) + if (*p1 != *p2) { + if (locp != NULL) + *locp = i; + return (*p1 < *p2 ? -1 : 1); + } + if (locp != NULL) + *locp = len; + return (a->size == b->size ? 0 : (a->size < b->size ? -1 : 1)); } /* diff --git a/src/btree/bt_compress.c b/src/btree/bt_compress.c index 3f293461..479e7248 100644 --- a/src/btree/bt_compress.c +++ b/src/btree/bt_compress.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ #include "db_config.h" @@ -352,16 +352,20 @@ __bam_compress_marshal_data(dbp, data, destbuf) * __bam_compress_dupcmp -- * Duplicate comparison function for compressed BTrees. * - * PUBLIC: int __bam_compress_dupcmp __P((DB *, const DBT *, const DBT *)); + * PUBLIC: int __bam_compress_dupcmp __P((DB *, const DBT *, const DBT *, + * PUBLIC: size_t *)); */ int -__bam_compress_dupcmp(db, a, b) +__bam_compress_dupcmp(db, a, b, locp) DB *db; const DBT *a; const DBT *b; + size_t *locp; { DBT dcmp_a, dcmp_b; + COMPQUIET(locp, NULL); + /* Decompress the initial data in a */ CMP_UNMARSHAL_DATA(a, &dcmp_a); dcmp_a.ulen = 0; @@ -380,7 +384,7 @@ __bam_compress_dupcmp(db, a, b) /* Call the user's duplicate compare function */ return ((BTREE *)db->bt_internal)-> - compress_dup_compare(db, &dcmp_a, &dcmp_b); + compress_dup_compare(db, &dcmp_a, &dcmp_b, NULL); } /* @@ -636,7 +640,7 @@ __bamc_next_decompress(dbc) db = dbc->dbp; if (cp->compcursor >= cp->compend) - return (DB_NOTFOUND); + return (DBC_ERR(dbc, DB_NOTFOUND)); cp->prevKey = cp->currentKey; cp->prevData = cp->currentData; @@ -1251,7 +1255,7 @@ __bamc_compress_merge_delete(dbc, stream, countp) * chunk, but don't delete any more * entries. */ - bulk_ret = DB_NOTFOUND; + bulk_ret = DBC_ERR(dbc, DB_NOTFOUND); moreStream = 0; iSmallEnough = 0; } else @@ -1318,7 +1322,7 @@ __bamc_compress_merge_delete(dbc, stream, countp) CMP_FREE_DBT(env, &nextk); CMP_FREE_DBT(env, &nextc); - return (ret != 0 ? ret : bulk_ret); + return (ret != 0 ? ret : DBC_ERR(dbc, bulk_ret)); } /* @@ -1389,7 +1393,7 @@ __bamc_compress_merge_delete_dups(dbc, stream, countp) * in the database */ if (ifound == 0) { - bulk_ret = DB_NOTFOUND; + bulk_ret = DBC_ERR(dbc, DB_NOTFOUND); } else ++chunk_count; break; @@ -1463,7 +1467,7 @@ __bamc_compress_merge_delete_dups(dbc, stream, countp) * current chunk, but don't delete * any more entries. */ - bulk_ret = DB_NOTFOUND; + bulk_ret = DBC_ERR(dbc, DB_NOTFOUND); moreStream = 0; iSmallEnough = 0; } else @@ -1541,7 +1545,7 @@ __bamc_compress_merge_delete_dups(dbc, stream, countp) CMP_FREE_DBT(env, &pdestdata); CMP_FREE_DBT(env, &nextk); - return (ret != 0 ? ret : bulk_ret); + return (ret != 0 ? ret : DBC_ERR(dbc, bulk_ret)); } /******************************************************************************/ @@ -1641,8 +1645,8 @@ __bamc_compress_get_prev_dup(dbc, flags) if ((ret = __bamc_compress_get_prev(dbc, flags)) != 0) return (ret); - if (t->bt_compare(dbp, cp->currentKey, &cp->del_key) != 0) - return (DB_NOTFOUND); + if (t->bt_compare(dbp, cp->currentKey, &cp->del_key, NULL) != 0) + return (DBC_ERR(dbc, DB_NOTFOUND)); return (0); } @@ -1684,7 +1688,7 @@ __bamc_compress_get_prev_nodup(dbc, flags) do if ((ret = __bamc_compress_get_prev(dbc, flags)) != 0) return (ret); - while (t->bt_compare(dbp, cp->currentKey, &cp->del_key) == 0); + while (t->bt_compare(dbp, cp->currentKey, &cp->del_key, NULL) == 0); return (0); } @@ -1702,7 +1706,7 @@ __bamc_compress_get_next(dbc, flags) if (F_ISSET(cp, C_COMPRESS_DELETED)) { if (cp->currentKey == 0) - return (DB_NOTFOUND); + return (DBC_ERR(dbc, DB_NOTFOUND)); F_CLR(cp, C_COMPRESS_DELETED); return (0); } else if (cp->currentKey) { @@ -1722,7 +1726,7 @@ __bamc_compress_get_next(dbc, flags) * to the right place */ __bamc_compress_reset(dbc); - return (DB_NOTFOUND); + return (DBC_ERR(dbc, DB_NOTFOUND)); } else if (ret != 0) return (ret); @@ -1753,17 +1757,18 @@ __bamc_compress_get_next_dup(dbc, key, flags) * deleted entry. */ if (cp->currentKey == 0) - return (DB_NOTFOUND); + return (DBC_ERR(dbc, DB_NOTFOUND)); F_CLR(cp, C_COMPRESS_DELETED); - return (t->bt_compare(dbp, - cp->currentKey, &cp->del_key) == 0 ? 0 : DB_NOTFOUND); + return (t->bt_compare(dbp, cp->currentKey, + &cp->del_key, NULL) == 0 ? 0 : DB_NOTFOUND); } else if (cp->currentKey == 0) return (EINVAL); /* Check that the next entry has the same key as the previous entry */ ret = __bamc_next_decompress(dbc); - if (ret == 0 && t->bt_compare(dbp, cp->currentKey, cp->prevKey) != 0) - return (DB_NOTFOUND); + if (ret == 0 && t->bt_compare(dbp, + cp->currentKey, cp->prevKey, NULL) != 0) + return (DBC_ERR(dbc, DB_NOTFOUND)); if (ret != DB_NOTFOUND) return (ret); @@ -1783,7 +1788,7 @@ __bamc_compress_get_next_dup(dbc, key, flags) * will end up pointing to the right place */ __bamc_compress_reset(dbc); - return (DB_NOTFOUND); + return (DBC_ERR(dbc, DB_NOTFOUND)); } else if (ret != 0) return (ret); @@ -1791,8 +1796,8 @@ __bamc_compress_get_next_dup(dbc, key, flags) return (ret); /* Check the keys are the same */ - if (t->bt_compare(dbp, cp->currentKey, key) != 0) - return (DB_NOTFOUND); + if (t->bt_compare(dbp, cp->currentKey, key, NULL) != 0) + return (DBC_ERR(dbc, DB_NOTFOUND)); return (0); } @@ -1828,7 +1833,7 @@ __bamc_compress_get_next_nodup(dbc, flags) do if ((ret = __bamc_compress_get_next(dbc, flags)) != 0) return (ret); - while (t->bt_compare(dbp, cp->currentKey, &cp->del_key) == 0); + while (t->bt_compare(dbp, cp->currentKey, &cp->del_key, NULL) == 0); return (ret); } @@ -1888,14 +1893,14 @@ __bamc_compress_get_set(dbc, key, data, method, flags) if (ret == 0 && __db_compare_both(dbp, cp->currentKey, 0, key, 0) != 0) { /* We didn't find the key */ - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); } break; case DB_GET_BOTH: if (ret == 0 && (cmp != 0 || (!F_ISSET(dbp, DB_AM_DUPSORT) && - __bam_defcmp(dbp, cp->currentData, data) != 0))) { + __bam_defcmp(dbp, cp->currentData, data, NULL) != 0))) { /* We didn't find the key/data pair */ - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); } break; default: @@ -1923,7 +1928,7 @@ __bamc_compress_get_bothc(dbc, data, flags) position */ if (__db_compare_both(dbp, cp->currentKey, cp->currentData, cp->currentKey, data) >= 0) - return (DB_NOTFOUND); + return (DBC_ERR(dbc, DB_NOTFOUND)); cmp = 0; /* Perform a linear search for the data in the current chunk */ @@ -1933,7 +1938,7 @@ __bamc_compress_get_bothc(dbc, data, flags) continue; if (ret == 0) - return (cmp == 0 ? 0 : DB_NOTFOUND); + return (cmp == 0 ? 0 : DBC_ERR(dbc, DB_NOTFOUND)); if (ret != DB_NOTFOUND) return (ret); @@ -2277,7 +2282,7 @@ __bamc_compress_iput(dbc, key, data, flags) switch (flags) { case DB_CURRENT: if (cp->currentKey == 0 || F_ISSET(cp, C_COMPRESS_DELETED)) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto end; } @@ -2290,7 +2295,7 @@ __bamc_compress_iput(dbc, key, data, flags) if (F_ISSET(dbp, DB_AM_DUPSORT) && ((BTREE *)dbp->bt_internal)->compress_dup_compare( - dbp, cp->currentData, data) != 0) { + dbp, cp->currentData, data, NULL) != 0) { __db_errx(env, DB_STR("1032", "Existing data sorts differently from put data")); ret = EINVAL; @@ -2464,7 +2469,7 @@ __bamc_compress_idel(dbc, flags) if (F_ISSET(cp, C_COMPRESS_DELETED)) return DB_KEYEMPTY; if (cp->currentKey == 0) - return DB_NOTFOUND; + return (DBC_ERR(dbc, DB_NOTFOUND)); if ((ret = __bam_compress_set_dbt(dbp, &cp->del_key, cp->currentKey->data, cp->currentKey->size)) != 0) @@ -3015,7 +3020,8 @@ __bam_compress_count(dbc, nkeysp, ndatap) if (ret != 0) goto err; - if (t->bt_compare(dbp, cp_n->currentKey, cp_n->prevKey) != 0) + if (t->bt_compare(dbp, + cp_n->currentKey, cp_n->prevKey, NULL) != 0) nkeys += 1; } diff --git a/src/btree/bt_conv.c b/src/btree/bt_conv.c index 348ce5c2..85baeed8 100644 --- a/src/btree/bt_conv.c +++ b/src/btree/bt_conv.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -88,7 +88,12 @@ __bam_mswap(env, pg) SWAP32(p); /* re_len */ SWAP32(p); /* re_pad */ SWAP32(p); /* root */ - p += 92 * sizeof(u_int32_t); /* unused */ + SWAP32(p); /* threshold */ + SWAP32(p); /* file id lo */ + SWAP32(p); /* file id hi */ + SWAP32(p); /* sdb id lo */ + SWAP32(p); /* sdb id hi */ + p += 87 * sizeof(u_int32_t); /* unused */ SWAP32(p); /* crypto_magic */ return (0); diff --git a/src/btree/bt_curadj.c b/src/btree/bt_curadj.c index 78606009..d3398ee8 100644 --- a/src/btree/bt_curadj.c +++ b/src/btree/bt_curadj.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c index 860c31ce..d63b7373 100644 --- a/src/btree/bt_cursor.c +++ b/src/btree/bt_cursor.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -938,7 +938,7 @@ __bamc_get(dbc, key, data, flags, pgnop) case DB_CURRENT: /* It's not possible to return a deleted record. */ if (F_ISSET(cp, C_DELETED)) { - ret = DB_KEYEMPTY; + ret = DBC_ERR(dbc, DB_KEYEMPTY); goto err; } @@ -979,7 +979,7 @@ __bamc_get(dbc, key, data, flags, pgnop) goto err; if (flags == DB_GET_BOTH) { if (!exact) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } break; @@ -1000,7 +1000,7 @@ __bamc_get(dbc, key, data, flags, pgnop) dbc, PGNO_INVALID, key, flags, &exact)) != 0) return (ret); if (!exact) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } @@ -1047,7 +1047,7 @@ __bamc_get(dbc, key, data, flags, pgnop) if ((ret = __bamc_next(dbc, 1, 0)) != 0) goto err; if (!IS_CUR_DUPLICATE(dbc, orig_pgno, orig_indx)) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } break; @@ -1077,7 +1077,7 @@ __bamc_get(dbc, key, data, flags, pgnop) if ((ret = __bamc_prev(dbc)) != 0) goto err; if (!IS_CUR_DUPLICATE(dbc, orig_pgno, orig_indx)) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } break; @@ -1173,12 +1173,15 @@ __bam_bulk(dbc, data, flags) DBT *data; u_int32_t flags; { + BBLOB bl; BKEYDATA *bk; BOVERFLOW *bo; BTREE_CURSOR *cp; PAGE *pg; db_indx_t *inp, indx, pg_keyoff; int32_t *endp, key_off, *offp, *saveoffp; + off_t blob_size; + db_seq_t blob_id; u_int8_t *dbuf, *dp, *np; u_int32_t key_size, pagesize, size, space; int adj, is_key, need_pg, next_key, no_dup, rec_key, ret; @@ -1279,6 +1282,7 @@ next_pg: */ if (is_key && pg_keyoff != inp[indx]) { bk = GET_BKEYDATA(dbc->dbp, pg, indx); + DB_ASSERT(dbc->env, B_TYPE(bk->type) != B_BLOB); if (B_TYPE(bk->type) == B_OVERFLOW) { bo = (BOVERFLOW *)bk; size = key_size = bo->tlen; @@ -1403,6 +1407,31 @@ get_key_space: *offp-- = (int32_t)(np - dbuf); np += size; *offp-- = (int32_t)size; + } else if (B_TYPE(bk->type) == B_BLOB) { + blob_size = 0; + blob_id = 0; + memcpy(&bl, bk, BBLOB_SIZE); + GET_BLOB_SIZE(dbc->env, bl, blob_size, ret); + if (ret != 0) + return (ret); + if (blob_size > UINT32_MAX) { + size = UINT32_MAX; + goto back_up; + } + size = (u_int32_t)blob_size; + if (size > space) + goto back_up; + blob_id = (db_seq_t)bl.id; + if ((ret = __blob_bulk(dbc, size, blob_id, np)) != 0) + return (ret); + if (is_key) { + *offp-- = (int32_t)key_off; + *offp-- = (int32_t)key_size; + } + space -= size; + *offp-- = (int32_t)(np - dbuf); + np += size; + *offp-- = (int32_t)size; } else { if (need_pg) { dp = np; @@ -1764,11 +1793,11 @@ __bam_getbothc(dbc, data) */ if ((ret = __bam_cmp(dbc, data, cp->page, cp->indx, dbp->dup_compare == NULL ? __bam_defcmp : dbp->dup_compare, - &cmp)) != 0) + &cmp, NULL)) != 0) return (ret); if (cmp <= 0) - return (DB_NOTFOUND); + return (DBC_ERR(dbc, DB_NOTFOUND)); /* Discard the current page, we're going to do a full search. */ if ((ret = __memp_fput(mpf, @@ -1791,7 +1820,7 @@ __bam_getbothc(dbc, data) */ if (cp->indx + P_INDX >= NUM_ENT(cp->page) || !IS_DUPLICATE(dbc, cp->indx, cp->indx + P_INDX)) - return (DB_NOTFOUND); + return (DBC_ERR(dbc, DB_NOTFOUND)); cp->indx += P_INDX; return (__bam_getboth_finddatum(dbc, data, DB_GET_BOTH)); @@ -1842,7 +1871,7 @@ __bam_getlte(dbc, key, data) /* Check if we're still on the correct key */ if ((ret = __bam_cmp(dbc, key, cp->page, cp->indx, - ((BTREE*)dbp->bt_internal)->bt_compare, &exact)) != 0) + ((BTREE*)dbp->bt_internal)->bt_compare, &exact, NULL)) != 0) goto end; exact = (exact == 0); } @@ -1884,8 +1913,8 @@ __bam_getlte(dbc, key, data) if (data != NULL) { /* Check if we're still on the correct data */ if ((ret = __bam_cmp( - dbc, data, ocp->page, ocp->indx, - dbp->dup_compare, &exact)) != 0) + dbc, data, ocp->page, ocp->indx, + dbp->dup_compare, &exact, NULL)) != 0) goto end; exact = (exact == 0); } else @@ -1915,7 +1944,8 @@ __bam_getlte(dbc, key, data) else { /* Check if we're still on the correct data */ if ((ret = __bam_cmp(dbc, data, cp->page, - cp->indx + O_INDX, dbp->dup_compare, &exact)) != 0) + cp->indx + O_INDX, dbp->dup_compare, + &exact, NULL)) != 0) goto end; exact = (exact == 0); } @@ -1982,7 +2012,7 @@ __bam_getboth_finddatum(dbc, data, flags) if (!IS_CUR_DELETED(dbc)) { if ((ret = __bam_cmp( dbc, data, cp->page, cp->indx + O_INDX, - __bam_defcmp, &cmp)) != 0) + __bam_defcmp, &cmp, NULL)) != 0) return (ret); if (cmp == 0) return (0); @@ -1992,7 +2022,8 @@ __bam_getboth_finddatum(dbc, data, flags) !IS_DUPLICATE(dbc, cp->indx, cp->indx + P_INDX)) break; } - return (DB_NOTFOUND); + + return (DBC_ERR(dbc, DB_NOTFOUND)); } /* @@ -2008,18 +2039,18 @@ __bam_getboth_finddatum(dbc, data, flags) break; if (base == (top - P_INDX)) { if ((ret = __bam_cmp(dbc, data, cp->page, - cp->indx + O_INDX, dbp->dup_compare, &cmp)) != 0) + cp->indx + O_INDX, dbp->dup_compare, &cmp, NULL)) != 0) return (ret); if (cmp == 0 || (cmp < 0 && flags == DB_GET_BOTH_RANGE)) return (0); cp->indx = top; - return DB_NOTFOUND; + return (DBC_ERR(dbc, DB_NOTFOUND)); } for (lim = (top - base) / (db_indx_t)P_INDX; lim != 0; lim >>= 1) { cp->indx = base + ((lim >> 1) * P_INDX); if ((ret = __bam_cmp(dbc, data, cp->page, - cp->indx + O_INDX, dbp->dup_compare, &cmp)) != 0) + cp->indx + O_INDX, dbp->dup_compare, &cmp, NULL)) != 0) return (ret); if (cmp == 0) { /* @@ -2039,7 +2070,7 @@ __bam_getboth_finddatum(dbc, data, flags) /* No match found; if we're looking for an exact match, we're done. */ if (flags == DB_GET_BOTH) - return (DB_NOTFOUND); + return (DBC_ERR(dbc, DB_NOTFOUND)); /* * Base is the smallest index greater than the data item, may be zero @@ -2049,7 +2080,7 @@ __bam_getboth_finddatum(dbc, data, flags) cp->indx = base; while (cp->indx < top && IS_CUR_DELETED(dbc)) cp->indx += P_INDX; - return (cp->indx < top ? 0 : DB_NOTFOUND); + return (cp->indx < top ? 0 : DBC_ERR(dbc, DB_NOTFOUND)); } /* @@ -2082,7 +2113,7 @@ split: ret = stack = 0; switch (flags) { case DB_CURRENT: if (F_ISSET(cp, C_DELETED)) - return (DB_NOTFOUND); + return (DBC_ERR(dbc, DB_NOTFOUND)); /* FALLTHROUGH */ case DB_AFTER: case DB_BEFORE: @@ -2206,7 +2237,8 @@ split: ret = stack = 0; */ for (;; cp->indx += P_INDX) { if ((ret = __bam_cmp(dbc, data, cp->page, - cp->indx + O_INDX, dbp->dup_compare, &cmp)) != 0) + cp->indx + O_INDX, dbp->dup_compare, + &cmp, NULL)) != 0) goto err; if (cmp < 0) { iiop = DB_BEFORE; @@ -2479,7 +2511,7 @@ __bamc_next(dbc, initial_move, deleted_okay) */ if (cp->indx >= NUM_ENT(cp->page)) { if ((pgno = NEXT_PGNO(cp->page)) == PGNO_INVALID) - return (DB_NOTFOUND); + return (DBC_ERR(dbc, DB_NOTFOUND)); ACQUIRE_CUR(dbc, lock_mode, pgno, 0, ret); if (ret != 0) @@ -2539,7 +2571,7 @@ __bamc_prev(dbc) if (cp->indx == 0) { if ((pgno = PREV_PGNO(cp->page)) == PGNO_INVALID) - return (DB_NOTFOUND); + return (DBC_ERR(dbc, DB_NOTFOUND)); ACQUIRE_CUR(dbc, lock_mode, pgno, 0, ret); if (ret != 0) @@ -2711,11 +2743,11 @@ __bamc_search(dbc, root_pgno, key, flags, exactp) if (h->next_pgno == PGNO_INVALID) { indx = NUM_ENT(h) - P_INDX; if ((ret = __bam_cmp(dbc, key, h, indx, - t->bt_compare, &cmp)) != 0) + t->bt_compare, &cmp, NULL)) != 0) goto fast_miss; if (cmp > 0) { if (FLD_ISSET(sflags, SR_EXACT)) - return (DB_NOTFOUND); + return (DBC_ERR(dbc, DB_NOTFOUND)); else indx += P_INDX; } @@ -2725,10 +2757,10 @@ __bamc_search(dbc, root_pgno, key, flags, exactp) if (h->prev_pgno == PGNO_INVALID) { indx = 0; if ((ret = __bam_cmp(dbc, key, h, indx, - t->bt_compare, &cmp)) != 0) + t->bt_compare, &cmp, NULL)) != 0) goto fast_miss; if (cmp < 0 && FLD_ISSET(sflags, SR_EXACT)) - return (DB_NOTFOUND); + return (DBC_ERR(dbc, DB_NOTFOUND)); if (cmp <= 0) goto fast_hit; } @@ -2736,7 +2768,7 @@ __bamc_search(dbc, root_pgno, key, flags, exactp) DB_BINARY_SEARCH_FOR(base, lim, NUM_ENT(h), P_INDX) { DB_BINARY_SEARCH_INCR(indx, base, lim, P_INDX); if ((ret = __bam_cmp(dbc, key, h, indx, - t->bt_compare, &cmp)) != 0) + t->bt_compare, &cmp, NULL)) != 0) goto fast_miss; if (cmp == 0) @@ -2752,7 +2784,7 @@ __bamc_search(dbc, root_pgno, key, flags, exactp) indx = base; if (indx > 0 && indx < NUM_ENT(h)) { if (FLD_ISSET(sflags, SR_EXACT)) - return (DB_NOTFOUND); + return (DBC_ERR(dbc, DB_NOTFOUND)); goto fast_hit; } } @@ -3068,7 +3100,7 @@ __bam_opd_exists(dbc, pgno) if (NUM_ENT(h) == 0) ret = 0; else - ret = DB_KEYEXIST; + ret = DBC_ERR(dbc, DB_KEYEXIST); (void)__memp_fput(dbc->dbp->mpf, dbc->thread_info, h, dbc->priority); diff --git a/src/btree/bt_delete.c b/src/btree/bt_delete.c index 37496b3f..a1ccef71 100644 --- a/src/btree/bt_delete.c +++ b/src/btree/bt_delete.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1990, 1993, 1994, 1995, 1996 @@ -61,15 +61,18 @@ __bam_ditem(dbc, h, indx) PAGE *h; u_int32_t indx; { + BBLOB bl; BINTERNAL *bi; BKEYDATA *bk; DB *dbp; + db_seq_t blob_id; u_int32_t nbytes; int ret; db_indx_t *inp; dbp = dbc->dbp; inp = P_INP(dbp, h); + ret = 0; /* The page should already have been dirtied by our caller. */ DB_ASSERT(dbp->env, IS_DIRTY(h)); @@ -139,6 +142,13 @@ __bam_ditem(dbc, h, indx) dbc, (GET_BOVERFLOW(dbp, h, indx))->pgno)) != 0) return (ret); break; + case B_BLOB: + nbytes = BBLOB_SIZE; + memcpy(&bl, bk, BBLOB_SIZE); + blob_id = (db_seq_t)bl.id; + if ((ret = __blob_del(dbc, blob_id)) != 0) + return (ret); + break; case B_KEYDATA: nbytes = BKEYDATA_SIZE(bk->len); break; @@ -241,7 +251,7 @@ __bam_dpages(dbc, use_top, flags) * single item deleted, and the rest of the pages are to be removed. * * Recno always has a stack to the root and __bam_merge operations - * may have unneeded items in the sack. We find the lowest page + * may have unneeded items in the stack. We find the lowest page * in the stack that has more than one record in it and start there. */ ret = 0; @@ -493,7 +503,9 @@ stop: done = 1; /* * __bam_pupdate -- - * Update parent key pointers up the tree. + * Update parent key pointers up the tree after putting a new key + * at the start of a leaf page. + * * * PUBLIC: int __bam_pupdate __P((DBC *, PAGE *)); */ diff --git a/src/btree/bt_method.c b/src/btree/bt_method.c index 5cf93d2e..2fb33be2 100644 --- a/src/btree/bt_method.c +++ b/src/btree/bt_method.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -15,7 +15,7 @@ static int __bam_set_bt_minkey __P((DB *, u_int32_t)); static int __bam_get_bt_compare - __P((DB *, int (**)(DB *, const DBT *, const DBT *))); + __P((DB *, int (**)(DB *, const DBT *, const DBT *, size_t *))); static int __bam_get_bt_prefix __P((DB *, size_t(**)(DB *, const DBT *, const DBT *))); static int __bam_set_bt_prefix @@ -233,7 +233,7 @@ incompat: static int __bam_get_bt_compare(dbp, funcp) DB *dbp; - int (**funcp) __P((DB *, const DBT *, const DBT *)); + int (**funcp) __P((DB *, const DBT *, const DBT *, size_t *)); { BTREE *t; @@ -251,13 +251,13 @@ __bam_get_bt_compare(dbp, funcp) * __bam_set_bt_compare -- * Set the comparison function. * - * PUBLIC: int __bam_set_bt_compare - * PUBLIC: __P((DB *, int (*)(DB *, const DBT *, const DBT *))); + * PUBLIC: int __bam_set_bt_compare __P((DB *, + * PUBLIC: int (*)(DB *, const DBT *, const DBT *, size_t *))); */ int __bam_set_bt_compare(dbp, func) DB *dbp; - int (*func) __P((DB *, const DBT *, const DBT *)); + int (*func) __P((DB *, const DBT *, const DBT *, size_t *)); { BTREE *t; @@ -351,6 +351,13 @@ __bam_set_bt_compress(dbp, compress, decompress) return (EINVAL); } + /* Compression is incompatible with blob storage. */ + if (dbp->blob_threshold > 0) { + __db_errx(dbp->env, DB_STR("1198", + "compression cannot be used with blobs enabled.")); + return (EINVAL); + } + if (compress != 0 && decompress != 0) { t->bt_compress = compress; t->bt_decompress = decompress; diff --git a/src/btree/bt_open.c b/src/btree/bt_open.c index 7be141c1..46a866d0 100644 --- a/src/btree/bt_open.c +++ b/src/btree/bt_open.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1990, 1993, 1994, 1995, 1996 @@ -44,6 +44,7 @@ #include "db_config.h" #include "db_int.h" +#include "dbinc/blob.h" #include "dbinc/crypto.h" #include "dbinc/db_page.h" #include "dbinc/db_swap.h" @@ -119,6 +120,7 @@ __bam_metachk(dbp, name, btm) int ret; env = dbp->env; + ret = 0; /* * At this point, all we know is that the magic number is for a Btree. @@ -136,6 +138,7 @@ __bam_metachk(dbp, name, btm) return (DB_OLD_VERSION); case 8: case 9: + case 10: break; default: __db_errx(env, DB_STR_A("1009", @@ -269,6 +272,29 @@ __bam_metachk(dbp, name, btm) /* Set the page size. */ dbp->pgsize = btm->dbmeta.pagesize; + dbp->blob_threshold = btm->blob_threshold; + GET_BLOB_FILE_ID(env, btm, dbp->blob_file_id, ret); + if (ret != 0) + return (ret); + GET_BLOB_SDB_ID(env, btm, dbp->blob_sdb_id, ret); + if (ret != 0) + return (ret); + /* Blob databases must be upgraded. */ + if (vers == 9 && (dbp->blob_file_id != 0 || dbp->blob_sdb_id != 0)) { + __db_errx(env, DB_STR_A("1207", +"%s: databases that support blobs must be upgraded.", "%s"), + name); + return (EINVAL); + } +#ifndef HAVE_64BIT_TYPES + if (dbp->blob_file_id != 0 || dbp->blob_sdb_id != 0) { + __db_errx(env, DB_STR_A("1199", + "%s: blobs require 64 integer compiler support.", "%s"), + name); + return (DB_OPNOTSUP); + } +#endif + /* Copy the file's ID. */ memcpy(dbp->fileid, btm->dbmeta.uid, DB_FILE_ID_LEN); @@ -442,6 +468,9 @@ __bam_init_meta(dbp, meta, pgno, lsnp) meta->minkey = t->bt_minkey; meta->re_len = t->re_len; meta->re_pad = (u_int32_t)t->re_pad; + meta->blob_threshold = dbp->blob_threshold; + SET_BLOB_META_FILE_ID(meta, dbp->blob_file_id, BTMETA); + SET_BLOB_META_SDB_ID(meta, dbp->blob_sdb_id, BTMETA); #ifdef HAVE_PARTITION if ((part = dbp->p_internal) != NULL) { @@ -535,6 +564,12 @@ __bam_new_file(dbp, ip, txn, fhp, name) pginfo.type = dbp->type; pdbt.data = &pginfo; pdbt.size = sizeof(pginfo); + if (dbp->blob_threshold) { + if ((ret = __blob_generate_dir_ids(dbp, txn, + &dbp->blob_file_id)) != 0) + return (ret); + + } if ((ret = __os_calloc(env, 1, dbp->pgsize, &buf)) != 0) return (ret); meta = (BTMETA *)buf; @@ -613,6 +648,12 @@ __bam_new_subdb(mdbp, dbp, ip, txn) meta = NULL; root = NULL; + if (dbp->blob_threshold) { + if ((ret = __blob_generate_dir_ids(dbp, txn, + &dbp->blob_sdb_id)) != 0) + return (ret); + } + if ((ret = __db_cursor(mdbp, ip, txn, &dbc, CDB_LOCKING(env) ? DB_WRITECURSOR : 0)) != 0) return (ret); diff --git a/src/btree/bt_put.c b/src/btree/bt_put.c index 13316181..5cd0ac12 100644 --- a/src/btree/bt_put.c +++ b/src/btree/bt_put.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1990, 1993, 1994, 1995, 1996 @@ -56,8 +56,8 @@ static int __bam_dup_check __P((DBC *, u_int32_t, static int __bam_dup_convert __P((DBC *, PAGE *, u_int32_t, u_int32_t)); static int __bam_ovput __P((DBC *, u_int32_t, db_pgno_t, PAGE *, u_int32_t, DBT *)); -static u_int32_t - __bam_partsize __P((DB *, u_int32_t, DBT *, PAGE *, u_int32_t)); +static int __bam_partsize + __P((DB *, u_int32_t, DBT *, PAGE *, u_int32_t, u_int32_t *)); /* * __bam_iitem -- @@ -71,18 +71,22 @@ __bam_iitem(dbc, key, data, op, flags) DBT *key, *data; u_int32_t op, flags; { + BBLOB bl, blob_buf; BKEYDATA *bk, bk_tmp; BTREE *t; BTREE_CURSOR *cp; DB *dbp; - DBT bk_hdr, tdbt; + DBT bk_hdr, blob_dbt, tdbt; DB_MPOOLFILE *mpf; ENV *env; + DB_LSN lsn; PAGE *h; db_indx_t cnt, indx; + off_t blob_size; + db_seq_t blob_id, new_blob_id; u_int32_t data_size, have_bytes, need_bytes, needed, pages, pagespace; char tmp_ch; - int cmp, bigkey, bigdata, del, dupadjust; + int cmp, bigkey, bigdata, blobdata, del, dupadjust; int padrec, replace, ret, t_ret, was_deleted; COMPQUIET(cnt, 0); @@ -95,6 +99,7 @@ __bam_iitem(dbc, key, data, op, flags) h = cp->page; indx = cp->indx; del = dupadjust = replace = was_deleted = 0; + blobdata = 0; /* * Fixed-length records with partial puts: it's an error to specify @@ -112,8 +117,12 @@ __bam_iitem(dbc, key, data, op, flags) * longer than the fixed-length, and we never require less than * the fixed-length record size. */ - data_size = F_ISSET(data, DB_DBT_PARTIAL) ? - __bam_partsize(dbp, op, data, h, indx) : data->size; + if (F_ISSET(data, DB_DBT_PARTIAL)) { + if ((ret = __bam_partsize( + dbp, op, data, h, indx, &data_size)) != 0) + return (ret); + } else + data_size = data->size; padrec = 0; if (F_ISSET(dbp, DB_AM_FIXEDLEN)) { if (data_size > t->re_len) @@ -190,6 +199,13 @@ __bam_iitem(dbc, key, data, op, flags) } if (!F_ISSET(data, DB_DBT_STREAMING) && (padrec || F_ISSET(data, DB_DBT_PARTIAL))) { + /* Partial puts need to be handled in the blob functions. */ + if (op == DB_CURRENT) { + bk = GET_BKEYDATA(dbp, h, indx + (TYPE(h) == P_LBTREE ? + O_INDX : 0)); + if (B_TYPE(bk->type) == B_BLOB) + goto dup_cmp; + } tdbt = *data; if ((ret = __bam_build(dbc, op, &tdbt, h, indx, data_size)) != 0) @@ -204,10 +220,10 @@ __bam_iitem(dbc, key, data, op, flags) * screwing up the duplicate sort order. We have to do this after * we build the real record so that we're comparing the real items. */ - if (op == DB_CURRENT && dbp->dup_compare != NULL) { +dup_cmp:if (op == DB_CURRENT && dbp->dup_compare != NULL) { if ((ret = __bam_cmp(dbc, data, h, indx + (TYPE(h) == P_LBTREE ? O_INDX : 0), - dbp->dup_compare, &cmp)) != 0) + dbp->dup_compare, &cmp, NULL)) != 0) return (ret); if (cmp != 0) { __db_errx(env, DB_STR("1004", @@ -218,10 +234,30 @@ __bam_iitem(dbc, key, data, op, flags) /* * If the key or data item won't fit on a page, we'll have to store - * them on overflow pages. + * them on overflow pages. The exception is if we are inserting + * into an existing blob file, in that case it remains a blob + * file regardless of its new size. */ + if (op == DB_CURRENT) { + bk = GET_BKEYDATA( + dbp, h, indx + (TYPE(h) == P_LBTREE ? O_INDX : 0)); + if (B_TYPE(bk->type) == B_BLOB) { + blobdata = 1; + bigdata = 0; + } else + bigdata = data_size > cp->ovflsize; + } else { + if (dbp->blob_threshold && + (dbp->blob_threshold <= data_size || + F_ISSET(data, DB_DBT_BLOB))) { + blobdata = 1; + bigdata = 0; + } else { + blobdata = 0; + bigdata = data_size > cp->ovflsize; + } + } needed = 0; - bigdata = data_size > cp->ovflsize; switch (op) { case DB_KEYFIRST: /* We're adding a new key and data pair. */ @@ -232,6 +268,8 @@ __bam_iitem(dbc, key, data, op, flags) needed += BKEYDATA_PSIZE(key->size); if (bigdata) needed += BOVERFLOW_PSIZE; + else if (blobdata) + needed += BBLOB_PSIZE; else needed += BKEYDATA_PSIZE(data_size); break; @@ -254,6 +292,8 @@ __bam_iitem(dbc, key, data, op, flags) indx + (TYPE(h) == P_LBTREE ? O_INDX : 0)); if (B_TYPE(bk->type) == B_KEYDATA) have_bytes = BKEYDATA_PSIZE(bk->len); + else if (B_TYPE(bk->type) == B_BLOB) + have_bytes = BBLOB_PSIZE; else have_bytes = BOVERFLOW_PSIZE; need_bytes = 0; @@ -263,6 +303,8 @@ __bam_iitem(dbc, key, data, op, flags) } if (bigdata) need_bytes += BOVERFLOW_PSIZE; + else if (blobdata) + need_bytes += BBLOB_PSIZE; else need_bytes += BKEYDATA_PSIZE(data_size); @@ -405,7 +447,8 @@ __bam_iitem(dbc, key, data, op, flags) * because we're going to immediately re-add the item into the * same slot. */ - if (bigdata || B_TYPE(bk->type) != B_KEYDATA) { + if (bigdata || (B_TYPE(bk->type) != B_KEYDATA && + B_TYPE(bk->type) != B_BLOB)) { /* * If streaming, don't delete the overflow item, * just delete the item pointing to the overflow item. @@ -448,13 +491,65 @@ __bam_iitem(dbc, key, data, op, flags) bk_hdr.size = SSZA(BKEYDATA, data); ret = __db_pitem(dbc, h, indx, BKEYDATA_SIZE(data->size), &bk_hdr, data); - } else if (replace) - ret = __bam_ritem(dbc, h, indx, data, 0); - else - ret = __db_pitem(dbc, h, indx, - BKEYDATA_SIZE(data->size), NULL, data); + } else if (replace) { + /* + * If updating a blob, replace the blob file with the + * new blob data and updated the blob db record. + */ + if (blobdata) { + memcpy(&bl, + P_ENTRY(dbp, h, indx), BBLOB_SIZE); + memset(&blob_dbt, 0, sizeof(DBT)); + blob_dbt.size = BBLOB_DSIZE; + if (F_ISSET(data, DB_DBT_BLOB_REC)) { + /* + * Replace the blob record with the + * blob record in the data DBT. + */ + blob_dbt.data = BBLOB_DATA(data->data); + } else { + blob_id = (db_seq_t)bl.id; + GET_BLOB_SIZE( + dbp->env, bl, blob_size, ret); + if (ret != 0) + goto err; + if ((ret = __blob_repl( + dbc, data, blob_id, + &new_blob_id, &blob_size)) != 0) + goto err; + blob_dbt.data = BBLOB_DATA((&bl)); + SET_BLOB_ID(&bl, new_blob_id, BBLOB); + SET_BLOB_SIZE(&bl, blob_size, BBLOB); + } + ret = __bam_ritem( + dbc, h, indx, &blob_dbt, B_BLOB); + } else + ret = __bam_ritem(dbc, h, indx, data, 0); + } else + if (blobdata) { + new_blob_id = 0; + blob_size = 0; + if ((ret = __blob_put(dbc, data, + &new_blob_id, &blob_size, &lsn)) != 0) + goto err; + memset(&blob_buf, 0, BBLOB_SIZE); + blob_buf.type = B_BLOB; + blob_buf.len = BBLOB_DSIZE; + tdbt.data = &blob_buf; + tdbt.size = BBLOB_SIZE; + SET_BLOB_ID(&blob_buf, new_blob_id, BBLOB); + SET_BLOB_SIZE(&blob_buf, blob_size, BBLOB); + SET_BLOB_FILE_ID( + &blob_buf, dbp->blob_file_id, BBLOB); + SET_BLOB_SDB_ID( + &blob_buf, dbp->blob_sdb_id, BBLOB); + ret = __db_pitem(dbc, h, + indx, BBLOB_SIZE, &tdbt, NULL); + } else + ret = __db_pitem(dbc, h, indx, + BKEYDATA_SIZE(data->size), NULL, data); } - if (ret != 0) { +err: if (ret != 0) { if (del == 1 && (t_ret = __bam_ca_di(dbc, PGNO(h), indx + 1, -1)) != 0) { __db_err(env, t_ret, DB_STR("1005", @@ -504,32 +599,61 @@ __bam_iitem(dbc, key, data, op, flags) * __bam_partsize -- * Figure out how much space a partial data item is in total. */ -static u_int32_t -__bam_partsize(dbp, op, data, h, indx) +static int +__bam_partsize(dbp, op, data, h, indx, data_size) DB *dbp; u_int32_t op, indx; DBT *data; PAGE *h; + u_int32_t *data_size; { + BBLOB bl; BKEYDATA *bk; + int ret; + off_t blob_size; u_int32_t nbytes; + ret = 0; + /* * If the record doesn't already exist, it's simply the data we're * provided. */ - if (op != DB_CURRENT) - return (data->doff + data->size); + if (op != DB_CURRENT) { + *data_size = data->doff + data->size; + return (0); + } /* * Otherwise, it's the data provided plus any already existing data * that we're not replacing. */ bk = GET_BKEYDATA(dbp, h, indx + (TYPE(h) == P_LBTREE ? O_INDX : 0)); - nbytes = - B_TYPE(bk->type) == B_OVERFLOW ? ((BOVERFLOW *)bk)->tlen : bk->len; + switch (B_TYPE(bk->type)) { + case B_BLOB: + memcpy(&bl, bk, BBLOB_SIZE); + GET_BLOB_SIZE(dbp->env, bl, blob_size, ret); + if (ret != 0) + return (ret); + /* + * It is not possible to add data past UINT32_MAX in the + * partial API, so this is safe. + */ + if (blob_size > UINT32_MAX) + nbytes = UINT32_MAX; + else + nbytes = (u_int32_t)blob_size; + break; + case B_OVERFLOW: + nbytes = ((BOVERFLOW *)bk)->tlen; + break; + default: + nbytes = bk->len; + } - return (__db_partsize(nbytes, data)); + *data_size = __db_partsize(nbytes, data); + + return (ret); } /* @@ -848,6 +972,7 @@ __bam_irep(dbc, h, indx, hdr, data) bi = GET_BINTERNAL(dbp, h, indx); bn = (BINTERNAL *) hdr->data; + DB_ASSERT(dbc->env, B_TYPE(bi->type) != B_BLOB); if (B_TYPE(bi->type) == B_OVERFLOW && (ret = __db_doff(dbc, ((BOVERFLOW *)bi->data)->pgno)) != 0) return (ret); @@ -892,6 +1017,7 @@ __bam_dup_check(dbc, op, h, indx, sz, cntp) /* Count the key once. */ bk = GET_BKEYDATA(dbp, h, indx); + DB_ASSERT(dbc->env, B_TYPE(bk->type) != B_BLOB); sz += B_TYPE(bk->type) == B_KEYDATA ? BKEYDATA_PSIZE(bk->len) : BOVERFLOW_PSIZE; @@ -994,6 +1120,7 @@ __bam_dup_convert(dbc, h, indx, cnt) * overflow, then free up those pages). */ bk = GET_BKEYDATA(dbp, h, dindx + 1); + DB_ASSERT(dbc->env, B_TYPE(bk->type) != B_BLOB); hdr.data = bk; hdr.size = B_TYPE(bk->type) == B_KEYDATA ? BKEYDATA_SIZE(bk->len) : BOVERFLOW_SIZE; diff --git a/src/btree/bt_rec.c b/src/btree/bt_rec.c index 026564b6..eb44d04b 100644 --- a/src/btree/bt_rec.c +++ b/src/btree/bt_rec.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/btree/bt_reclaim.c b/src/btree/bt_reclaim.c index f465cc5a..1203ea35 100644 --- a/src/btree/bt_reclaim.c +++ b/src/btree/bt_reclaim.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1998, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/btree/bt_recno.c b/src/btree/bt_recno.c index 9356a742..abbd8efb 100644 --- a/src/btree/bt_recno.c +++ b/src/btree/bt_recno.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -234,7 +234,7 @@ __ramc_del(dbc, flags) retry: if ((ret = __bam_rsearch(dbc, &cp->recno, SR_DELETE, 1, &exact)) != 0) goto err; if (!exact) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } stack = 1; @@ -256,7 +256,7 @@ retry: if ((ret = __bam_rsearch(dbc, &cp->recno, SR_DELETE, 1, &exact)) != 0) * if the record was "deleted", we could never have found it. */ if (B_DISSET(GET_BKEYDATA(dbp, cp->page, cp->indx)->type)) { - ret = DB_KEYEMPTY; + ret = DBC_ERR(dbc, DB_KEYEMPTY); goto err; } @@ -391,7 +391,7 @@ retry: switch (flags) { * a dup, so we set flags to DB_NEXT and keep going. */ if (!F_ISSET(dbc, DBC_OPD)) - return (DB_NOTFOUND); + return (DBC_ERR(dbc, DB_NOTFOUND)); /* FALLTHROUGH */ case DB_NEXT_NODUP: /* @@ -431,7 +431,7 @@ retry: switch (flags) { * is a dup, so we set flags to DB_PREV and keep going. */ if (!F_ISSET(dbc, DBC_OPD)) - return (DB_NOTFOUND); + return (DBC_ERR(dbc, DB_NOTFOUND)); /* FALLTHROUGH */ case DB_PREV_NODUP: /* @@ -443,7 +443,7 @@ retry: switch (flags) { flags = DB_PREV; if (cp->recno != RECNO_OOB) { if (cp->recno == 1) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } --cp->recno; @@ -458,7 +458,7 @@ retry: switch (flags) { if ((ret = __bam_nrecs(dbc, &cp->recno)) != 0) goto err; if (cp->recno == 0) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } break; @@ -476,7 +476,7 @@ retry: switch (flags) { cp->recno++; break; } - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; /* NOTREACHED */ case DB_GET_BOTH: @@ -522,7 +522,7 @@ retry: switch (flags) { 1, &exact)) != 0) goto err; if (!exact) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } @@ -561,22 +561,22 @@ retry: switch (flags) { (void)__bam_stkrel(dbc, STK_CLRDBC); continue; } - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; default: - ret = DB_KEYEMPTY; + ret = DBC_ERR(dbc, DB_KEYEMPTY); goto err; } if (flags == DB_GET_BOTH || flags == DB_GET_BOTHC || flags == DB_GET_BOTH_RANGE) { if ((ret = __bam_cmp(dbc, data, cp->page, cp->indx, - __bam_defcmp, &cmp)) != 0) + __bam_defcmp, &cmp, NULL)) != 0) return (ret); if (cmp == 0) break; if (!F_ISSET(dbc, DBC_OPD)) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } (void)__bam_stkrel(dbc, STK_CLRDBC); @@ -1331,7 +1331,7 @@ __ram_sread(dbc, top) if (0) { eof: t->re_eof = 1; - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); } err: if (!was_modified) t->re_modified = 0; @@ -1368,7 +1368,7 @@ retry: /* Find the slot for insertion. */ if (exact && flags == DB_NOOVERWRITE && !CD_ISSET(cp) && !B_DISSET(GET_BKEYDATA(dbc->dbp, cp->page, cp->indx)->type)) { - ret = DB_KEYEXIST; + ret = DBC_ERR(dbc, DB_KEYEXIST); goto err; } diff --git a/src/btree/bt_rsearch.c b/src/btree/bt_rsearch.c index 36d1c667..4ada6e2d 100644 --- a/src/btree/bt_rsearch.c +++ b/src/btree/bt_rsearch.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1990, 1993, 1994, 1995, 1996 @@ -147,7 +147,7 @@ __bam_rsearch(dbc, recnop, flags, stop, exactp) __TLPUT(dbc, lock)) != 0 && ret == 0) ret = t_ret; if (ret == 0) - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto done; } } @@ -197,7 +197,8 @@ __bam_rsearch(dbc, recnop, flags, stop, exactp) lock)) != 0 && ret == 0) ret = t_ret; if (ret == 0) - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, + DB_NOTFOUND); goto err; } } diff --git a/src/btree/bt_search.c b/src/btree/bt_search.c index e809a852..e3d69d16 100644 --- a/src/btree/bt_search.c +++ b/src/btree/bt_search.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1990, 1993, 1994, 1995, 1996 @@ -51,8 +51,9 @@ /* * __bam_get_root -- - * Fetch the root of a tree and see if we want to keep - * it in the stack. + * Try to appropriately lock and fetch the root page of a tree; + * if successful enter it into the cursor's stack; on error, leave the stack + * unchanged. * * PUBLIC: int __bam_get_root __P((DBC *, db_pgno_t, int, u_int32_t, int *)); */ @@ -232,9 +233,11 @@ retry: if (lock_mode == DB_LOCK_WRITE) } else if (atomic_read(&mpf->mfp->multiversion) != 0 && lock_mode == DB_LOCK_WRITE && (ret = __memp_dirty(mpf, &h, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0) { - (void)__memp_fput(mpf, - dbc->thread_info, h, dbc->priority); + if (h != NULL) + (void)__memp_fput(mpf, + dbc->thread_info, h, dbc->priority); (void)__LPUT(dbc, lock); + return (ret); } } @@ -272,9 +275,10 @@ __bam_search(dbc, root_pgno, key, flags, slevel, recnop, exactp) db_recno_t recno; int adjust, cmp, deloffset, ret, set_stack, stack, t_ret; int getlock, was_next; - int (*func) __P((DB *, const DBT *, const DBT *)); + int (*func) __P((DB *, const DBT *, const DBT *, size_t *)); u_int32_t get_mode, wait; u_int8_t level, saved_level; + size_t pos, pos_h, pos_l; if (F_ISSET(dbc, DBC_OPD)) LOCK_CHECK_OFF(dbc->thread_info); @@ -288,6 +292,7 @@ __bam_search(dbc, root_pgno, key, flags, slevel, recnop, exactp) t = dbp->bt_internal; recno = 0; t_ret = 0; + func = NULL; BT_STK_CLR(cp); LOCK_INIT(saved_lock); @@ -339,11 +344,17 @@ retry: if ((ret = __bam_get_root(dbc, start_pgno, slevel, flags, &stack)) != 0) BT_STK_CLR(cp); - /* Choose a comparison function. */ + /* + * Choose a comparison function. + * We apply the prefix search optimization only when there + * is no user-specific comparsion function set. + */ func = F_ISSET(dbc, DBC_OPD) ? (dbp->dup_compare == NULL ? __bam_defcmp : dbp->dup_compare) : t->bt_compare; + pos_h = 0; + pos_l = 0; for (;;) { if (TYPE(h) == P_LBTREE) adjust = P_INDX; @@ -389,9 +400,11 @@ retry: if ((ret = __bam_get_root(dbc, start_pgno, slevel, flags, &stack)) != 0) * match on a leaf page, we're done. */ DB_BINARY_SEARCH_FOR(base, lim, NUM_ENT(h), adjust) { + /* We compare from the common prefix */ + pos = pos_l > pos_h ? pos_h : pos_l; DB_BINARY_SEARCH_INCR(indx, base, lim, adjust); if ((ret = __bam_cmp(dbc, key, h, indx, - func, &cmp)) != 0) + func, &cmp, &pos)) != 0) goto err; if (cmp == 0) { if (LEVEL(h) == LEAFLEVEL || @@ -403,9 +416,19 @@ retry: if ((ret = __bam_get_root(dbc, start_pgno, slevel, flags, &stack)) != 0) } goto next; } - if (cmp > 0) + /* + * We have to maintain the offset in the keys where + * we begin comparing for both ends of the key range + * in which we are binary searching. So, update either + * the high or low position here, depending on how + * the comparison turned out. + */ + if (cmp > 0) { DB_BINARY_SEARCH_SHIFT_BASE(indx, base, lim, adjust); + pos_l = pos; + } else + pos_h = pos; } /* @@ -421,7 +444,7 @@ retry: if ((ret = __bam_get_root(dbc, start_pgno, slevel, flags, &stack)) != 0) *exactp = 0; if (LF_ISSET(SR_EXACT)) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } @@ -444,13 +467,13 @@ get_next: /* * at the root if the tree recently collapsed. */ if (PGNO(h) == root_pgno) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } indx = cp->sp->indx + 1; if (indx == NUM_ENT(cp->sp->page)) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); cp->csp++; goto err; } @@ -863,7 +886,7 @@ found: *exactp = 1; * DB_NOTFOUND. */ if (B_DISSET(GET_BKEYDATA(dbp, h, indx + deloffset)->type)) { - ret = DB_NOTFOUND; + ret = DBC_ERR(dbc, DB_NOTFOUND); goto err; } diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index 8299c69a..f7719dc4 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 1990, 1993, 1994, 1995, 1996 @@ -63,7 +63,7 @@ __bam_split(dbc, arg, root_pgnop) db_pgno_t *root_pgnop; { BTREE_CURSOR *cp; - DB_LOCK metalock, next_lock; + DB_LOCK meta_lock, next_lock; enum { UP, DOWN } dir; db_pgno_t pgno, next_pgno, root_pgno; int exact, level, ret; @@ -72,17 +72,16 @@ __bam_split(dbc, arg, root_pgnop) LOCK_CHECK_OFF(dbc->thread_info); cp = (BTREE_CURSOR *)dbc->internal; + LOCK_INIT(meta_lock); LOCK_INIT(next_lock); next_pgno = PGNO_INVALID; /* - * First get a lock on the metadata page, we will have to allocate + * First get a lock on the metadata page; we will have to allocate * pages and cannot get a lock while we have the search tree pinned. */ - pgno = PGNO_BASE_MD; - if ((ret = __db_lget(dbc, - 0, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0) + if ((ret = __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, &meta_lock)) != 0) goto err; root_pgno = BAM_ROOT_PGNO(dbc); @@ -189,7 +188,7 @@ no_split: /* Once we've split the leaf page, we're done. */ if (root_pgnop != NULL) *root_pgnop = BAM_ROOT_PGNO(dbc); err: -done: (void)__LPUT(dbc, metalock); +done: (void)__LPUT(dbc, meta_lock); (void)__TLPUT(dbc, next_lock); if (F_ISSET(dbc, DBC_OPD)) @@ -685,6 +684,7 @@ __bam_broot(dbc, rootp, split, lp, rp) DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data)); DB_SET_DBT(data, &bo, BOVERFLOW_SIZE); break; + case B_BLOB: case B_DUPLICATE: default: goto pgfmt; @@ -772,7 +772,30 @@ __ram_root(dbc, rootp, lp, rp) /* * __bam_pinsert -- - * Insert a new key into a parent page, completing the split. + * + * Construct a internal index item and place it in the parent page. It is + * primarily used by __bam_page() to add a new page into the tree. The sole + * other use is by __bam_pupdate() after a reverse split or compact has + * removed pages underneath it, in order to replace the parent's key/nrecs + * to match the new subtree. + * + * Parameters: + * parent - the page from the cursor stack to be modifed. The next entry + * in the stack (i.e., the next lower level in the tree) contains + * the key of the new item. The indx field must have been set + * when searching down the tree, to point to the new/replaced + * parent item. + * split - the indx in the cursor stack of the 'source' of the new item. + * lchild - the left child page is used *only* when attempting to use + * prefix key compression on a leaf (data) page. + * rchild - right child page. The source of the pgno of the new item. + * flags - BPI_REPLACE | BPI_NORENCUM + * BPI_NOLOGGING + * + * The pgno of the item always comes from rchild, which often is the same + * as parent[1].page. The key for DB_BTREE comes from the next lower page + * in the stack under parent, not from either lchild or rchild parameter -- + * though often rchild is a copy of parent[1].page. * * PUBLIC: int __bam_pinsert * PUBLIC: __P((DBC *, EPG *, u_int32_t, PAGE *, PAGE *, int)); @@ -867,12 +890,27 @@ __bam_pinsert(dbc, parent, split, lchild, rchild, flags) size = BINTERNAL_SIZE(child_bi->len); break; case B_OVERFLOW: - /* Reuse the overflow key. */ + /* Copy the overflow key. */ child_bo = (BOVERFLOW *)child_bi->data; memset(&bo, 0, sizeof(bo)); bo.type = B_OVERFLOW; bo.tlen = child_bo->tlen; - bo.pgno = child_bo->pgno; + if (LF_ISSET(BPI_REPLACE)) { + /* + * Replace (compact or reverse split) needs to + * copy in case the data item gets removed. + */ + memset(&hdr, 0, sizeof(hdr)); + if ((ret = __db_goff(dbc, &hdr, + child_bo->tlen, child_bo->pgno, + &hdr.data, &hdr.size)) == 0) + ret = __db_poff(dbc, &hdr, &bo.pgno); + if (hdr.data != NULL) + __os_free(dbp->env, hdr.data); + if (ret != 0) + return (ret); + } else + bo.pgno = child_bo->pgno; bi.len = BOVERFLOW_SIZE; B_TSET(bi.type, B_OVERFLOW); bi.pgno = rchild->pgno; @@ -881,6 +919,7 @@ __bam_pinsert(dbc, parent, split, lchild, rchild, flags) DB_SET_DBT(data, &bo, BOVERFLOW_SIZE); size = BINTERNAL_SIZE(BOVERFLOW_SIZE); break; + case B_BLOB: case B_DUPLICATE: default: goto pgfmt; @@ -982,8 +1021,8 @@ noprefix: if (P_FREESPACE(dbp, ppage) + oldsize < nbytes) DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data)); DB_SET_DBT(data, &bo, BOVERFLOW_SIZE); size = BINTERNAL_SIZE(BOVERFLOW_SIZE); - break; + case B_BLOB: case B_DUPLICATE: default: goto pgfmt; @@ -1153,23 +1192,32 @@ __bam_psplit(dbc, cp, lp, rp, splitret) nbytes += BINTERNAL_SIZE(BOVERFLOW_SIZE); break; case P_LBTREE: - if (B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) == - B_KEYDATA) - nbytes += BKEYDATA_SIZE(GET_BKEYDATA(dbp, - pp, off)->len); - else + switch (B_TYPE(GET_BKEYDATA(dbp, pp, off)->type)) { + case B_KEYDATA: + nbytes += BKEYDATA_SIZE( + GET_BKEYDATA(dbp, pp, off)->len); + break; + case B_BLOB: + nbytes += BBLOB_SIZE; + break; + default: nbytes += BOVERFLOW_SIZE; - + } ++off; /* FALLTHROUGH */ case P_LDUP: case P_LRECNO: - if (B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) == - B_KEYDATA) - nbytes += BKEYDATA_SIZE(GET_BKEYDATA(dbp, - pp, off)->len); - else + switch (B_TYPE(GET_BKEYDATA(dbp, pp, off)->type)) { + case B_KEYDATA: + nbytes += BKEYDATA_SIZE( + GET_BKEYDATA(dbp, pp, off)->len); + break; + case B_BLOB: + nbytes += BBLOB_SIZE; + break; + default: nbytes += BOVERFLOW_SIZE; + } break; case P_IRECNO: nbytes += RINTERNAL_SIZE; @@ -1269,7 +1317,7 @@ __bam_copy(dbp, pp, cp, nxt, stop) PAGE *pp, *cp; u_int32_t nxt, stop; { - BINTERNAL internal; + BINTERNAL *bi, internal; db_indx_t *cinp, nbytes, off, *pinp; cinp = P_INP(dbp, cp); @@ -1302,12 +1350,17 @@ __bam_copy(dbp, pp, cp, nxt, stop) /* FALLTHROUGH */ case P_LDUP: case P_LRECNO: - if (B_TYPE(GET_BKEYDATA(dbp, pp, nxt)->type) == - B_KEYDATA) - nbytes = BKEYDATA_SIZE(GET_BKEYDATA(dbp, - pp, nxt)->len); - else + switch (B_TYPE(GET_BKEYDATA(dbp, pp, nxt)->type)) { + case B_KEYDATA: + nbytes = BKEYDATA_SIZE( + GET_BKEYDATA(dbp, pp, nxt)->len); + break; + case B_BLOB: + nbytes = BBLOB_SIZE; + break; + default: nbytes = BOVERFLOW_SIZE; + } break; case P_IRECNO: nbytes = RINTERNAL_SIZE; @@ -1316,17 +1369,18 @@ __bam_copy(dbp, pp, cp, nxt, stop) return (__db_pgfmt(dbp->env, pp->pgno)); } cinp[off] = HOFFSET(cp) -= nbytes; + /* Minimize the first key on an IBTREE page; it isn't valid. */ + bi = GET_BINTERNAL(dbp, pp, nxt); if (off == 0 && nxt != 0 && TYPE(pp) == P_IBTREE) { internal.len = 0; UMRW_SET(internal.unused); internal.type = B_KEYDATA; - internal.pgno = GET_BINTERNAL(dbp, pp, nxt)->pgno; - internal.nrecs = GET_BINTERNAL(dbp, pp, nxt)->nrecs; + internal.pgno = bi->pgno; + internal.nrecs = bi->nrecs; memcpy(P_ENTRY(dbp, cp, off), &internal, nbytes); } else - memcpy(P_ENTRY(dbp, cp, off), - P_ENTRY(dbp, pp, nxt), nbytes); + memcpy(P_ENTRY(dbp, cp, off), bi, nbytes); } return (0); } diff --git a/src/btree/bt_stat.c b/src/btree/bt_stat.c index 668c4fdb..04c0fbcb 100644 --- a/src/btree/bt_stat.c +++ b/src/btree/bt_stat.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -278,6 +278,8 @@ __bam_stat_print(dbc, flags) "%#x\tFixed-length record pad", (u_int)sp->bt_re_pad); } __db_dl(env, + "Number of pages in the database", (u_long)sp->bt_pagecnt); + __db_dl(env, "Underlying database page size", (u_long)sp->bt_pagesize); if (dbp->type == DB_BTREE) __db_dl(env, "Overflow key/data size", @@ -288,6 +290,10 @@ __bam_stat_print(dbc, flags) "Number of records in the tree", (u_long)sp->bt_nkeys); __db_dl(env, "Number of data items in the tree", (u_long)sp->bt_ndata); + if (dbp->type == DB_BTREE) { + __db_dl(env, + "Number of blobs in the tree", (u_long)sp->bt_nblobs); + } __db_dl(env, "Number of tree internal pages", (u_long)sp->bt_int_pg); @@ -372,6 +378,10 @@ __bam_stat_callback(dbc, h, cookie, putp) /* Ignore off-page duplicates. */ if (B_TYPE(type) != B_DUPLICATE) ++sp->bt_ndata; + + /* Count blobs. */ + if (B_TYPE(type) == B_BLOB) + ++sp->bt_nblobs; } ++sp->bt_leaf_pg; diff --git a/src/btree/bt_upgrade.c b/src/btree/bt_upgrade.c index c9123351..66e27d56 100644 --- a/src/btree/bt_upgrade.c +++ b/src/btree/bt_upgrade.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -9,6 +9,7 @@ #include "db_config.h" #include "db_int.h" +#include "dbinc/blob.h" #include "dbinc/db_page.h" #include "dbinc/db_upgrade.h" #include "dbinc/btree.h" @@ -151,3 +152,94 @@ __bam_31_lbtree(dbp, real_name, flags, fhp, h, dirtyp) return (ret); } + +/* + * __bam_60_btreemeta-- + * Upgrade the version number. + * + * PUBLIC: int __bam_60_btreemeta + * PUBLIC: __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)); + */ +int +__bam_60_btreemeta(dbp, real_name, flags, fhp, h, dirtyp) + DB *dbp; + char *real_name; + u_int32_t flags; + DB_FH *fhp; + PAGE *h; + int *dirtyp; +{ + BTMETA33 *bmeta; + + COMPQUIET(flags, 0); + COMPQUIET(real_name, NULL); + COMPQUIET(fhp, NULL); + COMPQUIET(dbp, NULL); + bmeta = (BTMETA33 *)h; + + bmeta->dbmeta.version = 10; + *dirtyp = 1; + + return (0); +} + +/* + * __bam_60_lbtree -- + * Upgrade the blob records on the database btree leaf pages. + * + * PUBLIC: int __bam_60_lbtree + * PUBLIC: __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)); + */ +int +__bam_60_lbtree(dbp, real_name, flags, fhp, h, dirtyp) + DB *dbp; + char *real_name; + u_int32_t flags; + DB_FH *fhp; + PAGE *h; + int *dirtyp; +{ + BBLOB60 bl60; + BBLOB60P1 bl60p1; + BKEYDATA *bk; + db_seq_t blob_id, blob_size, file_id, sdb_id; + db_indx_t indx; + int ret; + + COMPQUIET(flags, 0); + COMPQUIET(real_name, NULL); + COMPQUIET(fhp, NULL); + ret = 0; + + DB_ASSERT(dbp->env, BBLOB60_SIZE == BBLOB_SIZE); + for (indx = O_INDX; indx < NUM_ENT(h); indx += P_INDX) { + bk = GET_BKEYDATA(dbp, h, indx); + if (B_TYPE(bk->type) == B_BLOB ) { + memcpy(&bl60, bk, BBLOB60_SIZE); + memset(&bl60p1, 0, BBLOB_SIZE); + bl60p1.type = bl60.type; + bl60p1.len = BBLOB_DSIZE; + bl60p1.encoding = bl60.encoding; + GET_BLOB60_ID(dbp->env, bl60, blob_id, ret); + if (ret != 0) + return (ret); + GET_BLOB60_SIZE(dbp->env, bl60, blob_size, ret); + if (ret != 0) + return (ret); + GET_BLOB60_FILE_ID(dbp->env, &bl60, file_id, ret); + if (ret != 0) + return (ret); + GET_BLOB60_SDB_ID(dbp->env, &bl60, sdb_id, ret); + if (ret != 0) + return (ret); + SET_BLOB_ID(&bl60p1, blob_id, BBLOB60P1); + SET_BLOB_SIZE(&bl60p1, blob_size, BBLOB60P1); + SET_BLOB_FILE_ID(&bl60p1, file_id, BBLOB60P1); + SET_BLOB_SDB_ID(&bl60p1, sdb_id, BBLOB60P1); + memcpy(bk, &bl60p1, BBLOB_SIZE); + *dirtyp = 1; + } + } + + return (ret); +} diff --git a/src/btree/bt_verify.c b/src/btree/bt_verify.c index 99354a58..8ceb50e6 100644 --- a/src/btree/bt_verify.c +++ b/src/btree/bt_verify.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -9,6 +9,7 @@ #include "db_config.h" #include "db_int.h" +#include "dbinc/blob.h" #include "dbinc/db_page.h" #include "dbinc/db_verify.h" #include "dbinc/btree.h" @@ -20,8 +21,8 @@ static int __bam_safe_getdata __P((DB *, DB_THREAD_INFO *, static int __bam_vrfy_inp __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, db_indx_t *, u_int32_t)); static int __bam_vrfy_treeorder __P((DB *, DB_THREAD_INFO *, PAGE *, - BINTERNAL *, BINTERNAL *, int (*)(DB *, const DBT *, const DBT *), - u_int32_t)); + BINTERNAL *, BINTERNAL *, + int (*)(DB *, const DBT *, const DBT *, size_t *), u_int32_t)); static int __ram_vrfy_inp __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, db_indx_t *, u_int32_t)); @@ -44,6 +45,7 @@ __bam_vrfy_meta(dbp, vdp, meta, pgno, flags) VRFY_PAGEINFO *pip; int isbad, t_ret, ret; db_indx_t ovflsize; + db_seq_t blob_id; env = dbp->env; isbad = 0; @@ -201,6 +203,56 @@ __bam_vrfy_meta(dbp, vdp, meta, pgno, flags) "%lu %lu"), (u_long)pgno, (u_long)pip->re_len)); } +/* + * Where 64-bit integer support is not available, + * return an error if the file has any blobs. + */ + t_ret = 0; +#ifdef HAVE_64BIT_TYPES + GET_BLOB_FILE_ID(env, meta, blob_id, t_ret); + if (t_ret != 0) { + isbad = 1; + EPRINT((env, DB_STR_A("1187", + "Page %lu: blob file id overflow.", "%lu"), (u_long)pgno)); + if (ret == 0) + ret = t_ret; + } + t_ret = 0; + GET_BLOB_SDB_ID(env, meta, blob_id, t_ret); + if (t_ret != 0) { + isbad = 1; + EPRINT((env, DB_STR_A("1188", + "Page %lu: blob subdatabase id overflow.", + "%lu"), (u_long)pgno)); + if (ret == 0) + ret = t_ret; + } +#else /* HAVE_64BIT_TYPES */ + /* + * db_seq_t is an int on systems that do not have 64 integers, so + * this will compile and run. + */ + GET_BLOB_FILE_ID(env, meta, blob_id, t_ret); + if (t_ret != 0 || blob_id != 0) { + isbad = 1; + EPRINT((env, DB_STR_A("1200", + "Page %lu: blobs require 64 integer compiler support.", + "%lu"), (u_long)pgno)); + if (ret == 0) + ret = t_ret; + } + t_ret = 0; + GET_BLOB_SDB_ID(env, meta, blob_id, t_ret); + if (t_ret != 0 || blob_id != 0) { + isbad = 1; + EPRINT((env, DB_STR_A("1201", + "Page %lu: blobs require 64 integer compiler support.", + "%lu"), (u_long)pgno)); + if (ret == 0) + ret = t_ret; + } +#endif + /* * We do not check that the rest of the page is 0, because it may * not be and may still be correct. @@ -268,8 +320,7 @@ __ram_vrfy_leaf(dbp, vdp, h, pgno, flags) if (F_ISSET(pip, VRFY_HAS_DUPS)) { EPRINT((env, DB_STR_A("1043", - "Page %lu: Recno database has dups", - "%lu"), (u_long)pgno)); + "Page %lu: Recno database has dups", "%lu"), (u_long)pgno)); ret = DB_VERIFY_BAD; goto err; } @@ -547,12 +598,15 @@ __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags) db_indx_t *nentriesp; u_int32_t flags; { + BBLOB bl; BKEYDATA *bk; BOVERFLOW *bo; ENV *env; VRFY_CHILDINFO child; VRFY_ITEM *pagelayout; VRFY_PAGEINFO *pip; + off_t blob_size; + db_seq_t blob_id, file_id, sdb_id; u_int32_t himark, offset; /* * These would be db_indx_ts * but for alignment. @@ -563,6 +617,7 @@ __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags) env = dbp->env; isbad = isdupitem = 0; nentries = 0; + file_id = sdb_id = 0; memset(&child, 0, sizeof(VRFY_CHILDINFO)); if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) return (ret); @@ -668,6 +723,9 @@ __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags) else endoff = offset + BKEYDATA_SIZE(bk->len) - 1; break; + case B_BLOB: + endoff = offset + BBLOB_SIZE - 1; + break; case B_DUPLICATE: /* * Flag that we have dups; we'll check whether @@ -731,6 +789,52 @@ __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags) * already been done. */ break; + case B_BLOB: + if (TYPE(h) == P_IBTREE) { + isbad = 1; + EPRINT((env, DB_STR_A("1189", + "Page %lu: blob item in internal btree page at item %lu", + "%lu %lu"), (u_long)pgno, (u_long)i)); + break; + } else if (TYPE(h) == P_LRECNO) { + isbad = 1; + EPRINT((env, DB_STR_A("1190", + "Page %lu: blob item referenced by recno page at item %lu", + "%lu %lu"), (u_long)pgno, (u_long)i)); + break; + } + /* + * Blob item. Check that the blob file exists and is + * the same file size as is stored in the database + * record. + */ + memcpy(&bl, bk, BBLOB_SIZE); + blob_id = (db_seq_t)bl.id; + GET_BLOB_SIZE(env, bl, blob_size, ret); + if (ret != 0 || blob_size < 0) { + isbad = 1; + EPRINT((env, DB_STR_A("1192", + "Page %lu: blob file size value has overflowed at item %lu", + "%lu %lu"), (u_long)pgno, (u_long)i)); + break; + } + file_id = (db_seq_t)bl.file_id; + sdb_id = (db_seq_t)bl.sdb_id; + if (file_id == 0 && sdb_id == 0) { + isbad = 1; + EPRINT((dbp->env, DB_STR_A("1195", + "Page %lu: invalid blob dir ids %llu %llu at item %lu", + "%lu %ll %ll %lu"), (u_long)pip->pgno, + (long long)file_id, + (long long)sdb_id, (u_long)i)); + break; + } + if ((ret = __blob_vrfy(env, blob_id, + blob_size, file_id, sdb_id, pgno, flags)) != 0) { + isbad = 1; + break; + } + break; case B_DUPLICATE: if (TYPE(h) == P_IBTREE) { isbad = 1; @@ -751,9 +855,17 @@ __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags) (BOVERFLOW *)(((BINTERNAL *)bk)->data) : (BOVERFLOW *)bk; - if (B_TYPE(bk->type) == B_OVERFLOW) + if (B_TYPE(bk->type) == B_OVERFLOW) { + if (TYPE(h) == P_IBTREE && + bk->len != BOVERFLOW_SIZE) { + EPRINT((env, DB_STR_A("1196", + "Page %lu: bad length %u in B_OVERFLOW item %lu", + "%lu %u %lu"), + (u_long)pgno, bk->len, (u_long)i)); + isbad = 1; + } /* Make sure tlen is reasonable. */ - if (bo->tlen > dbp->pgsize * vdp->last_pgno) { + if (bo->tlen >= dbp->pgsize * vdp->last_pgno) { isbad = 1; EPRINT((env, DB_STR_A("1056", "Page %lu: impossible tlen %lu, item %lu", @@ -762,6 +874,7 @@ __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags) /* Don't save as a child. */ break; } + } if (!IS_VALID_PGNO(bo->pgno) || bo->pgno == pgno || bo->pgno == PGNO_INVALID) { @@ -918,8 +1031,8 @@ __bam_vrfy_itemorder(dbp, vdp, ip, h, pgno, nentries, ovflok, hasdups, flags) VRFY_PAGEINFO *pip; db_indx_t i, *inp; int adj, cmp, freedup_1, freedup_2, isbad, ret, t_ret; - int (*dupfunc) __P((DB *, const DBT *, const DBT *)); - int (*func) __P((DB *, const DBT *, const DBT *)); + int (*dupfunc) __P((DB *, const DBT *, const DBT *, size_t *)); + int (*func) __P((DB *, const DBT *, const DBT *, size_t *)); void *buf1, *buf2, *tmpbuf; /* @@ -1066,6 +1179,11 @@ retry: p1 = &dbta; if (B_TYPE(bk->type) == B_OVERFLOW) { bo = (BOVERFLOW *)bk; goto overflow; + } else if (B_TYPE(bk->type) == B_BLOB) { + isbad = 1; + EPRINT((env, DB_STR_A("1197", + "Page %lu: Blob found in key item %lu", + "%lu %lu"), (u_long)pgno, (u_long)i)); } else { p2->data = bk->data; p2->size = bk->len; @@ -1124,7 +1242,8 @@ overflow: if (!ovflok) { /* Compare with the last key. */ if (p1->data != NULL && p2->data != NULL) { - cmp = inp[i] == inp[i - adj] ? 0 : func(dbp, p1, p2); + cmp = inp[i] == inp[i - adj] ? 0 : + func(dbp, p1, p2, NULL); /* comparison succeeded */ if (cmp > 0) { @@ -1236,8 +1355,8 @@ overflow: if (!ovflok) { * until we do the structure check * and see whether DUPSORT is set. */ - if (dupfunc(dbp, &dup_1, &dup_2) > 0 && - pip != NULL) + if (dupfunc(dbp, &dup_1, &dup_2, + NULL) > 0 && pip != NULL) F_SET(pip, VRFY_DUPS_UNSORTED); if (freedup_1) @@ -1409,7 +1528,7 @@ __bam_vrfy_subtree(dbp, vdp, pgno, l, r, flags, levelp, nrecsp, relenp) db_recno_t child_nrecs, nrecs; u_int32_t child_level, child_relen, j, level, relen, stflags; u_int8_t leaf_type; - int (*func) __P((DB *, const DBT *, const DBT *)); + int (*func) __P((DB *, const DBT *, const DBT *, size_t *)); int isbad, p, ret, t_ret, toplevel; if (levelp != NULL) /* Don't leave uninitialized on error. */ @@ -1524,7 +1643,7 @@ __bam_vrfy_subtree(dbp, vdp, pgno, l, r, flags, levelp, nrecsp, relenp) * Don't do the prev/next_pgno checks if we've lost * leaf pages due to another corruption. */ - if (!F_ISSET(vdp, VRFY_LEAFCHAIN_BROKEN)) { + if (!F_ISSET(vdp, SALVAGE_LEAFCHAIN_BROKEN)) { if (pip->pgno != vdp->next_pgno) { isbad = 1; EPRINT((env, DB_STR_A("1075", @@ -1547,7 +1666,7 @@ bad_prev: isbad = 1; } vdp->prev_pgno = pip->pgno; vdp->next_pgno = pip->next_pgno; - F_CLR(vdp, VRFY_LEAFCHAIN_BROKEN); + F_CLR(vdp, SALVAGE_LEAFCHAIN_BROKEN); /* * Overflow pages are common to all three leaf types; @@ -1694,7 +1813,7 @@ bad_prev: isbad = 1; * spew error messages about erroneous prev/next_pgnos, * since that's probably not the real problem. */ - F_SET(vdp, VRFY_LEAFCHAIN_BROKEN); + F_SET(vdp, SALVAGE_LEAFCHAIN_BROKEN); ret = DB_VERIFY_BAD; goto err; @@ -2042,7 +2161,7 @@ __bam_vrfy_treeorder(dbp, ip, h, lp, rp, func, flags) DB_THREAD_INFO *ip; PAGE *h; BINTERNAL *lp, *rp; - int (*func) __P((DB *, const DBT *, const DBT *)); + int (*func) __P((DB *, const DBT *, const DBT *, size_t *)); u_int32_t flags; { BOVERFLOW *bo; @@ -2050,7 +2169,7 @@ __bam_vrfy_treeorder(dbp, ip, h, lp, rp, func, flags) DBT dbt; ENV *env; db_indx_t last; - int ret, cmp; + int cmp, ret, t_ret; env = dbp->env; memset(&dbt, 0, sizeof(DBT)); @@ -2077,7 +2196,6 @@ __bam_vrfy_treeorder(dbp, ip, h, lp, rp, func, flags) return (__db_unknown_path(env, "__bam_vrfy_treeorder")); } - /* Populate a dummy cursor. */ if ((ret = __db_cursor_int(dbp, ip, NULL, DB_BTREE, PGNO_INVALID, 0, DB_LOCK_INVALIDID, &dbc)) != 0) return (ret); @@ -2095,9 +2213,6 @@ __bam_vrfy_treeorder(dbp, ip, h, lp, rp, func, flags) * parent and falsely report a failure.) */ if (lp != NULL && TYPE(h) != P_IBTREE) { - if ((ret = __db_cursor_int(dbp, ip, NULL, DB_BTREE, - PGNO_INVALID, 0, DB_LOCK_INVALIDID, &dbc)) != 0) - return (ret); if (lp->type == B_KEYDATA) { dbt.data = lp->data; dbt.size = lp->len; @@ -2105,13 +2220,13 @@ __bam_vrfy_treeorder(dbp, ip, h, lp, rp, func, flags) bo = (BOVERFLOW *)lp->data; if ((ret = __db_goff(dbc, &dbt, bo->tlen, bo->pgno, NULL, NULL)) != 0) - return (ret); - } else - return ( - __db_unknown_path(env, "__bam_vrfy_treeorder")); + goto err; + } else { + ret = __db_unknown_path(env, "__bam_vrfy_treeorder"); + goto err; + } - /* On error, fall through, free if needed, and return. */ - if ((ret = __bam_cmp(dbc, &dbt, h, 0, func, &cmp)) == 0) { + if ((ret = __bam_cmp(dbc, &dbt, h, 0, func, &cmp, NULL)) == 0) { if (cmp > 0) { EPRINT((env, DB_STR_A("1092", "Page %lu: first item on page sorted greater than parent entry", @@ -2126,7 +2241,7 @@ __bam_vrfy_treeorder(dbp, ip, h, lp, rp, func, flags) if (dbt.data != lp->data) __os_ufree(env, dbt.data); if (ret != 0) - return (ret); + goto err; } if (rp != NULL) { @@ -2137,13 +2252,14 @@ __bam_vrfy_treeorder(dbp, ip, h, lp, rp, func, flags) bo = (BOVERFLOW *)rp->data; if ((ret = __db_goff(dbc, &dbt, bo->tlen, bo->pgno, NULL, NULL)) != 0) - return (ret); - } else - return ( - __db_unknown_path(env, "__bam_vrfy_treeorder")); + goto err; + } else { + ret = __db_unknown_path(env, "__bam_vrfy_treeorder"); + goto err; + } - /* On error, fall through, free if needed, and return. */ - if ((ret = __bam_cmp(dbc, &dbt, h, last, func, &cmp)) == 0) { + if ((ret = __bam_cmp(dbc, + &dbt, h, last, func, &cmp, NULL)) == 0) { if (cmp < 0) { EPRINT((env, DB_STR_A("1094", "Page %lu: last item on page sorted greater than parent entry", @@ -2158,6 +2274,9 @@ __bam_vrfy_treeorder(dbp, ip, h, lp, rp, func, flags) if (dbt.data != rp->data) __os_ufree(env, dbt.data); } +err: + if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0) + ret = t_ret; return (ret); } @@ -2186,14 +2305,20 @@ __bam_salvage(dbp, vdp, pgno, pgtype, h, handle, callback, key, flags) { BKEYDATA *bk; BOVERFLOW *bo; + BBLOB bl; DBT dbt, repldbt, unknown_key, unknown_data; ENV *env; VRFY_ITEM *pgmap; db_indx_t i, last, beg, end, *inp; db_pgno_t ovflpg; + off_t blob_size, blob_offset, remaining; + u_int32_t blob_buf_size; + u_int8_t *blob_buf; u_int32_t himark, ovfl_bufsz; + db_seq_t blob_id, file_id, sdb_id; void *ovflbuf; int adj, ret, t_ret, t2_ret; + char *prefix; #ifdef HAVE_COMPRESSION DBT kcpy, *last_key; int unknown_dup_key; @@ -2202,6 +2327,8 @@ __bam_salvage(dbp, vdp, pgno, pgtype, h, handle, callback, key, flags) env = dbp->env; ovflbuf = pgmap = NULL; inp = P_INP(dbp, h); + blob_buf_size = 0; + blob_buf = NULL; memset(&dbt, 0, sizeof(DBT)); dbt.flags = DB_DBT_REALLOC; @@ -2543,6 +2670,68 @@ __bam_salvage(dbp, vdp, pgno, pgtype, h, handle, callback, key, flags) } #endif break; + case B_BLOB: + memcpy(&bl, bk, BBLOB_SIZE); + blob_id = (db_seq_t)bl.id; + GET_BLOB_SIZE(env, bl, blob_size, ret); + if (ret != 0 || blob_size < 0) + goto err; + file_id = (db_seq_t)bl.file_id; + sdb_id = (db_seq_t)bl.sdb_id; + + /* Read the blob, in pieces if it is too large.*/ + blob_offset = 0; + if (blob_size > MEGABYTE) { + if (blob_buf_size < MEGABYTE) { + if ((ret = __os_realloc( + env, MEGABYTE, &blob_buf)) != 0) + goto err; + blob_buf_size = MEGABYTE; + } + } else if (blob_buf_size < blob_size) { + blob_buf_size = (u_int32_t)blob_size; + if ((ret = __os_realloc(env, + blob_buf_size, &blob_buf)) != 0) + goto err; + } + dbt.data = blob_buf; + dbt.ulen = blob_buf_size; + remaining = blob_size; + prefix = " "; + do { + if ((ret = __blob_salvage(env, blob_id, + blob_offset, + ((remaining < blob_buf_size) ? + (size_t)remaining : blob_buf_size), + file_id, sdb_id, &dbt)) != 0) { + if (LF_ISSET(DB_AGGRESSIVE)) { + ret = DB_VERIFY_BAD; + break; + } + F_CLR(vdp, SALVAGE_STREAM_BLOB); + goto err; + } + if (remaining > blob_buf_size) + F_SET(vdp, SALVAGE_STREAM_BLOB); + else + F_CLR(vdp, SALVAGE_STREAM_BLOB); + if ((t_ret = __db_vrfy_prdbt( + &dbt, 0, prefix, + handle, callback, 0, 0, vdp)) != 0) { + if (ret == 0) + ret = t_ret; + F_CLR(vdp, SALVAGE_STREAM_BLOB); + goto err; + } + prefix = NULL; + blob_offset += dbt.size; + if (remaining < blob_buf_size) + remaining = 0; + else + remaining -= blob_buf_size; + } while (remaining > 0); + F_CLR(vdp, SALVAGE_STREAM_BLOB); + break; default: /* * We should never get here; __db_vrfy_inpitem should @@ -2572,6 +2761,8 @@ err: if (pgmap != NULL) __os_free(env, ovflbuf); if (repldbt.data != NULL) __os_free(env, repldbt.data); + if (blob_buf != NULL) + __os_free(env, blob_buf); #ifdef HAVE_COMPRESSION if (kcpy.data != NULL) __os_free(env, kcpy.data); diff --git a/src/btree/btree.src b/src/btree/btree.src index 08e5a206..02088b88 100644 --- a/src/btree/btree.src +++ b/src/btree/btree.src @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ |