summaryrefslogtreecommitdiff
path: root/src/btree
diff options
context:
space:
mode:
authorLorry Tar Creator <lorry-tar-importer@baserock.org>2015-02-17 17:25:57 +0000
committer <>2015-03-17 16:26:24 +0000
commit780b92ada9afcf1d58085a83a0b9e6bc982203d1 (patch)
tree598f8b9fa431b228d29897e798de4ac0c1d3d970 /src/btree
parent7a2660ba9cc2dc03a69ddfcfd95369395cc87444 (diff)
downloadberkeleydb-master.tar.gz
Imported from /home/lorry/working-area/delta_berkeleydb/db-6.1.23.tar.gz.HEADdb-6.1.23master
Diffstat (limited to 'src/btree')
-rw-r--r--src/btree/bt_compact.c239
-rw-r--r--src/btree/bt_compare.c105
-rw-r--r--src/btree/bt_compress.c72
-rw-r--r--src/btree/bt_conv.c9
-rw-r--r--src/btree/bt_curadj.c2
-rw-r--r--src/btree/bt_cursor.c94
-rw-r--r--src/btree/bt_delete.c18
-rw-r--r--src/btree/bt_method.c19
-rw-r--r--src/btree/bt_open.c43
-rw-r--r--src/btree/bt_put.c177
-rw-r--r--src/btree/bt_rec.c2
-rw-r--r--src/btree/bt_reclaim.c2
-rw-r--r--src/btree/bt_recno.c30
-rw-r--r--src/btree/bt_rsearch.c7
-rw-r--r--src/btree/bt_search.c49
-rw-r--r--src/btree/bt_split.c118
-rw-r--r--src/btree/bt_stat.c12
-rw-r--r--src/btree/bt_upgrade.c94
-rw-r--r--src/btree/bt_verify.c261
-rw-r--r--src/btree/btree.src2
20 files changed, 1040 insertions, 315 deletions
diff --git a/src/btree/bt_compact.c b/src/btree/bt_compact.c
index b455ff23..be4c6b01 100644
--- a/src/btree/bt_compact.c
+++ b/src/btree/bt_compact.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -22,13 +22,16 @@ static int __bam_csearch __P((DBC *, DBT *, u_int32_t, int));
static int __bam_lock_tree __P((DBC *, EPG *, EPG *csp, u_int32_t, u_int32_t));
static int __bam_lock_subtree __P((DBC *, PAGE *, u_int32_t, u_int32_t));
static int __bam_merge __P((DBC *,
- DBC *, u_int32_t, DBT *, DB_COMPACT *,int *));
-static int __bam_merge_internal __P((DBC *, DBC *, int, DB_COMPACT *, int *));
+ DBC *, u_int32_t, DBT *, DB_COMPACT *, int *, int *));
+static int __bam_merge_internal __P((DBC *,
+ DBC *, int, DB_COMPACT *, int *, int *));
static int __bam_merge_pages __P((DBC *, DBC *, DB_COMPACT *));
-static int __bam_merge_records __P((DBC *, DBC*, u_int32_t, DB_COMPACT *));
-static int __bam_truncate_internal_overflow __P((DBC *, PAGE *, DB_COMPACT *));
+static int __bam_merge_records __P((DBC *,
+ DBC *, u_int32_t, DB_COMPACT *, int *));
+static int __bam_truncate_internal_overflow __P((DBC *,
+ PAGE *, DB_COMPACT *, int *));
static int __bam_truncate_root_page __P((DBC *,
- PAGE *, u_int32_t, DB_COMPACT *));
+ PAGE *, u_int32_t, DB_COMPACT *, int *));
#ifdef HAVE_FTRUNCATE
static int __bam_savekey __P((DBC *, int, DBT *));
@@ -145,13 +148,13 @@ __bam_csearch(dbc, start, sflag, level)
* PUBLIC: DBT *, DBT *, u_int32_t, int *, DB_COMPACT *, int *));
*/
int
-__bam_compact_int(dbc, start, stop, factor, spanp, c_data, donep)
+__bam_compact_int(dbc, start, stop, factor, spanp, c_data, isdonep)
DBC *dbc;
DBT *start, *stop;
u_int32_t factor;
int *spanp;
DB_COMPACT *c_data;
- int *donep;
+ int *isdonep;
{
BTREE_CURSOR *cp, *ncp;
DB *dbp;
@@ -168,7 +171,7 @@ __bam_compact_int(dbc, start, stop, factor, spanp, c_data, donep)
int check_dups, check_trunc, clear_root, do_commit, isdone;
int merged, next_p, pgs_done, ret, t_ret, tdone;
-#ifdef DEBUG
+#ifdef DEBUG_WOP
#define CTRACE(dbc, location, t, start, f) do { \
DBT __trace; \
DB_SET_DBT(__trace, t, strlen(t)); \
@@ -182,8 +185,8 @@ __bam_compact_int(dbc, start, stop, factor, spanp, c_data, donep)
CTRACE(dbc, location, __buf, start, f); \
} while (0)
#else
-#define CTRACE(dbc, location, t, start, f)
-#define PTRACE(dbc, location, p, start, f)
+#define CTRACE(dbc, location, t, start, f) NOP_STATEMENT
+#define PTRACE(dbc, location, p, start, f) NOP_STATEMENT
#endif
ndbc = NULL;
@@ -551,11 +554,10 @@ retry: pg = NULL;
if (ret != 0)
goto err1;
}
- pgs_done++;
- /* Get a fresh low numbered page. */
+ /* Try to swap to a lower numbered page. */
if ((ret = __db_exchange_page(dbc,
&cp->csp->page, ncp->csp->page,
- PGNO_INVALID, DB_EXCH_DEFAULT)) != 0)
+ PGNO_INVALID, DB_EXCH_DEFAULT, &pgs_done)) != 0)
goto err1;
if ((ret = __TLPUT(dbc, prev_lock)) != 0)
goto err1;
@@ -598,8 +600,8 @@ retry: pg = NULL;
merged = 0;
for (epg = cp->sp; epg != cp->csp; epg++) {
PTRACE(dbc, "PMerge", PGNO(epg->page), start, 0);
- if ((ret = __bam_merge_internal(dbc,
- ndbc, LEVEL(epg->page), c_data, &merged)) != 0)
+ if ((ret = __bam_merge_internal(dbc, ndbc,
+ LEVEL(epg->page), c_data, &merged, &pgs_done)) != 0)
break;
if (merged)
break;
@@ -627,7 +629,7 @@ retry: pg = NULL;
}
PTRACE(dbc, "SMerge", PGNO(cp->csp->page), start, 0);
- /* if we remove the next page, then we need its next locked */
+ /* If we remove the next page, then we need its next locked. */
npgno = NEXT_PGNO(ncp->csp->page);
if (npgno != PGNO_INVALID) {
TRY_LOCK2(dbc, ndbc, npgno,
@@ -637,9 +639,8 @@ retry: pg = NULL;
}
/*lint -e{794} */
if ((ret = __bam_merge(dbc,
- ndbc, factor, stop, c_data, &isdone)) != 0)
+ ndbc, factor, stop, c_data, &isdone, &pgs_done)) != 0)
goto err1;
- pgs_done++;
/*
* __bam_merge could have freed our stack if it
* deleted a page possibly collapsing the tree.
@@ -722,8 +723,8 @@ retry: pg = NULL;
/* Get a fresh low numbered page. */
pgno = PGNO(pg);
if ((ret = __db_exchange_page(dbc,
- &cp->csp->page, NULL,
- PGNO_INVALID, DB_EXCH_DEFAULT)) != 0)
+ &cp->csp->page, NULL, PGNO_INVALID,
+ DB_EXCH_DEFAULT, &pgs_done)) != 0)
goto err1;
if ((ret = __TLPUT(dbc, prev_lock)) != 0)
goto err1;
@@ -734,10 +735,7 @@ retry: pg = NULL;
LOCK_INIT(next_lock);
saved_pgno = PGNO_INVALID;
pg = cp->csp->page;
- if (pgno != PGNO(pg)) {
- pgs_done++;
- pgno = PGNO(pg);
- }
+ pgno = PGNO(pg);
}
/*
* If we are going to leave this parent commit
@@ -752,7 +750,7 @@ retry: pg = NULL;
goto next_page;
}
- /* If they have the same parent, just dup the cursor */
+ /* If they have the same parent, just dup the cursor. */
if (ndbc != NULL && (ret = __dbc_close(ndbc)) != 0)
goto err1;
if ((ret = __dbc_dup(dbc, &ndbc, DB_POSITION)) != 0)
@@ -842,17 +840,15 @@ retry: pg = NULL;
pgno = PGNO(pg);
/* Get a fresh low numbered page. */
if ((ret = __db_exchange_page(dbc, &cp->csp->page,
- npg, PGNO_INVALID, DB_EXCH_DEFAULT)) != 0)
+ npg, PGNO_INVALID,
+ DB_EXCH_DEFAULT, &pgs_done)) != 0)
goto err1;
if ((ret = __TLPUT(dbc, prev_lock)) != 0)
goto err1;
LOCK_INIT(prev_lock);
prev_pgno = PGNO_INVALID;
pg = cp->csp->page;
- if (pgno != PGNO(pg)) {
- pgs_done++;
- pgno = PGNO(pg);
- }
+ pgno = PGNO(pg);
}
c_data->compact_pages_examine++;
@@ -887,11 +883,9 @@ retry: pg = NULL;
*/
PTRACE(dbc, "Merge", PGNO(cp->csp->page), start, 0);
if ((ret = __bam_merge(dbc,
- ndbc, factor, stop, c_data, &isdone)) != 0)
+ ndbc, factor, stop, c_data, &isdone, &pgs_done)) != 0)
goto err1;
- pgs_done++;
-
if ((ret = __TLPUT(dbc, nnext_lock)) != 0)
goto err1;
LOCK_INIT(nnext_lock);
@@ -932,7 +926,7 @@ next_page:
pg = NULL;
if ((ret = __bam_stkrel(dbc, STK_PGONLY)) != 0)
goto err;
- if (npgno != PGNO_INVALID &&
+ if (npgno != PGNO_INVALID && !do_commit &&
(ret = __db_lget(dbc, 0, npgno, DB_LOCK_READ, 0, &next_lock)) != 0)
goto err;
if ((ret = __bam_stkrel(dbc, pgs_done == 0 ? STK_NOLOCK : 0)) != 0)
@@ -1010,9 +1004,6 @@ err: /*
if ((t_ret = __bam_stkrel(dbc, sflag)) != 0 && ret == 0)
ret = t_ret;
- if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0)
- ret = t_ret;
-
if (pg != NULL && (t_ret =
__memp_fput(dbmp,
dbc->thread_info, pg, dbc->priority) != 0) && ret == 0)
@@ -1022,7 +1013,11 @@ err: /*
dbc->thread_info, npg, dbc->priority) != 0) && ret == 0)
ret = t_ret;
-out: *donep = isdone;
+out:
+ if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0)
+ ret = t_ret;
+
+ *isdonep = isdone;
/* For OPD trees return if we did anything in the span variable. */
if (F_ISSET(dbc, DBC_OPD))
@@ -1035,12 +1030,13 @@ out: *donep = isdone;
* __bam_merge -- do actual merging of leaf pages.
*/
static int
-__bam_merge(dbc, ndbc, factor, stop, c_data, donep)
+__bam_merge(dbc, ndbc, factor, stop, c_data, isdonep, pgs_donep)
DBC *dbc, *ndbc;
u_int32_t factor;
DBT *stop;
DB_COMPACT *c_data;
- int *donep;
+ int *isdonep;
+ int *pgs_donep;
{
BTREE_CURSOR *cp, *ncp;
DB *dbp;
@@ -1064,9 +1060,9 @@ __bam_merge(dbc, ndbc, factor, stop, c_data, donep)
/* Find if the stopping point is on this page. */
if (stop != NULL && stop->size != 0) {
- if ((ret = __bam_compact_isdone(dbc, stop, npg, donep)) != 0)
+ if ((ret = __bam_compact_isdone(dbc, stop, npg, isdonep)) != 0)
return (ret);
- if (*donep)
+ if (*isdonep)
return (0);
}
@@ -1080,20 +1076,23 @@ __bam_merge(dbc, ndbc, factor, stop, c_data, donep)
ncp->csp[-1].indx == 0 && ncp->csp[-1].entries != 1) ||
(int)(P_FREESPACE(dbp, pg) -
((dbp->pgsize - P_OVERHEAD(dbp)) -
- P_FREESPACE(dbp, npg))) < (int)factor)
- ret = __bam_merge_records(dbc, ndbc, factor, c_data);
- else
+ P_FREESPACE(dbp, npg))) < (int)factor) {
+ ret = __bam_merge_records(dbc, ndbc, factor, c_data, pgs_donep);
+ } else {
/*lint -e{794} */
free_page: ret = __bam_merge_pages(dbc, ndbc, c_data);
+ (*pgs_donep)++;
+ }
return (ret);
}
static int
-__bam_merge_records(dbc, ndbc, factor, c_data)
+__bam_merge_records(dbc, ndbc, factor, c_data, pgs_donep)
DBC *dbc, *ndbc;
u_int32_t factor;
DB_COMPACT *c_data;
+ int *pgs_donep;
{
BINTERNAL *bi;
BKEYDATA *bk, *tmp_bk;
@@ -1126,8 +1125,8 @@ __bam_merge_records(dbc, ndbc, factor, c_data)
if (c_data->compact_truncate != PGNO_INVALID &&
PGNO(ncp->csp->page) > c_data->compact_truncate) {
/* Get a fresh low numbered page. */
- if ((ret = __db_exchange_page(ndbc,
- &ncp->csp->page, pg, PGNO_INVALID, DB_EXCH_DEFAULT)) != 0)
+ if ((ret = __db_exchange_page(ndbc, &ncp->csp->page,
+ pg, PGNO_INVALID, DB_EXCH_DEFAULT, pgs_donep)) != 0)
goto err;
}
@@ -1197,6 +1196,7 @@ __bam_merge_records(dbc, ndbc, factor, c_data)
/* If we have hit the first record then there is nothing we can move. */
if (indx == 0)
goto done;
+ (*pgs_donep)++;
if (TYPE(pg) != P_LBTREE && TYPE(pg) != P_LDUP) {
if (indx == nent)
return (__bam_merge_pages(dbc, ndbc, c_data));
@@ -1237,7 +1237,8 @@ __bam_merge_records(dbc, ndbc, factor, c_data)
indx -= adj;
}
bk = GET_BKEYDATA(dbp, npg, indx);
- len = (B_TYPE(bk->type) != B_KEYDATA) ? BOVERFLOW_SIZE : bk->len;
+ len = (B_TYPE(bk->type) == B_KEYDATA) ? bk->len :
+ ((B_TYPE(bk->type) == B_BLOB) ? BBLOB_DSIZE : BOVERFLOW_SIZE);
if (indx != 0 && BINTERNAL_SIZE(len) >= pfree) {
if (F_ISSET(dbc, DBC_OPD)) {
if (dbp->dup_compare == __bam_defcmp)
@@ -1281,8 +1282,9 @@ noprefix:
} while (indx != 0 && ninp[indx] == ninp[indx - adj]);
bk = GET_BKEYDATA(dbp, npg, indx);
- len =
- (B_TYPE(bk->type) != B_KEYDATA) ? BOVERFLOW_SIZE : bk->len;
+ len = (B_TYPE(bk->type) == B_KEYDATA) ?
+ bk->len : ((B_TYPE(bk->type) == B_BLOB) ?
+ BBLOB_DSIZE : BOVERFLOW_SIZE);
}
/*
@@ -1346,6 +1348,13 @@ no_check: is_dup = first_dup = next_dup = 0;
BOVERFLOW_SIZE, &data, NULL)) != 0)
goto err;
break;
+ case B_BLOB:
+ data.size = BBLOB_SIZE;
+ data.data = bk;
+ if ((ret = __db_pitem(dbc, pg,
+ pind, BBLOB_SIZE, &data, NULL)) != 0)
+ goto err;
+ break;
default:
__db_errx(env, DB_STR_A("1022",
"Unknown record format, page %lu, indx 0",
@@ -1538,15 +1547,20 @@ err: return (ret);
/*
* __bam_merge_internal --
* Merge internal nodes of the tree.
+ *
+ * The first key of an internal page does not have a guaranteed-
+ * useful key.
*/
static int
-__bam_merge_internal(dbc, ndbc, level, c_data, merged)
+__bam_merge_internal(dbc, ndbc, level, c_data, merged, pgs_donep)
DBC *dbc, *ndbc;
int level;
DB_COMPACT *c_data;
int *merged;
+ int *pgs_donep;
{
BINTERNAL bi, *bip, *fip;
+ BOVERFLOW bo;
BTREE_CURSOR *cp, *ncp;
DB *dbp;
DBT data, hdr;
@@ -1579,7 +1593,6 @@ __bam_merge_internal(dbc, ndbc, level, c_data, merged)
dbmp = dbp->mpf;
cp = (BTREE_CURSOR *)dbc->internal;
ncp = (BTREE_CURSOR *)ndbc->internal;
- *merged = 0;
ret = 0;
/*
@@ -1608,11 +1621,11 @@ __bam_merge_internal(dbc, ndbc, level, c_data, merged)
* Check for overflow keys on both pages while we have
* them locked.
*/
- if ((ret =
- __bam_truncate_internal_overflow(dbc, pg, c_data)) != 0)
+ if ((ret = __bam_truncate_internal_overflow(dbc,
+ pg, c_data, pgs_donep)) != 0)
goto err;
- if ((ret =
- __bam_truncate_internal_overflow(dbc, npg, c_data)) != 0)
+ if ((ret = __bam_truncate_internal_overflow(dbc,
+ npg, c_data, pgs_donep)) != 0)
goto err;
}
@@ -1624,7 +1637,12 @@ __bam_merge_internal(dbc, ndbc, level, c_data, merged)
*/
fip = NULL;
if (TYPE(pg) == P_IBTREE) {
- /* See where we run out of space. */
+ /* See where we run out of space. This does not yet include
+ * whatever extra pages are needed if an overflow key is
+ * going to be added to one or more parent pages. It would be
+ * better to use as little of the key that as necessary, though
+ * the effort of determining that might not be worthwhile.
+ */
freespace = P_FREESPACE(dbp, pg);
/*
* The leftmost key of an internal page is not accurate.
@@ -1704,12 +1722,37 @@ fits: memset(&bi, 0, sizeof(bi));
if (fip == NULL) {
data.size = bip->len;
data.data = bip->data;
+ } else if (fip->type == B_OVERFLOW) {
+ DB_ASSERT(dbc->env,
+ fip->len == sizeof(BOVERFLOW));
+ /* Cast to "BOVERFLOW *" to calm down lint. */
+ memmove(&bo,
+ (BOVERFLOW *)fip->data, sizeof(BOVERFLOW));
+ memset(&hdr, 0, sizeof(hdr));
+ if ((ret = __db_goff(dbc, &hdr, bo.tlen,
+ bo.pgno, &hdr.data, &hdr.size)) == 0)
+ ret = __db_poff(dbc, &hdr, &bo.pgno);
+ if (hdr.data != NULL)
+ __os_free(dbp->env, hdr.data);
+ if (ret != 0)
+ return (ret);
+ data.size = sizeof(bo);
+ data.data = &bo;
+ } else if (fip->type == B_BLOB) {
+ /* Blobs should never appear as keys. */
+ DB_ASSERT(dbc->env,
+ !(fip->type == B_BLOB &&
+ TYPE(pg) == P_IBTREE));
} else {
data.size = fip->len;
data.data = fip->data;
}
bi.len = data.size;
- B_TSET(bi.type, bip->type);
+ /*
+ * Set bi.type according to the data's type, to ensure
+ * that it is B_OVERLOW iff the data is BOVERFLOW.
+ */
+ B_TSET(bi.type, fip == NULL ? bip->type : fip->type);
bi.pgno = bip->pgno;
bi.nrecs = bip->nrecs;
hdr.data = &bi;
@@ -1750,7 +1793,12 @@ fits: memset(&bi, 0, sizeof(bi));
if ((ret = __db_pitem(dbc, pg, pind, size, &hdr, &data)) != 0)
goto err;
pind++;
- if (fip != NULL) {
+ /* add bip test so fortify does not complain */
+ if (fip != NULL && bip != NULL) {
+ if (B_TYPE(bip->type) == B_OVERFLOW &&
+ (ret = __db_doff(dbc,
+ ((BOVERFLOW *)bip->data)->pgno)) != 0)
+ goto err;
/* reset size to be for the record being deleted. */
size = BINTERNAL_SIZE(bip->len);
fip = NULL;
@@ -1848,14 +1896,14 @@ fits: memset(&bi, 0, sizeof(bi));
PGNO(npg) > c_data->compact_truncate &&
ncp->csp != ncp->sp) {
if ((ret = __db_exchange_page(ndbc, &ncp->csp->page,
- pg, PGNO_INVALID, DB_EXCH_DEFAULT)) != 0)
+ pg, PGNO_INVALID, DB_EXCH_DEFAULT, pgs_donep)) != 0)
goto err;
}
if (c_data->compact_truncate != PGNO_INVALID &&
PGNO(pg) > c_data->compact_truncate && cp->csp != cp->sp) {
if ((ret = __db_exchange_page(dbc, &cp->csp->page,
ncp->csp->page,
- PGNO_INVALID, DB_EXCH_DEFAULT)) != 0)
+ PGNO_INVALID, DB_EXCH_DEFAULT, pgs_donep)) != 0)
goto err;
}
}
@@ -1875,13 +1923,13 @@ err: cp->csp = save_csp;
* We may or may not have a write lock on this page.
*/
static int
-__bam_compact_dups(dbc, ppg, factor, have_lock, c_data, donep)
+__bam_compact_dups(dbc, ppg, factor, have_lock, c_data, pgs_donep)
DBC *dbc;
PAGE **ppg;
u_int32_t factor;
int have_lock;
DB_COMPACT *c_data;
- int *donep;
+ int *pgs_donep;
{
BOVERFLOW *bo;
BTREE_CURSOR *cp;
@@ -1896,15 +1944,19 @@ __bam_compact_dups(dbc, ppg, factor, have_lock, c_data, donep)
DB_ASSERT(NULL, dbc != NULL);
dbp = dbc->dbp;
dbmp = dbp->mpf;
+ /* XXX Don't reserve any free bytes (Force 100% fillfactor) in OPD trees
+ * to ensure forward progress.
+ */
+ factor = 0;
cp = (BTREE_CURSOR *)dbc->internal;
for (i = 0; i < NUM_ENT(*ppg); i++) {
bo = GET_BOVERFLOW(dbp, *ppg, i);
- if (B_TYPE(bo->type) == B_KEYDATA)
+ if (B_TYPE(bo->type) == B_KEYDATA ||
+ B_TYPE(bo->type) == B_BLOB)
continue;
c_data->compact_pages_examine++;
if (bo->pgno > c_data->compact_truncate) {
- (*donep)++;
if (!have_lock) {
/*
* The caller should have the page at
@@ -1925,8 +1977,9 @@ __bam_compact_dups(dbc, ppg, factor, have_lock, c_data, donep)
dbc->txn, DB_MPOOL_DIRTY, ppg)) != 0)
goto err;
}
+ pgno = bo->pgno;
if ((ret = __bam_truncate_root_page(dbc,
- *ppg, i, c_data)) != 0)
+ *ppg, i, c_data, pgs_donep)) != 0)
goto err;
/* Just in case it should move. Could it? */
bo = GET_BOVERFLOW(dbp, *ppg, i);
@@ -1934,13 +1987,13 @@ __bam_compact_dups(dbc, ppg, factor, have_lock, c_data, donep)
if (B_TYPE(bo->type) == B_OVERFLOW) {
if ((ret = __db_truncate_overflow(dbc,
- bo->pgno, have_lock ? NULL : ppg, c_data)) != 0)
+ bo->pgno, have_lock ? NULL : ppg,
+ c_data, pgs_donep)) != 0)
goto err;
- (*donep)++;
continue;
}
if ((ret = __bam_compact_opd(dbc, bo->pgno,
- have_lock ? NULL : ppg, factor, c_data, donep)) != 0)
+ have_lock ? NULL : ppg, factor, c_data, pgs_donep)) != 0)
goto err;
}
@@ -1955,13 +2008,13 @@ err:
* PUBLIC: db_pgno_t, PAGE **, u_int32_t, DB_COMPACT *, int *));
*/
int
-__bam_compact_opd(dbc, root_pgno, ppg, factor, c_data, donep)
+__bam_compact_opd(dbc, root_pgno, ppg, factor, c_data, pgs_donep)
DBC *dbc;
db_pgno_t root_pgno;
PAGE **ppg;
u_int32_t factor;
DB_COMPACT *c_data;
- int *donep;
+ int *pgs_donep;
{
BTREE_CURSOR *cp;
DBC *opd;
@@ -2021,7 +2074,7 @@ __bam_compact_opd(dbc, root_pgno, ppg, factor, c_data, donep)
NULL, factor, &span, c_data, &isdone)) != 0)
break;
/* For OPD the number of pages dirtied is returned in span. */
- *donep += span;
+ *pgs_donep += span;
} while (!isdone);
if (start.data != NULL)
@@ -2041,11 +2094,12 @@ done:
* The page is reference by the pg/indx passed in.
*/
static int
-__bam_truncate_root_page(dbc, pg, indx, c_data)
+__bam_truncate_root_page(dbc, pg, indx, c_data, pgs_donep)
DBC *dbc;
PAGE *pg;
u_int32_t indx;
DB_COMPACT *c_data;
+ int *pgs_donep;
{
BINTERNAL *bi;
BOVERFLOW *bo;
@@ -2053,8 +2107,8 @@ __bam_truncate_root_page(dbc, pg, indx, c_data)
db_pgno_t *pgnop;
u_int32_t tlen;
- COMPQUIET(c_data, NULL);
COMPQUIET(bo, NULL);
+ COMPQUIET(c_data, NULL);
dbp = dbc->dbp;
if (TYPE(pg) == P_IBTREE) {
bi = GET_BINTERNAL(dbp, pg, indx);
@@ -2075,7 +2129,7 @@ __bam_truncate_root_page(dbc, pg, indx, c_data)
DB_ASSERT(dbp->env, IS_DIRTY(pg));
- return (__db_truncate_root(dbc, pg, indx, pgnop, tlen));
+ return (__db_truncate_root(dbc, pg, indx, pgnop, tlen, pgs_donep));
}
/*
@@ -2086,10 +2140,11 @@ __bam_truncate_root_page(dbc, pg, indx, c_data)
* nodes they will get copied adding pages to the database.
*/
static int
-__bam_truncate_internal_overflow(dbc, page, c_data)
+__bam_truncate_internal_overflow(dbc, page, c_data, pgs_donep)
DBC *dbc;
PAGE *page;
DB_COMPACT *c_data;
+ int *pgs_donep;
{
BINTERNAL *bi;
BOVERFLOW *bo;
@@ -2104,10 +2159,11 @@ __bam_truncate_internal_overflow(dbc, page, c_data)
continue;
bo = (BOVERFLOW *)(bi->data);
if (bo->pgno > c_data->compact_truncate && (ret =
- __bam_truncate_root_page(dbc, page, indx, c_data)) != 0)
+ __bam_truncate_root_page(dbc, page,
+ indx, c_data, pgs_donep)) != 0)
break;
- if ((ret = __db_truncate_overflow(
- dbc, bo->pgno, NULL, c_data)) != 0)
+ if ((ret = __db_truncate_overflow(dbc,
+ bo->pgno, NULL, c_data, pgs_donep)) != 0)
break;
}
return (ret);
@@ -2142,7 +2198,7 @@ __bam_compact_isdone(dbc, stop, pg, isdone)
} else {
DB_ASSERT(dbc->dbp->env, TYPE(pg) == P_LBTREE);
if ((ret = __bam_cmp(dbc, stop, pg, 0,
- t->bt_compare, &cmp)) != 0)
+ t->bt_compare, &cmp, NULL)) != 0)
return (ret);
*isdone = cmp <= 0;
@@ -2328,7 +2384,7 @@ __bam_savekey(dbc, next, start)
if (len == 0) {
no_key: __db_errx(env, DB_STR("1023",
"Compact cannot handle zero length key"));
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
} else {
@@ -2360,14 +2416,15 @@ retry: return (DB_LOCK_NOTGRANTED);
* Find high numbered pages in the internal nodes of a tree and
* swap them for lower numbered pages.
* PUBLIC: int __bam_truncate_ipages __P((DB *,
- * PUBLIC: DB_THREAD_INFO *, DB_TXN *, DB_COMPACT *));
+ * PUBLIC: DB_THREAD_INFO *, DB_TXN *, DB_COMPACT *, int *));
*/
int
-__bam_truncate_ipages(dbp, ip, txn, c_data)
+__bam_truncate_ipages(dbp, ip, txn, c_data, pgs_donep)
DB *dbp;
DB_THREAD_INFO *ip;
DB_TXN *txn;
DB_COMPACT *c_data;
+ int *pgs_donep;
{
BTMETA *meta;
BTREE *bt;
@@ -2480,8 +2537,9 @@ new_txn:
pgno = PGNO(cp->csp->page);
if (pgno > c_data->compact_truncate) {
- if ((ret = __db_exchange_page(dbc, &cp->csp->page,
- NULL, PGNO_INVALID, DB_EXCH_DEFAULT)) != 0)
+ if ((ret = __db_exchange_page(dbc,
+ &cp->csp->page, NULL, PGNO_INVALID,
+ DB_EXCH_DEFAULT, pgs_donep)) != 0)
goto err;
}
@@ -2561,7 +2619,8 @@ again: if (F_ISSET(dbp, DB_AM_SUBDB) &&
}
if (PGNO(meta) > c_data->compact_truncate) {
dbmeta = (DBMETA *)meta;
- ret = __db_move_metadata(dbc, &dbmeta, c_data);
+ ret = __db_move_metadata(dbc,
+ &dbmeta, c_data, pgs_donep);
meta = (BTMETA *)dbmeta;
if (ret != 0)
goto err;
@@ -2583,8 +2642,8 @@ again: if (F_ISSET(dbp, DB_AM_SUBDB) &&
* page latch is released.
*/
++dbp->mpf->mfp->revision;
- if ((ret = __db_exchange_page(dbc,
- &root, NULL, PGNO_INVALID, DB_EXCH_FREE)) != 0)
+ if ((ret = __db_exchange_page(dbc, &root, NULL,
+ PGNO_INVALID, DB_EXCH_FREE, pgs_donep)) != 0)
goto err;
if (PGNO(root) == bt->bt_root)
goto err;
diff --git a/src/btree/bt_compare.c b/src/btree/bt_compare.c
index 5c009071..8923c5fa 100644
--- a/src/btree/bt_compare.c
+++ b/src/btree/bt_compare.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1990, 1993, 1994, 1995, 1996
@@ -49,27 +49,39 @@
/*
* __bam_cmp --
- * Compare a key to a given record.
+ * Compare a key to a given record. We always start the comparison
+ * at an offset and update the offset with longest matching count
+ * after the comparison.
*
* PUBLIC: int __bam_cmp __P((DBC *, const DBT *, PAGE *, u_int32_t,
- * PUBLIC: int (*)(DB *, const DBT *, const DBT *), int *));
+ * PUBLIC: int (*)(DB *, const DBT *, const DBT *, size_t *),
+ * PUBLIC: int *, size_t *));
*/
int
-__bam_cmp(dbc, dbt, h, indx, func, cmpp)
+__bam_cmp(dbc, dbt, h, indx, func, cmpp, locp)
DBC *dbc;
const DBT *dbt;
PAGE *h;
u_int32_t indx;
- int (*func)__P((DB *, const DBT *, const DBT *));
+ int (*func)__P((DB *, const DBT *, const DBT *, size_t *));
int *cmpp;
+ size_t *locp;
{
+ BBLOB bl;
BINTERNAL *bi;
BKEYDATA *bk;
BOVERFLOW *bo;
DB *dbp;
DBT pg_dbt;
+ off_t blob_size;
+ int ret;
+ db_seq_t blob_id;
dbp = dbc->dbp;
+ ret = 0;
+
+ /* Assert that the func is non-Null. */
+ DB_ASSERT(dbp->env, func != NULL);
/*
* Returns:
@@ -91,11 +103,49 @@ __bam_cmp(dbc, dbt, h, indx, func, cmpp)
bk = GET_BKEYDATA(dbp, h, indx);
if (B_TYPE(bk->type) == B_OVERFLOW)
bo = (BOVERFLOW *)bk;
- else {
+ else if (B_TYPE(bk->type) == B_BLOB) {
+ /*
+ * This is very slow, but since blobs cannot be
+ * in databases with duplicates or be keys, it should
+ * only happen when using DB_GET_BOTH or DB_SET.
+ */
+ memcpy(&bl, bk, BBLOB_SIZE);
+ memset(&pg_dbt, 0, sizeof(DBT));
+ GET_BLOB_SIZE(dbc->env, bl, blob_size, ret);
+ if (ret != 0)
+ return (ret);
+ if (blob_size > UINT32_MAX)
+ pg_dbt.size = UINT32_MAX;
+ else
+ pg_dbt.size = (u_int32_t)blob_size;
+ blob_id = (db_seq_t)bl.id;
+ pg_dbt.flags = DB_DBT_USERMEM;
+ if ((ret = __os_malloc(
+ dbc->env, pg_dbt.size, &pg_dbt.data)) != 0)
+ return (ret);
+ pg_dbt.ulen = pg_dbt.size;
+ if ((ret = __blob_get(dbc,
+ &pg_dbt, blob_id, blob_size, NULL, NULL)) != 0) {
+ __os_free(dbc->env, pg_dbt.data);
+ return (ret);
+ }
+ *cmpp = func(dbp, dbt, &pg_dbt, locp);
+ /*
+ * There is no way to directly compare a blob file that
+ * is greater in size than UINT32_MAX, so instead we
+ * compare the data up to UINT32_MAX, and if they are
+ * equal return that the blob is larger, since it is
+ * longer than the input data.
+ */
+ if (*cmpp == 0 && (blob_size > UINT32_MAX))
+ *cmpp = -1;
+ __os_free(dbc->env, pg_dbt.data);
+ return (0);
+ } else {
pg_dbt.app_data = NULL;
pg_dbt.data = bk->data;
pg_dbt.size = bk->len;
- *cmpp = func(dbp, dbt, &pg_dbt);
+ *cmpp = func(dbp, dbt, &pg_dbt, locp);
return (0);
}
break;
@@ -123,13 +173,14 @@ __bam_cmp(dbc, dbt, h, indx, func, cmpp)
}
bi = GET_BINTERNAL(dbp, h, indx);
- if (B_TYPE(bi->type) == B_OVERFLOW)
+ if (B_TYPE(bi->type) == B_OVERFLOW) {
+ DB_ASSERT(dbp->env, bi->len == BOVERFLOW_SIZE);
bo = (BOVERFLOW *)(bi->data);
- else {
+ } else {
pg_dbt.app_data = NULL;
pg_dbt.data = bi->data;
pg_dbt.size = bi->len;
- *cmpp = func(dbp, dbt, &pg_dbt);
+ *cmpp = func(dbp, dbt, &pg_dbt, locp);
return (0);
}
break;
@@ -141,42 +192,56 @@ __bam_cmp(dbc, dbt, h, indx, func, cmpp)
* Overflow.
*/
return (__db_moff(dbc, dbt, bo->pgno, bo->tlen,
- func == __bam_defcmp ? NULL : func, cmpp));
+ func == __bam_defcmp ? NULL : func, cmpp, locp));
}
/*
* __bam_defcmp --
- * Default comparison routine.
+ * Keep track of how far along in the two keys we find matching
+ * characters, and use that as an offset into the keys to begin
+ * future comparisons. This will save us the overhead of always
+ * starting the comparisons on the first character.
*
- * PUBLIC: int __bam_defcmp __P((DB *, const DBT *, const DBT *));
+ * PUBLIC: int __bam_defcmp __P((DB *, const DBT *, const DBT *, size_t *));
*/
int
-__bam_defcmp(dbp, a, b)
+__bam_defcmp(dbp, a, b, locp)
DB *dbp;
const DBT *a, *b;
+ size_t *locp;
{
- size_t len;
+ size_t len, i, start;
u_int8_t *p1, *p2;
COMPQUIET(dbp, NULL);
-
+ start = (locp == NULL ? 0 : *locp);
/*
* Returns:
* < 0 if a is < b
* = 0 if a is = b
* > 0 if a is > b
*
+ * We start the comparison from 'locp' and store the last match
+ * location in 'locp'.
+ *
* XXX
* If a size_t doesn't fit into a long, or if the difference between
* any two characters doesn't fit into an int, this routine can lose.
* What we need is a signed integral type that's guaranteed to be at
* least as large as a size_t, and there is no such thing.
*/
+ p1 = (u_int8_t *)a->data + start;
+ p2 = (u_int8_t *)b->data + start;
len = a->size > b->size ? b->size : a->size;
- for (p1 = a->data, p2 = b->data; len--; ++p1, ++p2)
- if (*p1 != *p2)
- return ((long)*p1 - (long)*p2);
- return ((long)a->size - (long)b->size);
+ for (i = start; i < len; ++p1, ++p2, ++i)
+ if (*p1 != *p2) {
+ if (locp != NULL)
+ *locp = i;
+ return (*p1 < *p2 ? -1 : 1);
+ }
+ if (locp != NULL)
+ *locp = len;
+ return (a->size == b->size ? 0 : (a->size < b->size ? -1 : 1));
}
/*
diff --git a/src/btree/bt_compress.c b/src/btree/bt_compress.c
index 3f293461..479e7248 100644
--- a/src/btree/bt_compress.c
+++ b/src/btree/bt_compress.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
#include "db_config.h"
@@ -352,16 +352,20 @@ __bam_compress_marshal_data(dbp, data, destbuf)
* __bam_compress_dupcmp --
* Duplicate comparison function for compressed BTrees.
*
- * PUBLIC: int __bam_compress_dupcmp __P((DB *, const DBT *, const DBT *));
+ * PUBLIC: int __bam_compress_dupcmp __P((DB *, const DBT *, const DBT *,
+ * PUBLIC: size_t *));
*/
int
-__bam_compress_dupcmp(db, a, b)
+__bam_compress_dupcmp(db, a, b, locp)
DB *db;
const DBT *a;
const DBT *b;
+ size_t *locp;
{
DBT dcmp_a, dcmp_b;
+ COMPQUIET(locp, NULL);
+
/* Decompress the initial data in a */
CMP_UNMARSHAL_DATA(a, &dcmp_a);
dcmp_a.ulen = 0;
@@ -380,7 +384,7 @@ __bam_compress_dupcmp(db, a, b)
/* Call the user's duplicate compare function */
return ((BTREE *)db->bt_internal)->
- compress_dup_compare(db, &dcmp_a, &dcmp_b);
+ compress_dup_compare(db, &dcmp_a, &dcmp_b, NULL);
}
/*
@@ -636,7 +640,7 @@ __bamc_next_decompress(dbc)
db = dbc->dbp;
if (cp->compcursor >= cp->compend)
- return (DB_NOTFOUND);
+ return (DBC_ERR(dbc, DB_NOTFOUND));
cp->prevKey = cp->currentKey;
cp->prevData = cp->currentData;
@@ -1251,7 +1255,7 @@ __bamc_compress_merge_delete(dbc, stream, countp)
* chunk, but don't delete any more
* entries.
*/
- bulk_ret = DB_NOTFOUND;
+ bulk_ret = DBC_ERR(dbc, DB_NOTFOUND);
moreStream = 0;
iSmallEnough = 0;
} else
@@ -1318,7 +1322,7 @@ __bamc_compress_merge_delete(dbc, stream, countp)
CMP_FREE_DBT(env, &nextk);
CMP_FREE_DBT(env, &nextc);
- return (ret != 0 ? ret : bulk_ret);
+ return (ret != 0 ? ret : DBC_ERR(dbc, bulk_ret));
}
/*
@@ -1389,7 +1393,7 @@ __bamc_compress_merge_delete_dups(dbc, stream, countp)
* in the database
*/
if (ifound == 0) {
- bulk_ret = DB_NOTFOUND;
+ bulk_ret = DBC_ERR(dbc, DB_NOTFOUND);
} else
++chunk_count;
break;
@@ -1463,7 +1467,7 @@ __bamc_compress_merge_delete_dups(dbc, stream, countp)
* current chunk, but don't delete
* any more entries.
*/
- bulk_ret = DB_NOTFOUND;
+ bulk_ret = DBC_ERR(dbc, DB_NOTFOUND);
moreStream = 0;
iSmallEnough = 0;
} else
@@ -1541,7 +1545,7 @@ __bamc_compress_merge_delete_dups(dbc, stream, countp)
CMP_FREE_DBT(env, &pdestdata);
CMP_FREE_DBT(env, &nextk);
- return (ret != 0 ? ret : bulk_ret);
+ return (ret != 0 ? ret : DBC_ERR(dbc, bulk_ret));
}
/******************************************************************************/
@@ -1641,8 +1645,8 @@ __bamc_compress_get_prev_dup(dbc, flags)
if ((ret = __bamc_compress_get_prev(dbc, flags)) != 0)
return (ret);
- if (t->bt_compare(dbp, cp->currentKey, &cp->del_key) != 0)
- return (DB_NOTFOUND);
+ if (t->bt_compare(dbp, cp->currentKey, &cp->del_key, NULL) != 0)
+ return (DBC_ERR(dbc, DB_NOTFOUND));
return (0);
}
@@ -1684,7 +1688,7 @@ __bamc_compress_get_prev_nodup(dbc, flags)
do
if ((ret = __bamc_compress_get_prev(dbc, flags)) != 0)
return (ret);
- while (t->bt_compare(dbp, cp->currentKey, &cp->del_key) == 0);
+ while (t->bt_compare(dbp, cp->currentKey, &cp->del_key, NULL) == 0);
return (0);
}
@@ -1702,7 +1706,7 @@ __bamc_compress_get_next(dbc, flags)
if (F_ISSET(cp, C_COMPRESS_DELETED)) {
if (cp->currentKey == 0)
- return (DB_NOTFOUND);
+ return (DBC_ERR(dbc, DB_NOTFOUND));
F_CLR(cp, C_COMPRESS_DELETED);
return (0);
} else if (cp->currentKey) {
@@ -1722,7 +1726,7 @@ __bamc_compress_get_next(dbc, flags)
* to the right place
*/
__bamc_compress_reset(dbc);
- return (DB_NOTFOUND);
+ return (DBC_ERR(dbc, DB_NOTFOUND));
} else if (ret != 0)
return (ret);
@@ -1753,17 +1757,18 @@ __bamc_compress_get_next_dup(dbc, key, flags)
* deleted entry.
*/
if (cp->currentKey == 0)
- return (DB_NOTFOUND);
+ return (DBC_ERR(dbc, DB_NOTFOUND));
F_CLR(cp, C_COMPRESS_DELETED);
- return (t->bt_compare(dbp,
- cp->currentKey, &cp->del_key) == 0 ? 0 : DB_NOTFOUND);
+ return (t->bt_compare(dbp, cp->currentKey,
+ &cp->del_key, NULL) == 0 ? 0 : DB_NOTFOUND);
} else if (cp->currentKey == 0)
return (EINVAL);
/* Check that the next entry has the same key as the previous entry */
ret = __bamc_next_decompress(dbc);
- if (ret == 0 && t->bt_compare(dbp, cp->currentKey, cp->prevKey) != 0)
- return (DB_NOTFOUND);
+ if (ret == 0 && t->bt_compare(dbp,
+ cp->currentKey, cp->prevKey, NULL) != 0)
+ return (DBC_ERR(dbc, DB_NOTFOUND));
if (ret != DB_NOTFOUND)
return (ret);
@@ -1783,7 +1788,7 @@ __bamc_compress_get_next_dup(dbc, key, flags)
* will end up pointing to the right place
*/
__bamc_compress_reset(dbc);
- return (DB_NOTFOUND);
+ return (DBC_ERR(dbc, DB_NOTFOUND));
} else if (ret != 0)
return (ret);
@@ -1791,8 +1796,8 @@ __bamc_compress_get_next_dup(dbc, key, flags)
return (ret);
/* Check the keys are the same */
- if (t->bt_compare(dbp, cp->currentKey, key) != 0)
- return (DB_NOTFOUND);
+ if (t->bt_compare(dbp, cp->currentKey, key, NULL) != 0)
+ return (DBC_ERR(dbc, DB_NOTFOUND));
return (0);
}
@@ -1828,7 +1833,7 @@ __bamc_compress_get_next_nodup(dbc, flags)
do
if ((ret = __bamc_compress_get_next(dbc, flags)) != 0)
return (ret);
- while (t->bt_compare(dbp, cp->currentKey, &cp->del_key) == 0);
+ while (t->bt_compare(dbp, cp->currentKey, &cp->del_key, NULL) == 0);
return (ret);
}
@@ -1888,14 +1893,14 @@ __bamc_compress_get_set(dbc, key, data, method, flags)
if (ret == 0 &&
__db_compare_both(dbp, cp->currentKey, 0, key, 0) != 0) {
/* We didn't find the key */
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
}
break;
case DB_GET_BOTH:
if (ret == 0 && (cmp != 0 || (!F_ISSET(dbp, DB_AM_DUPSORT) &&
- __bam_defcmp(dbp, cp->currentData, data) != 0))) {
+ __bam_defcmp(dbp, cp->currentData, data, NULL) != 0))) {
/* We didn't find the key/data pair */
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
}
break;
default:
@@ -1923,7 +1928,7 @@ __bamc_compress_get_bothc(dbc, data, flags)
position */
if (__db_compare_both(dbp, cp->currentKey,
cp->currentData, cp->currentKey, data) >= 0)
- return (DB_NOTFOUND);
+ return (DBC_ERR(dbc, DB_NOTFOUND));
cmp = 0;
/* Perform a linear search for the data in the current chunk */
@@ -1933,7 +1938,7 @@ __bamc_compress_get_bothc(dbc, data, flags)
continue;
if (ret == 0)
- return (cmp == 0 ? 0 : DB_NOTFOUND);
+ return (cmp == 0 ? 0 : DBC_ERR(dbc, DB_NOTFOUND));
if (ret != DB_NOTFOUND)
return (ret);
@@ -2277,7 +2282,7 @@ __bamc_compress_iput(dbc, key, data, flags)
switch (flags) {
case DB_CURRENT:
if (cp->currentKey == 0 || F_ISSET(cp, C_COMPRESS_DELETED)) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto end;
}
@@ -2290,7 +2295,7 @@ __bamc_compress_iput(dbc, key, data, flags)
if (F_ISSET(dbp, DB_AM_DUPSORT) &&
((BTREE *)dbp->bt_internal)->compress_dup_compare(
- dbp, cp->currentData, data) != 0) {
+ dbp, cp->currentData, data, NULL) != 0) {
__db_errx(env, DB_STR("1032",
"Existing data sorts differently from put data"));
ret = EINVAL;
@@ -2464,7 +2469,7 @@ __bamc_compress_idel(dbc, flags)
if (F_ISSET(cp, C_COMPRESS_DELETED))
return DB_KEYEMPTY;
if (cp->currentKey == 0)
- return DB_NOTFOUND;
+ return (DBC_ERR(dbc, DB_NOTFOUND));
if ((ret = __bam_compress_set_dbt(dbp, &cp->del_key,
cp->currentKey->data, cp->currentKey->size)) != 0)
@@ -3015,7 +3020,8 @@ __bam_compress_count(dbc, nkeysp, ndatap)
if (ret != 0)
goto err;
- if (t->bt_compare(dbp, cp_n->currentKey, cp_n->prevKey) != 0)
+ if (t->bt_compare(dbp,
+ cp_n->currentKey, cp_n->prevKey, NULL) != 0)
nkeys += 1;
}
diff --git a/src/btree/bt_conv.c b/src/btree/bt_conv.c
index 348ce5c2..85baeed8 100644
--- a/src/btree/bt_conv.c
+++ b/src/btree/bt_conv.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -88,7 +88,12 @@ __bam_mswap(env, pg)
SWAP32(p); /* re_len */
SWAP32(p); /* re_pad */
SWAP32(p); /* root */
- p += 92 * sizeof(u_int32_t); /* unused */
+ SWAP32(p); /* threshold */
+ SWAP32(p); /* file id lo */
+ SWAP32(p); /* file id hi */
+ SWAP32(p); /* sdb id lo */
+ SWAP32(p); /* sdb id hi */
+ p += 87 * sizeof(u_int32_t); /* unused */
SWAP32(p); /* crypto_magic */
return (0);
diff --git a/src/btree/bt_curadj.c b/src/btree/bt_curadj.c
index 78606009..d3398ee8 100644
--- a/src/btree/bt_curadj.c
+++ b/src/btree/bt_curadj.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c
index 860c31ce..d63b7373 100644
--- a/src/btree/bt_cursor.c
+++ b/src/btree/bt_cursor.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -938,7 +938,7 @@ __bamc_get(dbc, key, data, flags, pgnop)
case DB_CURRENT:
/* It's not possible to return a deleted record. */
if (F_ISSET(cp, C_DELETED)) {
- ret = DB_KEYEMPTY;
+ ret = DBC_ERR(dbc, DB_KEYEMPTY);
goto err;
}
@@ -979,7 +979,7 @@ __bamc_get(dbc, key, data, flags, pgnop)
goto err;
if (flags == DB_GET_BOTH) {
if (!exact) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
break;
@@ -1000,7 +1000,7 @@ __bamc_get(dbc, key, data, flags, pgnop)
dbc, PGNO_INVALID, key, flags, &exact)) != 0)
return (ret);
if (!exact) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
@@ -1047,7 +1047,7 @@ __bamc_get(dbc, key, data, flags, pgnop)
if ((ret = __bamc_next(dbc, 1, 0)) != 0)
goto err;
if (!IS_CUR_DUPLICATE(dbc, orig_pgno, orig_indx)) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
break;
@@ -1077,7 +1077,7 @@ __bamc_get(dbc, key, data, flags, pgnop)
if ((ret = __bamc_prev(dbc)) != 0)
goto err;
if (!IS_CUR_DUPLICATE(dbc, orig_pgno, orig_indx)) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
break;
@@ -1173,12 +1173,15 @@ __bam_bulk(dbc, data, flags)
DBT *data;
u_int32_t flags;
{
+ BBLOB bl;
BKEYDATA *bk;
BOVERFLOW *bo;
BTREE_CURSOR *cp;
PAGE *pg;
db_indx_t *inp, indx, pg_keyoff;
int32_t *endp, key_off, *offp, *saveoffp;
+ off_t blob_size;
+ db_seq_t blob_id;
u_int8_t *dbuf, *dp, *np;
u_int32_t key_size, pagesize, size, space;
int adj, is_key, need_pg, next_key, no_dup, rec_key, ret;
@@ -1279,6 +1282,7 @@ next_pg:
*/
if (is_key && pg_keyoff != inp[indx]) {
bk = GET_BKEYDATA(dbc->dbp, pg, indx);
+ DB_ASSERT(dbc->env, B_TYPE(bk->type) != B_BLOB);
if (B_TYPE(bk->type) == B_OVERFLOW) {
bo = (BOVERFLOW *)bk;
size = key_size = bo->tlen;
@@ -1403,6 +1407,31 @@ get_key_space:
*offp-- = (int32_t)(np - dbuf);
np += size;
*offp-- = (int32_t)size;
+ } else if (B_TYPE(bk->type) == B_BLOB) {
+ blob_size = 0;
+ blob_id = 0;
+ memcpy(&bl, bk, BBLOB_SIZE);
+ GET_BLOB_SIZE(dbc->env, bl, blob_size, ret);
+ if (ret != 0)
+ return (ret);
+ if (blob_size > UINT32_MAX) {
+ size = UINT32_MAX;
+ goto back_up;
+ }
+ size = (u_int32_t)blob_size;
+ if (size > space)
+ goto back_up;
+ blob_id = (db_seq_t)bl.id;
+ if ((ret = __blob_bulk(dbc, size, blob_id, np)) != 0)
+ return (ret);
+ if (is_key) {
+ *offp-- = (int32_t)key_off;
+ *offp-- = (int32_t)key_size;
+ }
+ space -= size;
+ *offp-- = (int32_t)(np - dbuf);
+ np += size;
+ *offp-- = (int32_t)size;
} else {
if (need_pg) {
dp = np;
@@ -1764,11 +1793,11 @@ __bam_getbothc(dbc, data)
*/
if ((ret = __bam_cmp(dbc, data, cp->page, cp->indx,
dbp->dup_compare == NULL ? __bam_defcmp : dbp->dup_compare,
- &cmp)) != 0)
+ &cmp, NULL)) != 0)
return (ret);
if (cmp <= 0)
- return (DB_NOTFOUND);
+ return (DBC_ERR(dbc, DB_NOTFOUND));
/* Discard the current page, we're going to do a full search. */
if ((ret = __memp_fput(mpf,
@@ -1791,7 +1820,7 @@ __bam_getbothc(dbc, data)
*/
if (cp->indx + P_INDX >= NUM_ENT(cp->page) ||
!IS_DUPLICATE(dbc, cp->indx, cp->indx + P_INDX))
- return (DB_NOTFOUND);
+ return (DBC_ERR(dbc, DB_NOTFOUND));
cp->indx += P_INDX;
return (__bam_getboth_finddatum(dbc, data, DB_GET_BOTH));
@@ -1842,7 +1871,7 @@ __bam_getlte(dbc, key, data)
/* Check if we're still on the correct key */
if ((ret = __bam_cmp(dbc, key, cp->page, cp->indx,
- ((BTREE*)dbp->bt_internal)->bt_compare, &exact)) != 0)
+ ((BTREE*)dbp->bt_internal)->bt_compare, &exact, NULL)) != 0)
goto end;
exact = (exact == 0);
}
@@ -1884,8 +1913,8 @@ __bam_getlte(dbc, key, data)
if (data != NULL) {
/* Check if we're still on the correct data */
if ((ret = __bam_cmp(
- dbc, data, ocp->page, ocp->indx,
- dbp->dup_compare, &exact)) != 0)
+ dbc, data, ocp->page, ocp->indx,
+ dbp->dup_compare, &exact, NULL)) != 0)
goto end;
exact = (exact == 0);
} else
@@ -1915,7 +1944,8 @@ __bam_getlte(dbc, key, data)
else {
/* Check if we're still on the correct data */
if ((ret = __bam_cmp(dbc, data, cp->page,
- cp->indx + O_INDX, dbp->dup_compare, &exact)) != 0)
+ cp->indx + O_INDX, dbp->dup_compare,
+ &exact, NULL)) != 0)
goto end;
exact = (exact == 0);
}
@@ -1982,7 +2012,7 @@ __bam_getboth_finddatum(dbc, data, flags)
if (!IS_CUR_DELETED(dbc)) {
if ((ret = __bam_cmp(
dbc, data, cp->page, cp->indx + O_INDX,
- __bam_defcmp, &cmp)) != 0)
+ __bam_defcmp, &cmp, NULL)) != 0)
return (ret);
if (cmp == 0)
return (0);
@@ -1992,7 +2022,8 @@ __bam_getboth_finddatum(dbc, data, flags)
!IS_DUPLICATE(dbc, cp->indx, cp->indx + P_INDX))
break;
}
- return (DB_NOTFOUND);
+
+ return (DBC_ERR(dbc, DB_NOTFOUND));
}
/*
@@ -2008,18 +2039,18 @@ __bam_getboth_finddatum(dbc, data, flags)
break;
if (base == (top - P_INDX)) {
if ((ret = __bam_cmp(dbc, data, cp->page,
- cp->indx + O_INDX, dbp->dup_compare, &cmp)) != 0)
+ cp->indx + O_INDX, dbp->dup_compare, &cmp, NULL)) != 0)
return (ret);
if (cmp == 0 || (cmp < 0 && flags == DB_GET_BOTH_RANGE))
return (0);
cp->indx = top;
- return DB_NOTFOUND;
+ return (DBC_ERR(dbc, DB_NOTFOUND));
}
for (lim = (top - base) / (db_indx_t)P_INDX; lim != 0; lim >>= 1) {
cp->indx = base + ((lim >> 1) * P_INDX);
if ((ret = __bam_cmp(dbc, data, cp->page,
- cp->indx + O_INDX, dbp->dup_compare, &cmp)) != 0)
+ cp->indx + O_INDX, dbp->dup_compare, &cmp, NULL)) != 0)
return (ret);
if (cmp == 0) {
/*
@@ -2039,7 +2070,7 @@ __bam_getboth_finddatum(dbc, data, flags)
/* No match found; if we're looking for an exact match, we're done. */
if (flags == DB_GET_BOTH)
- return (DB_NOTFOUND);
+ return (DBC_ERR(dbc, DB_NOTFOUND));
/*
* Base is the smallest index greater than the data item, may be zero
@@ -2049,7 +2080,7 @@ __bam_getboth_finddatum(dbc, data, flags)
cp->indx = base;
while (cp->indx < top && IS_CUR_DELETED(dbc))
cp->indx += P_INDX;
- return (cp->indx < top ? 0 : DB_NOTFOUND);
+ return (cp->indx < top ? 0 : DBC_ERR(dbc, DB_NOTFOUND));
}
/*
@@ -2082,7 +2113,7 @@ split: ret = stack = 0;
switch (flags) {
case DB_CURRENT:
if (F_ISSET(cp, C_DELETED))
- return (DB_NOTFOUND);
+ return (DBC_ERR(dbc, DB_NOTFOUND));
/* FALLTHROUGH */
case DB_AFTER:
case DB_BEFORE:
@@ -2206,7 +2237,8 @@ split: ret = stack = 0;
*/
for (;; cp->indx += P_INDX) {
if ((ret = __bam_cmp(dbc, data, cp->page,
- cp->indx + O_INDX, dbp->dup_compare, &cmp)) != 0)
+ cp->indx + O_INDX, dbp->dup_compare,
+ &cmp, NULL)) != 0)
goto err;
if (cmp < 0) {
iiop = DB_BEFORE;
@@ -2479,7 +2511,7 @@ __bamc_next(dbc, initial_move, deleted_okay)
*/
if (cp->indx >= NUM_ENT(cp->page)) {
if ((pgno = NEXT_PGNO(cp->page)) == PGNO_INVALID)
- return (DB_NOTFOUND);
+ return (DBC_ERR(dbc, DB_NOTFOUND));
ACQUIRE_CUR(dbc, lock_mode, pgno, 0, ret);
if (ret != 0)
@@ -2539,7 +2571,7 @@ __bamc_prev(dbc)
if (cp->indx == 0) {
if ((pgno =
PREV_PGNO(cp->page)) == PGNO_INVALID)
- return (DB_NOTFOUND);
+ return (DBC_ERR(dbc, DB_NOTFOUND));
ACQUIRE_CUR(dbc, lock_mode, pgno, 0, ret);
if (ret != 0)
@@ -2711,11 +2743,11 @@ __bamc_search(dbc, root_pgno, key, flags, exactp)
if (h->next_pgno == PGNO_INVALID) {
indx = NUM_ENT(h) - P_INDX;
if ((ret = __bam_cmp(dbc, key, h, indx,
- t->bt_compare, &cmp)) != 0)
+ t->bt_compare, &cmp, NULL)) != 0)
goto fast_miss;
if (cmp > 0) {
if (FLD_ISSET(sflags, SR_EXACT))
- return (DB_NOTFOUND);
+ return (DBC_ERR(dbc, DB_NOTFOUND));
else
indx += P_INDX;
}
@@ -2725,10 +2757,10 @@ __bamc_search(dbc, root_pgno, key, flags, exactp)
if (h->prev_pgno == PGNO_INVALID) {
indx = 0;
if ((ret = __bam_cmp(dbc, key, h, indx,
- t->bt_compare, &cmp)) != 0)
+ t->bt_compare, &cmp, NULL)) != 0)
goto fast_miss;
if (cmp < 0 && FLD_ISSET(sflags, SR_EXACT))
- return (DB_NOTFOUND);
+ return (DBC_ERR(dbc, DB_NOTFOUND));
if (cmp <= 0)
goto fast_hit;
}
@@ -2736,7 +2768,7 @@ __bamc_search(dbc, root_pgno, key, flags, exactp)
DB_BINARY_SEARCH_FOR(base, lim, NUM_ENT(h), P_INDX) {
DB_BINARY_SEARCH_INCR(indx, base, lim, P_INDX);
if ((ret = __bam_cmp(dbc, key, h, indx,
- t->bt_compare, &cmp)) != 0)
+ t->bt_compare, &cmp, NULL)) != 0)
goto fast_miss;
if (cmp == 0)
@@ -2752,7 +2784,7 @@ __bamc_search(dbc, root_pgno, key, flags, exactp)
indx = base;
if (indx > 0 && indx < NUM_ENT(h)) {
if (FLD_ISSET(sflags, SR_EXACT))
- return (DB_NOTFOUND);
+ return (DBC_ERR(dbc, DB_NOTFOUND));
goto fast_hit;
}
}
@@ -3068,7 +3100,7 @@ __bam_opd_exists(dbc, pgno)
if (NUM_ENT(h) == 0)
ret = 0;
else
- ret = DB_KEYEXIST;
+ ret = DBC_ERR(dbc, DB_KEYEXIST);
(void)__memp_fput(dbc->dbp->mpf, dbc->thread_info, h, dbc->priority);
diff --git a/src/btree/bt_delete.c b/src/btree/bt_delete.c
index 37496b3f..a1ccef71 100644
--- a/src/btree/bt_delete.c
+++ b/src/btree/bt_delete.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1990, 1993, 1994, 1995, 1996
@@ -61,15 +61,18 @@ __bam_ditem(dbc, h, indx)
PAGE *h;
u_int32_t indx;
{
+ BBLOB bl;
BINTERNAL *bi;
BKEYDATA *bk;
DB *dbp;
+ db_seq_t blob_id;
u_int32_t nbytes;
int ret;
db_indx_t *inp;
dbp = dbc->dbp;
inp = P_INP(dbp, h);
+ ret = 0;
/* The page should already have been dirtied by our caller. */
DB_ASSERT(dbp->env, IS_DIRTY(h));
@@ -139,6 +142,13 @@ __bam_ditem(dbc, h, indx)
dbc, (GET_BOVERFLOW(dbp, h, indx))->pgno)) != 0)
return (ret);
break;
+ case B_BLOB:
+ nbytes = BBLOB_SIZE;
+ memcpy(&bl, bk, BBLOB_SIZE);
+ blob_id = (db_seq_t)bl.id;
+ if ((ret = __blob_del(dbc, blob_id)) != 0)
+ return (ret);
+ break;
case B_KEYDATA:
nbytes = BKEYDATA_SIZE(bk->len);
break;
@@ -241,7 +251,7 @@ __bam_dpages(dbc, use_top, flags)
* single item deleted, and the rest of the pages are to be removed.
*
* Recno always has a stack to the root and __bam_merge operations
- * may have unneeded items in the sack. We find the lowest page
+ * may have unneeded items in the stack. We find the lowest page
* in the stack that has more than one record in it and start there.
*/
ret = 0;
@@ -493,7 +503,9 @@ stop: done = 1;
/*
* __bam_pupdate --
- * Update parent key pointers up the tree.
+ * Update parent key pointers up the tree after putting a new key
+ * at the start of a leaf page.
+ *
*
* PUBLIC: int __bam_pupdate __P((DBC *, PAGE *));
*/
diff --git a/src/btree/bt_method.c b/src/btree/bt_method.c
index 5cf93d2e..2fb33be2 100644
--- a/src/btree/bt_method.c
+++ b/src/btree/bt_method.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -15,7 +15,7 @@
static int __bam_set_bt_minkey __P((DB *, u_int32_t));
static int __bam_get_bt_compare
- __P((DB *, int (**)(DB *, const DBT *, const DBT *)));
+ __P((DB *, int (**)(DB *, const DBT *, const DBT *, size_t *)));
static int __bam_get_bt_prefix
__P((DB *, size_t(**)(DB *, const DBT *, const DBT *)));
static int __bam_set_bt_prefix
@@ -233,7 +233,7 @@ incompat:
static int
__bam_get_bt_compare(dbp, funcp)
DB *dbp;
- int (**funcp) __P((DB *, const DBT *, const DBT *));
+ int (**funcp) __P((DB *, const DBT *, const DBT *, size_t *));
{
BTREE *t;
@@ -251,13 +251,13 @@ __bam_get_bt_compare(dbp, funcp)
* __bam_set_bt_compare --
* Set the comparison function.
*
- * PUBLIC: int __bam_set_bt_compare
- * PUBLIC: __P((DB *, int (*)(DB *, const DBT *, const DBT *)));
+ * PUBLIC: int __bam_set_bt_compare __P((DB *,
+ * PUBLIC: int (*)(DB *, const DBT *, const DBT *, size_t *)));
*/
int
__bam_set_bt_compare(dbp, func)
DB *dbp;
- int (*func) __P((DB *, const DBT *, const DBT *));
+ int (*func) __P((DB *, const DBT *, const DBT *, size_t *));
{
BTREE *t;
@@ -351,6 +351,13 @@ __bam_set_bt_compress(dbp, compress, decompress)
return (EINVAL);
}
+ /* Compression is incompatible with blob storage. */
+ if (dbp->blob_threshold > 0) {
+ __db_errx(dbp->env, DB_STR("1198",
+ "compression cannot be used with blobs enabled."));
+ return (EINVAL);
+ }
+
if (compress != 0 && decompress != 0) {
t->bt_compress = compress;
t->bt_decompress = decompress;
diff --git a/src/btree/bt_open.c b/src/btree/bt_open.c
index 7be141c1..46a866d0 100644
--- a/src/btree/bt_open.c
+++ b/src/btree/bt_open.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1990, 1993, 1994, 1995, 1996
@@ -44,6 +44,7 @@
#include "db_config.h"
#include "db_int.h"
+#include "dbinc/blob.h"
#include "dbinc/crypto.h"
#include "dbinc/db_page.h"
#include "dbinc/db_swap.h"
@@ -119,6 +120,7 @@ __bam_metachk(dbp, name, btm)
int ret;
env = dbp->env;
+ ret = 0;
/*
* At this point, all we know is that the magic number is for a Btree.
@@ -136,6 +138,7 @@ __bam_metachk(dbp, name, btm)
return (DB_OLD_VERSION);
case 8:
case 9:
+ case 10:
break;
default:
__db_errx(env, DB_STR_A("1009",
@@ -269,6 +272,29 @@ __bam_metachk(dbp, name, btm)
/* Set the page size. */
dbp->pgsize = btm->dbmeta.pagesize;
+ dbp->blob_threshold = btm->blob_threshold;
+ GET_BLOB_FILE_ID(env, btm, dbp->blob_file_id, ret);
+ if (ret != 0)
+ return (ret);
+ GET_BLOB_SDB_ID(env, btm, dbp->blob_sdb_id, ret);
+ if (ret != 0)
+ return (ret);
+ /* Blob databases must be upgraded. */
+ if (vers == 9 && (dbp->blob_file_id != 0 || dbp->blob_sdb_id != 0)) {
+ __db_errx(env, DB_STR_A("1207",
+"%s: databases that support blobs must be upgraded.", "%s"),
+ name);
+ return (EINVAL);
+ }
+#ifndef HAVE_64BIT_TYPES
+ if (dbp->blob_file_id != 0 || dbp->blob_sdb_id != 0) {
+ __db_errx(env, DB_STR_A("1199",
+ "%s: blobs require 64 integer compiler support.", "%s"),
+ name);
+ return (DB_OPNOTSUP);
+ }
+#endif
+
/* Copy the file's ID. */
memcpy(dbp->fileid, btm->dbmeta.uid, DB_FILE_ID_LEN);
@@ -442,6 +468,9 @@ __bam_init_meta(dbp, meta, pgno, lsnp)
meta->minkey = t->bt_minkey;
meta->re_len = t->re_len;
meta->re_pad = (u_int32_t)t->re_pad;
+ meta->blob_threshold = dbp->blob_threshold;
+ SET_BLOB_META_FILE_ID(meta, dbp->blob_file_id, BTMETA);
+ SET_BLOB_META_SDB_ID(meta, dbp->blob_sdb_id, BTMETA);
#ifdef HAVE_PARTITION
if ((part = dbp->p_internal) != NULL) {
@@ -535,6 +564,12 @@ __bam_new_file(dbp, ip, txn, fhp, name)
pginfo.type = dbp->type;
pdbt.data = &pginfo;
pdbt.size = sizeof(pginfo);
+ if (dbp->blob_threshold) {
+ if ((ret = __blob_generate_dir_ids(dbp, txn,
+ &dbp->blob_file_id)) != 0)
+ return (ret);
+
+ }
if ((ret = __os_calloc(env, 1, dbp->pgsize, &buf)) != 0)
return (ret);
meta = (BTMETA *)buf;
@@ -613,6 +648,12 @@ __bam_new_subdb(mdbp, dbp, ip, txn)
meta = NULL;
root = NULL;
+ if (dbp->blob_threshold) {
+ if ((ret = __blob_generate_dir_ids(dbp, txn,
+ &dbp->blob_sdb_id)) != 0)
+ return (ret);
+ }
+
if ((ret = __db_cursor(mdbp, ip, txn,
&dbc, CDB_LOCKING(env) ? DB_WRITECURSOR : 0)) != 0)
return (ret);
diff --git a/src/btree/bt_put.c b/src/btree/bt_put.c
index 13316181..5cd0ac12 100644
--- a/src/btree/bt_put.c
+++ b/src/btree/bt_put.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1990, 1993, 1994, 1995, 1996
@@ -56,8 +56,8 @@ static int __bam_dup_check __P((DBC *, u_int32_t,
static int __bam_dup_convert __P((DBC *, PAGE *, u_int32_t, u_int32_t));
static int __bam_ovput
__P((DBC *, u_int32_t, db_pgno_t, PAGE *, u_int32_t, DBT *));
-static u_int32_t
- __bam_partsize __P((DB *, u_int32_t, DBT *, PAGE *, u_int32_t));
+static int __bam_partsize
+ __P((DB *, u_int32_t, DBT *, PAGE *, u_int32_t, u_int32_t *));
/*
* __bam_iitem --
@@ -71,18 +71,22 @@ __bam_iitem(dbc, key, data, op, flags)
DBT *key, *data;
u_int32_t op, flags;
{
+ BBLOB bl, blob_buf;
BKEYDATA *bk, bk_tmp;
BTREE *t;
BTREE_CURSOR *cp;
DB *dbp;
- DBT bk_hdr, tdbt;
+ DBT bk_hdr, blob_dbt, tdbt;
DB_MPOOLFILE *mpf;
ENV *env;
+ DB_LSN lsn;
PAGE *h;
db_indx_t cnt, indx;
+ off_t blob_size;
+ db_seq_t blob_id, new_blob_id;
u_int32_t data_size, have_bytes, need_bytes, needed, pages, pagespace;
char tmp_ch;
- int cmp, bigkey, bigdata, del, dupadjust;
+ int cmp, bigkey, bigdata, blobdata, del, dupadjust;
int padrec, replace, ret, t_ret, was_deleted;
COMPQUIET(cnt, 0);
@@ -95,6 +99,7 @@ __bam_iitem(dbc, key, data, op, flags)
h = cp->page;
indx = cp->indx;
del = dupadjust = replace = was_deleted = 0;
+ blobdata = 0;
/*
* Fixed-length records with partial puts: it's an error to specify
@@ -112,8 +117,12 @@ __bam_iitem(dbc, key, data, op, flags)
* longer than the fixed-length, and we never require less than
* the fixed-length record size.
*/
- data_size = F_ISSET(data, DB_DBT_PARTIAL) ?
- __bam_partsize(dbp, op, data, h, indx) : data->size;
+ if (F_ISSET(data, DB_DBT_PARTIAL)) {
+ if ((ret = __bam_partsize(
+ dbp, op, data, h, indx, &data_size)) != 0)
+ return (ret);
+ } else
+ data_size = data->size;
padrec = 0;
if (F_ISSET(dbp, DB_AM_FIXEDLEN)) {
if (data_size > t->re_len)
@@ -190,6 +199,13 @@ __bam_iitem(dbc, key, data, op, flags)
}
if (!F_ISSET(data, DB_DBT_STREAMING) &&
(padrec || F_ISSET(data, DB_DBT_PARTIAL))) {
+ /* Partial puts need to be handled in the blob functions. */
+ if (op == DB_CURRENT) {
+ bk = GET_BKEYDATA(dbp, h, indx + (TYPE(h) == P_LBTREE ?
+ O_INDX : 0));
+ if (B_TYPE(bk->type) == B_BLOB)
+ goto dup_cmp;
+ }
tdbt = *data;
if ((ret =
__bam_build(dbc, op, &tdbt, h, indx, data_size)) != 0)
@@ -204,10 +220,10 @@ __bam_iitem(dbc, key, data, op, flags)
* screwing up the duplicate sort order. We have to do this after
* we build the real record so that we're comparing the real items.
*/
- if (op == DB_CURRENT && dbp->dup_compare != NULL) {
+dup_cmp:if (op == DB_CURRENT && dbp->dup_compare != NULL) {
if ((ret = __bam_cmp(dbc, data, h,
indx + (TYPE(h) == P_LBTREE ? O_INDX : 0),
- dbp->dup_compare, &cmp)) != 0)
+ dbp->dup_compare, &cmp, NULL)) != 0)
return (ret);
if (cmp != 0) {
__db_errx(env, DB_STR("1004",
@@ -218,10 +234,30 @@ __bam_iitem(dbc, key, data, op, flags)
/*
* If the key or data item won't fit on a page, we'll have to store
- * them on overflow pages.
+ * them on overflow pages. The exception is if we are inserting
+ * into an existing blob file, in that case it remains a blob
+ * file regardless of its new size.
*/
+ if (op == DB_CURRENT) {
+ bk = GET_BKEYDATA(
+ dbp, h, indx + (TYPE(h) == P_LBTREE ? O_INDX : 0));
+ if (B_TYPE(bk->type) == B_BLOB) {
+ blobdata = 1;
+ bigdata = 0;
+ } else
+ bigdata = data_size > cp->ovflsize;
+ } else {
+ if (dbp->blob_threshold &&
+ (dbp->blob_threshold <= data_size ||
+ F_ISSET(data, DB_DBT_BLOB))) {
+ blobdata = 1;
+ bigdata = 0;
+ } else {
+ blobdata = 0;
+ bigdata = data_size > cp->ovflsize;
+ }
+ }
needed = 0;
- bigdata = data_size > cp->ovflsize;
switch (op) {
case DB_KEYFIRST:
/* We're adding a new key and data pair. */
@@ -232,6 +268,8 @@ __bam_iitem(dbc, key, data, op, flags)
needed += BKEYDATA_PSIZE(key->size);
if (bigdata)
needed += BOVERFLOW_PSIZE;
+ else if (blobdata)
+ needed += BBLOB_PSIZE;
else
needed += BKEYDATA_PSIZE(data_size);
break;
@@ -254,6 +292,8 @@ __bam_iitem(dbc, key, data, op, flags)
indx + (TYPE(h) == P_LBTREE ? O_INDX : 0));
if (B_TYPE(bk->type) == B_KEYDATA)
have_bytes = BKEYDATA_PSIZE(bk->len);
+ else if (B_TYPE(bk->type) == B_BLOB)
+ have_bytes = BBLOB_PSIZE;
else
have_bytes = BOVERFLOW_PSIZE;
need_bytes = 0;
@@ -263,6 +303,8 @@ __bam_iitem(dbc, key, data, op, flags)
}
if (bigdata)
need_bytes += BOVERFLOW_PSIZE;
+ else if (blobdata)
+ need_bytes += BBLOB_PSIZE;
else
need_bytes += BKEYDATA_PSIZE(data_size);
@@ -405,7 +447,8 @@ __bam_iitem(dbc, key, data, op, flags)
* because we're going to immediately re-add the item into the
* same slot.
*/
- if (bigdata || B_TYPE(bk->type) != B_KEYDATA) {
+ if (bigdata || (B_TYPE(bk->type) != B_KEYDATA &&
+ B_TYPE(bk->type) != B_BLOB)) {
/*
* If streaming, don't delete the overflow item,
* just delete the item pointing to the overflow item.
@@ -448,13 +491,65 @@ __bam_iitem(dbc, key, data, op, flags)
bk_hdr.size = SSZA(BKEYDATA, data);
ret = __db_pitem(dbc, h, indx,
BKEYDATA_SIZE(data->size), &bk_hdr, data);
- } else if (replace)
- ret = __bam_ritem(dbc, h, indx, data, 0);
- else
- ret = __db_pitem(dbc, h, indx,
- BKEYDATA_SIZE(data->size), NULL, data);
+ } else if (replace) {
+ /*
+ * If updating a blob, replace the blob file with the
+ * new blob data and updated the blob db record.
+ */
+ if (blobdata) {
+ memcpy(&bl,
+ P_ENTRY(dbp, h, indx), BBLOB_SIZE);
+ memset(&blob_dbt, 0, sizeof(DBT));
+ blob_dbt.size = BBLOB_DSIZE;
+ if (F_ISSET(data, DB_DBT_BLOB_REC)) {
+ /*
+ * Replace the blob record with the
+ * blob record in the data DBT.
+ */
+ blob_dbt.data = BBLOB_DATA(data->data);
+ } else {
+ blob_id = (db_seq_t)bl.id;
+ GET_BLOB_SIZE(
+ dbp->env, bl, blob_size, ret);
+ if (ret != 0)
+ goto err;
+ if ((ret = __blob_repl(
+ dbc, data, blob_id,
+ &new_blob_id, &blob_size)) != 0)
+ goto err;
+ blob_dbt.data = BBLOB_DATA((&bl));
+ SET_BLOB_ID(&bl, new_blob_id, BBLOB);
+ SET_BLOB_SIZE(&bl, blob_size, BBLOB);
+ }
+ ret = __bam_ritem(
+ dbc, h, indx, &blob_dbt, B_BLOB);
+ } else
+ ret = __bam_ritem(dbc, h, indx, data, 0);
+ } else
+ if (blobdata) {
+ new_blob_id = 0;
+ blob_size = 0;
+ if ((ret = __blob_put(dbc, data,
+ &new_blob_id, &blob_size, &lsn)) != 0)
+ goto err;
+ memset(&blob_buf, 0, BBLOB_SIZE);
+ blob_buf.type = B_BLOB;
+ blob_buf.len = BBLOB_DSIZE;
+ tdbt.data = &blob_buf;
+ tdbt.size = BBLOB_SIZE;
+ SET_BLOB_ID(&blob_buf, new_blob_id, BBLOB);
+ SET_BLOB_SIZE(&blob_buf, blob_size, BBLOB);
+ SET_BLOB_FILE_ID(
+ &blob_buf, dbp->blob_file_id, BBLOB);
+ SET_BLOB_SDB_ID(
+ &blob_buf, dbp->blob_sdb_id, BBLOB);
+ ret = __db_pitem(dbc, h,
+ indx, BBLOB_SIZE, &tdbt, NULL);
+ } else
+ ret = __db_pitem(dbc, h, indx,
+ BKEYDATA_SIZE(data->size), NULL, data);
}
- if (ret != 0) {
+err: if (ret != 0) {
if (del == 1 && (t_ret =
__bam_ca_di(dbc, PGNO(h), indx + 1, -1)) != 0) {
__db_err(env, t_ret, DB_STR("1005",
@@ -504,32 +599,61 @@ __bam_iitem(dbc, key, data, op, flags)
* __bam_partsize --
* Figure out how much space a partial data item is in total.
*/
-static u_int32_t
-__bam_partsize(dbp, op, data, h, indx)
+static int
+__bam_partsize(dbp, op, data, h, indx, data_size)
DB *dbp;
u_int32_t op, indx;
DBT *data;
PAGE *h;
+ u_int32_t *data_size;
{
+ BBLOB bl;
BKEYDATA *bk;
+ int ret;
+ off_t blob_size;
u_int32_t nbytes;
+ ret = 0;
+
/*
* If the record doesn't already exist, it's simply the data we're
* provided.
*/
- if (op != DB_CURRENT)
- return (data->doff + data->size);
+ if (op != DB_CURRENT) {
+ *data_size = data->doff + data->size;
+ return (0);
+ }
/*
* Otherwise, it's the data provided plus any already existing data
* that we're not replacing.
*/
bk = GET_BKEYDATA(dbp, h, indx + (TYPE(h) == P_LBTREE ? O_INDX : 0));
- nbytes =
- B_TYPE(bk->type) == B_OVERFLOW ? ((BOVERFLOW *)bk)->tlen : bk->len;
+ switch (B_TYPE(bk->type)) {
+ case B_BLOB:
+ memcpy(&bl, bk, BBLOB_SIZE);
+ GET_BLOB_SIZE(dbp->env, bl, blob_size, ret);
+ if (ret != 0)
+ return (ret);
+ /*
+ * It is not possible to add data past UINT32_MAX in the
+ * partial API, so this is safe.
+ */
+ if (blob_size > UINT32_MAX)
+ nbytes = UINT32_MAX;
+ else
+ nbytes = (u_int32_t)blob_size;
+ break;
+ case B_OVERFLOW:
+ nbytes = ((BOVERFLOW *)bk)->tlen;
+ break;
+ default:
+ nbytes = bk->len;
+ }
- return (__db_partsize(nbytes, data));
+ *data_size = __db_partsize(nbytes, data);
+
+ return (ret);
}
/*
@@ -848,6 +972,7 @@ __bam_irep(dbc, h, indx, hdr, data)
bi = GET_BINTERNAL(dbp, h, indx);
bn = (BINTERNAL *) hdr->data;
+ DB_ASSERT(dbc->env, B_TYPE(bi->type) != B_BLOB);
if (B_TYPE(bi->type) == B_OVERFLOW &&
(ret = __db_doff(dbc, ((BOVERFLOW *)bi->data)->pgno)) != 0)
return (ret);
@@ -892,6 +1017,7 @@ __bam_dup_check(dbc, op, h, indx, sz, cntp)
/* Count the key once. */
bk = GET_BKEYDATA(dbp, h, indx);
+ DB_ASSERT(dbc->env, B_TYPE(bk->type) != B_BLOB);
sz += B_TYPE(bk->type) == B_KEYDATA ?
BKEYDATA_PSIZE(bk->len) : BOVERFLOW_PSIZE;
@@ -994,6 +1120,7 @@ __bam_dup_convert(dbc, h, indx, cnt)
* overflow, then free up those pages).
*/
bk = GET_BKEYDATA(dbp, h, dindx + 1);
+ DB_ASSERT(dbc->env, B_TYPE(bk->type) != B_BLOB);
hdr.data = bk;
hdr.size = B_TYPE(bk->type) == B_KEYDATA ?
BKEYDATA_SIZE(bk->len) : BOVERFLOW_SIZE;
diff --git a/src/btree/bt_rec.c b/src/btree/bt_rec.c
index 026564b6..eb44d04b 100644
--- a/src/btree/bt_rec.c
+++ b/src/btree/bt_rec.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/btree/bt_reclaim.c b/src/btree/bt_reclaim.c
index f465cc5a..1203ea35 100644
--- a/src/btree/bt_reclaim.c
+++ b/src/btree/bt_reclaim.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1998, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/btree/bt_recno.c b/src/btree/bt_recno.c
index 9356a742..abbd8efb 100644
--- a/src/btree/bt_recno.c
+++ b/src/btree/bt_recno.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -234,7 +234,7 @@ __ramc_del(dbc, flags)
retry: if ((ret = __bam_rsearch(dbc, &cp->recno, SR_DELETE, 1, &exact)) != 0)
goto err;
if (!exact) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
stack = 1;
@@ -256,7 +256,7 @@ retry: if ((ret = __bam_rsearch(dbc, &cp->recno, SR_DELETE, 1, &exact)) != 0)
* if the record was "deleted", we could never have found it.
*/
if (B_DISSET(GET_BKEYDATA(dbp, cp->page, cp->indx)->type)) {
- ret = DB_KEYEMPTY;
+ ret = DBC_ERR(dbc, DB_KEYEMPTY);
goto err;
}
@@ -391,7 +391,7 @@ retry: switch (flags) {
* a dup, so we set flags to DB_NEXT and keep going.
*/
if (!F_ISSET(dbc, DBC_OPD))
- return (DB_NOTFOUND);
+ return (DBC_ERR(dbc, DB_NOTFOUND));
/* FALLTHROUGH */
case DB_NEXT_NODUP:
/*
@@ -431,7 +431,7 @@ retry: switch (flags) {
* is a dup, so we set flags to DB_PREV and keep going.
*/
if (!F_ISSET(dbc, DBC_OPD))
- return (DB_NOTFOUND);
+ return (DBC_ERR(dbc, DB_NOTFOUND));
/* FALLTHROUGH */
case DB_PREV_NODUP:
/*
@@ -443,7 +443,7 @@ retry: switch (flags) {
flags = DB_PREV;
if (cp->recno != RECNO_OOB) {
if (cp->recno == 1) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
--cp->recno;
@@ -458,7 +458,7 @@ retry: switch (flags) {
if ((ret = __bam_nrecs(dbc, &cp->recno)) != 0)
goto err;
if (cp->recno == 0) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
break;
@@ -476,7 +476,7 @@ retry: switch (flags) {
cp->recno++;
break;
}
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
/* NOTREACHED */
case DB_GET_BOTH:
@@ -522,7 +522,7 @@ retry: switch (flags) {
1, &exact)) != 0)
goto err;
if (!exact) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
@@ -561,22 +561,22 @@ retry: switch (flags) {
(void)__bam_stkrel(dbc, STK_CLRDBC);
continue;
}
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
default:
- ret = DB_KEYEMPTY;
+ ret = DBC_ERR(dbc, DB_KEYEMPTY);
goto err;
}
if (flags == DB_GET_BOTH ||
flags == DB_GET_BOTHC || flags == DB_GET_BOTH_RANGE) {
if ((ret = __bam_cmp(dbc, data, cp->page, cp->indx,
- __bam_defcmp, &cmp)) != 0)
+ __bam_defcmp, &cmp, NULL)) != 0)
return (ret);
if (cmp == 0)
break;
if (!F_ISSET(dbc, DBC_OPD)) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
(void)__bam_stkrel(dbc, STK_CLRDBC);
@@ -1331,7 +1331,7 @@ __ram_sread(dbc, top)
if (0) {
eof: t->re_eof = 1;
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
}
err: if (!was_modified)
t->re_modified = 0;
@@ -1368,7 +1368,7 @@ retry: /* Find the slot for insertion. */
if (exact && flags == DB_NOOVERWRITE && !CD_ISSET(cp) &&
!B_DISSET(GET_BKEYDATA(dbc->dbp, cp->page, cp->indx)->type)) {
- ret = DB_KEYEXIST;
+ ret = DBC_ERR(dbc, DB_KEYEXIST);
goto err;
}
diff --git a/src/btree/bt_rsearch.c b/src/btree/bt_rsearch.c
index 36d1c667..4ada6e2d 100644
--- a/src/btree/bt_rsearch.c
+++ b/src/btree/bt_rsearch.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1990, 1993, 1994, 1995, 1996
@@ -147,7 +147,7 @@ __bam_rsearch(dbc, recnop, flags, stop, exactp)
__TLPUT(dbc, lock)) != 0 && ret == 0)
ret = t_ret;
if (ret == 0)
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto done;
}
}
@@ -197,7 +197,8 @@ __bam_rsearch(dbc, recnop, flags, stop, exactp)
lock)) != 0 && ret == 0)
ret = t_ret;
if (ret == 0)
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc,
+ DB_NOTFOUND);
goto err;
}
}
diff --git a/src/btree/bt_search.c b/src/btree/bt_search.c
index e809a852..e3d69d16 100644
--- a/src/btree/bt_search.c
+++ b/src/btree/bt_search.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1990, 1993, 1994, 1995, 1996
@@ -51,8 +51,9 @@
/*
* __bam_get_root --
- * Fetch the root of a tree and see if we want to keep
- * it in the stack.
+ * Try to appropriately lock and fetch the root page of a tree;
+ * if successful enter it into the cursor's stack; on error, leave the stack
+ * unchanged.
*
* PUBLIC: int __bam_get_root __P((DBC *, db_pgno_t, int, u_int32_t, int *));
*/
@@ -232,9 +233,11 @@ retry: if (lock_mode == DB_LOCK_WRITE)
} else if (atomic_read(&mpf->mfp->multiversion) != 0 &&
lock_mode == DB_LOCK_WRITE && (ret = __memp_dirty(mpf, &h,
dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0) {
- (void)__memp_fput(mpf,
- dbc->thread_info, h, dbc->priority);
+ if (h != NULL)
+ (void)__memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority);
(void)__LPUT(dbc, lock);
+ return (ret);
}
}
@@ -272,9 +275,10 @@ __bam_search(dbc, root_pgno, key, flags, slevel, recnop, exactp)
db_recno_t recno;
int adjust, cmp, deloffset, ret, set_stack, stack, t_ret;
int getlock, was_next;
- int (*func) __P((DB *, const DBT *, const DBT *));
+ int (*func) __P((DB *, const DBT *, const DBT *, size_t *));
u_int32_t get_mode, wait;
u_int8_t level, saved_level;
+ size_t pos, pos_h, pos_l;
if (F_ISSET(dbc, DBC_OPD))
LOCK_CHECK_OFF(dbc->thread_info);
@@ -288,6 +292,7 @@ __bam_search(dbc, root_pgno, key, flags, slevel, recnop, exactp)
t = dbp->bt_internal;
recno = 0;
t_ret = 0;
+ func = NULL;
BT_STK_CLR(cp);
LOCK_INIT(saved_lock);
@@ -339,11 +344,17 @@ retry: if ((ret = __bam_get_root(dbc, start_pgno, slevel, flags, &stack)) != 0)
BT_STK_CLR(cp);
- /* Choose a comparison function. */
+ /*
+ * Choose a comparison function.
+ * We apply the prefix search optimization only when there
+ * is no user-specific comparsion function set.
+ */
func = F_ISSET(dbc, DBC_OPD) ?
(dbp->dup_compare == NULL ? __bam_defcmp : dbp->dup_compare) :
t->bt_compare;
+ pos_h = 0;
+ pos_l = 0;
for (;;) {
if (TYPE(h) == P_LBTREE)
adjust = P_INDX;
@@ -389,9 +400,11 @@ retry: if ((ret = __bam_get_root(dbc, start_pgno, slevel, flags, &stack)) != 0)
* match on a leaf page, we're done.
*/
DB_BINARY_SEARCH_FOR(base, lim, NUM_ENT(h), adjust) {
+ /* We compare from the common prefix */
+ pos = pos_l > pos_h ? pos_h : pos_l;
DB_BINARY_SEARCH_INCR(indx, base, lim, adjust);
if ((ret = __bam_cmp(dbc, key, h, indx,
- func, &cmp)) != 0)
+ func, &cmp, &pos)) != 0)
goto err;
if (cmp == 0) {
if (LEVEL(h) == LEAFLEVEL ||
@@ -403,9 +416,19 @@ retry: if ((ret = __bam_get_root(dbc, start_pgno, slevel, flags, &stack)) != 0)
}
goto next;
}
- if (cmp > 0)
+ /*
+ * We have to maintain the offset in the keys where
+ * we begin comparing for both ends of the key range
+ * in which we are binary searching. So, update either
+ * the high or low position here, depending on how
+ * the comparison turned out.
+ */
+ if (cmp > 0) {
DB_BINARY_SEARCH_SHIFT_BASE(indx, base,
lim, adjust);
+ pos_l = pos;
+ } else
+ pos_h = pos;
}
/*
@@ -421,7 +444,7 @@ retry: if ((ret = __bam_get_root(dbc, start_pgno, slevel, flags, &stack)) != 0)
*exactp = 0;
if (LF_ISSET(SR_EXACT)) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
@@ -444,13 +467,13 @@ get_next: /*
* at the root if the tree recently collapsed.
*/
if (PGNO(h) == root_pgno) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
indx = cp->sp->indx + 1;
if (indx == NUM_ENT(cp->sp->page)) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
cp->csp++;
goto err;
}
@@ -863,7 +886,7 @@ found: *exactp = 1;
* DB_NOTFOUND.
*/
if (B_DISSET(GET_BKEYDATA(dbp, h, indx + deloffset)->type)) {
- ret = DB_NOTFOUND;
+ ret = DBC_ERR(dbc, DB_NOTFOUND);
goto err;
}
diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c
index 8299c69a..f7719dc4 100644
--- a/src/btree/bt_split.c
+++ b/src/btree/bt_split.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 1990, 1993, 1994, 1995, 1996
@@ -63,7 +63,7 @@ __bam_split(dbc, arg, root_pgnop)
db_pgno_t *root_pgnop;
{
BTREE_CURSOR *cp;
- DB_LOCK metalock, next_lock;
+ DB_LOCK meta_lock, next_lock;
enum { UP, DOWN } dir;
db_pgno_t pgno, next_pgno, root_pgno;
int exact, level, ret;
@@ -72,17 +72,16 @@ __bam_split(dbc, arg, root_pgnop)
LOCK_CHECK_OFF(dbc->thread_info);
cp = (BTREE_CURSOR *)dbc->internal;
+ LOCK_INIT(meta_lock);
LOCK_INIT(next_lock);
next_pgno = PGNO_INVALID;
/*
- * First get a lock on the metadata page, we will have to allocate
+ * First get a lock on the metadata page; we will have to allocate
* pages and cannot get a lock while we have the search tree pinned.
*/
-
pgno = PGNO_BASE_MD;
- if ((ret = __db_lget(dbc,
- 0, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
+ if ((ret = __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, &meta_lock)) != 0)
goto err;
root_pgno = BAM_ROOT_PGNO(dbc);
@@ -189,7 +188,7 @@ no_split: /* Once we've split the leaf page, we're done. */
if (root_pgnop != NULL)
*root_pgnop = BAM_ROOT_PGNO(dbc);
err:
-done: (void)__LPUT(dbc, metalock);
+done: (void)__LPUT(dbc, meta_lock);
(void)__TLPUT(dbc, next_lock);
if (F_ISSET(dbc, DBC_OPD))
@@ -685,6 +684,7 @@ __bam_broot(dbc, rootp, split, lp, rp)
DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data));
DB_SET_DBT(data, &bo, BOVERFLOW_SIZE);
break;
+ case B_BLOB:
case B_DUPLICATE:
default:
goto pgfmt;
@@ -772,7 +772,30 @@ __ram_root(dbc, rootp, lp, rp)
/*
* __bam_pinsert --
- * Insert a new key into a parent page, completing the split.
+ *
+ * Construct a internal index item and place it in the parent page. It is
+ * primarily used by __bam_page() to add a new page into the tree. The sole
+ * other use is by __bam_pupdate() after a reverse split or compact has
+ * removed pages underneath it, in order to replace the parent's key/nrecs
+ * to match the new subtree.
+ *
+ * Parameters:
+ * parent - the page from the cursor stack to be modifed. The next entry
+ * in the stack (i.e., the next lower level in the tree) contains
+ * the key of the new item. The indx field must have been set
+ * when searching down the tree, to point to the new/replaced
+ * parent item.
+ * split - the indx in the cursor stack of the 'source' of the new item.
+ * lchild - the left child page is used *only* when attempting to use
+ * prefix key compression on a leaf (data) page.
+ * rchild - right child page. The source of the pgno of the new item.
+ * flags - BPI_REPLACE | BPI_NORENCUM
+ * BPI_NOLOGGING
+ *
+ * The pgno of the item always comes from rchild, which often is the same
+ * as parent[1].page. The key for DB_BTREE comes from the next lower page
+ * in the stack under parent, not from either lchild or rchild parameter --
+ * though often rchild is a copy of parent[1].page.
*
* PUBLIC: int __bam_pinsert
* PUBLIC: __P((DBC *, EPG *, u_int32_t, PAGE *, PAGE *, int));
@@ -867,12 +890,27 @@ __bam_pinsert(dbc, parent, split, lchild, rchild, flags)
size = BINTERNAL_SIZE(child_bi->len);
break;
case B_OVERFLOW:
- /* Reuse the overflow key. */
+ /* Copy the overflow key. */
child_bo = (BOVERFLOW *)child_bi->data;
memset(&bo, 0, sizeof(bo));
bo.type = B_OVERFLOW;
bo.tlen = child_bo->tlen;
- bo.pgno = child_bo->pgno;
+ if (LF_ISSET(BPI_REPLACE)) {
+ /*
+ * Replace (compact or reverse split) needs to
+ * copy in case the data item gets removed.
+ */
+ memset(&hdr, 0, sizeof(hdr));
+ if ((ret = __db_goff(dbc, &hdr,
+ child_bo->tlen, child_bo->pgno,
+ &hdr.data, &hdr.size)) == 0)
+ ret = __db_poff(dbc, &hdr, &bo.pgno);
+ if (hdr.data != NULL)
+ __os_free(dbp->env, hdr.data);
+ if (ret != 0)
+ return (ret);
+ } else
+ bo.pgno = child_bo->pgno;
bi.len = BOVERFLOW_SIZE;
B_TSET(bi.type, B_OVERFLOW);
bi.pgno = rchild->pgno;
@@ -881,6 +919,7 @@ __bam_pinsert(dbc, parent, split, lchild, rchild, flags)
DB_SET_DBT(data, &bo, BOVERFLOW_SIZE);
size = BINTERNAL_SIZE(BOVERFLOW_SIZE);
break;
+ case B_BLOB:
case B_DUPLICATE:
default:
goto pgfmt;
@@ -982,8 +1021,8 @@ noprefix: if (P_FREESPACE(dbp, ppage) + oldsize < nbytes)
DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data));
DB_SET_DBT(data, &bo, BOVERFLOW_SIZE);
size = BINTERNAL_SIZE(BOVERFLOW_SIZE);
-
break;
+ case B_BLOB:
case B_DUPLICATE:
default:
goto pgfmt;
@@ -1153,23 +1192,32 @@ __bam_psplit(dbc, cp, lp, rp, splitret)
nbytes += BINTERNAL_SIZE(BOVERFLOW_SIZE);
break;
case P_LBTREE:
- if (B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) ==
- B_KEYDATA)
- nbytes += BKEYDATA_SIZE(GET_BKEYDATA(dbp,
- pp, off)->len);
- else
+ switch (B_TYPE(GET_BKEYDATA(dbp, pp, off)->type)) {
+ case B_KEYDATA:
+ nbytes += BKEYDATA_SIZE(
+ GET_BKEYDATA(dbp, pp, off)->len);
+ break;
+ case B_BLOB:
+ nbytes += BBLOB_SIZE;
+ break;
+ default:
nbytes += BOVERFLOW_SIZE;
-
+ }
++off;
/* FALLTHROUGH */
case P_LDUP:
case P_LRECNO:
- if (B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) ==
- B_KEYDATA)
- nbytes += BKEYDATA_SIZE(GET_BKEYDATA(dbp,
- pp, off)->len);
- else
+ switch (B_TYPE(GET_BKEYDATA(dbp, pp, off)->type)) {
+ case B_KEYDATA:
+ nbytes += BKEYDATA_SIZE(
+ GET_BKEYDATA(dbp, pp, off)->len);
+ break;
+ case B_BLOB:
+ nbytes += BBLOB_SIZE;
+ break;
+ default:
nbytes += BOVERFLOW_SIZE;
+ }
break;
case P_IRECNO:
nbytes += RINTERNAL_SIZE;
@@ -1269,7 +1317,7 @@ __bam_copy(dbp, pp, cp, nxt, stop)
PAGE *pp, *cp;
u_int32_t nxt, stop;
{
- BINTERNAL internal;
+ BINTERNAL *bi, internal;
db_indx_t *cinp, nbytes, off, *pinp;
cinp = P_INP(dbp, cp);
@@ -1302,12 +1350,17 @@ __bam_copy(dbp, pp, cp, nxt, stop)
/* FALLTHROUGH */
case P_LDUP:
case P_LRECNO:
- if (B_TYPE(GET_BKEYDATA(dbp, pp, nxt)->type) ==
- B_KEYDATA)
- nbytes = BKEYDATA_SIZE(GET_BKEYDATA(dbp,
- pp, nxt)->len);
- else
+ switch (B_TYPE(GET_BKEYDATA(dbp, pp, nxt)->type)) {
+ case B_KEYDATA:
+ nbytes = BKEYDATA_SIZE(
+ GET_BKEYDATA(dbp, pp, nxt)->len);
+ break;
+ case B_BLOB:
+ nbytes = BBLOB_SIZE;
+ break;
+ default:
nbytes = BOVERFLOW_SIZE;
+ }
break;
case P_IRECNO:
nbytes = RINTERNAL_SIZE;
@@ -1316,17 +1369,18 @@ __bam_copy(dbp, pp, cp, nxt, stop)
return (__db_pgfmt(dbp->env, pp->pgno));
}
cinp[off] = HOFFSET(cp) -= nbytes;
+ /* Minimize the first key on an IBTREE page; it isn't valid. */
+ bi = GET_BINTERNAL(dbp, pp, nxt);
if (off == 0 && nxt != 0 && TYPE(pp) == P_IBTREE) {
internal.len = 0;
UMRW_SET(internal.unused);
internal.type = B_KEYDATA;
- internal.pgno = GET_BINTERNAL(dbp, pp, nxt)->pgno;
- internal.nrecs = GET_BINTERNAL(dbp, pp, nxt)->nrecs;
+ internal.pgno = bi->pgno;
+ internal.nrecs = bi->nrecs;
memcpy(P_ENTRY(dbp, cp, off), &internal, nbytes);
}
else
- memcpy(P_ENTRY(dbp, cp, off),
- P_ENTRY(dbp, pp, nxt), nbytes);
+ memcpy(P_ENTRY(dbp, cp, off), bi, nbytes);
}
return (0);
}
diff --git a/src/btree/bt_stat.c b/src/btree/bt_stat.c
index 668c4fdb..04c0fbcb 100644
--- a/src/btree/bt_stat.c
+++ b/src/btree/bt_stat.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -278,6 +278,8 @@ __bam_stat_print(dbc, flags)
"%#x\tFixed-length record pad", (u_int)sp->bt_re_pad);
}
__db_dl(env,
+ "Number of pages in the database", (u_long)sp->bt_pagecnt);
+ __db_dl(env,
"Underlying database page size", (u_long)sp->bt_pagesize);
if (dbp->type == DB_BTREE)
__db_dl(env, "Overflow key/data size",
@@ -288,6 +290,10 @@ __bam_stat_print(dbc, flags)
"Number of records in the tree", (u_long)sp->bt_nkeys);
__db_dl(env,
"Number of data items in the tree", (u_long)sp->bt_ndata);
+ if (dbp->type == DB_BTREE) {
+ __db_dl(env,
+ "Number of blobs in the tree", (u_long)sp->bt_nblobs);
+ }
__db_dl(env,
"Number of tree internal pages", (u_long)sp->bt_int_pg);
@@ -372,6 +378,10 @@ __bam_stat_callback(dbc, h, cookie, putp)
/* Ignore off-page duplicates. */
if (B_TYPE(type) != B_DUPLICATE)
++sp->bt_ndata;
+
+ /* Count blobs. */
+ if (B_TYPE(type) == B_BLOB)
+ ++sp->bt_nblobs;
}
++sp->bt_leaf_pg;
diff --git a/src/btree/bt_upgrade.c b/src/btree/bt_upgrade.c
index c9123351..66e27d56 100644
--- a/src/btree/bt_upgrade.c
+++ b/src/btree/bt_upgrade.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -9,6 +9,7 @@
#include "db_config.h"
#include "db_int.h"
+#include "dbinc/blob.h"
#include "dbinc/db_page.h"
#include "dbinc/db_upgrade.h"
#include "dbinc/btree.h"
@@ -151,3 +152,94 @@ __bam_31_lbtree(dbp, real_name, flags, fhp, h, dirtyp)
return (ret);
}
+
+/*
+ * __bam_60_btreemeta--
+ * Upgrade the version number.
+ *
+ * PUBLIC: int __bam_60_btreemeta
+ * PUBLIC: __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+ */
+int
+__bam_60_btreemeta(dbp, real_name, flags, fhp, h, dirtyp)
+ DB *dbp;
+ char *real_name;
+ u_int32_t flags;
+ DB_FH *fhp;
+ PAGE *h;
+ int *dirtyp;
+{
+ BTMETA33 *bmeta;
+
+ COMPQUIET(flags, 0);
+ COMPQUIET(real_name, NULL);
+ COMPQUIET(fhp, NULL);
+ COMPQUIET(dbp, NULL);
+ bmeta = (BTMETA33 *)h;
+
+ bmeta->dbmeta.version = 10;
+ *dirtyp = 1;
+
+ return (0);
+}
+
+/*
+ * __bam_60_lbtree --
+ * Upgrade the blob records on the database btree leaf pages.
+ *
+ * PUBLIC: int __bam_60_lbtree
+ * PUBLIC: __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+ */
+int
+__bam_60_lbtree(dbp, real_name, flags, fhp, h, dirtyp)
+ DB *dbp;
+ char *real_name;
+ u_int32_t flags;
+ DB_FH *fhp;
+ PAGE *h;
+ int *dirtyp;
+{
+ BBLOB60 bl60;
+ BBLOB60P1 bl60p1;
+ BKEYDATA *bk;
+ db_seq_t blob_id, blob_size, file_id, sdb_id;
+ db_indx_t indx;
+ int ret;
+
+ COMPQUIET(flags, 0);
+ COMPQUIET(real_name, NULL);
+ COMPQUIET(fhp, NULL);
+ ret = 0;
+
+ DB_ASSERT(dbp->env, BBLOB60_SIZE == BBLOB_SIZE);
+ for (indx = O_INDX; indx < NUM_ENT(h); indx += P_INDX) {
+ bk = GET_BKEYDATA(dbp, h, indx);
+ if (B_TYPE(bk->type) == B_BLOB ) {
+ memcpy(&bl60, bk, BBLOB60_SIZE);
+ memset(&bl60p1, 0, BBLOB_SIZE);
+ bl60p1.type = bl60.type;
+ bl60p1.len = BBLOB_DSIZE;
+ bl60p1.encoding = bl60.encoding;
+ GET_BLOB60_ID(dbp->env, bl60, blob_id, ret);
+ if (ret != 0)
+ return (ret);
+ GET_BLOB60_SIZE(dbp->env, bl60, blob_size, ret);
+ if (ret != 0)
+ return (ret);
+ GET_BLOB60_FILE_ID(dbp->env, &bl60, file_id, ret);
+ if (ret != 0)
+ return (ret);
+ GET_BLOB60_SDB_ID(dbp->env, &bl60, sdb_id, ret);
+ if (ret != 0)
+ return (ret);
+ SET_BLOB_ID(&bl60p1, blob_id, BBLOB60P1);
+ SET_BLOB_SIZE(&bl60p1, blob_size, BBLOB60P1);
+ SET_BLOB_FILE_ID(&bl60p1, file_id, BBLOB60P1);
+ SET_BLOB_SDB_ID(&bl60p1, sdb_id, BBLOB60P1);
+ memcpy(bk, &bl60p1, BBLOB_SIZE);
+ *dirtyp = 1;
+ }
+ }
+
+ return (ret);
+}
diff --git a/src/btree/bt_verify.c b/src/btree/bt_verify.c
index 99354a58..8ceb50e6 100644
--- a/src/btree/bt_verify.c
+++ b/src/btree/bt_verify.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -9,6 +9,7 @@
#include "db_config.h"
#include "db_int.h"
+#include "dbinc/blob.h"
#include "dbinc/db_page.h"
#include "dbinc/db_verify.h"
#include "dbinc/btree.h"
@@ -20,8 +21,8 @@ static int __bam_safe_getdata __P((DB *, DB_THREAD_INFO *,
static int __bam_vrfy_inp __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
db_indx_t *, u_int32_t));
static int __bam_vrfy_treeorder __P((DB *, DB_THREAD_INFO *, PAGE *,
- BINTERNAL *, BINTERNAL *, int (*)(DB *, const DBT *, const DBT *),
- u_int32_t));
+ BINTERNAL *, BINTERNAL *,
+ int (*)(DB *, const DBT *, const DBT *, size_t *), u_int32_t));
static int __ram_vrfy_inp __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
db_indx_t *, u_int32_t));
@@ -44,6 +45,7 @@ __bam_vrfy_meta(dbp, vdp, meta, pgno, flags)
VRFY_PAGEINFO *pip;
int isbad, t_ret, ret;
db_indx_t ovflsize;
+ db_seq_t blob_id;
env = dbp->env;
isbad = 0;
@@ -201,6 +203,56 @@ __bam_vrfy_meta(dbp, vdp, meta, pgno, flags)
"%lu %lu"), (u_long)pgno, (u_long)pip->re_len));
}
+/*
+ * Where 64-bit integer support is not available,
+ * return an error if the file has any blobs.
+ */
+ t_ret = 0;
+#ifdef HAVE_64BIT_TYPES
+ GET_BLOB_FILE_ID(env, meta, blob_id, t_ret);
+ if (t_ret != 0) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1187",
+ "Page %lu: blob file id overflow.", "%lu"), (u_long)pgno));
+ if (ret == 0)
+ ret = t_ret;
+ }
+ t_ret = 0;
+ GET_BLOB_SDB_ID(env, meta, blob_id, t_ret);
+ if (t_ret != 0) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1188",
+ "Page %lu: blob subdatabase id overflow.",
+ "%lu"), (u_long)pgno));
+ if (ret == 0)
+ ret = t_ret;
+ }
+#else /* HAVE_64BIT_TYPES */
+ /*
+ * db_seq_t is an int on systems that do not have 64 integers, so
+ * this will compile and run.
+ */
+ GET_BLOB_FILE_ID(env, meta, blob_id, t_ret);
+ if (t_ret != 0 || blob_id != 0) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1200",
+ "Page %lu: blobs require 64 integer compiler support.",
+ "%lu"), (u_long)pgno));
+ if (ret == 0)
+ ret = t_ret;
+ }
+ t_ret = 0;
+ GET_BLOB_SDB_ID(env, meta, blob_id, t_ret);
+ if (t_ret != 0 || blob_id != 0) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1201",
+ "Page %lu: blobs require 64 integer compiler support.",
+ "%lu"), (u_long)pgno));
+ if (ret == 0)
+ ret = t_ret;
+ }
+#endif
+
/*
* We do not check that the rest of the page is 0, because it may
* not be and may still be correct.
@@ -268,8 +320,7 @@ __ram_vrfy_leaf(dbp, vdp, h, pgno, flags)
if (F_ISSET(pip, VRFY_HAS_DUPS)) {
EPRINT((env, DB_STR_A("1043",
- "Page %lu: Recno database has dups",
- "%lu"), (u_long)pgno));
+ "Page %lu: Recno database has dups", "%lu"), (u_long)pgno));
ret = DB_VERIFY_BAD;
goto err;
}
@@ -547,12 +598,15 @@ __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags)
db_indx_t *nentriesp;
u_int32_t flags;
{
+ BBLOB bl;
BKEYDATA *bk;
BOVERFLOW *bo;
ENV *env;
VRFY_CHILDINFO child;
VRFY_ITEM *pagelayout;
VRFY_PAGEINFO *pip;
+ off_t blob_size;
+ db_seq_t blob_id, file_id, sdb_id;
u_int32_t himark, offset; /*
* These would be db_indx_ts
* but for alignment.
@@ -563,6 +617,7 @@ __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags)
env = dbp->env;
isbad = isdupitem = 0;
nentries = 0;
+ file_id = sdb_id = 0;
memset(&child, 0, sizeof(VRFY_CHILDINFO));
if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
return (ret);
@@ -668,6 +723,9 @@ __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags)
else
endoff = offset + BKEYDATA_SIZE(bk->len) - 1;
break;
+ case B_BLOB:
+ endoff = offset + BBLOB_SIZE - 1;
+ break;
case B_DUPLICATE:
/*
* Flag that we have dups; we'll check whether
@@ -731,6 +789,52 @@ __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags)
* already been done.
*/
break;
+ case B_BLOB:
+ if (TYPE(h) == P_IBTREE) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1189",
+ "Page %lu: blob item in internal btree page at item %lu",
+ "%lu %lu"), (u_long)pgno, (u_long)i));
+ break;
+ } else if (TYPE(h) == P_LRECNO) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1190",
+ "Page %lu: blob item referenced by recno page at item %lu",
+ "%lu %lu"), (u_long)pgno, (u_long)i));
+ break;
+ }
+ /*
+ * Blob item. Check that the blob file exists and is
+ * the same file size as is stored in the database
+ * record.
+ */
+ memcpy(&bl, bk, BBLOB_SIZE);
+ blob_id = (db_seq_t)bl.id;
+ GET_BLOB_SIZE(env, bl, blob_size, ret);
+ if (ret != 0 || blob_size < 0) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1192",
+ "Page %lu: blob file size value has overflowed at item %lu",
+ "%lu %lu"), (u_long)pgno, (u_long)i));
+ break;
+ }
+ file_id = (db_seq_t)bl.file_id;
+ sdb_id = (db_seq_t)bl.sdb_id;
+ if (file_id == 0 && sdb_id == 0) {
+ isbad = 1;
+ EPRINT((dbp->env, DB_STR_A("1195",
+ "Page %lu: invalid blob dir ids %llu %llu at item %lu",
+ "%lu %ll %ll %lu"), (u_long)pip->pgno,
+ (long long)file_id,
+ (long long)sdb_id, (u_long)i));
+ break;
+ }
+ if ((ret = __blob_vrfy(env, blob_id,
+ blob_size, file_id, sdb_id, pgno, flags)) != 0) {
+ isbad = 1;
+ break;
+ }
+ break;
case B_DUPLICATE:
if (TYPE(h) == P_IBTREE) {
isbad = 1;
@@ -751,9 +855,17 @@ __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags)
(BOVERFLOW *)(((BINTERNAL *)bk)->data) :
(BOVERFLOW *)bk;
- if (B_TYPE(bk->type) == B_OVERFLOW)
+ if (B_TYPE(bk->type) == B_OVERFLOW) {
+ if (TYPE(h) == P_IBTREE &&
+ bk->len != BOVERFLOW_SIZE) {
+ EPRINT((env, DB_STR_A("1196",
+ "Page %lu: bad length %u in B_OVERFLOW item %lu",
+ "%lu %u %lu"),
+ (u_long)pgno, bk->len, (u_long)i));
+ isbad = 1;
+ }
/* Make sure tlen is reasonable. */
- if (bo->tlen > dbp->pgsize * vdp->last_pgno) {
+ if (bo->tlen >= dbp->pgsize * vdp->last_pgno) {
isbad = 1;
EPRINT((env, DB_STR_A("1056",
"Page %lu: impossible tlen %lu, item %lu",
@@ -762,6 +874,7 @@ __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags)
/* Don't save as a child. */
break;
}
+ }
if (!IS_VALID_PGNO(bo->pgno) || bo->pgno == pgno ||
bo->pgno == PGNO_INVALID) {
@@ -918,8 +1031,8 @@ __bam_vrfy_itemorder(dbp, vdp, ip, h, pgno, nentries, ovflok, hasdups, flags)
VRFY_PAGEINFO *pip;
db_indx_t i, *inp;
int adj, cmp, freedup_1, freedup_2, isbad, ret, t_ret;
- int (*dupfunc) __P((DB *, const DBT *, const DBT *));
- int (*func) __P((DB *, const DBT *, const DBT *));
+ int (*dupfunc) __P((DB *, const DBT *, const DBT *, size_t *));
+ int (*func) __P((DB *, const DBT *, const DBT *, size_t *));
void *buf1, *buf2, *tmpbuf;
/*
@@ -1066,6 +1179,11 @@ retry: p1 = &dbta;
if (B_TYPE(bk->type) == B_OVERFLOW) {
bo = (BOVERFLOW *)bk;
goto overflow;
+ } else if (B_TYPE(bk->type) == B_BLOB) {
+ isbad = 1;
+ EPRINT((env, DB_STR_A("1197",
+ "Page %lu: Blob found in key item %lu",
+ "%lu %lu"), (u_long)pgno, (u_long)i));
} else {
p2->data = bk->data;
p2->size = bk->len;
@@ -1124,7 +1242,8 @@ overflow: if (!ovflok) {
/* Compare with the last key. */
if (p1->data != NULL && p2->data != NULL) {
- cmp = inp[i] == inp[i - adj] ? 0 : func(dbp, p1, p2);
+ cmp = inp[i] == inp[i - adj] ? 0 :
+ func(dbp, p1, p2, NULL);
/* comparison succeeded */
if (cmp > 0) {
@@ -1236,8 +1355,8 @@ overflow: if (!ovflok) {
* until we do the structure check
* and see whether DUPSORT is set.
*/
- if (dupfunc(dbp, &dup_1, &dup_2) > 0 &&
- pip != NULL)
+ if (dupfunc(dbp, &dup_1, &dup_2,
+ NULL) > 0 && pip != NULL)
F_SET(pip, VRFY_DUPS_UNSORTED);
if (freedup_1)
@@ -1409,7 +1528,7 @@ __bam_vrfy_subtree(dbp, vdp, pgno, l, r, flags, levelp, nrecsp, relenp)
db_recno_t child_nrecs, nrecs;
u_int32_t child_level, child_relen, j, level, relen, stflags;
u_int8_t leaf_type;
- int (*func) __P((DB *, const DBT *, const DBT *));
+ int (*func) __P((DB *, const DBT *, const DBT *, size_t *));
int isbad, p, ret, t_ret, toplevel;
if (levelp != NULL) /* Don't leave uninitialized on error. */
@@ -1524,7 +1643,7 @@ __bam_vrfy_subtree(dbp, vdp, pgno, l, r, flags, levelp, nrecsp, relenp)
* Don't do the prev/next_pgno checks if we've lost
* leaf pages due to another corruption.
*/
- if (!F_ISSET(vdp, VRFY_LEAFCHAIN_BROKEN)) {
+ if (!F_ISSET(vdp, SALVAGE_LEAFCHAIN_BROKEN)) {
if (pip->pgno != vdp->next_pgno) {
isbad = 1;
EPRINT((env, DB_STR_A("1075",
@@ -1547,7 +1666,7 @@ bad_prev: isbad = 1;
}
vdp->prev_pgno = pip->pgno;
vdp->next_pgno = pip->next_pgno;
- F_CLR(vdp, VRFY_LEAFCHAIN_BROKEN);
+ F_CLR(vdp, SALVAGE_LEAFCHAIN_BROKEN);
/*
* Overflow pages are common to all three leaf types;
@@ -1694,7 +1813,7 @@ bad_prev: isbad = 1;
* spew error messages about erroneous prev/next_pgnos,
* since that's probably not the real problem.
*/
- F_SET(vdp, VRFY_LEAFCHAIN_BROKEN);
+ F_SET(vdp, SALVAGE_LEAFCHAIN_BROKEN);
ret = DB_VERIFY_BAD;
goto err;
@@ -2042,7 +2161,7 @@ __bam_vrfy_treeorder(dbp, ip, h, lp, rp, func, flags)
DB_THREAD_INFO *ip;
PAGE *h;
BINTERNAL *lp, *rp;
- int (*func) __P((DB *, const DBT *, const DBT *));
+ int (*func) __P((DB *, const DBT *, const DBT *, size_t *));
u_int32_t flags;
{
BOVERFLOW *bo;
@@ -2050,7 +2169,7 @@ __bam_vrfy_treeorder(dbp, ip, h, lp, rp, func, flags)
DBT dbt;
ENV *env;
db_indx_t last;
- int ret, cmp;
+ int cmp, ret, t_ret;
env = dbp->env;
memset(&dbt, 0, sizeof(DBT));
@@ -2077,7 +2196,6 @@ __bam_vrfy_treeorder(dbp, ip, h, lp, rp, func, flags)
return (__db_unknown_path(env, "__bam_vrfy_treeorder"));
}
- /* Populate a dummy cursor. */
if ((ret = __db_cursor_int(dbp, ip, NULL, DB_BTREE,
PGNO_INVALID, 0, DB_LOCK_INVALIDID, &dbc)) != 0)
return (ret);
@@ -2095,9 +2213,6 @@ __bam_vrfy_treeorder(dbp, ip, h, lp, rp, func, flags)
* parent and falsely report a failure.)
*/
if (lp != NULL && TYPE(h) != P_IBTREE) {
- if ((ret = __db_cursor_int(dbp, ip, NULL, DB_BTREE,
- PGNO_INVALID, 0, DB_LOCK_INVALIDID, &dbc)) != 0)
- return (ret);
if (lp->type == B_KEYDATA) {
dbt.data = lp->data;
dbt.size = lp->len;
@@ -2105,13 +2220,13 @@ __bam_vrfy_treeorder(dbp, ip, h, lp, rp, func, flags)
bo = (BOVERFLOW *)lp->data;
if ((ret = __db_goff(dbc, &dbt,
bo->tlen, bo->pgno, NULL, NULL)) != 0)
- return (ret);
- } else
- return (
- __db_unknown_path(env, "__bam_vrfy_treeorder"));
+ goto err;
+ } else {
+ ret = __db_unknown_path(env, "__bam_vrfy_treeorder");
+ goto err;
+ }
- /* On error, fall through, free if needed, and return. */
- if ((ret = __bam_cmp(dbc, &dbt, h, 0, func, &cmp)) == 0) {
+ if ((ret = __bam_cmp(dbc, &dbt, h, 0, func, &cmp, NULL)) == 0) {
if (cmp > 0) {
EPRINT((env, DB_STR_A("1092",
"Page %lu: first item on page sorted greater than parent entry",
@@ -2126,7 +2241,7 @@ __bam_vrfy_treeorder(dbp, ip, h, lp, rp, func, flags)
if (dbt.data != lp->data)
__os_ufree(env, dbt.data);
if (ret != 0)
- return (ret);
+ goto err;
}
if (rp != NULL) {
@@ -2137,13 +2252,14 @@ __bam_vrfy_treeorder(dbp, ip, h, lp, rp, func, flags)
bo = (BOVERFLOW *)rp->data;
if ((ret = __db_goff(dbc, &dbt,
bo->tlen, bo->pgno, NULL, NULL)) != 0)
- return (ret);
- } else
- return (
- __db_unknown_path(env, "__bam_vrfy_treeorder"));
+ goto err;
+ } else {
+ ret = __db_unknown_path(env, "__bam_vrfy_treeorder");
+ goto err;
+ }
- /* On error, fall through, free if needed, and return. */
- if ((ret = __bam_cmp(dbc, &dbt, h, last, func, &cmp)) == 0) {
+ if ((ret = __bam_cmp(dbc,
+ &dbt, h, last, func, &cmp, NULL)) == 0) {
if (cmp < 0) {
EPRINT((env, DB_STR_A("1094",
"Page %lu: last item on page sorted greater than parent entry",
@@ -2158,6 +2274,9 @@ __bam_vrfy_treeorder(dbp, ip, h, lp, rp, func, flags)
if (dbt.data != rp->data)
__os_ufree(env, dbt.data);
}
+err:
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
return (ret);
}
@@ -2186,14 +2305,20 @@ __bam_salvage(dbp, vdp, pgno, pgtype, h, handle, callback, key, flags)
{
BKEYDATA *bk;
BOVERFLOW *bo;
+ BBLOB bl;
DBT dbt, repldbt, unknown_key, unknown_data;
ENV *env;
VRFY_ITEM *pgmap;
db_indx_t i, last, beg, end, *inp;
db_pgno_t ovflpg;
+ off_t blob_size, blob_offset, remaining;
+ u_int32_t blob_buf_size;
+ u_int8_t *blob_buf;
u_int32_t himark, ovfl_bufsz;
+ db_seq_t blob_id, file_id, sdb_id;
void *ovflbuf;
int adj, ret, t_ret, t2_ret;
+ char *prefix;
#ifdef HAVE_COMPRESSION
DBT kcpy, *last_key;
int unknown_dup_key;
@@ -2202,6 +2327,8 @@ __bam_salvage(dbp, vdp, pgno, pgtype, h, handle, callback, key, flags)
env = dbp->env;
ovflbuf = pgmap = NULL;
inp = P_INP(dbp, h);
+ blob_buf_size = 0;
+ blob_buf = NULL;
memset(&dbt, 0, sizeof(DBT));
dbt.flags = DB_DBT_REALLOC;
@@ -2543,6 +2670,68 @@ __bam_salvage(dbp, vdp, pgno, pgtype, h, handle, callback, key, flags)
}
#endif
break;
+ case B_BLOB:
+ memcpy(&bl, bk, BBLOB_SIZE);
+ blob_id = (db_seq_t)bl.id;
+ GET_BLOB_SIZE(env, bl, blob_size, ret);
+ if (ret != 0 || blob_size < 0)
+ goto err;
+ file_id = (db_seq_t)bl.file_id;
+ sdb_id = (db_seq_t)bl.sdb_id;
+
+ /* Read the blob, in pieces if it is too large.*/
+ blob_offset = 0;
+ if (blob_size > MEGABYTE) {
+ if (blob_buf_size < MEGABYTE) {
+ if ((ret = __os_realloc(
+ env, MEGABYTE, &blob_buf)) != 0)
+ goto err;
+ blob_buf_size = MEGABYTE;
+ }
+ } else if (blob_buf_size < blob_size) {
+ blob_buf_size = (u_int32_t)blob_size;
+ if ((ret = __os_realloc(env,
+ blob_buf_size, &blob_buf)) != 0)
+ goto err;
+ }
+ dbt.data = blob_buf;
+ dbt.ulen = blob_buf_size;
+ remaining = blob_size;
+ prefix = " ";
+ do {
+ if ((ret = __blob_salvage(env, blob_id,
+ blob_offset,
+ ((remaining < blob_buf_size) ?
+ (size_t)remaining : blob_buf_size),
+ file_id, sdb_id, &dbt)) != 0) {
+ if (LF_ISSET(DB_AGGRESSIVE)) {
+ ret = DB_VERIFY_BAD;
+ break;
+ }
+ F_CLR(vdp, SALVAGE_STREAM_BLOB);
+ goto err;
+ }
+ if (remaining > blob_buf_size)
+ F_SET(vdp, SALVAGE_STREAM_BLOB);
+ else
+ F_CLR(vdp, SALVAGE_STREAM_BLOB);
+ if ((t_ret = __db_vrfy_prdbt(
+ &dbt, 0, prefix,
+ handle, callback, 0, 0, vdp)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ F_CLR(vdp, SALVAGE_STREAM_BLOB);
+ goto err;
+ }
+ prefix = NULL;
+ blob_offset += dbt.size;
+ if (remaining < blob_buf_size)
+ remaining = 0;
+ else
+ remaining -= blob_buf_size;
+ } while (remaining > 0);
+ F_CLR(vdp, SALVAGE_STREAM_BLOB);
+ break;
default:
/*
* We should never get here; __db_vrfy_inpitem should
@@ -2572,6 +2761,8 @@ err: if (pgmap != NULL)
__os_free(env, ovflbuf);
if (repldbt.data != NULL)
__os_free(env, repldbt.data);
+ if (blob_buf != NULL)
+ __os_free(env, blob_buf);
#ifdef HAVE_COMPRESSION
if (kcpy.data != NULL)
__os_free(env, kcpy.data);
diff --git a/src/btree/btree.src b/src/btree/btree.src
index 08e5a206..02088b88 100644
--- a/src/btree/btree.src
+++ b/src/btree/btree.src
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/