summaryrefslogtreecommitdiff
path: root/bdb/db
diff options
context:
space:
mode:
authorunknown <ram@mysql.r18.ru>2002-10-30 15:57:05 +0400
committerunknown <ram@mysql.r18.ru>2002-10-30 15:57:05 +0400
commit155e78f014de1a2e259ae5119f4621fbb210a784 (patch)
tree6881a3cca88bea0bb9eeffd5aae34be437152786 /bdb/db
parentb8798d25ab71436bf690ee8ae48285a655c5487e (diff)
downloadmariadb-git-155e78f014de1a2e259ae5119f4621fbb210a784.tar.gz
BDB 4.1.24
BitKeeper/deleted/.del-ex_access.wpj~3df6ae8c99bf7c5f: Delete: bdb/build_vxworks/ex_access/ex_access.wpj BitKeeper/deleted/.del-ex_btrec.wpj~a7622f1c6f432dc6: Delete: bdb/build_vxworks/ex_btrec/ex_btrec.wpj BitKeeper/deleted/.del-ex_dbclient.wpj~7345440f3b204cdd: Delete: bdb/build_vxworks/ex_dbclient/ex_dbclient.wpj BitKeeper/deleted/.del-ex_env.wpj~fbe1ab10b04e8b74: Delete: bdb/build_vxworks/ex_env/ex_env.wpj BitKeeper/deleted/.del-ex_mpool.wpj~4479cfd5c45f327d: Delete: bdb/build_vxworks/ex_mpool/ex_mpool.wpj BitKeeper/deleted/.del-ex_tpcb.wpj~f78093006e14bf41: Delete: bdb/build_vxworks/ex_tpcb/ex_tpcb.wpj BitKeeper/deleted/.del-db_buildall.dsp~bd749ff6da11682: Delete: bdb/build_win32/db_buildall.dsp BitKeeper/deleted/.del-cxx_app.cpp~ad8df8e0791011ed: Delete: bdb/cxx/cxx_app.cpp BitKeeper/deleted/.del-cxx_log.cpp~a50ff3118fe06952: Delete: bdb/cxx/cxx_log.cpp BitKeeper/deleted/.del-cxx_table.cpp~ecd751e79b055556: Delete: bdb/cxx/cxx_table.cpp BitKeeper/deleted/.del-namemap.txt~796a3acd3885d8fd: Delete: bdb/cxx/namemap.txt BitKeeper/deleted/.del-Design.fileop~3ca4da68f1727373: Delete: bdb/db/Design.fileop BitKeeper/deleted/.del-db185_int.h~61bee3736e7959ef: Delete: bdb/db185/db185_int.h BitKeeper/deleted/.del-acconfig.h~411e8854d67ad8b5: Delete: bdb/dist/acconfig.h BitKeeper/deleted/.del-mutex.m4~a13383cde18a64e1: Delete: bdb/dist/aclocal/mutex.m4 BitKeeper/deleted/.del-options.m4~b9d0ca637213750a: Delete: bdb/dist/aclocal/options.m4 BitKeeper/deleted/.del-programs.m4~3ce7890b47732b30: Delete: bdb/dist/aclocal/programs.m4 BitKeeper/deleted/.del-tcl.m4~f944e2db93c3b6db: Delete: bdb/dist/aclocal/tcl.m4 BitKeeper/deleted/.del-types.m4~59cae158c9a32cff: Delete: bdb/dist/aclocal/types.m4 BitKeeper/deleted/.del-script~d38f6d3a4f159cb4: Delete: bdb/dist/build/script BitKeeper/deleted/.del-configure.in~ac795a92c8fe049c: Delete: bdb/dist/configure.in BitKeeper/deleted/.del-ltconfig~66bbd007d8024af: Delete: bdb/dist/ltconfig BitKeeper/deleted/.del-rec_ctemp~a28554362534f00a: Delete: bdb/dist/rec_ctemp BitKeeper/deleted/.del-s_tcl~2ffe4326459fcd9f: Delete: bdb/dist/s_tcl BitKeeper/deleted/.del-.IGNORE_ME~d8148b08fa7d5d15: Delete: bdb/dist/template/.IGNORE_ME BitKeeper/deleted/.del-btree.h~179f2aefec1753d: Delete: bdb/include/btree.h BitKeeper/deleted/.del-cxx_int.h~6b649c04766508f8: Delete: bdb/include/cxx_int.h BitKeeper/deleted/.del-db.src~6b433ae615b16a8d: Delete: bdb/include/db.src BitKeeper/deleted/.del-db_185.h~ad8b373d9391d35c: Delete: bdb/include/db_185.h BitKeeper/deleted/.del-db_am.h~a714912b6b75932f: Delete: bdb/include/db_am.h BitKeeper/deleted/.del-db_cxx.h~fcafadf45f5d19e9: Delete: bdb/include/db_cxx.h BitKeeper/deleted/.del-db_dispatch.h~6844f20f7eb46904: Delete: bdb/include/db_dispatch.h BitKeeper/deleted/.del-db_int.src~419a3f48b6a01da7: Delete: bdb/include/db_int.src BitKeeper/deleted/.del-db_join.h~76f9747a42c3399a: Delete: bdb/include/db_join.h BitKeeper/deleted/.del-db_page.h~e302ca3a4db3abdc: Delete: bdb/include/db_page.h BitKeeper/deleted/.del-db_server_int.h~e1d20b6ba3bca1ab: Delete: bdb/include/db_server_int.h BitKeeper/deleted/.del-db_shash.h~5fbf2d696fac90f3: Delete: bdb/include/db_shash.h BitKeeper/deleted/.del-db_swap.h~1e60887550864a59: Delete: bdb/include/db_swap.h BitKeeper/deleted/.del-db_upgrade.h~c644eee73701fc8d: Delete: bdb/include/db_upgrade.h BitKeeper/deleted/.del-db_verify.h~b8d6c297c61f342e: Delete: bdb/include/db_verify.h BitKeeper/deleted/.del-debug.h~dc2b4f2cf27ccebc: Delete: bdb/include/debug.h BitKeeper/deleted/.del-hash.h~2aaa548b28882dfb: Delete: bdb/include/hash.h BitKeeper/deleted/.del-lock.h~a761c1b7de57b77f: Delete: bdb/include/lock.h BitKeeper/deleted/.del-log.h~ff20184238e35e4d: Delete: bdb/include/log.h BitKeeper/deleted/.del-mp.h~7e317597622f3411: Delete: bdb/include/mp.h BitKeeper/deleted/.del-mutex.h~d3ae7a2977a68137: Delete: bdb/include/mutex.h BitKeeper/deleted/.del-os.h~91867cc8757cd0e3: Delete: bdb/include/os.h BitKeeper/deleted/.del-os_jump.h~e1b939fa5151d4be: Delete: bdb/include/os_jump.h BitKeeper/deleted/.del-qam.h~6fad0c1b5723d597: Delete: bdb/include/qam.h BitKeeper/deleted/.del-queue.h~4c72c0826c123d5: Delete: bdb/include/queue.h BitKeeper/deleted/.del-region.h~513fe04d977ca0fc: Delete: bdb/include/region.h BitKeeper/deleted/.del-shqueue.h~525fc3e6c2025c36: Delete: bdb/include/shqueue.h BitKeeper/deleted/.del-tcl_db.h~c536fd61a844f23f: Delete: bdb/include/tcl_db.h BitKeeper/deleted/.del-txn.h~c8d94b221ec147e4: Delete: bdb/include/txn.h BitKeeper/deleted/.del-xa.h~ecc466493aae9d9a: Delete: bdb/include/xa.h BitKeeper/deleted/.del-DbRecoveryInit.java~756b52601a0b9023: Delete: bdb/java/src/com/sleepycat/db/DbRecoveryInit.java BitKeeper/deleted/.del-DbTxnRecover.java~74607cba7ab89d6d: Delete: bdb/java/src/com/sleepycat/db/DbTxnRecover.java BitKeeper/deleted/.del-lock_conflict.c~fc5e0f14cf597a2b: Delete: bdb/lock/lock_conflict.c BitKeeper/deleted/.del-log.src~53ac9e7b5cb023f2: Delete: bdb/log/log.src BitKeeper/deleted/.del-log_findckp.c~24287f008916e81f: Delete: bdb/log/log_findckp.c BitKeeper/deleted/.del-log_rec.c~d51711f2cac09297: Delete: bdb/log/log_rec.c BitKeeper/deleted/.del-log_register.c~b40bb4efac75ca15: Delete: bdb/log/log_register.c BitKeeper/deleted/.del-Design~b3d0f179f2767b: Delete: bdb/mp/Design BitKeeper/deleted/.del-os_finit.c~95dbefc6fe79b26c: Delete: bdb/os/os_finit.c BitKeeper/deleted/.del-os_abs.c~df95d1e7db81924: Delete: bdb/os_vxworks/os_abs.c BitKeeper/deleted/.del-os_finit.c~803b484bdb9d0122: Delete: bdb/os_vxworks/os_finit.c BitKeeper/deleted/.del-os_map.c~3a6d7926398b76d3: Delete: bdb/os_vxworks/os_map.c BitKeeper/deleted/.del-os_finit.c~19a227c6d3c78ad: Delete: bdb/os_win32/os_finit.c BitKeeper/deleted/.del-log-corruption.patch~1cf2ecc7c6408d5d: Delete: bdb/patches/log-corruption.patch BitKeeper/deleted/.del-Btree.pm~af6d0c5eaed4a98e: Delete: bdb/perl.BerkeleyDB/BerkeleyDB/Btree.pm BitKeeper/deleted/.del-BerkeleyDB.pm~7244036d4482643: Delete: bdb/perl.BerkeleyDB/BerkeleyDB.pm BitKeeper/deleted/.del-BerkeleyDB.pod~e7b18fd6132448e3: Delete: bdb/perl.BerkeleyDB/BerkeleyDB.pod BitKeeper/deleted/.del-Hash.pm~10292a26c06a5c95: Delete: bdb/perl.BerkeleyDB/BerkeleyDB/Hash.pm BitKeeper/deleted/.del-BerkeleyDB.pod.P~79f76a1495eda203: Delete: bdb/perl.BerkeleyDB/BerkeleyDB.pod.P BitKeeper/deleted/.del-BerkeleyDB.xs~80c99afbd98e392c: Delete: bdb/perl.BerkeleyDB/BerkeleyDB.xs BitKeeper/deleted/.del-Changes~729c1891efa60de9: Delete: bdb/perl.BerkeleyDB/Changes BitKeeper/deleted/.del-MANIFEST~63a1e34aecf157a0: Delete: bdb/perl.BerkeleyDB/MANIFEST BitKeeper/deleted/.del-Makefile.PL~c68797707d8df87a: Delete: bdb/perl.BerkeleyDB/Makefile.PL BitKeeper/deleted/.del-README~5f2f579b1a241407: Delete: bdb/perl.BerkeleyDB/README BitKeeper/deleted/.del-Todo~dca3c66c193adda9: Delete: bdb/perl.BerkeleyDB/Todo BitKeeper/deleted/.del-config.in~ae81681e450e0999: Delete: bdb/perl.BerkeleyDB/config.in BitKeeper/deleted/.del-dbinfo~28ad67d83be4f68e: Delete: bdb/perl.BerkeleyDB/dbinfo BitKeeper/deleted/.del-mkconsts~543ab60669c7a04e: Delete: bdb/perl.BerkeleyDB/mkconsts BitKeeper/deleted/.del-mkpod~182c0ca54e439afb: Delete: bdb/perl.BerkeleyDB/mkpod BitKeeper/deleted/.del-5.004~e008cb5a48805543: Delete: bdb/perl.BerkeleyDB/patches/5.004 BitKeeper/deleted/.del-irix_6_5.pl~61662bb08afcdec8: Delete: bdb/perl.BerkeleyDB/hints/irix_6_5.pl BitKeeper/deleted/.del-solaris.pl~6771e7182394e152: Delete: bdb/perl.BerkeleyDB/hints/solaris.pl BitKeeper/deleted/.del-typemap~783b8f5295b05f3d: Delete: bdb/perl.BerkeleyDB/typemap BitKeeper/deleted/.del-5.004_01~6081ce2fff7b0bc: Delete: bdb/perl.BerkeleyDB/patches/5.004_01 BitKeeper/deleted/.del-5.004_02~87214eac35ad9e6: Delete: bdb/perl.BerkeleyDB/patches/5.004_02 BitKeeper/deleted/.del-5.004_03~9a672becec7cb40f: Delete: bdb/perl.BerkeleyDB/patches/5.004_03 BitKeeper/deleted/.del-5.004_04~e326cb51af09d154: Delete: bdb/perl.BerkeleyDB/patches/5.004_04 BitKeeper/deleted/.del-5.004_05~7ab457a1e41a92fe: Delete: bdb/perl.BerkeleyDB/patches/5.004_05 BitKeeper/deleted/.del-5.005~f9e2d59b5964cd4b: Delete: bdb/perl.BerkeleyDB/patches/5.005 BitKeeper/deleted/.del-5.005_01~3eb9fb7b5842ea8e: Delete: bdb/perl.BerkeleyDB/patches/5.005_01 BitKeeper/deleted/.del-5.005_02~67477ce0bef717cb: Delete: bdb/perl.BerkeleyDB/patches/5.005_02 BitKeeper/deleted/.del-5.005_03~c4c29a1fb21e290a: Delete: bdb/perl.BerkeleyDB/patches/5.005_03 BitKeeper/deleted/.del-5.6.0~e1fb9897d124ee22: Delete: bdb/perl.BerkeleyDB/patches/5.6.0 BitKeeper/deleted/.del-btree.t~e4a1a3c675ddc406: Delete: bdb/perl.BerkeleyDB/t/btree.t BitKeeper/deleted/.del-db-3.0.t~d2c60991d84558f2: Delete: bdb/perl.BerkeleyDB/t/db-3.0.t BitKeeper/deleted/.del-db-3.1.t~6ee88cd13f55e018: Delete: bdb/perl.BerkeleyDB/t/db-3.1.t BitKeeper/deleted/.del-db-3.2.t~f73b6461f98fd1cf: Delete: bdb/perl.BerkeleyDB/t/db-3.2.t BitKeeper/deleted/.del-destroy.t~cc6a2ae1980a2ecd: Delete: bdb/perl.BerkeleyDB/t/destroy.t BitKeeper/deleted/.del-env.t~a8604a4499c4bd07: Delete: bdb/perl.BerkeleyDB/t/env.t BitKeeper/deleted/.del-examples.t~2571b77c3cc75574: Delete: bdb/perl.BerkeleyDB/t/examples.t BitKeeper/deleted/.del-examples.t.T~8228bdd75ac78b88: Delete: bdb/perl.BerkeleyDB/t/examples.t.T BitKeeper/deleted/.del-examples3.t.T~66a186897a87026d: Delete: bdb/perl.BerkeleyDB/t/examples3.t.T BitKeeper/deleted/.del-examples3.t~fe3822ba2f2d7f83: Delete: bdb/perl.BerkeleyDB/t/examples3.t BitKeeper/deleted/.del-filter.t~f87b045c1b708637: Delete: bdb/perl.BerkeleyDB/t/filter.t BitKeeper/deleted/.del-hash.t~616bfb4d644de3a3: Delete: bdb/perl.BerkeleyDB/t/hash.t BitKeeper/deleted/.del-join.t~29fc39f74a83ca22: Delete: bdb/perl.BerkeleyDB/t/join.t BitKeeper/deleted/.del-mldbm.t~31f5015341eea040: Delete: bdb/perl.BerkeleyDB/t/mldbm.t BitKeeper/deleted/.del-queue.t~8f338034ce44a641: Delete: bdb/perl.BerkeleyDB/t/queue.t BitKeeper/deleted/.del-recno.t~d4ddbd3743add63e: Delete: bdb/perl.BerkeleyDB/t/recno.t BitKeeper/deleted/.del-strict.t~6885cdd2ea71ca2d: Delete: bdb/perl.BerkeleyDB/t/strict.t BitKeeper/deleted/.del-subdb.t~aab62a5d5864c603: Delete: bdb/perl.BerkeleyDB/t/subdb.t BitKeeper/deleted/.del-txn.t~65033b8558ae1216: Delete: bdb/perl.BerkeleyDB/t/txn.t BitKeeper/deleted/.del-unknown.t~f3710458682665e1: Delete: bdb/perl.BerkeleyDB/t/unknown.t BitKeeper/deleted/.del-Changes~436f74a5c414c65b: Delete: bdb/perl.DB_File/Changes BitKeeper/deleted/.del-DB_File.pm~ae0951c6c7665a82: Delete: bdb/perl.DB_File/DB_File.pm BitKeeper/deleted/.del-DB_File.xs~89e49a0b5556f1d8: Delete: bdb/perl.DB_File/DB_File.xs BitKeeper/deleted/.del-DB_File_BS~290fad5dbbb87069: Delete: bdb/perl.DB_File/DB_File_BS BitKeeper/deleted/.del-MANIFEST~90ee581572bdd4ac: Delete: bdb/perl.DB_File/MANIFEST BitKeeper/deleted/.del-Makefile.PL~ac0567bb5a377e38: Delete: bdb/perl.DB_File/Makefile.PL BitKeeper/deleted/.del-README~77e924a5a9bae6b3: Delete: bdb/perl.DB_File/README BitKeeper/deleted/.del-config.in~ab4c2792b86a810b: Delete: bdb/perl.DB_File/config.in BitKeeper/deleted/.del-dbinfo~461c43b30fab2cb: Delete: bdb/perl.DB_File/dbinfo BitKeeper/deleted/.del-dynixptx.pl~50dcddfae25d17e9: Delete: bdb/perl.DB_File/hints/dynixptx.pl BitKeeper/deleted/.del-typemap~55cffb3288a9e587: Delete: bdb/perl.DB_File/typemap BitKeeper/deleted/.del-version.c~a4df0e646f8b3975: Delete: bdb/perl.DB_File/version.c BitKeeper/deleted/.del-5.004_01~d6830d0082702af7: Delete: bdb/perl.DB_File/patches/5.004_01 BitKeeper/deleted/.del-5.004_02~78b082dc80c91031: Delete: bdb/perl.DB_File/patches/5.004_02 BitKeeper/deleted/.del-5.004~4411ec2e3c9e008b: Delete: bdb/perl.DB_File/patches/5.004 BitKeeper/deleted/.del-sco.pl~1e795fe14fe4dcfe: Delete: bdb/perl.DB_File/hints/sco.pl BitKeeper/deleted/.del-5.004_03~33f274648b160d95: Delete: bdb/perl.DB_File/patches/5.004_03 BitKeeper/deleted/.del-5.004_04~8f3d1b3cf18bb20a: Delete: bdb/perl.DB_File/patches/5.004_04 BitKeeper/deleted/.del-5.004_05~9c0f02e7331e142: Delete: bdb/perl.DB_File/patches/5.004_05 BitKeeper/deleted/.del-5.005~c2108cb2e3c8d951: Delete: bdb/perl.DB_File/patches/5.005 BitKeeper/deleted/.del-5.005_01~3b45e9673afc4cfa: Delete: bdb/perl.DB_File/patches/5.005_01 BitKeeper/deleted/.del-5.005_02~9fe5766bb02a4522: Delete: bdb/perl.DB_File/patches/5.005_02 BitKeeper/deleted/.del-5.005_03~ffa1c38c19ae72ea: Delete: bdb/perl.DB_File/patches/5.005_03 BitKeeper/deleted/.del-5.6.0~373be3a5ce47be85: Delete: bdb/perl.DB_File/patches/5.6.0 BitKeeper/deleted/.del-db-btree.t~3231595a1c241eb3: Delete: bdb/perl.DB_File/t/db-btree.t BitKeeper/deleted/.del-db-hash.t~7c4ad0c795c7fad2: Delete: bdb/perl.DB_File/t/db-hash.t BitKeeper/deleted/.del-db-recno.t~6c2d3d80b9ba4a50: Delete: bdb/perl.DB_File/t/db-recno.t BitKeeper/deleted/.del-db_server.sed~cdb00ebcd48a64e2: Delete: bdb/rpc_server/db_server.sed BitKeeper/deleted/.del-db_server_proc.c~d46c8f409c3747f4: Delete: bdb/rpc_server/db_server_proc.c BitKeeper/deleted/.del-db_server_svc.sed~3f5e59f334fa4607: Delete: bdb/rpc_server/db_server_svc.sed BitKeeper/deleted/.del-db_server_util.c~a809f3a4629acda: Delete: bdb/rpc_server/db_server_util.c BitKeeper/deleted/.del-log.tcl~ff1b41f1355b97d7: Delete: bdb/test/log.tcl BitKeeper/deleted/.del-mpool.tcl~b0df4dc1b04db26c: Delete: bdb/test/mpool.tcl BitKeeper/deleted/.del-mutex.tcl~52fd5c73a150565: Delete: bdb/test/mutex.tcl BitKeeper/deleted/.del-txn.tcl~c4ff071550b5446e: Delete: bdb/test/txn.tcl BitKeeper/deleted/.del-README~e800a12a5392010a: Delete: bdb/test/upgrade/README BitKeeper/deleted/.del-pack-2.6.6.pl~89d5076d758d3e98: Delete: bdb/test/upgrade/generate-2.X/pack-2.6.6.pl BitKeeper/deleted/.del-test-2.6.patch~4a52dc83d447547b: Delete: bdb/test/upgrade/generate-2.X/test-2.6.patch
Diffstat (limited to 'bdb/db')
-rw-r--r--bdb/db/Design.fileop452
-rw-r--r--bdb/db/crdel.src85
-rw-r--r--bdb/db/crdel_rec.c577
-rw-r--r--bdb/db/db.c2087
-rw-r--r--bdb/db/db.src133
-rw-r--r--bdb/db/db_am.c926
-rw-r--r--bdb/db/db_cam.c1538
-rw-r--r--bdb/db/db_conv.c290
-rw-r--r--bdb/db/db_dispatch.c1305
-rw-r--r--bdb/db/db_dup.c118
-rw-r--r--bdb/db/db_iface.c504
-rw-r--r--bdb/db/db_join.c250
-rw-r--r--bdb/db/db_meta.c287
-rw-r--r--bdb/db/db_method.c288
-rw-r--r--bdb/db/db_open.c705
-rw-r--r--bdb/db/db_overflow.c213
-rw-r--r--bdb/db/db_pr.c444
-rw-r--r--bdb/db/db_rec.c456
-rw-r--r--bdb/db/db_reclaim.c228
-rw-r--r--bdb/db/db_remove.c318
-rw-r--r--bdb/db/db_rename.c297
-rw-r--r--bdb/db/db_ret.c36
-rw-r--r--bdb/db/db_truncate.c95
-rw-r--r--bdb/db/db_upg.c27
-rw-r--r--bdb/db/db_upg_opd.c79
-rw-r--r--bdb/db/db_vrfy.c704
-rw-r--r--bdb/db/db_vrfyutil.c118
27 files changed, 7954 insertions, 4606 deletions
diff --git a/bdb/db/Design.fileop b/bdb/db/Design.fileop
deleted file mode 100644
index 187f1ffaf22..00000000000
--- a/bdb/db/Design.fileop
+++ /dev/null
@@ -1,452 +0,0 @@
-# $Id: Design.fileop,v 11.4 2000/02/19 20:57:54 bostic Exp $
-
-The design of file operation recovery.
-
-Keith has asked me to write up notes on our current status of database
-create and delete and recovery, why it's so hard, and how we've violated
-all the cornerstone assumptions on which our recovery framework is based.
-
-I am including two documents at the end of this one. The first is the
-initial design of the recoverability of file create and delete (there is
-no talk of subdatabases there, because we didn't think we'd have to do
-anything special there). I will annotate this document on where things
-changed.
-
-The second is the design of recd007 which is supposed to test our ability
-to recover these operations regardless of where one crashes. This test
-is fundamentally different from our other recovery tests in the following
-manner. Normally, the application controls transaction boundaries.
-Therefore, we can perform an operation and then decide whether to commit
-or abort it. In the normal recovery tests, we force the database into
-each of the four possible states from a recovery perspective:
-
- database is pre-op, undo (do nothing)
- database is pre-op, redo
- database is post-op, undo
- database is post-op, redo (do nothing)
-
-By copying databases at various points and initiating txn_commit and abort
-appropriately, we can make all these things happen. Notice that the one
-case we don't handle is where page A is in one state (e.g., pre-op) and
-page B is in another state (e.g., post-op). I will argue that these don't
-matter because each page is recovered independently. If anyone can poke
-holes in this, I'm interested.
-
-The problem with create/delete recovery testing is that the transaction
-is begun and ended all inside the library. Therefore, there is never any
-point (outside the library) where we can copy files and or initiate
-abort/commit. In order to still put the recovery code through its paces,
-Sue designed an infrastructure that lets you tell the library where to
-make copies of things and where to suddenly inject errors so that the
-transaction gets aborted. This level of detail allows us to push the
-create/delete recovery code through just about every recovery path
-possible (although I'm sure Mike will tell me I'm wrong when he starts to
-run code coverage tools).
-
-OK, so that's all preamble and a brief discussion of the documents I'm
-enclosing.
-
-Why was this so hard and painful and why is the code so Q@#$!% complicated?
-The following is a discussion/explanation, but to the best of my knowledge,
-the structure we have in place now works. The key question we need to be
-asking is, "Does this need to have to be so complex or should we redesign
-portions to simplify it?" At this point, there is no obvious way to simplify
-it in my book, but I may be having difficulty seeing this because my mind is
-too polluted at this point.
-
-Our overall strategy for recovery is that we do write-ahead logging,
-that is we log an operation and make sure it is on disk before any
-data corresponding to the data that log record describes is on disk.
-Typically we use log sequence numbers (LSNs) to mark the data so that
-during recovery, we can look at the data and determine if it is in a
-state before a particular log record or after a particular log record.
-
-In the good old days, opens were not transaction protected, so we could
-do regular old opens during recovery and if the file existed, we opened
-it and if it didn't (or appeared corrupt), we didn't and treated it like
-a missing file. As will be discussed below in detail, our states are much
-more complicated and recovery can't make such simplistic assumptions.
-
-Also, since we are now dealing with file system operations, we have less
-control about when they actually happen and what the state of the system
-can be. That is, we have to write create log records synchronously, because
-the create/open system call may force a newly created (0-length) file to
-disk. This file has to now be identified as being in the "being-created"
-state.
-
-A. We used to make a number of assumptions during recovery:
-
-1. We could call db_open at any time and one of three things would happen:
- a) the file would be opened cleanly
- b) the file would not exist
- c) we would encounter an error while opening the file
-
-Case a posed no difficulty.
-In Case b, we simply spit out a warning that a file was missing and then
- ignored all subsequent operations to that file.
-In Case c, we reported a fatal error.
-
-2. We can always generate a warning if a file is missing.
-
-3. We never encounter NULL file names in the log.
-
-B. We also made some assumptions in the main-line library:
-
-1. If you try to open a file and it exists but is 0-length, then
-someone else is trying to open it.
-
-2. You can write pages anywhere in a file and any non-existent pages
-are 0-filled. [This breaks on Windows.]
-
-3. If you have proper permissions then you can always evict pages from
-the buffer pool.
-
-4. During open, we can close the master database handle as soon as
-we're done with it since all the rest of the activity will take place
-on the subdatabase handle.
-
-In our brave new world, most of these assumptions are no longer valid.
-Let's address them one at a time.
-
-A.1 We could call db_open at any time and one of three things would happen:
- a) the file would be opened cleanly
- b) the file would not exist
- c) we would encounter an error while opening the file
-There are now additional states. Since we are trying to make file
-operations recoverable, you can now die in the middle of such an
-operation and we have to be able to pick up the pieces. What this
-now means is that:
-
- * a 0-length file can be an indication of a create in-progress
- * you can have a meta-data page but no root page (of a btree)
- * if a file doesn't exist, it could mean that it was just about
- to be created and needs to be rolled forward.
- * if you encounter an error in a file (e.g., the meta-data page
- is all 0's) you could still be in mid-open.
-
-I have now made this all work, but it required significant changes to the
-db_open code and error handling and this is the sort of change that makes
-everyone nervous.
-
-A.2. We can always generate a warning if a file is missing.
-
-Now that we have a delete file method in the API, we need to make sure
-that we do not generate warning messages for files that don't exist if
-we see that they were explicitly deleted.
-
-This means that we need to save state during recovery, determine which
-files were missing and were not being recreated and were not deleted and
-only complain about those.
-
-A.3. We never encounter NULL file names in the log.
-
-Now that we allow tranaction protection on memory-resident files, we write
-log messages for files with NULL file names. This means that our assumption
-of always being able to call "db_open" on any log_register OPEN message found
-in the log is no longer valid.
-
-B.1. If you try to open a file and it exists but is 0-length, then
-someone else is trying to open it.
-
-As discussed for A.1, this is no longer true. It may be instead that you
-are in the process of recovering a create.
-
-B.2. You can write pages anywhere in a file and any non-existent pages
-are 0-filled.
-
-It turns out that this is not true on Windows. This means that places
-we do group allocation (hash) must explicitly allocate each page, because
-we can't count on recognizing the uninitialized pages later.
-
-B.3. If you have proper permissions then you can always evict pages from
-the buffer pool.
-
-In the brave new world though, files can be deleted and they may
-have pages in the mpool. If you later try to evict these, you
-discover that the file doesn't exist. We'd get here when we had
-to dirty pages during a remove operation.
-
-B.4. You can close files any time you want.
-
-However, if the file takes part in the open/remove transaction,
-then we had better not close it until after the transaction
-commits/aborts, because we need to be able to get our hands on the
-dbp and the open happened in a different transaction.
-
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
-Design for recovering file create and delete in the presence of subdatabases.
-
-Assumptions:
- Remove the O_TRUNCATE flag.
- Single-thread all open/create/delete operations.
- (Well, almost all; we'll optimize opens without DB_CREATE set.)
- The reasoning for this is that with two simultaneous
- open/creaters, during recovery, we cannot identify which
- transaction successfully created files and therefore cannot
- recovery correctly.
- File system creates/deletes are synchronous
- Once the file is open, subdatabase creates look like regular
- get/put operations and a metadata page creation.
-
-There are 4 cases to deal with:
- 1. Open/create file
- 2. Open/create subdatabase
- 3. Delete
- 4. Recovery records
-
- __db_fileopen_recover
- __db_metapage_recover
- __db_delete_recover
- existing c_put and c_get routines for subdatabase creation
-
- Note that the open/create of the file and the open/create of the
- subdatabase need to be in the same transaction.
-
-1. Open/create (full file and subdb version)
-
-If create
- LOCK_FILEOP
- txn_begin
- log create message (open message below)
- do file system open/create
- if we did not create
- abort transaction (before going to open_only)
- if (!subdb)
- set dbp->open_txn = NULL
- else
- txn_begin a new transaction for the subdb open
-
- construct meta-data page
- log meta-data page (see metapage)
- write the meta-data page
- * It may be the case that btrees need to log both meta-data pages
- and root pages. If that is the case, I believe that we can use
- this same record and recovery routines for both
-
- txn_commit
- UNLOCK_FILEOP
-
-2. Delete
- LOCK_FILEOP
- txn_begin
- log delete message (delete message below)
- mv file __db.file.lsn
- txn_commit
- unlink __db.file.lsn
- UNLOCK_FILEOP
-
-3. Recovery Routines
-
-__db_fileopen_recover
- if (argp->name.size == 0
- done;
-
- if (redo) /* Commit */
- __os_open(argp->name, DB_OSO_CREATE, argp->mode, &fh)
- __os_closehandle(fh)
- if (undo) /* Abort */
- if (argp->name exists)
- unlink(argp->name);
-
-__db_metapage_recover
- if (redo)
- __os_open(argp->name, 0, 0, &fh)
- __os_lseek(meta data page)
- __os_write(meta data page)
- __os_closehandle(fh);
- if (undo)
- done = 0;
- if (argp->name exists)
- if (length of argp->name != 0)
- __os_open(argp->name, 0, 0, &fh)
- __os_lseek(meta data page)
- __os_read(meta data page)
- if (read succeeds && page lsn != current_lsn)
- done = 1
- __os_closehandle(fh);
- if (!done)
- unlink(argp->name)
-
-__db_delete_recover
- if (redo)
- Check if the backup file still exists and if so, delete it.
-
- if (undo)
- if (__db_appname(__db.file.lsn exists))
- mv __db_appname(__db.file.lsn) __db_appname(file)
-
-__db_metasub_recover
- /* This is like a normal recovery routine */
- Get the metadata page
- if (cmp_n && redo)
- copy the log page onto the page
- update the lsn
- make sure page gets put dirty
- else if (cmp_p && undo)
- update the lsn to the lsn in the log record
- make sure page gets put dirty
-
- if the page was modified, put it back dirty
-
-In db.src
-
-# name: filename (before call to __db_appname)
-# mode: file system mode
-BEGIN open
-DBT name DBT s
-ARG mode u_int32_t o
-END
-
-# opcode: indicate if it is a create/delete and if it is a subdatabase
-# pgsize: page size on which we're going to write the meta-data page
-# pgno: page number on which to write this meta-data page
-# page: the actual meta-data page
-# lsn: LSN of the meta-data page -- 0 for new databases, may be non-0
-# for subdatabases.
-
-BEGIN metapage
-ARG opcode u_int32_t x
-DBT name DBT s
-ARG pgno db_pgno_t d
-DBT page DBT s
-POINTER lsn DB_LSN * lu
-END
-
-# We do not need a subdatabase name here because removing a subdatabase
-# name is simply a regular bt_delete operation from the master database.
-# It will get logged normally.
-# name: filename
-BEGIN delete
-DBT name DBT s
-END
-
-# We also need to reclaim pages, but we can use the existing
-# bt_pg_alloc routines.
-
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
-Testing recoverability of create/delete.
-
-These tests are unlike other tests in that they are going to
-require hooks in the library. The reason is that the create
-and delete calls are internally wrapped in a transaction, so
-that if the call returns, the transaction has already either
-commited or aborted. Using only that interface limits what
-kind of testing we can do. To match our other recovery testing
-efforts, we need to add hooks to trigger aborts at particular
-times in the create/delete path.
-
-The general recovery testing strategy is that we wish to
-execute every path through every recovery routine. That
-means that we try to:
- catch each operation in its pre-operation state
- call the recovery function with redo
- call the recovery function with undo
- catch each operation in its post-operation state
- call the recovery function with redo
- call the recovery function with undo
-
-In addition, there are a few critical points in the create and
-delete path that we want to make sure we capture.
-
-1. Test Structure
-
-The test structure should be similar to the existing recovery
-tests. We will want to have a structure in place where we
-can execute different commands:
- create a file/database
- create a file that will contain subdatabases.
- create a subdatabase
- remove a subdatabase (that contains valid data)
- remove a subdatabase (that does not contain any data)
- remove a file that used to contain subdatabases
- remove a file that contains a database
-
-The tricky part is capturing the state of the world at the
-various points in the create/delete process.
-
-The critical points in the create process are:
-
- 1. After we've logged the create, but before we've done anything.
- in db/db.c
- after the open_retry
- after the __crdel_fileopen_log call (and before we've
- called __os_open).
-
- 2. Immediately after the __os_open
-
- 3. Immediately after each __db_log_page call
- in bt_open.c
- log meta-data page
- log root page
- in hash.c
- log meta-data page
-
- 4. With respect to the log records above, shortly after each
- log write is an memp_fput. We need to do a sync after
- each memp_fput and trigger a point after that sync.
-
-The critical points in the remove process are:
-
- 1. Right after the crdel_delete_log in db/db.c
-
- 2. Right after the __os_rename call (below the crdel_delete_log)
-
- 3. After the __db_remove_callback call.
-
-I believe that there are the places where we'll need some sort of hook.
-
-2. Adding hooks to the library.
-
-The hooks need two components. One component is to capture the state of
-the database at the hook point and the other is to trigger a txn_abort at
-the hook point. The second part is fairly trivial.
-
-The first part requires more thought. Let me explain what we do in a
-"normal" recovery test. In a normal recovery test, we save an intial
-copy of the database (this copy is called init). Then we execute one
-or more operations. Then, right before the commit/abort, we sync the
-file, and save another copy (the afterop copy). Finally, we call txn_commit
-or txn_abort, sync the file again, and save the database one last time (the
-final copy).
-
-Then we run recovery. The first time, this should be a no-op, because
-we've either committed the transaction and are checking to redo it or
-we aborted the transaction, undid it on the abort and are checking to
-undo it again.
-
-We then run recovery again on whatever database will force us through
-the path that requires work. In the commit case, this means we start
-with the init copy of the database and run recovery. This pushes us
-through all the redo paths. In the abort case, we start with the afterop
-copy which pushes us through all the undo cases.
-
-In some sense, we're asking the create/delete test to be more exhaustive
-by defining all the trigger points, but I think that's the correct thing
-to do, since the create/delete is not initiated by a user transaction.
-
-So, what do we have to do at the hook points?
- 1. sync the file to disk.
- 2. save the file itself
- 3. save any files named __db_backup_name(name, &backup_name, lsn)
- Since we may not know the right lsns, I think we should save
- every file of the form __db.name.0xNNNNNNNN.0xNNNNNNNN into
- some temporary files from which we can restore it to run
- recovery.
-
-3. Putting it all together
-
-So, the three pieces are writing the test structure, putting in the hooks
-and then writing the recovery portions so that we restore the right thing
-that the hooks saved in order to initiate recovery.
-
-Some of the technical issues that need to be solved are:
- How does the hook code become active (i.e., we don't
- want it in there normally, but it's got to be
- there when you configure for testing)?
- How do you (the test) tell the library that you want a
- particular hook to abort?
- How do you (the test) tell the library that you want the
- hook code doing its copies (do we really want
- *every* test doing these copies during testing?
- Maybe it's not a big deal, but maybe it is; we
- should at least think about it).
diff --git a/bdb/db/crdel.src b/bdb/db/crdel.src
index 17c061d6887..d89fa7a0382 100644
--- a/bdb/db/crdel.src
+++ b/bdb/db/crdel.src
@@ -1,13 +1,14 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*
- * $Id: crdel.src,v 11.12 2000/12/12 17:41:48 bostic Exp $
+ * $Id: crdel.src,v 11.24 2002/04/17 19:02:57 krinsky Exp $
*/
-PREFIX crdel
+PREFIX __crdel
+DBPRIVATE
INCLUDE #include "db_config.h"
INCLUDE
@@ -15,30 +16,20 @@ INCLUDE #ifndef NO_SYSTEM_INCLUDES
INCLUDE #include <sys/types.h>
INCLUDE
INCLUDE #include <ctype.h>
-INCLUDE #include <errno.h>
INCLUDE #include <string.h>
INCLUDE #endif
INCLUDE
INCLUDE #include "db_int.h"
-INCLUDE #include "db_page.h"
-INCLUDE #include "db_dispatch.h"
-INCLUDE #include "db_am.h"
-INCLUDE #include "txn.h"
+INCLUDE #include "dbinc/crypto.h"
+INCLUDE #include "dbinc/db_page.h"
+INCLUDE #include "dbinc/db_dispatch.h"
+INCLUDE #include "dbinc/db_am.h"
+INCLUDE #include "dbinc/log.h"
+INCLUDE #include "dbinc/rep.h"
+INCLUDE #include "dbinc/txn.h"
INCLUDE
/*
- * Fileopen -- log a potential file create operation
- *
- * name: filename
- * subname: sub database name
- * mode: file system mode
- */
-BEGIN fileopen 141
-DBT name DBT s
-ARG mode u_int32_t o
-END
-
-/*
* Metasub: log the creation of a subdatabase meta data page.
*
* fileid: identifies the file being acted upon.
@@ -47,57 +38,9 @@ END
* lsn: lsn of the page.
*/
BEGIN metasub 142
-ARG fileid int32_t ld
-ARG pgno db_pgno_t d
-DBT page DBT s
+DB fileid int32_t ld
+WRLOCK pgno db_pgno_t lu
+PGDBT page DBT s
POINTER lsn DB_LSN * lu
END
-/*
- * Metapage: log the creation of a meta data page for a new file.
- *
- * fileid: identifies the file being acted upon.
- * name: file containing the page.
- * pgno: page number on which to write this meta-data page
- * page: the actual meta-data page
- */
-BEGIN metapage 143
-ARG fileid int32_t ld
-DBT name DBT s
-ARG pgno db_pgno_t d
-DBT page DBT s
-END
-
-/*
- * Delete: remove a file.
- * Note that we don't need a special log record for subdatabase
- * removes, because we use normal btree operations to remove them.
- *
- * name: name of the file being removed (relative to DBHOME).
- */
-DEPRECATED old_delete 144
-DBT name DBT s
-END
-
-/*
- * Rename: rename a file
- * We do not need this for subdatabases
- *
- * name: name of the file being removed (relative to DBHOME).
- */
-BEGIN rename 145
-ARG fileid int32_t ld
-DBT name DBT s
-DBT newname DBT s
-END
-/*
- * Delete: remove a file.
- * Note that we don't need a special log record for subdatabase
- * removes, because we use normal btree operations to remove them.
- *
- * name: name of the file being removed (relative to DBHOME).
- */
-BEGIN delete 146
-ARG fileid int32_t ld
-DBT name DBT s
-END
diff --git a/bdb/db/crdel_rec.c b/bdb/db/crdel_rec.c
index 495b92a0ad7..542a0c358dd 100644
--- a/bdb/db/crdel_rec.c
+++ b/bdb/db/crdel_rec.c
@@ -1,14 +1,14 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: crdel_rec.c,v 11.43 2000/12/13 08:06:34 krinsky Exp $";
+static const char revid[] = "$Id: crdel_rec.c,v 11.64 2002/08/14 20:27:34 bostic Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -18,112 +18,9 @@ static const char revid[] = "$Id: crdel_rec.c,v 11.43 2000/12/13 08:06:34 krinsk
#endif
#include "db_int.h"
-#include "db_page.h"
-#include "log.h"
-#include "hash.h"
-#include "mp.h"
-#include "db_dispatch.h"
-
-/*
- * __crdel_fileopen_recover --
- * Recovery function for fileopen.
- *
- * PUBLIC: int __crdel_fileopen_recover
- * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
- */
-int
-__crdel_fileopen_recover(dbenv, dbtp, lsnp, op, info)
- DB_ENV *dbenv;
- DBT *dbtp;
- DB_LSN *lsnp;
- db_recops op;
- void *info;
-{
- __crdel_fileopen_args *argp;
- DBMETA ondisk;
- DB_FH fh;
- size_t nr;
- int do_unlink, ret;
- u_int32_t b, mb, io;
- char *real_name;
-
- COMPQUIET(info, NULL);
-
- real_name = NULL;
- REC_PRINT(__crdel_fileopen_print);
-
- if ((ret = __crdel_fileopen_read(dbenv, dbtp->data, &argp)) != 0)
- goto out;
- /*
- * If this is an in-memory database, then the name is going to
- * be NULL, which looks like a 0-length name in recovery.
- */
- if (argp->name.size == 0)
- goto done;
-
- if ((ret = __db_appname(dbenv, DB_APP_DATA,
- NULL, argp->name.data, 0, NULL, &real_name)) != 0)
- goto out;
- if (DB_REDO(op)) {
- /*
- * The create commited, so we need to make sure that the file
- * exists. A simple open should suffice.
- */
- if ((ret = __os_open(dbenv, real_name,
- DB_OSO_CREATE, argp->mode, &fh)) != 0)
- goto out;
- if ((ret = __os_closehandle(&fh)) != 0)
- goto out;
- } else if (DB_UNDO(op)) {
- /*
- * If the file is 0-length then it was in the process of being
- * created, so we should unlink it. If it is non-0 length, then
- * either someone else created it and we need to leave it
- * untouched or we were in the process of creating it, allocated
- * the first page on a system that requires you to actually
- * write pages as you allocate them, but never got any data
- * on it.
- * If the file doesn't exist, we never got around to creating
- * it, so that's fine.
- */
- if (__os_exists(real_name, NULL) != 0)
- goto done;
-
- if ((ret = __os_open(dbenv, real_name, 0, 0, &fh)) != 0)
- goto out;
- if ((ret = __os_ioinfo(dbenv,
- real_name, &fh, &mb, &b, &io)) != 0)
- goto out;
- do_unlink = 0;
- if (mb != 0 || b != 0) {
- /*
- * We need to read the first page
- * to see if its got valid data on it.
- */
- if ((ret = __os_read(dbenv, &fh,
- &ondisk, sizeof(ondisk), &nr)) != 0 ||
- nr != sizeof(ondisk))
- goto out;
- if (ondisk.magic == 0)
- do_unlink = 1;
- }
- if ((ret = __os_closehandle(&fh)) != 0)
- goto out;
- /* Check for 0-length and if it is, delete it. */
- if (do_unlink || (mb == 0 && b == 0))
- if ((ret = __os_unlink(dbenv, real_name)) != 0)
- goto out;
- }
-
-done: *lsnp = argp->prev_lsn;
- ret = 0;
-
-out: if (argp != NULL)
- __os_free(argp, 0);
- if (real_name != NULL)
- __os_freestr(real_name);
- return (ret);
-}
+#include "dbinc/db_page.h"
+#include "dbinc/hash.h"
+#include "dbinc/log.h"
/*
* __crdel_metasub_recover --
@@ -145,16 +42,16 @@ __crdel_metasub_recover(dbenv, dbtp, lsnp, op, info)
DBC *dbc;
DB_MPOOLFILE *mpf;
PAGE *pagep;
- u_int8_t *file_uid, ptype;
- int cmp_p, modified, reopen, ret;
+ int cmp_p, modified, ret;
+ pagep = NULL;
COMPQUIET(info, NULL);
REC_PRINT(__crdel_metasub_print);
REC_INTRO(__crdel_metasub_read, 0);
- if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) {
+ if ((ret = mpf->get(mpf, &argp->pgno, 0, &pagep)) != 0) {
if (DB_REDO(op)) {
- if ((ret = memp_fget(mpf,
+ if ((ret = mpf->get(mpf,
&argp->pgno, DB_MPOOL_CREATE, &pagep)) != 0)
goto out;
} else {
@@ -165,7 +62,6 @@ __crdel_metasub_recover(dbenv, dbtp, lsnp, op, info)
}
modified = 0;
- reopen = 0;
cmp_p = log_compare(&LSN(pagep), &argp->lsn);
CHECK_LSN(op, cmp_p, &LSN(pagep), &argp->lsn);
@@ -173,14 +69,6 @@ __crdel_metasub_recover(dbenv, dbtp, lsnp, op, info)
memcpy(pagep, argp->page.data, argp->page.size);
LSN(pagep) = *lsnp;
modified = 1;
- /*
- * If this is a meta-data page, then we must reopen;
- * if it was a root page, then we do not.
- */
- ptype = ((DBMETA *)argp->page.data)->type;
- if (ptype == P_HASHMETA || ptype == P_BTREEMETA ||
- ptype == P_QAMMETA)
- reopen = 1;
} else if (DB_UNDO(op)) {
/*
* We want to undo this page creation. The page creation
@@ -196,451 +84,14 @@ __crdel_metasub_recover(dbenv, dbtp, lsnp, op, info)
LSN(pagep) = argp->lsn;
modified = 1;
}
- if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0)
+ if ((ret = mpf->put(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0)
goto out;
-
- /*
- * If we are redoing a subdatabase create, we must close and reopen the
- * file to be sure that we have the proper meta information in the
- * in-memory structures
- */
- if (reopen) {
- /* Close cursor if it's open. */
- if (dbc != NULL) {
- dbc->c_close(dbc);
- dbc = NULL;
- }
-
- if ((ret = __os_malloc(dbenv,
- DB_FILE_ID_LEN, NULL, &file_uid)) != 0)
- goto out;
- memcpy(file_uid, &file_dbp->fileid[0], DB_FILE_ID_LEN);
- ret = __log_reopen_file(dbenv,
- NULL, argp->fileid, file_uid, argp->pgno);
- (void)__os_free(file_uid, DB_FILE_ID_LEN);
- if (ret != 0)
- goto out;
- }
-
-done: *lsnp = argp->prev_lsn;
- ret = 0;
-
-out: REC_CLOSE;
-}
-
-/*
- * __crdel_metapage_recover --
- * Recovery function for metapage.
- *
- * PUBLIC: int __crdel_metapage_recover
- * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
- */
-int
-__crdel_metapage_recover(dbenv, dbtp, lsnp, op, info)
- DB_ENV *dbenv;
- DBT *dbtp;
- DB_LSN *lsnp;
- db_recops op;
- void *info;
-{
- __crdel_metapage_args *argp;
- DB *dbp;
- DBMETA *meta, ondisk;
- DB_FH fh;
- size_t nr;
- u_int32_t b, io, mb, pagesize;
- int is_done, ret;
- char *real_name;
-
- COMPQUIET(info, NULL);
-
- real_name = NULL;
- memset(&fh, 0, sizeof(fh));
- REC_PRINT(__crdel_metapage_print);
-
- if ((ret = __crdel_metapage_read(dbenv, dbtp->data, &argp)) != 0)
- goto out;
-
- /*
- * If this is an in-memory database, then the name is going to
- * be NULL, which looks like a 0-length name in recovery.
- */
- if (argp->name.size == 0)
- goto done;
-
- meta = (DBMETA *)argp->page.data;
- __ua_memcpy(&pagesize, &meta->pagesize, sizeof(pagesize));
-
- if ((ret = __db_appname(dbenv, DB_APP_DATA,
- NULL, argp->name.data, 0, NULL, &real_name)) != 0)
- goto out;
- if (DB_REDO(op)) {
- if ((ret = __db_fileid_to_db(dbenv,
- &dbp, argp->fileid, 0)) != 0) {
- if (ret == DB_DELETED)
- goto done;
- else
- goto out;
- }
-
- /*
- * We simply read the first page and if the LSN is 0, we
- * write the meta-data page.
- */
- if ((ret = __os_open(dbenv, real_name, 0, 0, &fh)) != 0)
- goto out;
- if ((ret = __os_seek(dbenv, &fh,
- pagesize, argp->pgno, 0, 0, DB_OS_SEEK_SET)) != 0)
- goto out;
- /*
- * If the read succeeds then the page exists, then we need
- * to vrify that the page has actually been written, because
- * on some systems (e.g., Windows) we preallocate pages because
- * files aren't allowed to have holes in them. If the page
- * looks good then we're done.
- */
- if ((ret = __os_read(dbenv, &fh, &ondisk,
- sizeof(ondisk), &nr)) == 0 && nr == sizeof(ondisk)) {
- if (ondisk.magic != 0)
- goto done;
- if ((ret = __os_seek(dbenv, &fh,
- pagesize, argp->pgno, 0, 0, DB_OS_SEEK_SET)) != 0)
- goto out;
- }
-
- /*
- * Page didn't exist, update the LSN and write a new one.
- * (seek pointer shouldn't have moved)
- */
- __ua_memcpy(&meta->lsn, lsnp, sizeof(DB_LSN));
- if ((ret = __os_write(dbp->dbenv, &fh,
- argp->page.data, argp->page.size, &nr)) != 0)
- goto out;
- if (nr != (size_t)argp->page.size) {
- __db_err(dbenv, "Write failed during recovery");
- ret = EIO;
- goto out;
- }
-
- /*
- * We must close and reopen the file to be sure
- * that we have the proper meta information
- * in the in memory structures
- */
-
- if ((ret = __log_reopen_file(dbenv,
- argp->name.data, argp->fileid,
- meta->uid, argp->pgno)) != 0)
- goto out;
-
- /* Handle will be closed on exit. */
- } else if (DB_UNDO(op)) {
- is_done = 0;
-
- /* If file does not exist, there is nothing to undo. */
- if (__os_exists(real_name, NULL) != 0)
- goto done;
-
- /*
- * Before we can look at anything on disk, we have to check
- * if there is a valid dbp for this, and if there is, we'd
- * better flush it.
- */
- dbp = NULL;
- if ((ret =
- __db_fileid_to_db(dbenv, &dbp, argp->fileid, 0)) == 0)
- (void)dbp->sync(dbp, 0);
-
- /*
- * We need to make sure that we do not remove a file that
- * someone else created. If the file is 0-length, then we
- * can assume that we created it and remove it. If it is
- * not 0-length, then we need to check the LSN and make
- * sure that it's the file we created.
- */
- if ((ret = __os_open(dbenv, real_name, 0, 0, &fh)) != 0)
- goto out;
- if ((ret = __os_ioinfo(dbenv,
- real_name, &fh, &mb, &b, &io)) != 0)
- goto out;
- if (mb != 0 || b != 0) {
- /* The file has something in it. */
- if ((ret = __os_seek(dbenv, &fh,
- pagesize, argp->pgno, 0, 0, DB_OS_SEEK_SET)) != 0)
- goto out;
- if ((ret = __os_read(dbenv, &fh,
- &ondisk, sizeof(ondisk), &nr)) != 0)
- goto out;
- if (log_compare(&ondisk.lsn, lsnp) != 0)
- is_done = 1;
- }
-
- /*
- * Must close here, because unlink with the file open fails
- * on some systems.
- */
- if ((ret = __os_closehandle(&fh)) != 0)
- goto out;
-
- if (!is_done) {
- /*
- * On some systems, you cannot unlink an open file so
- * we close the fd in the dbp here and make sure we
- * don't try to close it again. First, check for a
- * saved_open_fhp, then close down the mpool.
- */
- if (dbp != NULL && dbp->saved_open_fhp != NULL &&
- F_ISSET(dbp->saved_open_fhp, DB_FH_VALID) &&
- (ret = __os_closehandle(dbp->saved_open_fhp)) != 0)
- goto out;
- if (dbp != NULL && dbp->mpf != NULL) {
- (void)__memp_fremove(dbp->mpf);
- if ((ret = memp_fclose(dbp->mpf)) != 0)
- goto out;
- F_SET(dbp, DB_AM_DISCARD);
- dbp->mpf = NULL;
- }
- if ((ret = __os_unlink(dbenv, real_name)) != 0)
- goto out;
- }
- }
+ pagep = NULL;
done: *lsnp = argp->prev_lsn;
ret = 0;
-out: if (argp != NULL)
- __os_free(argp, 0);
- if (real_name != NULL)
- __os_freestr(real_name);
- if (F_ISSET(&fh, DB_FH_VALID))
- (void)__os_closehandle(&fh);
- return (ret);
-}
-
-/*
- * __crdel_delete_recover --
- * Recovery function for delete.
- *
- * PUBLIC: int __crdel_delete_recover
- * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
- */
-int
-__crdel_delete_recover(dbenv, dbtp, lsnp, op, info)
- DB_ENV *dbenv;
- DBT *dbtp;
- DB_LSN *lsnp;
- db_recops op;
- void *info;
-{
- DB *dbp;
- __crdel_delete_args *argp;
- int ret;
- char *backup, *real_back, *real_name;
-
- REC_PRINT(__crdel_delete_print);
-
- backup = real_back = real_name = NULL;
- if ((ret = __crdel_delete_read(dbenv, dbtp->data, &argp)) != 0)
- goto out;
-
- if (DB_REDO(op)) {
- /*
- * On a recovery, as we recreate what was going on, we
- * recreate the creation of the file. And so, even though
- * it committed, we need to delete it. Try to delete it,
- * but it is not an error if that delete fails.
- */
- if ((ret = __db_appname(dbenv, DB_APP_DATA,
- NULL, argp->name.data, 0, NULL, &real_name)) != 0)
- goto out;
- if (__os_exists(real_name, NULL) == 0) {
- /*
- * If a file is deleted and then recreated, it's
- * possible for the __os_exists call above to
- * return success and for us to get here, but for
- * the fileid we're looking for to be marked
- * deleted. In that case, we needn't redo the
- * unlink even though the file exists, and it's
- * not an error.
- */
- ret = __db_fileid_to_db(dbenv, &dbp, argp->fileid, 0);
- if (ret == 0) {
- /*
- * On Windows, the underlying file must be
- * closed to perform a remove.
- */
- (void)__memp_fremove(dbp->mpf);
- if ((ret = memp_fclose(dbp->mpf)) != 0)
- goto out;
- dbp->mpf = NULL;
- if ((ret = __os_unlink(dbenv, real_name)) != 0)
- goto out;
- } else if (ret != DB_DELETED)
- goto out;
- }
- /*
- * The transaction committed, so the only thing that might
- * be true is that the backup file is still around. Try
- * to delete it, but it's not an error if that delete fails.
- */
- if ((ret = __db_backup_name(dbenv, argp->name.data,
- &backup, lsnp)) != 0)
- goto out;
- if ((ret = __db_appname(dbenv,
- DB_APP_DATA, NULL, backup, 0, NULL, &real_back)) != 0)
- goto out;
- if (__os_exists(real_back, NULL) == 0)
- if ((ret = __os_unlink(dbenv, real_back)) != 0)
- goto out;
- if ((ret = __db_txnlist_delete(dbenv, info,
- argp->name.data, TXNLIST_INVALID_ID, 1)) != 0)
- goto out;
- } else if (DB_UNDO(op)) {
- /*
- * Trying to undo. File may or may not have been deleted.
- * Try to move the backup to the original. If the backup
- * exists, then this is right. If it doesn't exist, then
- * nothing will happen and that's OK.
- */
- if ((ret = __db_backup_name(dbenv, argp->name.data,
- &backup, lsnp)) != 0)
- goto out;
- if ((ret = __db_appname(dbenv,
- DB_APP_DATA, NULL, backup, 0, NULL, &real_back)) != 0)
- goto out;
- if ((ret = __db_appname(dbenv, DB_APP_DATA,
- NULL, argp->name.data, 0, NULL, &real_name)) != 0)
- goto out;
- if (__os_exists(real_back, NULL) == 0)
- if ((ret =
- __os_rename(dbenv, real_back, real_name)) != 0)
- goto out;
- }
-
- *lsnp = argp->prev_lsn;
- ret = 0;
-
-out: if (argp != NULL)
- __os_free(argp, 0);
- if (backup != NULL)
- __os_freestr(backup);
- if (real_back != NULL)
- __os_freestr(real_back);
- if (real_name != NULL)
- __os_freestr(real_name);
- return (ret);
-}
-/*
- * __crdel_rename_recover --
- * Recovery function for rename.
- *
- * PUBLIC: int __crdel_rename_recover
- * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
- */
-int
-__crdel_rename_recover(dbenv, dbtp, lsnp, op, info)
- DB_ENV *dbenv;
- DBT *dbtp;
- DB_LSN *lsnp;
- db_recops op;
- void *info;
-{
- DB *dbp;
- __crdel_rename_args *argp;
- char *new_name, *real_name;
- int ret, set;
-
- COMPQUIET(info, NULL);
-
- REC_PRINT(__crdel_rename_print);
-
- new_name = real_name = NULL;
-
- if ((ret = __crdel_rename_read(dbenv, dbtp->data, &argp)) != 0)
- goto out;
-
- if ((ret = __db_fileid_to_db(dbenv, &dbp, argp->fileid, 0)) != 0)
- goto out;
- if (DB_REDO(op)) {
- /*
- * We don't use the dbp parameter to __log_filelist_update
- * in the rename case, so passing NULL for it is OK.
- */
- if ((ret = __log_filelist_update(dbenv, NULL,
- argp->fileid, argp->newname.data, &set)) != 0)
- goto out;
- if (set != 0) {
- if ((ret = __db_appname(dbenv, DB_APP_DATA,
- NULL, argp->name.data, 0, NULL, &real_name)) != 0)
- goto out;
- if (__os_exists(real_name, NULL) == 0) {
- if ((ret = __db_appname(dbenv,
- DB_APP_DATA, NULL, argp->newname.data,
- 0, NULL, &new_name)) != 0)
- goto out;
- /*
- * On Windows, the underlying file
- * must be closed to perform a remove.
- * The db will be closed by a
- * log_register record. Rename
- * has exclusive access to the db.
- */
- (void)__memp_fremove(dbp->mpf);
- if ((ret = memp_fclose(dbp->mpf)) != 0)
- goto out;
- dbp->mpf = NULL;
- if ((ret = __os_rename(dbenv,
- real_name, new_name)) != 0)
- goto out;
- }
- }
- } else {
- /*
- * We don't use the dbp parameter to __log_filelist_update
- * in the rename case, so passing NULL for it is OK.
- */
- if ((ret = __log_filelist_update(dbenv, NULL,
- argp->fileid, argp->name.data, &set)) != 0)
- goto out;
- if (set != 0) {
- if ((ret = __db_appname(dbenv, DB_APP_DATA,
- NULL, argp->newname.data, 0, NULL, &new_name)) != 0)
- goto out;
- if (__os_exists(new_name, NULL) == 0) {
- if ((ret = __db_appname(dbenv,
- DB_APP_DATA, NULL, argp->name.data,
- 0, NULL, &real_name)) != 0)
- goto out;
- /*
- * On Windows, the underlying file
- * must be closed to perform a remove.
- * The file may have already been closed
- * if we are aborting the transaction.
- */
- if (dbp->mpf != NULL) {
- (void)__memp_fremove(dbp->mpf);
- if ((ret = memp_fclose(dbp->mpf)) != 0)
- goto out;
- dbp->mpf = NULL;
- }
- if ((ret = __os_rename(dbenv,
- new_name, real_name)) != 0)
- goto out;
- }
- }
- }
-
- *lsnp = argp->prev_lsn;
- ret = 0;
-
-out: if (argp != NULL)
- __os_free(argp, 0);
-
- if (new_name != NULL)
- __os_free(new_name, 0);
-
- if (real_name != NULL)
- __os_free(real_name, 0);
-
- return (ret);
+out: if (pagep != NULL)
+ (void)mpf->put(mpf, pagep, 0);
+ REC_CLOSE;
}
diff --git a/bdb/db/db.c b/bdb/db/db.c
index 6e74b4b21bd..986167d5ade 100644
--- a/bdb/db/db.c
+++ b/bdb/db/db.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
/*
@@ -40,7 +40,7 @@
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: db.c,v 11.117 2001/01/11 18:19:50 bostic Exp $";
+static const char revid[] = "$Id: db.c,v 11.246 2002/08/20 14:40:00 margo Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -52,352 +52,41 @@ static const char revid[] = "$Id: db.c,v 11.117 2001/01/11 18:19:50 bostic Exp $
#endif
#include "db_int.h"
-#include "db_page.h"
-#include "db_shash.h"
-#include "db_swap.h"
-#include "btree.h"
-#include "db_am.h"
-#include "hash.h"
-#include "lock.h"
-#include "log.h"
-#include "mp.h"
-#include "qam.h"
-#include "common_ext.h"
-
-/* Actions that __db_master_update can take. */
-typedef enum { MU_REMOVE, MU_RENAME, MU_OPEN } mu_action;
-
-/* Flag values that __db_file_setup can return. */
-#define DB_FILE_SETUP_CREATE 0x01
-#define DB_FILE_SETUP_ZERO 0x02
-
-static int __db_file_setup __P((DB *,
- const char *, u_int32_t, int, db_pgno_t, int *));
-static int __db_master_update __P((DB *,
- const char *, u_int32_t,
- db_pgno_t *, mu_action, const char *, u_int32_t));
-static int __db_refresh __P((DB *));
-static int __db_remove_callback __P((DB *, void *));
-static int __db_set_pgsize __P((DB *, DB_FH *, char *));
-static int __db_subdb_remove __P((DB *, const char *, const char *));
-static int __db_subdb_rename __P(( DB *,
- const char *, const char *, const char *));
-#if CONFIG_TEST
+#include "dbinc/db_page.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+static int __db_disassociate __P((DB *));
+#if CONFIG_TEST
static void __db_makecopy __P((const char *, const char *));
-static int __db_testdocopy __P((DB *, const char *));
-static int __qam_testdocopy __P((DB *, const char *));
+static int __db_testdocopy __P((DB_ENV *, const char *));
+static int __qam_testdocopy __P((DB *, const char *));
#endif
/*
- * __db_open --
- * Main library interface to the DB access methods.
- *
- * PUBLIC: int __db_open __P((DB *,
- * PUBLIC: const char *, const char *, DBTYPE, u_int32_t, int));
+ * DB.C --
+ * This file contains the utility functions for the DBP layer.
*/
-int
-__db_open(dbp, name, subdb, type, flags, mode)
- DB *dbp;
- const char *name, *subdb;
- DBTYPE type;
- u_int32_t flags;
- int mode;
-{
- DB_ENV *dbenv;
- DB_LOCK open_lock;
- DB *mdbp;
- db_pgno_t meta_pgno;
- u_int32_t ok_flags;
- int ret, t_ret;
-
- dbenv = dbp->dbenv;
- mdbp = NULL;
-
- /* Validate arguments. */
-#define OKFLAGS \
- (DB_CREATE | DB_EXCL | DB_FCNTL_LOCKING | \
- DB_NOMMAP | DB_RDONLY | DB_RDWRMASTER | DB_THREAD | DB_TRUNCATE)
- if ((ret = __db_fchk(dbenv, "DB->open", flags, OKFLAGS)) != 0)
- return (ret);
- if (LF_ISSET(DB_EXCL) && !LF_ISSET(DB_CREATE))
- return (__db_ferr(dbenv, "DB->open", 1));
- if (LF_ISSET(DB_RDONLY) && LF_ISSET(DB_CREATE))
- return (__db_ferr(dbenv, "DB->open", 1));
-#ifdef HAVE_VXWORKS
- if (LF_ISSET(DB_TRUNCATE)) {
- __db_err(dbenv, "DB_TRUNCATE unsupported in VxWorks");
- return (__db_eopnotsup(dbenv));
- }
-#endif
- switch (type) {
- case DB_UNKNOWN:
- if (LF_ISSET(DB_CREATE|DB_TRUNCATE)) {
- __db_err(dbenv,
- "%s: DB_UNKNOWN type specified with DB_CREATE or DB_TRUNCATE",
- name);
- return (EINVAL);
- }
- ok_flags = 0;
- break;
- case DB_BTREE:
- ok_flags = DB_OK_BTREE;
- break;
- case DB_HASH:
- ok_flags = DB_OK_HASH;
- break;
- case DB_QUEUE:
- ok_flags = DB_OK_QUEUE;
- break;
- case DB_RECNO:
- ok_flags = DB_OK_RECNO;
- break;
- default:
- __db_err(dbenv, "unknown type: %lu", (u_long)type);
- return (EINVAL);
- }
- if (ok_flags)
- DB_ILLEGAL_METHOD(dbp, ok_flags);
-
- /* The environment may have been created, but never opened. */
- if (!F_ISSET(dbenv, DB_ENV_DBLOCAL | DB_ENV_OPEN_CALLED)) {
- __db_err(dbenv, "environment not yet opened");
- return (EINVAL);
- }
-
- /*
- * Historically, you could pass in an environment that didn't have a
- * mpool, and DB would create a private one behind the scenes. This
- * no longer works.
- */
- if (!F_ISSET(dbenv, DB_ENV_DBLOCAL) && !MPOOL_ON(dbenv)) {
- __db_err(dbenv, "environment did not include a memory pool.");
- return (EINVAL);
- }
-
- /*
- * You can't specify threads during DB->open if subsystems in the
- * environment weren't configured with them.
- */
- if (LF_ISSET(DB_THREAD) &&
- !F_ISSET(dbenv, DB_ENV_DBLOCAL | DB_ENV_THREAD)) {
- __db_err(dbenv, "environment not created using DB_THREAD");
- return (EINVAL);
- }
-
- /*
- * If the environment was configured with threads, the DB handle
- * must also be free-threaded, so we force the DB_THREAD flag on.
- * (See SR #2033 for why this is a requirement--recovery needs
- * to be able to grab a dbp using __db_fileid_to_dbp, and it has
- * no way of knowing which dbp goes with which thread, so whichever
- * one it finds has to be usable in any of them.)
- */
- if (F_ISSET(dbenv, DB_ENV_THREAD))
- LF_SET(DB_THREAD);
-
- /* DB_TRUNCATE is not transaction recoverable. */
- if (LF_ISSET(DB_TRUNCATE) && TXN_ON(dbenv)) {
- __db_err(dbenv,
- "DB_TRUNCATE illegal in a transaction protected environment");
- return (EINVAL);
- }
-
- /* Subdatabase checks. */
- if (subdb != NULL) {
- /* Subdatabases must be created in named files. */
- if (name == NULL) {
- __db_err(dbenv,
- "multiple databases cannot be created in temporary files");
- return (EINVAL);
- }
-
- /* QAM can't be done as a subdatabase. */
- if (type == DB_QUEUE) {
- __db_err(dbenv, "Queue databases must be one-per-file");
- return (EINVAL);
- }
- }
-
- /* Convert any DB->open flags. */
- if (LF_ISSET(DB_RDONLY))
- F_SET(dbp, DB_AM_RDONLY);
-
- /* Fill in the type. */
- dbp->type = type;
-
- /*
- * If we're potentially creating a database, wrap the open inside of
- * a transaction.
- */
- if (TXN_ON(dbenv) && LF_ISSET(DB_CREATE))
- if ((ret = __db_metabegin(dbp, &open_lock)) != 0)
- return (ret);
-
- /*
- * If we're opening a subdatabase, we have to open (and potentially
- * create) the main database, and then get (and potentially store)
- * our base page number in that database. Then, we can finally open
- * the subdatabase.
- */
- if (subdb == NULL)
- meta_pgno = PGNO_BASE_MD;
- else {
- /*
- * Open the master database, optionally creating or updating
- * it, and retrieve the metadata page number.
- */
- if ((ret =
- __db_master_open(dbp, name, flags, mode, &mdbp)) != 0)
- goto err;
-
- /* Copy the page size and file id from the master. */
- dbp->pgsize = mdbp->pgsize;
- F_SET(dbp, DB_AM_SUBDB);
- memcpy(dbp->fileid, mdbp->fileid, DB_FILE_ID_LEN);
-
- if ((ret = __db_master_update(mdbp,
- subdb, type, &meta_pgno, MU_OPEN, NULL, flags)) != 0)
- goto err;
-
- /*
- * Clear the exclusive open and truncation flags, they only
- * apply to the open of the master database.
- */
- LF_CLR(DB_EXCL | DB_TRUNCATE);
- }
-
- ret = __db_dbopen(dbp, name, flags, mode, meta_pgno);
-
- /*
- * You can open the database that describes the subdatabases in the
- * rest of the file read-only. The content of each key's data is
- * unspecified and applications should never be adding new records
- * or updating existing records. However, during recovery, we need
- * to open these databases R/W so we can redo/undo changes in them.
- * Likewise, we need to open master databases read/write during
- * rename and remove so we can be sure they're fully sync'ed, so
- * we provide an override flag for the purpose.
- */
- if (subdb == NULL && !IS_RECOVERING(dbenv) && !LF_ISSET(DB_RDONLY) &&
- !LF_ISSET(DB_RDWRMASTER) && F_ISSET(dbp, DB_AM_SUBDB)) {
- __db_err(dbenv,
- "files containing multiple databases may only be opened read-only");
- ret = EINVAL;
- goto err;
- }
-
-err: /*
- * End any transaction, committing if we were successful, aborting
- * otherwise.
- */
- if (TXN_ON(dbenv) && LF_ISSET(DB_CREATE))
- if ((t_ret = __db_metaend(dbp,
- &open_lock, ret == 0, NULL, NULL)) != 0 && ret == 0)
- ret = t_ret;
-
- /* If we were successful, don't discard the file on close. */
- if (ret == 0)
- F_CLR(dbp, DB_AM_DISCARD);
-
- /* If we were unsuccessful, destroy the DB handle. */
- if (ret != 0) {
- /* In recovery we set log_fileid early. */
- if (IS_RECOVERING(dbenv))
- dbp->log_fileid = DB_LOGFILEID_INVALID;
- __db_refresh(dbp);
- }
-
- if (mdbp != NULL) {
- /* If we were successful, don't discard the file on close. */
- if (ret == 0)
- F_CLR(mdbp, DB_AM_DISCARD);
- if ((t_ret = mdbp->close(mdbp, 0)) != 0 && ret == 0)
- ret = t_ret;
- }
-
- return (ret);
-}
-
-/*
- * __db_dbopen --
- * Open a database.
- * PUBLIC: int __db_dbopen __P((DB *, const char *, u_int32_t, int, db_pgno_t));
- */
-int
-__db_dbopen(dbp, name, flags, mode, meta_pgno)
- DB *dbp;
- const char *name;
- u_int32_t flags;
- int mode;
- db_pgno_t meta_pgno;
-{
- DB_ENV *dbenv;
- int ret, retinfo;
-
- dbenv = dbp->dbenv;
-
- /* Set up the underlying file. */
- if ((ret = __db_file_setup(dbp,
- name, flags, mode, meta_pgno, &retinfo)) != 0)
- return (ret);
-
- /*
- * If we created the file, set the truncate flag for the mpool. This
- * isn't for anything we've done, it's protection against stupid user
- * tricks: if the user deleted a file behind Berkeley DB's back, we
- * may still have pages in the mpool that match the file's "unique" ID.
- */
- if (retinfo & DB_FILE_SETUP_CREATE)
- flags |= DB_TRUNCATE;
-
- /* Set up the underlying environment. */
- if ((ret = __db_dbenv_setup(dbp, name, flags)) != 0)
- return (ret);
-
- /*
- * Do access method specific initialization.
- *
- * !!!
- * Set the open flag. (The underlying access method open functions
- * may want to do things like acquire cursors, so the open flag has
- * to be set before calling them.)
- */
- F_SET(dbp, DB_OPEN_CALLED);
-
- if (retinfo & DB_FILE_SETUP_ZERO)
- return (0);
-
- switch (dbp->type) {
- case DB_BTREE:
- ret = __bam_open(dbp, name, meta_pgno, flags);
- break;
- case DB_HASH:
- ret = __ham_open(dbp, name, meta_pgno, flags);
- break;
- case DB_RECNO:
- ret = __ram_open(dbp, name, meta_pgno, flags);
- break;
- case DB_QUEUE:
- ret = __qam_open(dbp, name, meta_pgno, mode, flags);
- break;
- case DB_UNKNOWN:
- return (__db_unknown_type(dbp->dbenv,
- "__db_dbopen", dbp->type));
- break;
- }
- return (ret);
-}
/*
* __db_master_open --
* Open up a handle on a master database.
*
* PUBLIC: int __db_master_open __P((DB *,
- * PUBLIC: const char *, u_int32_t, int, DB **));
+ * PUBLIC: DB_TXN *, const char *, u_int32_t, int, DB **));
*/
int
-__db_master_open(subdbp, name, flags, mode, dbpp)
+__db_master_open(subdbp, txn, name, flags, mode, dbpp)
DB *subdbp;
+ DB_TXN *txn;
const char *name;
u_int32_t flags;
int mode;
@@ -417,30 +106,62 @@ __db_master_open(subdbp, name, flags, mode, dbpp)
* Flag that we're creating a database with subdatabases.
*/
dbp->type = DB_BTREE;
- dbp->open_txn = subdbp->open_txn;
dbp->pgsize = subdbp->pgsize;
F_SET(dbp, DB_AM_SUBDB);
+ F_SET(dbp, F_ISSET(subdbp,
+ DB_AM_RECOVER | DB_AM_SWAP | DB_AM_ENCRYPT | DB_AM_CHKSUM));
- if ((ret = __db_dbopen(dbp, name, flags, mode, PGNO_BASE_MD)) != 0) {
- if (!F_ISSET(dbp, DB_AM_DISCARD))
- dbp->close(dbp, 0);
- return (ret);
- }
+ /*
+ * If there was a subdb specified, then we only want to apply
+ * DB_EXCL to the subdb, not the actual file. We only got here
+ * because there was a subdb specified.
+ */
+ LF_CLR(DB_EXCL);
+ LF_SET(DB_RDWRMASTER);
+ if ((ret = __db_dbopen(dbp, txn, name, NULL, flags, mode, PGNO_BASE_MD))
+ != 0)
+ goto err;
- *dbpp = dbp;
- return (0);
+ /*
+ * Verify that pagesize is the same on both.
+ * The items in dbp were now initialized from the meta
+ * page. The items in dbp were set in __db_dbopen
+ * when we either read or created the master file.
+ * Other items such as checksum and encryption are
+ * checked when we read the meta-page. So we do not
+ * check those here. However, if the meta-page caused
+ * chksumming to be turned on and it wasn't already, set
+ * it here.
+ */
+ if (F_ISSET(dbp, DB_AM_CHKSUM))
+ F_SET(subdbp, DB_AM_CHKSUM);
+ if (subdbp->pgsize != 0 && dbp->pgsize != subdbp->pgsize) {
+ ret = EINVAL;
+ __db_err(dbp->dbenv,
+ "Different pagesize specified on existent file");
+ goto err;
+ }
+err:
+ if (ret != 0 && !F_ISSET(dbp, DB_AM_DISCARD))
+ __db_close_i(dbp, txn, 0);
+ else
+ *dbpp = dbp;
+ return (ret);
}
/*
* __db_master_update --
- * Add/Remove a subdatabase from a master database.
+ * Add/Open/Remove a subdatabase from a master database.
+ *
+ * PUBLIC: int __db_master_update __P((DB *, DB *, DB_TXN *, const char *,
+ * PUBLIC: DBTYPE, mu_action, const char *, u_int32_t));
*/
-static int
-__db_master_update(mdbp, subdb, type, meta_pgnop, action, newname, flags)
- DB *mdbp;
+int
+__db_master_update(mdbp, sdbp, txn, subdb, type, action, newname, flags)
+ DB *mdbp, *sdbp;
+ DB_TXN *txn;
const char *subdb;
- u_int32_t type;
- db_pgno_t *meta_pgnop; /* may be NULL on MU_RENAME */
+ DBTYPE type;
mu_action action;
const char *newname;
u_int32_t flags;
@@ -456,33 +177,37 @@ __db_master_update(mdbp, subdb, type, meta_pgnop, action, newname, flags)
dbc = ndbc = NULL;
p = NULL;
- /* Might we modify the master database? If so, we'll need to lock. */
- modify = (action != MU_OPEN || LF_ISSET(DB_CREATE)) ? 1 : 0;
-
memset(&key, 0, sizeof(key));
memset(&data, 0, sizeof(data));
+ /* Might we modify the master database? If so, we'll need to lock. */
+ modify = (action != MU_OPEN || LF_ISSET(DB_CREATE)) ? 1 : 0;
+
/*
* Open up a cursor. If this is CDB and we're creating the database,
* make it an update cursor.
*/
- if ((ret = mdbp->cursor(mdbp, mdbp->open_txn, &dbc,
+ if ((ret = mdbp->cursor(mdbp, txn, &dbc,
(CDB_LOCKING(dbenv) && modify) ? DB_WRITECURSOR : 0)) != 0)
goto err;
/*
- * Try to point the cursor at the record.
+ * Point the cursor at the record.
*
* If we're removing or potentially creating an entry, lock the page
* with DB_RMW.
*
+ * We do multiple cursor operations with the cursor in some cases and
+ * subsequently access the data DBT information. Set DB_DBT_MALLOC so
+ * we don't risk modification of the data between our uses of it.
+ *
* !!!
* We don't include the name's nul termination in the database.
*/
- key.data = (char *)subdb;
- key.size = strlen(subdb);
- /* In the rename case, we do multiple cursor ops, so MALLOC is safer. */
+ key.data = (void *)subdb;
+ key.size = (u_int32_t)strlen(subdb);
F_SET(&data, DB_DBT_MALLOC);
+
ret = dbc->c_get(dbc, &key, &data,
DB_SET | ((STD_LOCKING(dbc) && modify) ? DB_RMW : 0));
@@ -514,9 +239,10 @@ __db_master_update(mdbp, subdb, type, meta_pgnop, action, newname, flags)
* so it hasn't been converted to/from opposite
* endian architectures. Do it explicitly, now.
*/
- memcpy(meta_pgnop, data.data, sizeof(db_pgno_t));
- DB_NTOHL(meta_pgnop);
- if ((ret = memp_fget(mdbp->mpf, meta_pgnop, 0, &p)) != 0)
+ memcpy(&sdbp->meta_pgno, data.data, sizeof(db_pgno_t));
+ DB_NTOHL(&sdbp->meta_pgno);
+ if ((ret =
+ mdbp->mpf->get(mdbp->mpf, &sdbp->meta_pgno, 0, &p)) != 0)
goto err;
/* Free and put the page. */
@@ -538,11 +264,11 @@ __db_master_update(mdbp, subdb, type, meta_pgnop, action, newname, flags)
* for the existence of newname; it shouldn't appear under
* us since we hold the metadata lock.
*/
- if ((ret = mdbp->cursor(mdbp, mdbp->open_txn, &ndbc, 0)) != 0)
+ if ((ret = mdbp->cursor(mdbp, txn, &ndbc, 0)) != 0)
goto err;
DB_ASSERT(newname != NULL);
- key.data = (void *) newname;
- key.size = strlen(newname);
+ key.data = (void *)newname;
+ key.size = (u_int32_t)strlen(newname);
/*
* We don't actually care what the meta page of the potentially-
@@ -583,8 +309,12 @@ __db_master_update(mdbp, subdb, type, meta_pgnop, action, newname, flags)
*/
switch (ret) {
case 0:
- memcpy(meta_pgnop, data.data, sizeof(db_pgno_t));
- DB_NTOHL(meta_pgnop);
+ if (LF_ISSET(DB_CREATE) && LF_ISSET(DB_EXCL)) {
+ ret = EEXIST;
+ goto err;
+ }
+ memcpy(&sdbp->meta_pgno, data.data, sizeof(db_pgno_t));
+ DB_NTOHL(&sdbp->meta_pgno);
goto done;
case DB_NOTFOUND:
if (LF_ISSET(DB_CREATE))
@@ -599,10 +329,22 @@ __db_master_update(mdbp, subdb, type, meta_pgnop, action, newname, flags)
goto err;
}
+ /*
+ * We need to check against the master lorder here because
+ * we only want to check this if we are creating. In the
+ * case where we don't create we just want to inherit.
+ */
+ if (F_ISSET(mdbp, DB_AM_SWAP) != F_ISSET(sdbp, DB_AM_SWAP)) {
+ ret = EINVAL;
+ __db_err(mdbp->dbenv,
+ "Different lorder specified on existent file");
+ goto err;
+ }
+ /* Create a subdatabase. */
if ((ret = __db_new(dbc,
type == DB_HASH ? P_HASHMETA : P_BTREEMETA, &p)) != 0)
goto err;
- *meta_pgnop = PGNO(p);
+ sdbp->meta_pgno = PGNO(p);
/*
* XXX
@@ -617,6 +359,7 @@ __db_master_update(mdbp, subdb, type, meta_pgnop, action, newname, flags)
ndata.size = sizeof(db_pgno_t);
if ((ret = dbc->c_put(dbc, &key, &ndata, DB_KEYLAST)) != 0)
goto err;
+ F_SET(sdbp, DB_AM_CREATED);
break;
}
@@ -628,7 +371,7 @@ done: /*
if (p != NULL) {
if (ret == 0) {
if ((t_ret =
- memp_fput(mdbp->mpf, p, DB_MPOOL_DIRTY)) != 0)
+ mdbp->mpf->put(mdbp->mpf, p, DB_MPOOL_DIRTY)) != 0)
ret = t_ret;
/*
* Since we cannot close this file until after
@@ -639,12 +382,12 @@ done: /*
if ((t_ret = mdbp->sync(mdbp, 0)) != 0 && ret == 0)
ret = t_ret;
} else
- (void)__db_free(dbc, p);
+ (void)mdbp->mpf->put(mdbp->mpf, p, 0);
}
/* Discard the cursor(s) and data. */
if (data.data != NULL)
- __os_free(data.data, data.size);
+ __os_ufree(dbenv, data.data);
if (dbc != NULL && (t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
ret = t_ret;
if (ndbc != NULL && (t_ret = ndbc->c_close(ndbc)) != 0 && ret == 0)
@@ -657,21 +400,25 @@ done: /*
* __db_dbenv_setup --
* Set up the underlying environment during a db_open.
*
- * PUBLIC: int __db_dbenv_setup __P((DB *, const char *, u_int32_t));
+ * PUBLIC: int __db_dbenv_setup __P((DB *,
+ * PUBLIC: DB_TXN *, const char *, u_int32_t, u_int32_t));
*/
int
-__db_dbenv_setup(dbp, name, flags)
+__db_dbenv_setup(dbp, txn, name, id, flags)
DB *dbp;
+ DB_TXN *txn;
const char *name;
+ u_int32_t id;
u_int32_t flags;
{
DB *ldbp;
- DB_ENV *dbenv;
DBT pgcookie;
- DB_MPOOL_FINFO finfo;
+ DB_ENV *dbenv;
+ DB_MPOOL *dbmp;
+ DB_MPOOLFILE *mpf;
DB_PGINFO pginfo;
- int ret;
u_int32_t maxid;
+ int ftype, ret;
dbenv = dbp->dbenv;
@@ -690,8 +437,18 @@ __db_dbenv_setup(dbp, name, flags)
}
/* Register DB's pgin/pgout functions. */
- if ((ret =
- memp_register(dbenv, DB_FTYPE_SET, __db_pgin, __db_pgout)) != 0)
+ if ((ret = dbenv->memp_register(
+ dbenv, DB_FTYPE_SET, __db_pgin, __db_pgout)) != 0)
+ return (ret);
+
+ /* Create the DB_MPOOLFILE structure. */
+ if ((ret = dbenv->memp_fcreate(dbenv, &dbp->mpf, 0)) != 0)
+ return (ret);
+ mpf = dbp->mpf;
+
+ /* Set the database's cache priority if we've been given one. */
+ if (dbp->priority != 0 &&
+ (ret = mpf->set_priority(mpf, dbp->priority)) != 0)
return (ret);
/*
@@ -704,22 +461,26 @@ __db_dbenv_setup(dbp, name, flags)
* need to page the file in and out. This has to be right -- we can't
* mmap files that are being paged in and out.
*/
- memset(&finfo, 0, sizeof(finfo));
switch (dbp->type) {
case DB_BTREE:
case DB_RECNO:
- finfo.ftype =
- F_ISSET(dbp, DB_AM_SWAP) ? DB_FTYPE_SET : DB_FTYPE_NOTSET;
- finfo.clear_len = DB_PAGE_DB_LEN;
+ ftype = F_ISSET(dbp, DB_AM_SWAP | DB_AM_ENCRYPT | DB_AM_CHKSUM)
+ ? DB_FTYPE_SET : DB_FTYPE_NOTSET;
+ (void)mpf->set_ftype(mpf, ftype);
+ (void)mpf->set_clear_len(mpf, (CRYPTO_ON(dbenv) ?
+ dbp->pgsize : DB_PAGE_DB_LEN));
break;
case DB_HASH:
- finfo.ftype = DB_FTYPE_SET;
- finfo.clear_len = DB_PAGE_DB_LEN;
+ (void)mpf->set_ftype(mpf, DB_FTYPE_SET);
+ (void)mpf->set_clear_len(mpf, (CRYPTO_ON(dbenv) ?
+ dbp->pgsize : DB_PAGE_DB_LEN));
break;
case DB_QUEUE:
- finfo.ftype =
- F_ISSET(dbp, DB_AM_SWAP) ? DB_FTYPE_SET : DB_FTYPE_NOTSET;
- finfo.clear_len = DB_PAGE_QUEUE_LEN;
+ ftype = F_ISSET(dbp, DB_AM_SWAP | DB_AM_ENCRYPT | DB_AM_CHKSUM)
+ ? DB_FTYPE_SET : DB_FTYPE_NOTSET;
+ (void)mpf->set_ftype(mpf, ftype);
+ (void)mpf->set_clear_len(mpf, (CRYPTO_ON(dbenv) ?
+ dbp->pgsize : DB_PAGE_QUEUE_LEN));
break;
case DB_UNKNOWN:
/*
@@ -735,48 +496,63 @@ __db_dbenv_setup(dbp, name, flags)
* to salvage some data even with no metadata page.
*/
if (F_ISSET(dbp, DB_AM_VERIFYING)) {
- finfo.ftype = DB_FTYPE_NOTSET;
- finfo.clear_len = DB_PAGE_DB_LEN;
+ (void)mpf->set_ftype(mpf, DB_FTYPE_NOTSET);
+ (void)mpf->set_clear_len(mpf, DB_PAGE_DB_LEN);
break;
}
- return (__db_unknown_type(dbp->dbenv,
- "__db_dbenv_setup", dbp->type));
+ /* FALLTHROUGH */
+ default:
+ return (
+ __db_unknown_type(dbenv, "__db_dbenv_setup", dbp->type));
}
- finfo.pgcookie = &pgcookie;
- finfo.fileid = dbp->fileid;
- finfo.lsn_offset = 0;
+
+ (void)mpf->set_fileid(mpf, dbp->fileid);
+ (void)mpf->set_lsn_offset(mpf, 0);
pginfo.db_pagesize = dbp->pgsize;
- pginfo.needswap = F_ISSET(dbp, DB_AM_SWAP);
+ pginfo.flags =
+ F_ISSET(dbp, (DB_AM_CHKSUM | DB_AM_ENCRYPT | DB_AM_SWAP));
+ pginfo.type = dbp->type;
pgcookie.data = &pginfo;
pgcookie.size = sizeof(DB_PGINFO);
+ (void)mpf->set_pgcookie(mpf, &pgcookie);
- if ((ret = memp_fopen(dbenv, name,
- LF_ISSET(DB_RDONLY | DB_NOMMAP | DB_ODDFILESIZE | DB_TRUNCATE),
- 0, dbp->pgsize, &finfo, &dbp->mpf)) != 0)
+ if ((ret = mpf->open(mpf, name,
+ LF_ISSET(DB_RDONLY | DB_NOMMAP | DB_ODDFILESIZE | DB_TRUNCATE) |
+ (F_ISSET(dbenv, DB_ENV_DIRECT_DB) ? DB_DIRECT : 0),
+ 0, dbp->pgsize)) != 0)
return (ret);
/*
- * We may need a per-thread mutex. Allocate it from the environment
+ * We may need a per-thread mutex. Allocate it from the mpool
* region, there's supposed to be extra space there for that purpose.
*/
if (LF_ISSET(DB_THREAD)) {
- if ((ret = __db_mutex_alloc(
- dbenv, dbenv->reginfo, (MUTEX **)&dbp->mutexp)) != 0)
+ dbmp = dbenv->mp_handle;
+ if ((ret = __db_mutex_setup(dbenv, dbmp->reginfo, &dbp->mutexp,
+ MUTEX_ALLOC | MUTEX_THREAD)) != 0)
return (ret);
- if ((ret = __db_mutex_init(
- dbenv, dbp->mutexp, 0, MUTEX_THREAD)) != 0) {
- __db_mutex_free(dbenv, dbenv->reginfo, dbp->mutexp);
- return (ret);
- }
}
- /* Get a log file id. */
- if (LOGGING_ON(dbenv) && !IS_RECOVERING(dbenv) &&
+ /*
+ * Set up a bookkeeping entry for this database in the log region,
+ * if such a region exists. Note that even if we're in recovery
+ * or a replication client, where we won't log registries, we'll
+ * still need an FNAME struct, so LOGGING_ON is the correct macro.
+ */
+ if (LOGGING_ON(dbenv) &&
+ (ret = __dbreg_setup(dbp, name, id)) != 0)
+ return (ret);
+
+ /*
+ * If we're actively logging and our caller isn't a recovery function
+ * that already did so, assign this dbp a log fileid.
+ */
+ if (DBENV_LOGGING(dbenv) && !F_ISSET(dbp, DB_AM_RECOVER) &&
#if !defined(DEBUG_ROP)
!F_ISSET(dbp, DB_AM_RDONLY) &&
#endif
- (ret = log_register(dbenv, dbp, name)) != 0)
+ (ret = __dbreg_new_id(dbp, txn)) != 0)
return (ret);
/*
@@ -822,541 +598,69 @@ __db_dbenv_setup(dbp, name, flags)
}
/*
- * __db_file_setup --
- * Setup the file or in-memory data.
- * Read the database metadata and resolve it with our arguments.
+ * __db_close --
+ * DB destructor.
+ *
+ * PUBLIC: int __db_close __P((DB *, u_int32_t));
*/
-static int
-__db_file_setup(dbp, name, flags, mode, meta_pgno, retflags)
+int
+__db_close(dbp, flags)
DB *dbp;
- const char *name;
u_int32_t flags;
- int mode;
- db_pgno_t meta_pgno;
- int *retflags;
-{
- DB *mdb;
- DBT namedbt;
- DB_ENV *dbenv;
- DB_FH *fhp, fh;
- DB_LSN lsn;
- DB_TXN *txn;
- size_t nr;
- u_int32_t magic, oflags;
- int ret, retry_cnt, t_ret;
- char *real_name, mbuf[DBMETASIZE];
-
-#define IS_SUBDB_SETUP (meta_pgno != PGNO_BASE_MD)
-
- dbenv = dbp->dbenv;
- dbp->meta_pgno = meta_pgno;
- txn = NULL;
- *retflags = 0;
-
- /*
- * If we open a file handle and our caller is doing fcntl(2) locking,
- * we can't close it because that would discard the caller's lock.
- * Save it until we close the DB handle.
- */
- if (LF_ISSET(DB_FCNTL_LOCKING)) {
- if ((ret = __os_malloc(dbenv, sizeof(*fhp), NULL, &fhp)) != 0)
- return (ret);
- } else
- fhp = &fh;
- memset(fhp, 0, sizeof(*fhp));
-
- /*
- * If the file is in-memory, set up is simple. Otherwise, do the
- * hard work of opening and reading the file.
- *
- * If we have a file name, try and read the first page, figure out
- * what type of file it is, and initialize everything we can based
- * on that file's meta-data page.
- *
- * !!!
- * There's a reason we don't push this code down into the buffer cache.
- * The problem is that there's no information external to the file that
- * we can use as a unique ID. UNIX has dev/inode pairs, but they are
- * not necessarily unique after reboot, if the file was mounted via NFS.
- * Windows has similar problems, as the FAT filesystem doesn't maintain
- * dev/inode numbers across reboot. So, we must get something from the
- * file we can use to ensure that, even after a reboot, the file we're
- * joining in the cache is the right file for us to join. The solution
- * we use is to maintain a file ID that's stored in the database, and
- * that's why we have to open and read the file before calling into the
- * buffer cache.
- *
- * The secondary reason is that there's additional information that
- * we want to have before instantiating a file in the buffer cache:
- * the page size, file type (btree/hash), if swapping is required,
- * and flags (DB_RDONLY, DB_CREATE, DB_TRUNCATE). We could handle
- * needing this information by allowing it to be set for a file in
- * the buffer cache even after the file has been opened, and, of
- * course, supporting the ability to flush a file from the cache as
- * necessary, e.g., if we guessed wrongly about the page size. Given
- * that we have to read the file anyway to get the file ID, we might
- * as well get the rest, too.
- *
- * Get the real file name.
- */
- if (name == NULL) {
- F_SET(dbp, DB_AM_INMEM);
-
- if (dbp->type == DB_UNKNOWN) {
- __db_err(dbenv,
- "DBTYPE of unknown without existing file");
- return (EINVAL);
- }
- real_name = NULL;
-
- /* Set the page size if we don't have one yet. */
- if (dbp->pgsize == 0)
- dbp->pgsize = DB_DEF_IOSIZE;
-
- /*
- * If the file is a temporary file and we're doing locking,
- * then we have to create a unique file ID. We can't use our
- * normal dev/inode pair (or whatever this OS uses in place of
- * dev/inode pairs) because no backing file will be created
- * until the mpool cache is filled forcing the buffers to disk.
- * Grab a random locker ID to use as a file ID. The created
- * ID must never match a potential real file ID -- we know it
- * won't because real file IDs contain a time stamp after the
- * dev/inode pair, and we're simply storing a 4-byte value.
- *
- * !!!
- * Store the locker in the file id structure -- we can get it
- * from there as necessary, and it saves having two copies.
- */
- if (LOCKING_ON(dbenv) &&
- (ret = lock_id(dbenv, (u_int32_t *)dbp->fileid)) != 0)
- return (ret);
-
- return (0);
- }
-
- /* Get the real backing file name. */
- if ((ret = __db_appname(dbenv,
- DB_APP_DATA, NULL, name, 0, NULL, &real_name)) != 0)
- return (ret);
-
- /*
- * Open the backing file. We need to make sure that multiple processes
- * attempting to create the file at the same time are properly ordered
- * so that only one of them creates the "unique" file ID, so we open it
- * O_EXCL and O_CREAT so two simultaneous attempts to create the region
- * will return failure in one of the attempts. If we're the one that
- * fails, simply retry without the O_CREAT flag, which will require the
- * meta-data page exist.
- */
-
- /* Fill in the default file mode. */
- if (mode == 0)
- mode = __db_omode("rwrw--");
-
- oflags = 0;
- if (LF_ISSET(DB_RDONLY))
- oflags |= DB_OSO_RDONLY;
- if (LF_ISSET(DB_TRUNCATE))
- oflags |= DB_OSO_TRUNC;
-
- retry_cnt = 0;
-open_retry:
- *retflags = 0;
- ret = 0;
- if (!IS_SUBDB_SETUP && LF_ISSET(DB_CREATE)) {
- if (dbp->open_txn != NULL) {
- /*
- * Start a child transaction to wrap this individual
- * create.
- */
- if ((ret =
- txn_begin(dbenv, dbp->open_txn, &txn, 0)) != 0)
- goto err_msg;
-
- memset(&namedbt, 0, sizeof(namedbt));
- namedbt.data = (char *)name;
- namedbt.size = strlen(name) + 1;
- if ((ret = __crdel_fileopen_log(dbenv, txn,
- &lsn, DB_FLUSH, &namedbt, mode)) != 0)
- goto err_msg;
- }
- DB_TEST_RECOVERY(dbp, DB_TEST_PREOPEN, ret, name);
- if ((ret = __os_open(dbenv, real_name,
- oflags | DB_OSO_CREATE | DB_OSO_EXCL, mode, fhp)) == 0) {
- DB_TEST_RECOVERY(dbp, DB_TEST_POSTOPEN, ret, name);
-
- /* Commit the file create. */
- if (dbp->open_txn != NULL) {
- if ((ret = txn_commit(txn, DB_TXN_SYNC)) != 0)
- goto err_msg;
- txn = NULL;
- }
-
- /*
- * We created the file. This means that if we later
- * fail, we need to delete the file and if we're going
- * to do that, we need to trash any pages in the
- * memory pool. Since we only know here that we
- * created the file, we're going to set the flag here
- * and clear it later if we commit successfully.
- */
- F_SET(dbp, DB_AM_DISCARD);
- *retflags |= DB_FILE_SETUP_CREATE;
- } else {
- /*
- * Abort the file create. If the abort fails, report
- * the error returned by txn_abort(), rather than the
- * open error, for no particular reason.
- */
- if (dbp->open_txn != NULL) {
- if ((t_ret = txn_abort(txn)) != 0) {
- ret = t_ret;
- goto err_msg;
- }
- txn = NULL;
- }
-
- /*
- * If we were not doing an exclusive open, try again
- * without the create flag.
- */
- if (ret == EEXIST && !LF_ISSET(DB_EXCL)) {
- LF_CLR(DB_CREATE);
- DB_TEST_RECOVERY(dbp,
- DB_TEST_POSTOPEN, ret, name);
- goto open_retry;
- }
- }
- } else
- ret = __os_open(dbenv, real_name, oflags, mode, fhp);
-
- /*
- * Be quiet if we couldn't open the file because it didn't exist
- * or we did not have permission,
- * the customers don't like those messages appearing in the logs.
- * Otherwise, complain loudly.
- */
- if (ret != 0) {
- if (ret == EACCES || ret == ENOENT)
- goto err;
- goto err_msg;
- }
-
- /* Set the page size if we don't have one yet. */
- if (dbp->pgsize == 0) {
- if (IS_SUBDB_SETUP) {
- if ((ret = __db_master_open(dbp,
- name, flags, mode, &mdb)) != 0)
- goto err;
- dbp->pgsize = mdb->pgsize;
- (void)mdb->close(mdb, 0);
- } else if ((ret = __db_set_pgsize(dbp, fhp, real_name)) != 0)
- goto err;
- }
-
- /*
- * Seek to the metadata offset; if it's a master database open or a
- * database without subdatabases, we're seeking to 0, but that's OK.
- */
- if ((ret = __os_seek(dbenv, fhp,
- dbp->pgsize, meta_pgno, 0, 0, DB_OS_SEEK_SET)) != 0)
- goto err_msg;
-
- /*
- * Read the metadata page. We read DBMETASIZE bytes, which is larger
- * than any access method's metadata page and smaller than any disk
- * sector.
- */
- if ((ret = __os_read(dbenv, fhp, mbuf, sizeof(mbuf), &nr)) != 0)
- goto err_msg;
-
- if (nr == sizeof(mbuf)) {
- /*
- * Figure out what access method we're dealing with, and then
- * call access method specific code to check error conditions
- * based on conflicts between the found file and application
- * arguments. A found file overrides some user information --
- * we don't consider it an error, for example, if the user set
- * an expected byte order and the found file doesn't match it.
- */
- F_CLR(dbp, DB_AM_SWAP);
- magic = ((DBMETA *)mbuf)->magic;
-
-swap_retry: switch (magic) {
- case DB_BTREEMAGIC:
- if ((ret =
- __bam_metachk(dbp, name, (BTMETA *)mbuf)) != 0)
- goto err;
- break;
- case DB_HASHMAGIC:
- if ((ret =
- __ham_metachk(dbp, name, (HMETA *)mbuf)) != 0)
- goto err;
- break;
- case DB_QAMMAGIC:
- if ((ret =
- __qam_metachk(dbp, name, (QMETA *)mbuf)) != 0)
- goto err;
- break;
- case 0:
- /*
- * There are two ways we can get a 0 magic number.
- * If we're creating a subdatabase, then the magic
- * number will be 0. We allocate a page as part of
- * finding out what the base page number will be for
- * the new subdatabase, but it's not initialized in
- * any way.
- *
- * The second case happens if we are in recovery
- * and we are going to recreate a database, it's
- * possible that it's page was created (on systems
- * where pages must be created explicitly to avoid
- * holes in files) but is still 0.
- */
- if (IS_SUBDB_SETUP) { /* Case 1 */
- if ((IS_RECOVERING(dbenv)
- && F_ISSET((DB_LOG *)
- dbenv->lg_handle, DBLOG_FORCE_OPEN))
- || ((DBMETA *)mbuf)->pgno != PGNO_INVALID)
- goto empty;
-
- ret = EINVAL;
- goto err;
- }
- /* Case 2 */
- if (IS_RECOVERING(dbenv)) {
- *retflags |= DB_FILE_SETUP_ZERO;
- goto empty;
- }
- goto bad_format;
- default:
- if (F_ISSET(dbp, DB_AM_SWAP))
- goto bad_format;
-
- M_32_SWAP(magic);
- F_SET(dbp, DB_AM_SWAP);
- goto swap_retry;
- }
- } else {
- /*
- * Only newly created files are permitted to fail magic
- * number tests.
- */
- if (nr != 0 || (!IS_RECOVERING(dbenv) && IS_SUBDB_SETUP))
- goto bad_format;
-
- /* Let the caller know that we had a 0-length file. */
- if (!LF_ISSET(DB_CREATE | DB_TRUNCATE))
- *retflags |= DB_FILE_SETUP_ZERO;
-
- /*
- * The only way we can reach here with the DB_CREATE flag set
- * is if we created the file. If that's not the case, then
- * either (a) someone else created the file but has not yet
- * written out the metadata page, or (b) we truncated the file
- * (DB_TRUNCATE) leaving it zero-length. In the case of (a),
- * we want to sleep and give the file creator time to write
- * the metadata page. In the case of (b), we want to continue.
- *
- * !!!
- * There's a race in the case of two processes opening the file
- * with the DB_TRUNCATE flag set at roughly the same time, and
- * they could theoretically hurt each other. Sure hope that's
- * unlikely.
- */
- if (!LF_ISSET(DB_CREATE | DB_TRUNCATE) &&
- !IS_RECOVERING(dbenv)) {
- if (retry_cnt++ < 3) {
- __os_sleep(dbenv, 1, 0);
- goto open_retry;
- }
-bad_format: if (!IS_RECOVERING(dbenv))
- __db_err(dbenv,
- "%s: unexpected file type or format", name);
- ret = EINVAL;
- goto err;
- }
-
- DB_ASSERT (dbp->type != DB_UNKNOWN);
-
-empty: /*
- * The file is empty, and that's OK. If it's not a subdatabase,
- * though, we do need to generate a unique file ID for it. The
- * unique file ID includes a timestamp so that we can't collide
- * with any other files, even when the file IDs (dev/inode pair)
- * are reused.
- */
- if (!IS_SUBDB_SETUP) {
- if (*retflags & DB_FILE_SETUP_ZERO)
- memset(dbp->fileid, 0, DB_FILE_ID_LEN);
- else if ((ret = __os_fileid(dbenv,
- real_name, 1, dbp->fileid)) != 0)
- goto err_msg;
- }
- }
-
- if (0) {
-err_msg: __db_err(dbenv, "%s: %s", name, db_strerror(ret));
- }
-
- /*
- * Abort any running transaction -- it can only exist if something
- * went wrong.
- */
-err:
-DB_TEST_RECOVERY_LABEL
-
- /*
- * If we opened a file handle and our caller is doing fcntl(2) locking,
- * then we can't close it because that would discard the caller's lock.
- * Otherwise, close the handle.
- */
- if (F_ISSET(fhp, DB_FH_VALID)) {
- if (ret == 0 && LF_ISSET(DB_FCNTL_LOCKING))
- dbp->saved_open_fhp = fhp;
- else
- if ((t_ret = __os_closehandle(fhp)) != 0 && ret == 0)
- ret = t_ret;
- }
-
- /*
- * This must be done after the file is closed, since
- * txn_abort() may remove the file, and an open file
- * cannot be removed on a Windows platforms.
- */
- if (txn != NULL)
- (void)txn_abort(txn);
-
- if (real_name != NULL)
- __os_freestr(real_name);
-
- return (ret);
-}
-
-/*
- * __db_set_pgsize --
- * Set the page size based on file information.
- */
-static int
-__db_set_pgsize(dbp, fhp, name)
- DB *dbp;
- DB_FH *fhp;
- char *name;
{
DB_ENV *dbenv;
- u_int32_t iopsize;
- int ret;
dbenv = dbp->dbenv;
- /*
- * Use the filesystem's optimum I/O size as the pagesize if a pagesize
- * not specified. Some filesystems have 64K as their optimum I/O size,
- * but as that results in fairly large default caches, we limit the
- * default pagesize to 16K.
- */
- if ((ret = __os_ioinfo(dbenv, name, fhp, NULL, NULL, &iopsize)) != 0) {
- __db_err(dbenv, "%s: %s", name, db_strerror(ret));
- return (ret);
- }
- if (iopsize < 512)
- iopsize = 512;
- if (iopsize > 16 * 1024)
- iopsize = 16 * 1024;
-
- /*
- * Sheer paranoia, but we don't want anything that's not a power-of-2
- * (we rely on that for alignment of various types on the pages), and
- * we want a multiple of the sector size as well.
- */
- OS_ROUNDOFF(iopsize, 512);
+ PANIC_CHECK(dbenv);
- dbp->pgsize = iopsize;
- F_SET(dbp, DB_AM_PGDEF);
+ /* Validate arguments, but as a DB handle destructor, we can't fail. */
+ if (flags != 0 && flags != DB_NOSYNC)
+ (void)__db_ferr(dbenv, "DB->close", 0);
- return (0);
+ return (__db_close_i(dbp, NULL, flags));
}
/*
- * __db_close --
- * DB destructor.
+ * __db_close_i --
+ * Internal DB destructor.
*
- * PUBLIC: int __db_close __P((DB *, u_int32_t));
+ * PUBLIC: int __db_close_i __P((DB *, DB_TXN *, u_int32_t));
*/
int
-__db_close(dbp, flags)
+__db_close_i(dbp, txn, flags)
DB *dbp;
+ DB_TXN *txn;
u_int32_t flags;
{
DB_ENV *dbenv;
- DBC *dbc;
int ret, t_ret;
- ret = 0;
-
dbenv = dbp->dbenv;
- PANIC_CHECK(dbenv);
-
- /* Validate arguments. */
- if ((ret = __db_closechk(dbp, flags)) != 0)
- goto err;
-
- /* If never opened, or not currently open, it's easy. */
- if (!F_ISSET(dbp, DB_OPEN_CALLED))
- goto never_opened;
-
- /* Sync the underlying access method. */
- if (!LF_ISSET(DB_NOSYNC) && !F_ISSET(dbp, DB_AM_DISCARD) &&
- (t_ret = dbp->sync(dbp, 0)) != 0 && ret == 0)
- ret = t_ret;
-
- /*
- * Go through the active cursors and call the cursor recycle routine,
- * which resolves pending operations and moves the cursors onto the
- * free list. Then, walk the free list and call the cursor destroy
- * routine.
- */
- while ((dbc = TAILQ_FIRST(&dbp->active_queue)) != NULL)
- if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
- ret = t_ret;
- while ((dbc = TAILQ_FIRST(&dbp->free_queue)) != NULL)
- if ((t_ret = __db_c_destroy(dbc)) != 0 && ret == 0)
- ret = t_ret;
+ ret = 0;
/*
- * Close any outstanding join cursors. Join cursors destroy
- * themselves on close and have no separate destroy routine.
+ * Validate arguments, but as a DB handle destructor, we can't fail.
+ *
+ * Check for consistent transaction usage -- ignore errors. Only
+ * internal callers specify transactions, so it's a serious problem
+ * if we get error messages.
*/
- while ((dbc = TAILQ_FIRST(&dbp->join_queue)) != NULL)
- if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
- ret = t_ret;
-
- /* Remove this DB handle from the DB_ENV's dblist. */
- MUTEX_THREAD_LOCK(dbenv, dbenv->dblist_mutexp);
- LIST_REMOVE(dbp, dblistlinks);
- MUTEX_THREAD_UNLOCK(dbenv, dbenv->dblist_mutexp);
-
- /* Sync the memory pool. */
- if (!LF_ISSET(DB_NOSYNC) && !F_ISSET(dbp, DB_AM_DISCARD) &&
- (t_ret = memp_fsync(dbp->mpf)) != 0 &&
- t_ret != DB_INCOMPLETE && ret == 0)
- ret = t_ret;
+ if (txn != NULL)
+ (void)__db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 0);
- /* Close any handle we've been holding since the open. */
- if (dbp->saved_open_fhp != NULL &&
- F_ISSET(dbp->saved_open_fhp, DB_FH_VALID) &&
- (t_ret = __os_closehandle(dbp->saved_open_fhp)) != 0 && ret == 0)
+ /* Refresh the structure and close any local environment. */
+ if ((t_ret = __db_refresh(dbp, txn, flags)) != 0 && ret == 0)
ret = t_ret;
-never_opened:
/*
* Call the access specific close function.
*
* !!!
- * Because of where the function is called in the close process,
- * these routines can't do anything that would dirty pages or
- * otherwise affect closing down the database.
+ * Because of where these functions are called in the DB handle close
+ * process, these routines can't do anything that would dirty pages or
+ * otherwise affect closing down the database. Specifically, we can't
+ * abort and recover any of the information they control.
*/
if ((t_ret = __ham_db_close(dbp)) != 0 && ret == 0)
ret = t_ret;
@@ -1365,17 +669,14 @@ never_opened:
if ((t_ret = __qam_db_close(dbp)) != 0 && ret == 0)
ret = t_ret;
-err:
- /* Refresh the structure and close any local environment. */
- if ((t_ret = __db_refresh(dbp)) != 0 && ret == 0)
- ret = t_ret;
- if (F_ISSET(dbenv, DB_ENV_DBLOCAL) &&
- --dbenv->dblocal_ref == 0 &&
+ --dbenv->db_ref;
+ if (F_ISSET(dbenv, DB_ENV_DBLOCAL) && dbenv->db_ref == 0 &&
(t_ret = dbenv->close(dbenv, 0)) != 0 && ret == 0)
ret = t_ret;
+ /* Free the database handle. */
memset(dbp, CLEAR_BYTE, sizeof(*dbp));
- __os_free(dbp, sizeof(*dbp));
+ __os_free(dbenv, dbp);
return (ret);
}
@@ -1383,653 +684,257 @@ err:
/*
* __db_refresh --
* Refresh the DB structure, releasing any allocated resources.
+ * This does most of the work of closing files now because refresh
+ * is what is used during abort processing (since we can't destroy
+ * the actual handle) and during abort processing, we may have a
+ * fully opened handle.
+ *
+ * PUBLIC: int __db_refresh __P((DB *, DB_TXN *, u_int32_t));
*/
-static int
-__db_refresh(dbp)
+int
+__db_refresh(dbp, txn, flags)
DB *dbp;
+ DB_TXN *txn;
+ u_int32_t flags;
{
- DB_ENV *dbenv;
+ DB *sdbp;
DBC *dbc;
+ DB_ENV *dbenv;
+ DB_LOCKREQ lreq;
+ DB_MPOOL *dbmp;
int ret, t_ret;
ret = 0;
dbenv = dbp->dbenv;
+ /* If never opened, or not currently open, it's easy. */
+ if (!F_ISSET(dbp, DB_AM_OPEN_CALLED))
+ goto never_opened;
+
/*
- * Go through the active cursors and call the cursor recycle routine,
- * which resolves pending operations and moves the cursors onto the
- * free list. Then, walk the free list and call the cursor destroy
- * routine.
+ * If we have any secondary indices, disassociate them from us.
+ * We don't bother with the mutex here; it only protects some
+ * of the ops that will make us core-dump mid-close anyway, and
+ * if you're trying to do something with a secondary *while* you're
+ * closing the primary, you deserve what you get. The disassociation
+ * is mostly done just so we can close primaries and secondaries in
+ * any order--but within one thread of control.
*/
- while ((dbc = TAILQ_FIRST(&dbp->active_queue)) != NULL)
- if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
- ret = t_ret;
- while ((dbc = TAILQ_FIRST(&dbp->free_queue)) != NULL)
- if ((t_ret = __db_c_destroy(dbc)) != 0 && ret == 0)
+ for (sdbp = LIST_FIRST(&dbp->s_secondaries);
+ sdbp != NULL; sdbp = LIST_NEXT(sdbp, s_links)) {
+ LIST_REMOVE(sdbp, s_links);
+ if ((t_ret = __db_disassociate(sdbp)) != 0 && ret == 0)
ret = t_ret;
-
- dbp->type = 0;
-
- /* Close the memory pool file handle. */
- if (dbp->mpf != NULL) {
- if (F_ISSET(dbp, DB_AM_DISCARD))
- (void)__memp_fremove(dbp->mpf);
- if ((t_ret = memp_fclose(dbp->mpf)) != 0 && ret == 0)
- ret = t_ret;
- dbp->mpf = NULL;
}
- /* Discard the thread mutex. */
- if (dbp->mutexp != NULL) {
- __db_mutex_free(dbenv, dbenv->reginfo, dbp->mutexp);
- dbp->mutexp = NULL;
- }
-
- /* Discard the log file id. */
- if (!IS_RECOVERING(dbenv)
- && dbp->log_fileid != DB_LOGFILEID_INVALID)
- (void)log_unregister(dbenv, dbp);
-
- F_CLR(dbp, DB_AM_DISCARD);
- F_CLR(dbp, DB_AM_INMEM);
- F_CLR(dbp, DB_AM_RDONLY);
- F_CLR(dbp, DB_AM_SWAP);
- F_CLR(dbp, DB_DBM_ERROR);
- F_CLR(dbp, DB_OPEN_CALLED);
-
- return (ret);
-}
-
-/*
- * __db_remove
- * Remove method for DB.
- *
- * PUBLIC: int __db_remove __P((DB *, const char *, const char *, u_int32_t));
- */
-int
-__db_remove(dbp, name, subdb, flags)
- DB *dbp;
- const char *name, *subdb;
- u_int32_t flags;
-{
- DBT namedbt;
- DB_ENV *dbenv;
- DB_LOCK remove_lock;
- DB_LSN newlsn;
- int ret, t_ret, (*callback_func) __P((DB *, void *));
- char *backup, *real_back, *real_name;
- void *cookie;
-
- dbenv = dbp->dbenv;
- ret = 0;
- backup = real_back = real_name = NULL;
-
- PANIC_CHECK(dbenv);
/*
- * Cannot use DB_ILLEGAL_AFTER_OPEN here because that returns
- * and we cannot return, but must deal with the error and destroy
- * the handle anyway.
+ * Sync the underlying access method. Do before closing the cursors
+ * because DB->sync allocates cursors in order to write Recno backing
+ * source text files.
*/
- if (F_ISSET(dbp, DB_OPEN_CALLED)) {
- ret = __db_mi_open(dbp->dbenv, "remove", 1);
- goto err_close;
- }
-
- /* Validate arguments. */
- if ((ret = __db_removechk(dbp, flags)) != 0)
- goto err_close;
+ if (!LF_ISSET(DB_NOSYNC) && !F_ISSET(dbp, DB_AM_DISCARD) &&
+ (t_ret = dbp->sync(dbp, 0)) != 0 && ret == 0)
+ ret = t_ret;
/*
- * Subdatabases.
+ * Go through the active cursors and call the cursor recycle routine,
+ * which resolves pending operations and moves the cursors onto the
+ * free list. Then, walk the free list and call the cursor destroy
+ * routine. Note that any failure on a close is considered "really
+ * bad" and we just break out of the loop and force forward.
*/
- if (subdb != NULL) {
- /* Subdatabases must be created in named files. */
- if (name == NULL) {
- __db_err(dbenv,
- "multiple databases cannot be created in temporary files");
- goto err_close;
+ while ((dbc = TAILQ_FIRST(&dbp->active_queue)) != NULL)
+ if ((t_ret = dbc->c_close(dbc)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ break;
}
- return (__db_subdb_remove(dbp, name, subdb));
- }
-
- if ((ret = dbp->open(dbp,
- name, NULL, DB_UNKNOWN, DB_RDWRMASTER, 0)) != 0)
- goto err_close;
-
- if (LOGGING_ON(dbenv) && (ret = __log_file_lock(dbp)) != 0)
- goto err_close;
- if ((ret = dbp->sync(dbp, 0)) != 0)
- goto err_close;
-
- /* Start the transaction and log the delete. */
- if (TXN_ON(dbenv) && (ret = __db_metabegin(dbp, &remove_lock)) != 0)
- goto err_close;
-
- if (LOGGING_ON(dbenv)) {
- memset(&namedbt, 0, sizeof(namedbt));
- namedbt.data = (char *)name;
- namedbt.size = strlen(name) + 1;
-
- if ((ret = __crdel_delete_log(dbenv,
- dbp->open_txn, &newlsn, DB_FLUSH,
- dbp->log_fileid, &namedbt)) != 0) {
- __db_err(dbenv,
- "%s: %s", name, db_strerror(ret));
- goto err;
+ while ((dbc = TAILQ_FIRST(&dbp->free_queue)) != NULL)
+ if ((t_ret = __db_c_destroy(dbc)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ break;
}
- }
-
- /* Find the real name of the file. */
- if ((ret = __db_appname(dbenv,
- DB_APP_DATA, NULL, name, 0, NULL, &real_name)) != 0)
- goto err;
/*
- * XXX
- * We don't bother to open the file and call __memp_fremove on the mpf.
- * There is a potential race here. It is at least possible that, if
- * the unique filesystem ID (dev/inode pair on UNIX) is reallocated
- * within a second (the granularity of the fileID timestamp), a new
- * file open will get the same fileID as the file being "removed".
- * We may actually want to open the file and call __memp_fremove on
- * the mpf to get around this.
- */
-
- /* Create name for backup file. */
- if (TXN_ON(dbenv)) {
- if ((ret =
- __db_backup_name(dbenv, name, &backup, &newlsn)) != 0)
- goto err;
- if ((ret = __db_appname(dbenv,
- DB_APP_DATA, NULL, backup, 0, NULL, &real_back)) != 0)
- goto err;
- }
-
- callback_func = __db_remove_callback;
- cookie = real_back;
- DB_TEST_RECOVERY(dbp, DB_TEST_PRERENAME, ret, name);
- if (dbp->db_am_remove != NULL &&
- (ret = dbp->db_am_remove(dbp,
- name, subdb, &newlsn, &callback_func, &cookie)) != 0)
- goto err;
- /*
- * On Windows, the underlying file must be closed to perform a remove.
- * Nothing later in __db_remove requires that it be open, and the
- * dbp->close closes it anyway, so we just close it early.
+ * Close any outstanding join cursors. Join cursors destroy
+ * themselves on close and have no separate destroy routine.
*/
- (void)__memp_fremove(dbp->mpf);
- if ((ret = memp_fclose(dbp->mpf)) != 0)
- goto err;
- dbp->mpf = NULL;
-
- if (TXN_ON(dbenv))
- ret = __os_rename(dbenv, real_name, real_back);
- else
- ret = __os_unlink(dbenv, real_name);
-
- DB_TEST_RECOVERY(dbp, DB_TEST_POSTRENAME, ret, name);
+ while ((dbc = TAILQ_FIRST(&dbp->join_queue)) != NULL)
+ if ((t_ret = dbc->c_close(dbc)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ break;
+ }
-err:
-DB_TEST_RECOVERY_LABEL
/*
- * End the transaction, committing the transaction if we were
- * successful, aborting otherwise.
+ * Sync the memory pool, even though we've already called DB->sync,
+ * because closing cursors can dirty pages by deleting items they
+ * referenced.
*/
- if (dbp->open_txn != NULL && (t_ret = __db_metaend(dbp, &remove_lock,
- ret == 0, callback_func, cookie)) != 0 && ret == 0)
+ if (!LF_ISSET(DB_NOSYNC) && !F_ISSET(dbp, DB_AM_DISCARD) &&
+ (t_ret = dbp->mpf->sync(dbp->mpf)) != 0 && ret == 0)
ret = t_ret;
- /* FALLTHROUGH */
-
-err_close:
- if (real_back != NULL)
- __os_freestr(real_back);
- if (real_name != NULL)
- __os_freestr(real_name);
- if (backup != NULL)
- __os_freestr(backup);
-
- /* We no longer have an mpool, so syncing would be disastrous. */
- if ((t_ret = dbp->close(dbp, DB_NOSYNC)) != 0 && ret == 0)
+ /* Close any handle we've been holding since the open. */
+ if (dbp->saved_open_fhp != NULL &&
+ F_ISSET(dbp->saved_open_fhp, DB_FH_VALID) &&
+ (t_ret = __os_closehandle(dbenv, dbp->saved_open_fhp)) != 0 &&
+ ret == 0)
ret = t_ret;
- return (ret);
-}
-
-/*
- * __db_subdb_remove --
- * Remove a subdatabase.
- */
-static int
-__db_subdb_remove(dbp, name, subdb)
- DB *dbp;
- const char *name, *subdb;
-{
- DB *mdbp;
- DBC *dbc;
- DB_ENV *dbenv;
- DB_LOCK remove_lock;
- db_pgno_t meta_pgno;
- int ret, t_ret;
-
- mdbp = NULL;
- dbc = NULL;
- dbenv = dbp->dbenv;
-
- /* Start the transaction. */
- if (TXN_ON(dbenv) && (ret = __db_metabegin(dbp, &remove_lock)) != 0)
- goto err_close;
-
+never_opened:
/*
- * Open the subdatabase. We can use the user's DB handle for this
- * purpose, I think.
+ * We are not releasing the handle lock here because we're about
+ * to release all locks held by dbp->lid below. There are two
+ * ways that we can get in here with a handle_lock, but not a
+ * dbp->lid. The first is when our lid has been hijacked by a
+ * subdb. The second is when we are a Queue database in the midst
+ * of a rename. If the queue file hasn't actually been opened, we
+ * hijack the main dbp's locker id to do the open so we can get the
+ * extent files. In both cases, we needn't free the handle lock
+ * because it will be freed when the hijacked locker-id is freed.
*/
- if ((ret = __db_open(dbp, name, subdb, DB_UNKNOWN, 0, 0)) != 0)
- goto err;
+ DB_ASSERT(!LOCK_ISSET(dbp->handle_lock) ||
+ dbp->lid != DB_LOCK_INVALIDID ||
+ dbp->type == DB_QUEUE ||
+ F_ISSET(dbp, DB_AM_SUBDB));
+
+ if (dbp->lid != DB_LOCK_INVALIDID) {
+ /* We may have pending trade operations on this dbp. */
+ if (txn != NULL)
+ __txn_remlock(dbenv, txn, &dbp->handle_lock, dbp->lid);
+
+ /* We may be holding the handle lock; release it. */
+ lreq.op = DB_LOCK_PUT_ALL;
+ if ((t_ret = __lock_vec(dbenv,
+ dbp->lid, 0, &lreq, 1, NULL)) != 0 && ret == 0)
+ ret = t_ret;
- /* Free up the pages in the subdatabase. */
- switch (dbp->type) {
- case DB_BTREE:
- case DB_RECNO:
- if ((ret = __bam_reclaim(dbp, dbp->open_txn)) != 0)
- goto err;
- break;
- case DB_HASH:
- if ((ret = __ham_reclaim(dbp, dbp->open_txn)) != 0)
- goto err;
- break;
- default:
- ret = __db_unknown_type(dbp->dbenv,
- "__db_subdb_remove", dbp->type);
- goto err;
+ if ((t_ret =
+ dbenv->lock_id_free(dbenv, dbp->lid)) != 0 && ret == 0)
+ ret = t_ret;
+ dbp->lid = DB_LOCK_INVALIDID;
+ LOCK_INIT(dbp->handle_lock);
}
- /*
- * Remove the entry from the main database and free the subdatabase
- * metadata page.
- */
- if ((ret = __db_master_open(dbp, name, 0, 0, &mdbp)) != 0)
- goto err;
-
- if ((ret = __db_master_update(mdbp,
- subdb, dbp->type, &meta_pgno, MU_REMOVE, NULL, 0)) != 0)
- goto err;
-
-err: /*
- * End the transaction, committing the transaction if we were
- * successful, aborting otherwise.
- */
- if (dbp->open_txn != NULL && (t_ret = __db_metaend(dbp,
- &remove_lock, ret == 0, NULL, NULL)) != 0 && ret == 0)
+ /* Discard the locker ID allocated as the fileid. */
+ if (F_ISSET(dbp, DB_AM_INMEM) &&
+ LOCKING_ON(dbenv) && (t_ret = dbenv->lock_id_free(
+ dbenv, *(u_int32_t *)dbp->fileid)) != 0 && ret == 0)
ret = t_ret;
-err_close:
- /*
- * Close the user's DB handle -- do this LAST to avoid smashing the
- * the transaction information.
- */
- if ((t_ret = dbp->close(dbp, 0)) != 0 && ret == 0)
- ret = t_ret;
-
- if (mdbp != NULL && (t_ret = mdbp->close(mdbp, 0)) != 0 && ret == 0)
- ret = t_ret;
+ dbp->type = DB_UNKNOWN;
- return (ret);
-}
-
-/*
- * __db_rename
- * Rename method for DB.
- *
- * PUBLIC: int __db_rename __P((DB *,
- * PUBLIC: const char *, const char *, const char *, u_int32_t));
- */
-int
-__db_rename(dbp, filename, subdb, newname, flags)
- DB *dbp;
- const char *filename, *subdb, *newname;
- u_int32_t flags;
-{
- DBT namedbt, newnamedbt;
- DB_ENV *dbenv;
- DB_LOCK remove_lock;
- DB_LSN newlsn;
- char *real_name, *real_newname;
- int ret, t_ret;
-
- dbenv = dbp->dbenv;
- ret = 0;
- real_name = real_newname = NULL;
-
- PANIC_CHECK(dbenv);
- /*
- * Cannot use DB_ILLEGAL_AFTER_OPEN here because that returns
- * and we cannot return, but must deal with the error and destroy
- * the handle anyway.
- */
- if (F_ISSET(dbp, DB_OPEN_CALLED)) {
- ret = __db_mi_open(dbp->dbenv, "rename", 1);
- goto err_close;
+ /* Discard the thread mutex. */
+ if (dbp->mutexp != NULL) {
+ dbmp = dbenv->mp_handle;
+ __db_mutex_free(dbenv, dbmp->reginfo, dbp->mutexp);
+ dbp->mutexp = NULL;
}
- /* Validate arguments -- has same rules as remove. */
- if ((ret = __db_removechk(dbp, flags)) != 0)
- goto err_close;
+ /* Discard any memory used to store returned data. */
+ if (dbp->my_rskey.data != NULL)
+ __os_free(dbp->dbenv, dbp->my_rskey.data);
+ if (dbp->my_rkey.data != NULL)
+ __os_free(dbp->dbenv, dbp->my_rkey.data);
+ if (dbp->my_rdata.data != NULL)
+ __os_free(dbp->dbenv, dbp->my_rdata.data);
+
+ /* For safety's sake; we may refresh twice. */
+ memset(&dbp->my_rskey, 0, sizeof(DBT));
+ memset(&dbp->my_rkey, 0, sizeof(DBT));
+ memset(&dbp->my_rdata, 0, sizeof(DBT));
/*
- * Subdatabases.
+ * Remove this DB handle from the DB_ENV's dblist, if it's been added.
*/
- if (subdb != NULL) {
- if (filename == NULL) {
- __db_err(dbenv,
- "multiple databases cannot be created in temporary files");
- goto err_close;
- }
- return (__db_subdb_rename(dbp, filename, subdb, newname));
- }
-
- if ((ret = dbp->open(dbp,
- filename, NULL, DB_UNKNOWN, DB_RDWRMASTER, 0)) != 0)
- goto err_close;
-
- if (LOGGING_ON(dbenv) && (ret = __log_file_lock(dbp)) != 0)
- goto err_close;
-
- if ((ret = dbp->sync(dbp, 0)) != 0)
- goto err_close;
-
- /* Start the transaction and log the rename. */
- if (TXN_ON(dbenv) && (ret = __db_metabegin(dbp, &remove_lock)) != 0)
- goto err_close;
-
- if (LOGGING_ON(dbenv)) {
- memset(&namedbt, 0, sizeof(namedbt));
- namedbt.data = (char *)filename;
- namedbt.size = strlen(filename) + 1;
-
- memset(&newnamedbt, 0, sizeof(namedbt));
- newnamedbt.data = (char *)newname;
- newnamedbt.size = strlen(newname) + 1;
-
- if ((ret = __crdel_rename_log(dbenv, dbp->open_txn,
- &newlsn, 0, dbp->log_fileid, &namedbt, &newnamedbt)) != 0) {
- __db_err(dbenv, "%s: %s", filename, db_strerror(ret));
- goto err;
- }
+ MUTEX_THREAD_LOCK(dbenv, dbenv->dblist_mutexp);
+ if (dbp->dblistlinks.le_prev != NULL)
+ LIST_REMOVE(dbp, dblistlinks);
+ MUTEX_THREAD_UNLOCK(dbenv, dbenv->dblist_mutexp);
+ dbp->dblistlinks.le_prev = NULL;
- if ((ret = __log_filelist_update(dbenv, dbp,
- dbp->log_fileid, newname, NULL)) != 0)
- goto err;
+ /* Close the memory pool file handle. */
+ if (dbp->mpf != NULL) {
+ if ((t_ret = dbp->mpf->close(dbp->mpf,
+ F_ISSET(dbp, DB_AM_DISCARD) ? DB_MPOOL_DISCARD : 0)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ dbp->mpf = NULL;
}
- /* Find the real name of the file. */
- if ((ret = __db_appname(dbenv,
- DB_APP_DATA, NULL, filename, 0, NULL, &real_name)) != 0)
- goto err;
-
- /* Find the real newname of the file. */
- if ((ret = __db_appname(dbenv,
- DB_APP_DATA, NULL, newname, 0, NULL, &real_newname)) != 0)
- goto err;
+ if (LOGGING_ON(dbp->dbenv)) {
+ /*
+ * Discard the log file id, if any. We want to log the close
+ * if and only if this is not a recovery dbp.
+ */
+ if (F_ISSET(dbp, DB_AM_RECOVER))
+ (void)__dbreg_revoke_id(dbp, 0);
+ else
+ (void)__dbreg_close_id(dbp, txn);
- /*
- * It is an error to rename a file over one that already exists,
- * as that wouldn't be transaction-safe.
- */
- if (__os_exists(real_newname, NULL) == 0) {
- ret = EEXIST;
- __db_err(dbenv, "rename: file %s exists", real_newname);
- goto err;
+ /* Discard the log FNAME. */
+ (void)__dbreg_teardown(dbp);
}
- DB_TEST_RECOVERY(dbp, DB_TEST_PRERENAME, ret, filename);
- if (dbp->db_am_rename != NULL &&
- (ret = dbp->db_am_rename(dbp, filename, subdb, newname)) != 0)
- goto err;
- /*
- * We have to flush the cache for a couple of reasons. First, the
- * underlying MPOOLFILE maintains a "name" that unrelated processes
- * can use to open the file in order to flush pages, and that name
- * is about to be wrong. Second, on Windows the unique file ID is
- * generated from the file's name, not other file information as is
- * the case on UNIX, and so a subsequent open of the old file name
- * could conceivably result in a matching "unique" file ID.
- */
- if ((ret = __memp_fremove(dbp->mpf)) != 0)
- goto err;
-
- /*
- * On Windows, the underlying file must be closed to perform a rename.
- * Nothing later in __db_rename requires that it be open, and the call
- * to dbp->close closes it anyway, so we just close it early.
- */
- if ((ret = memp_fclose(dbp->mpf)) != 0)
- goto err;
- dbp->mpf = NULL;
-
- ret = __os_rename(dbenv, real_name, real_newname);
- DB_TEST_RECOVERY(dbp, DB_TEST_POSTRENAME, ret, newname);
-
-DB_TEST_RECOVERY_LABEL
-err: if (dbp->open_txn != NULL && (t_ret = __db_metaend(dbp,
- &remove_lock, ret == 0, NULL, NULL)) != 0 && ret == 0)
- ret = t_ret;
-
-err_close:
- /* We no longer have an mpool, so syncing would be disastrous. */
- dbp->close(dbp, DB_NOSYNC);
- if (real_name != NULL)
- __os_freestr(real_name);
- if (real_newname != NULL)
- __os_freestr(real_newname);
-
- return (ret);
-}
-
-/*
- * __db_subdb_rename --
- * Rename a subdatabase.
- */
-static int
-__db_subdb_rename(dbp, name, subdb, newname)
- DB *dbp;
- const char *name, *subdb, *newname;
-{
- DB *mdbp;
- DBC *dbc;
- DB_ENV *dbenv;
- DB_LOCK remove_lock;
- int ret, t_ret;
-
- mdbp = NULL;
- dbc = NULL;
- dbenv = dbp->dbenv;
-
- /* Start the transaction. */
- if (TXN_ON(dbenv) && (ret = __db_metabegin(dbp, &remove_lock)) != 0)
- goto err_close;
-
- /*
- * Open the subdatabase. We can use the user's DB handle for this
- * purpose, I think.
- */
- if ((ret = __db_open(dbp, name, subdb, DB_UNKNOWN, 0, 0)) != 0)
- goto err;
-
- /*
- * Rename the entry in the main database.
- */
- if ((ret = __db_master_open(dbp, name, 0, 0, &mdbp)) != 0)
- goto err;
-
- if ((ret = __db_master_update(mdbp,
- subdb, dbp->type, NULL, MU_RENAME, newname, 0)) != 0)
- goto err;
-
-err: /*
- * End the transaction, committing the transaction if we were
- * successful, aborting otherwise.
- */
- if (dbp->open_txn != NULL && (t_ret = __db_metaend(dbp,
- &remove_lock, ret == 0, NULL, NULL)) != 0 && ret == 0)
- ret = t_ret;
-
-err_close:
- /*
- * Close the user's DB handle -- do this LAST to avoid smashing the
- * the transaction information.
- */
- if ((t_ret = dbp->close(dbp, 0)) != 0 && ret == 0)
- ret = t_ret;
-
- if (mdbp != NULL && (t_ret = mdbp->close(mdbp, 0)) != 0 && ret == 0)
- ret = t_ret;
-
- return (ret);
-}
-
-/*
- * __db_metabegin --
- *
- * Begin a meta-data operation. This involves doing any required locking,
- * potentially beginning a transaction and then telling the caller if you
- * did or did not begin the transaction.
- *
- * The writing flag indicates if the caller is actually allowing creates
- * or doing deletes (i.e., if the caller is opening and not creating, then
- * we don't need to do any of this).
- * PUBLIC: int __db_metabegin __P((DB *, DB_LOCK *));
- */
-int
-__db_metabegin(dbp, lockp)
- DB *dbp;
- DB_LOCK *lockp;
-{
- DB_ENV *dbenv;
- DBT dbplock;
- u_int32_t locker, lockval;
- int ret;
-
- dbenv = dbp->dbenv;
-
- lockp->off = LOCK_INVALID;
+ /* Clear out fields that normally get set during open. */
+ memset(dbp->fileid, 0, sizeof(dbp->fileid));
+ dbp->adj_fileid = 0;
+ dbp->meta_pgno = 0;
+ dbp->cur_lid = DB_LOCK_INVALIDID;
+ dbp->associate_lid = DB_LOCK_INVALIDID;
+ dbp->cl_id = 0;
/*
- * There is no single place where we can know that we are or are not
- * going to be creating any files and/or subdatabases, so we will
- * always begin a tranasaction when we start creating one. If we later
- * discover that this was unnecessary, we will abort the transaction.
- * Recovery is written so that if we log a file create, but then
- * discover that we didn't have to do it, we recover correctly. The
- * file recovery design document has details.
- *
- * We need to single thread all create and delete operations, so if we
- * are running with locking, we must obtain a lock. We use lock_id to
- * generate a unique locker id and use a handcrafted DBT as the object
- * on which we are locking.
+ * If we are being refreshed with a txn specified, then we need
+ * to make sure that we clear out the lock handle field, because
+ * releasing all the locks for this transaction will release this
+ * lock and we don't want close to stumble upon this handle and
+ * try to close it.
*/
- if (LOCKING_ON(dbenv)) {
- if ((ret = lock_id(dbenv, &locker)) != 0)
- return (ret);
- lockval = 0;
- dbplock.data = &lockval;
- dbplock.size = sizeof(lockval);
- if ((ret = lock_get(dbenv,
- locker, 0, &dbplock, DB_LOCK_WRITE, lockp)) != 0)
- return (ret);
- }
-
- return (txn_begin(dbenv, NULL, &dbp->open_txn, 0));
-}
-
-/*
- * __db_metaend --
- * End a meta-data operation.
- * PUBLIC: int __db_metaend __P((DB *,
- * PUBLIC: DB_LOCK *, int, int (*)(DB *, void *), void *));
- */
-int
-__db_metaend(dbp, lockp, commit, callback, cookie)
- DB *dbp;
- DB_LOCK *lockp;
- int commit, (*callback) __P((DB *, void *));
- void *cookie;
-{
- DB_ENV *dbenv;
- int ret, t_ret;
-
- ret = 0;
- dbenv = dbp->dbenv;
-
- /* End the transaction. */
- if (commit) {
- if ((ret = txn_commit(dbp->open_txn, DB_TXN_SYNC)) == 0) {
- /*
- * Unlink any underlying file, we've committed the
- * transaction.
- */
- if (callback != NULL)
- ret = callback(dbp, cookie);
- }
- } else if ((t_ret = txn_abort(dbp->open_txn)) && ret == 0)
- ret = t_ret;
+ if (txn != NULL)
+ LOCK_INIT(dbp->handle_lock);
- /* Release our lock. */
- if (lockp->off != LOCK_INVALID &&
- (t_ret = lock_put(dbenv, lockp)) != 0 && ret == 0)
- ret = t_ret;
+ F_CLR(dbp, DB_AM_DBM_ERROR);
+ F_CLR(dbp, DB_AM_DISCARD);
+ F_CLR(dbp, DB_AM_INMEM);
+ F_CLR(dbp, DB_AM_RECOVER);
+ F_CLR(dbp, DB_AM_OPEN_CALLED);
+ F_CLR(dbp, DB_AM_RDONLY);
+ F_CLR(dbp, DB_AM_SWAP);
return (ret);
}
/*
* __db_log_page
- * Log a meta-data or root page during a create operation.
+ * Log a meta-data or root page during a subdatabase create operation.
*
- * PUBLIC: int __db_log_page __P((DB *,
- * PUBLIC: const char *, DB_LSN *, db_pgno_t, PAGE *));
+ * PUBLIC: int __db_log_page __P((DB *, DB_TXN *, DB_LSN *, db_pgno_t, PAGE *));
*/
int
-__db_log_page(dbp, name, lsn, pgno, page)
+__db_log_page(dbp, txn, lsn, pgno, page)
DB *dbp;
- const char *name;
+ DB_TXN *txn;
DB_LSN *lsn;
db_pgno_t pgno;
PAGE *page;
{
- DBT name_dbt, page_dbt;
+ DBT page_dbt;
DB_LSN new_lsn;
int ret;
- if (dbp->open_txn == NULL)
+ if (!LOGGING_ON(dbp->dbenv) || txn == NULL)
return (0);
memset(&page_dbt, 0, sizeof(page_dbt));
page_dbt.size = dbp->pgsize;
page_dbt.data = page;
- if (pgno == PGNO_BASE_MD) {
- /*
- * !!!
- * Make sure that we properly handle a null name. The old
- * Tcl sent us pathnames of the form ""; it may be the case
- * that the new Tcl doesn't do that, so we can get rid of
- * the second check here.
- */
- memset(&name_dbt, 0, sizeof(name_dbt));
- name_dbt.data = (char *)name;
- if (name == NULL || *name == '\0')
- name_dbt.size = 0;
- else
- name_dbt.size = strlen(name) + 1;
- ret = __crdel_metapage_log(dbp->dbenv,
- dbp->open_txn, &new_lsn, DB_FLUSH,
- dbp->log_fileid, &name_dbt, pgno, &page_dbt);
- } else
- ret = __crdel_metasub_log(dbp->dbenv, dbp->open_txn,
- &new_lsn, 0, dbp->log_fileid, pgno, &page_dbt, lsn);
+ ret = __crdel_metasub_log(dbp, txn, &new_lsn, 0, pgno, &page_dbt, lsn);
if (ret == 0)
page->lsn = new_lsn;
@@ -2041,50 +946,89 @@ __db_log_page(dbp, name, lsn, pgno, page)
* Create the backup file name for a given file.
*
* PUBLIC: int __db_backup_name __P((DB_ENV *,
- * PUBLIC: const char *, char **, DB_LSN *));
+ * PUBLIC: const char *, DB_TXN *, char **));
*/
#undef BACKUP_PREFIX
#define BACKUP_PREFIX "__db."
#undef MAX_LSN_TO_TEXT
-#define MAX_LSN_TO_TEXT 21
+#define MAX_LSN_TO_TEXT 17
+
int
-__db_backup_name(dbenv, name, backup, lsn)
+__db_backup_name(dbenv, name, txn, backup)
DB_ENV *dbenv;
const char *name;
+ DB_TXN *txn;
char **backup;
- DB_LSN *lsn;
{
+ DB_LSN lsn;
size_t len;
int plen, ret;
char *p, *retp;
- len = strlen(name) + strlen(BACKUP_PREFIX) + MAX_LSN_TO_TEXT + 1;
-
- if ((ret = __os_malloc(dbenv, len, NULL, &retp)) != 0)
- return (ret);
-
/*
- * Create the name. Backup file names are of the form:
+ * Create the name. Backup file names are in one of two forms:
*
- * __db.name.0x[lsn-file].0x[lsn-offset]
+ * In a transactional env: __db.LSN(8).LSN(8)
+ * and
+ * in a non-transactional env: __db.FILENAME.
*
- * which guarantees uniqueness.
+ * If the transaction doesn't have a current LSN, we write
+ * a dummy log record to force it, so that we ensure that
+ * all tmp names are unique.
*
- * However, name may contain an env-relative path in it.
- * In that case, put the __db. after the last portion of
- * the pathname.
+ * In addition, the name passed may contain an env-relative path.
+ * In that case, put the __db. in the right place (in the last
+ * component of the pathname).
*/
- if ((p = __db_rpath(name)) == NULL)
- snprintf(retp, len,
- "%s%s.0x%x0x%x", BACKUP_PREFIX, name,
- lsn->file, lsn->offset);
- else {
- plen = p - name + 1;
+ if (txn != NULL) {
+ if (IS_ZERO_LSN(txn->last_lsn)) {
+ /*
+ * Write dummy log record. The two choices for
+ * dummy log records are __db_noop_log and
+ * __db_debug_log; unfortunately __db_noop_log requires
+ * a valid dbp, and we aren't guaranteed to be able
+ * to pass one in here.
+ */
+ if ((ret = __db_debug_log(dbenv, txn, &lsn, 0,
+ NULL, 0, NULL, NULL, 0)) != 0)
+ return (ret);
+ } else
+ lsn = txn->last_lsn;
+ }
+
+ /*
+ * Part of the name may be a full path, so we need to make sure that
+ * we allocate enough space for it, even in the case where we don't
+ * use the entire filename for the backup name.
+ */
+ len = strlen(name) + strlen(BACKUP_PREFIX) + MAX_LSN_TO_TEXT;
+
+ if ((ret = __os_malloc(dbenv, len, &retp)) != 0)
+ return (ret);
+
+ /*
+ * There are four cases here:
+ * 1. simple path w/out transaction
+ * 2. simple path + transaction
+ * 3. multi-component path w/out transaction
+ * 4. multi-component path + transaction
+ */
+ if ((p = __db_rpath(name)) == NULL) {
+ if (txn == NULL) /* case 1 */
+ snprintf(retp, len, "%s%s.", BACKUP_PREFIX, name);
+ else /* case 2 */
+ snprintf(retp, len,
+ "%s%x.%x", BACKUP_PREFIX, lsn.file, lsn.offset);
+ } else {
+ plen = (int)(p - name) + 1;
p++;
- snprintf(retp, len,
- "%.*s%s%s.0x%x0x%x", plen, name, BACKUP_PREFIX, p,
- lsn->file, lsn->offset);
+ if (txn == NULL) /* case 3 */
+ snprintf(retp, len,
+ "%.*s%s%s.", plen, name, BACKUP_PREFIX, p);
+ else /* case 4 */
+ snprintf(retp, len,
+ "%.*s%x.%x.", plen, name, lsn.file, lsn.offset);
}
*backup = retp;
@@ -2092,19 +1036,6 @@ __db_backup_name(dbenv, name, backup, lsn)
}
/*
- * __db_remove_callback --
- * Callback function -- on file remove commit, it unlinks the backing
- * file.
- */
-static int
-__db_remove_callback(dbp, cookie)
- DB *dbp;
- void *cookie;
-{
- return (__os_unlink(dbp->dbenv, cookie));
-}
-
-/*
* __dblist_get --
* Get the first element of dbenv->dblist with
* dbp->adj_fileid matching adjid.
@@ -2126,22 +1057,73 @@ __dblist_get(dbenv, adjid)
return (dbp);
}
-#if CONFIG_TEST
+/*
+ * __db_disassociate --
+ * Destroy the association between a given secondary and its primary.
+ */
+static int
+__db_disassociate(sdbp)
+ DB *sdbp;
+{
+ DBC *dbc;
+ int ret, t_ret;
+
+ ret = 0;
+
+ sdbp->s_callback = NULL;
+ sdbp->s_primary = NULL;
+ sdbp->get = sdbp->stored_get;
+ sdbp->close = sdbp->stored_close;
+
+ /*
+ * Complain, but proceed, if we have any active cursors. (We're in
+ * the middle of a close, so there's really no turning back.)
+ */
+ if (sdbp->s_refcnt != 1 ||
+ TAILQ_FIRST(&sdbp->active_queue) != NULL ||
+ TAILQ_FIRST(&sdbp->join_queue) != NULL) {
+ __db_err(sdbp->dbenv,
+ "Closing a primary DB while a secondary DB has active cursors is unsafe");
+ ret = EINVAL;
+ }
+ sdbp->s_refcnt = 0;
+
+ while ((dbc = TAILQ_FIRST(&sdbp->free_queue)) != NULL)
+ if ((t_ret = __db_c_destroy(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ F_CLR(sdbp, DB_AM_SECONDARY);
+ return (ret);
+}
+
+#if CONFIG_TEST
/*
* __db_testcopy
* Create a copy of all backup files and our "main" DB.
*
- * PUBLIC: int __db_testcopy __P((DB *, const char *));
+ * PUBLIC: #if CONFIG_TEST
+ * PUBLIC: int __db_testcopy __P((DB_ENV *, DB *, const char *));
+ * PUBLIC: #endif
*/
int
-__db_testcopy(dbp, name)
+__db_testcopy(dbenv, dbp, name)
+ DB_ENV *dbenv;
DB *dbp;
const char *name;
{
- if (dbp->type == DB_QUEUE)
+ DB_MPOOLFILE *mpf;
+
+ DB_ASSERT(dbp != NULL || name != NULL);
+
+ if (name == NULL) {
+ mpf = dbp->mpf;
+ name = R_ADDR(mpf->dbmp->reginfo, mpf->mfp->path_off);
+ }
+
+ if (dbp != NULL && dbp->type == DB_QUEUE)
return (__qam_testdocopy(dbp, name));
else
- return (__db_testdocopy(dbp, name));
+ return (__db_testdocopy(dbenv, name));
}
static int
@@ -2154,7 +1136,7 @@ __qam_testdocopy(dbp, name)
int ret;
filelist = NULL;
- if ((ret = __db_testdocopy(dbp, name)) != 0)
+ if ((ret = __db_testdocopy(dbp->dbenv, name)) != 0)
return (ret);
if (dbp->mpf != NULL &&
(ret = __qam_gen_filelist(dbp, &filelist)) != 0)
@@ -2164,12 +1146,13 @@ __qam_testdocopy(dbp, name)
return (0);
dir = ((QUEUE *)dbp->q_internal)->dir;
for (fp = filelist; fp->mpf != NULL; fp++) {
- snprintf(buf, sizeof(buf), QUEUE_EXTENT, dir, name, fp->id);
- if ((ret = __db_testdocopy(dbp, buf)) != 0)
+ snprintf(buf, sizeof(buf),
+ QUEUE_EXTENT, dir, PATH_SEPARATOR[0], name, fp->id);
+ if ((ret = __db_testdocopy(dbp->dbenv, buf)) != 0)
return (ret);
}
- __os_free(filelist, 0);
+ __os_free(dbp->dbenv, filelist);
return (0);
}
@@ -2179,8 +1162,8 @@ __qam_testdocopy(dbp, name)
*
*/
static int
-__db_testdocopy(dbp, name)
- DB *dbp;
+__db_testdocopy(dbenv, name)
+ DB_ENV *dbenv;
const char *name;
{
size_t len;
@@ -2188,8 +1171,8 @@ __db_testdocopy(dbp, name)
char **namesp, *backup, *copy, *dir, *p, *real_name;
real_name = NULL;
/* Get the real backing file name. */
- if ((ret = __db_appname(dbp->dbenv,
- DB_APP_DATA, NULL, name, 0, NULL, &real_name)) != 0)
+ if ((ret = __db_appname(dbenv,
+ DB_APP_DATA, name, 0, NULL, &real_name)) != 0)
return (ret);
copy = backup = NULL;
@@ -2200,10 +1183,10 @@ __db_testdocopy(dbp, name)
*/
len = strlen(real_name) + strlen(BACKUP_PREFIX) + MAX_LSN_TO_TEXT + 9;
- if ((ret = __os_malloc(dbp->dbenv, len, NULL, &copy)) != 0)
+ if ((ret = __os_malloc(dbenv, len, &copy)) != 0)
goto out;
- if ((ret = __os_malloc(dbp->dbenv, len, NULL, &backup)) != 0)
+ if ((ret = __os_malloc(dbenv, len, &backup)) != 0)
goto out;
/*
@@ -2212,9 +1195,9 @@ __db_testdocopy(dbp, name)
snprintf(copy, len, "%s.afterop", real_name);
__db_makecopy(real_name, copy);
- if ((ret = __os_strdup(dbp->dbenv, real_name, &dir)) != 0)
+ if ((ret = __os_strdup(dbenv, real_name, &dir)) != 0)
goto out;
- __os_freestr(real_name);
+ __os_free(dbenv, real_name);
real_name = NULL;
/*
* Create the name. Backup file names are of the form:
@@ -2234,7 +1217,7 @@ __db_testdocopy(dbp, name)
p = __db_rpath(dir);
if (p != NULL)
*p = '\0';
- ret = __os_dirlist(dbp->dbenv, dir, &namesp, &dircnt);
+ ret = __os_dirlist(dbenv, dir, &namesp, &dircnt);
#if DIAGNOSTIC
/*
* XXX
@@ -2245,7 +1228,7 @@ __db_testdocopy(dbp, name)
*/
*p = '/';
#endif
- __os_freestr(dir);
+ __os_free(dbenv, dir);
if (ret != 0)
goto out;
for (i = 0; i < dircnt; i++) {
@@ -2258,8 +1241,8 @@ __db_testdocopy(dbp, name)
* know its LSN's.
*/
if (strncmp(namesp[i], backup, strlen(backup)) == 0) {
- if ((ret = __db_appname(dbp->dbenv, DB_APP_DATA,
- NULL, namesp[i], 0, NULL, &real_name)) != 0)
+ if ((ret = __db_appname(dbenv, DB_APP_DATA,
+ namesp[i], 0, NULL, &real_name)) != 0)
goto out;
/*
@@ -2268,25 +1251,25 @@ __db_testdocopy(dbp, name)
* If so, just move on.
*/
if (strstr(real_name, ".afterop") != NULL) {
- __os_freestr(real_name);
+ __os_free(dbenv, real_name);
real_name = NULL;
continue;
}
snprintf(copy, len, "%s.afterop", real_name);
__db_makecopy(real_name, copy);
- __os_freestr(real_name);
+ __os_free(dbenv, real_name);
real_name = NULL;
}
}
out:
if (backup != NULL)
- __os_freestr(backup);
+ __os_free(dbenv, backup);
if (copy != NULL)
- __os_freestr(copy);
+ __os_free(dbenv, copy);
if (namesp != NULL)
- __os_dirfree(namesp, dircnt);
+ __os_dirfree(dbenv, namesp, dircnt);
if (real_name != NULL)
- __os_freestr(real_name);
+ __os_free(dbenv, real_name);
return (ret);
}
@@ -2301,7 +1284,7 @@ __db_makecopy(src, dest)
memset(&rfh, 0, sizeof(rfh));
memset(&wfh, 0, sizeof(wfh));
- if (__os_malloc(NULL, 1024, NULL, &buf) != 0)
+ if (__os_malloc(NULL, 1024, &buf) != 0)
return;
if (__os_open(NULL,
@@ -2313,13 +1296,13 @@ __db_makecopy(src, dest)
for (;;)
if (__os_read(NULL, &rfh, buf, 1024, &rcnt) < 0 || rcnt == 0 ||
- __os_write(NULL, &wfh, buf, rcnt, &wcnt) < 0 || wcnt != rcnt)
+ __os_write(NULL, &wfh, buf, rcnt, &wcnt) < 0)
break;
-err: __os_free(buf, 1024);
+err: __os_free(NULL, buf);
if (F_ISSET(&rfh, DB_FH_VALID))
- __os_closehandle(&rfh);
+ __os_closehandle(NULL, &rfh);
if (F_ISSET(&wfh, DB_FH_VALID))
- __os_closehandle(&wfh);
+ __os_closehandle(NULL, &wfh);
}
#endif
diff --git a/bdb/db/db.src b/bdb/db/db.src
index b695e1360c5..414321fcbbd 100644
--- a/bdb/db/db.src
+++ b/bdb/db/db.src
@@ -1,13 +1,14 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*
- * $Id: db.src,v 11.8 2000/02/17 20:24:07 bostic Exp $
+ * $Id: db.src,v 11.18 2002/04/17 19:02:58 krinsky Exp $
*/
-PREFIX db
+PREFIX __db
+DBPRIVATE
INCLUDE #include "db_config.h"
INCLUDE
@@ -15,15 +16,17 @@ INCLUDE #ifndef NO_SYSTEM_INCLUDES
INCLUDE #include <sys/types.h>
INCLUDE
INCLUDE #include <ctype.h>
-INCLUDE #include <errno.h>
INCLUDE #include <string.h>
INCLUDE #endif
INCLUDE
INCLUDE #include "db_int.h"
-INCLUDE #include "db_page.h"
-INCLUDE #include "db_dispatch.h"
-INCLUDE #include "db_am.h"
-INCLUDE #include "txn.h"
+INCLUDE #include "dbinc/crypto.h"
+INCLUDE #include "dbinc/db_page.h"
+INCLUDE #include "dbinc/db_dispatch.h"
+INCLUDE #include "dbinc/db_am.h"
+INCLUDE #include "dbinc/log.h"
+INCLUDE #include "dbinc/rep.h"
+INCLUDE #include "dbinc/txn.h"
INCLUDE
/*
@@ -44,33 +47,16 @@ INCLUDE
*/
BEGIN addrem 41
ARG opcode u_int32_t lu
-ARG fileid int32_t ld
-ARG pgno db_pgno_t lu
+DB fileid int32_t ld
+WRLOCK pgno db_pgno_t lu
ARG indx u_int32_t lu
-ARG nbytes size_t lu
-DBT hdr DBT s
+ARG nbytes u_int32_t lu
+PGDBT hdr DBT s
DBT dbt DBT s
POINTER pagelsn DB_LSN * lu
END
/*
- * split -- Handles the split of a duplicate page.
- *
- * opcode: defines whether we are splitting from or splitting onto
- * fileid: file identifier of the file being modified.
- * pgno: page number being split.
- * pageimage: entire page contents.
- * pagelsn: former lsn of the page.
- */
-DEPRECATED split 42
-ARG opcode u_int32_t lu
-ARG fileid int32_t ld
-ARG pgno db_pgno_t lu
-DBT pageimage DBT s
-POINTER pagelsn DB_LSN * lu
-END
-
-/*
* big -- Handles addition and deletion of big key/data items.
*
* opcode: identifies get/put.
@@ -87,10 +73,10 @@ END
*/
BEGIN big 43
ARG opcode u_int32_t lu
-ARG fileid int32_t ld
-ARG pgno db_pgno_t lu
-ARG prev_pgno db_pgno_t lu
-ARG next_pgno db_pgno_t lu
+DB fileid int32_t ld
+WRLOCK pgno db_pgno_t lu
+WRLOCKNZ prev_pgno db_pgno_t lu
+WRLOCKNZ next_pgno db_pgno_t lu
DBT dbt DBT s
POINTER pagelsn DB_LSN * lu
POINTER prevlsn DB_LSN * lu
@@ -106,8 +92,8 @@ END
* lsn: the page's original lsn.
*/
BEGIN ovref 44
-ARG fileid int32_t ld
-ARG pgno db_pgno_t lu
+DB fileid int32_t ld
+WRLOCK pgno db_pgno_t lu
ARG adjust int32_t ld
POINTER lsn DB_LSN * lu
END
@@ -125,33 +111,16 @@ END
*/
BEGIN relink 45
ARG opcode u_int32_t lu
-ARG fileid int32_t ld
-ARG pgno db_pgno_t lu
+DB fileid int32_t ld
+WRLOCK pgno db_pgno_t lu
POINTER lsn DB_LSN * lu
-ARG prev db_pgno_t lu
+WRLOCKNZ prev db_pgno_t lu
POINTER lsn_prev DB_LSN * lu
-ARG next db_pgno_t lu
+WRLOCKNZ next db_pgno_t lu
POINTER lsn_next DB_LSN * lu
END
/*
- * Addpage -- Handles adding a new duplicate page onto the end of
- * an existing duplicate page.
- * fileid: identifies the file being changed.
- * pgno: page number to which a new page is being added.
- * lsn: lsn of pgno
- * nextpgno: new page number being added.
- * nextlsn: lsn of nextpgno;
- */
-DEPRECATED addpage 46
-ARG fileid int32_t ld
-ARG pgno db_pgno_t lu
-POINTER lsn DB_LSN * lu
-ARG nextpgno db_pgno_t lu
-POINTER nextlsn DB_LSN * lu
-END
-
-/*
* Debug -- log an operation upon entering an access method.
* op: Operation (cursor, c_close, c_get, c_put, c_del,
* get, put, delete).
@@ -172,7 +141,55 @@ END
* noop -- do nothing, but get an LSN.
*/
BEGIN noop 48
-ARG fileid int32_t ld
-ARG pgno db_pgno_t lu
+DB fileid int32_t ld
+WRLOCK pgno db_pgno_t lu
POINTER prevlsn DB_LSN * lu
END
+
+/*
+ * pg_alloc: used to record allocating a new page.
+ *
+ * meta_lsn: the meta-data page's original lsn.
+ * meta_pgno the meta-data page number.
+ * page_lsn: the allocated page's original lsn.
+ * pgno: the page allocated.
+ * ptype: the type of the page allocated.
+ * next: the next page on the free list.
+ */
+BEGIN pg_alloc 49
+DB fileid int32_t ld
+POINTER meta_lsn DB_LSN * lu
+WRLOCK meta_pgno db_pgno_t lu
+POINTER page_lsn DB_LSN * lu
+WRLOCK pgno db_pgno_t lu
+ARG ptype u_int32_t lu
+ARG next db_pgno_t lu
+END
+
+/*
+ * pg_free: used to record freeing a page.
+ *
+ * pgno: the page being freed.
+ * meta_lsn: the meta-data page's original lsn.
+ * meta_pgno: the meta-data page number.
+ * header: the header from the free'd page.
+ * next: the previous next pointer on the metadata page.
+ */
+BEGIN pg_free 50
+DB fileid int32_t ld
+WRLOCK pgno db_pgno_t lu
+POINTER meta_lsn DB_LSN * lu
+WRLOCK meta_pgno db_pgno_t lu
+PGDBT header DBT s
+ARG next db_pgno_t lu
+END
+
+/*
+ * cksum --
+ * This log record is written when we're unable to checksum a page,
+ * before returning DB_RUNRECOVERY. This log record causes normal
+ * recovery to itself return DB_RUNRECOVERY, as only catastrophic
+ * recovery can fix things.
+ */
+BEGIN cksum 51
+END
diff --git a/bdb/db/db_am.c b/bdb/db/db_am.c
index 2d224566904..cf6ef18549b 100644
--- a/bdb/db/db_am.c
+++ b/bdb/db/db_am.c
@@ -1,14 +1,14 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1998, 1999, 2000
+ * Copyright (c) 1998-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: db_am.c,v 11.42 2001/01/11 18:19:50 bostic Exp $";
+static const char revid[] = "$Id: db_am.c,v 11.96 2002/08/27 15:17:32 bostic Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -18,16 +18,22 @@ static const char revid[] = "$Id: db_am.c,v 11.42 2001/01/11 18:19:50 bostic Exp
#endif
#include "db_int.h"
-#include "db_page.h"
-#include "db_shash.h"
-#include "btree.h"
-#include "hash.h"
-#include "qam.h"
-#include "lock.h"
-#include "mp.h"
-#include "txn.h"
-#include "db_am.h"
-#include "db_ext.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/qam.h"
+
+static int __db_append_primary __P((DBC *, DBT *, DBT *));
+static int __db_secondary_get __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
+static int __db_secondary_close __P((DB *, u_int32_t));
+
+#ifdef DEBUG
+static int __db_cprint_item __P((DBC *));
+#endif
/*
* __db_cursor --
@@ -53,12 +59,22 @@ __db_cursor(dbp, txn, dbcp, flags)
PANIC_CHECK(dbenv);
DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->cursor");
- /* Check for invalid flags. */
- if ((ret = __db_cursorchk(dbp, flags, F_ISSET(dbp, DB_AM_RDONLY))) != 0)
+ /* Validate arguments. */
+ if ((ret = __db_cursorchk(dbp, flags)) != 0)
return (ret);
- if ((ret =
- __db_icursor(dbp, txn, dbp->type, PGNO_INVALID, 0, dbcp)) != 0)
+ /*
+ * Check for consistent transaction usage. For now, assume that
+ * this cursor might be used for read operations only (in which
+ * case it may not require a txn). We'll check more stringently
+ * in c_del and c_put. (Note that this all means that the
+ * read-op txn tests have to be a subset of the write-op ones.)
+ */
+ if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 1)) != 0)
+ return (ret);
+
+ if ((ret = __db_icursor(dbp,
+ txn, dbp->type, PGNO_INVALID, 0, DB_LOCK_INVALIDID, dbcp)) != 0)
return (ret);
dbc = *dbcp;
@@ -70,7 +86,7 @@ __db_cursor(dbp, txn, dbcp, flags)
op = LF_ISSET(DB_OPFLAGS_MASK);
mode = (op == DB_WRITELOCK) ? DB_LOCK_WRITE :
((op == DB_WRITECURSOR) ? DB_LOCK_IWRITE : DB_LOCK_READ);
- if ((ret = lock_get(dbenv, dbc->locker, 0,
+ if ((ret = dbenv->lock_get(dbenv, dbc->locker, 0,
&dbc->lock_dbt, mode, &dbc->mylock)) != 0) {
(void)__db_c_close(dbc);
return (ret);
@@ -81,6 +97,9 @@ __db_cursor(dbp, txn, dbcp, flags)
F_SET(dbc, DBC_WRITER);
}
+ if (LF_ISSET(DB_DIRTY_READ) ||
+ (txn != NULL && F_ISSET(txn, TXN_DIRTY_READ)))
+ F_SET(dbc, DBC_DIRTY_READ);
return (0);
}
@@ -91,15 +110,16 @@ __db_cursor(dbp, txn, dbcp, flags)
* initialize as a cursor.
*
* PUBLIC: int __db_icursor
- * PUBLIC: __P((DB *, DB_TXN *, DBTYPE, db_pgno_t, int, DBC **));
+ * PUBLIC: __P((DB *, DB_TXN *, DBTYPE, db_pgno_t, int, u_int32_t, DBC **));
*/
int
-__db_icursor(dbp, txn, dbtype, root, is_opd, dbcp)
+__db_icursor(dbp, txn, dbtype, root, is_opd, lockerid, dbcp)
DB *dbp;
DB_TXN *txn;
DBTYPE dbtype;
db_pgno_t root;
int is_opd;
+ u_int32_t lockerid;
DBC **dbcp;
{
DBC *dbc, *adbc;
@@ -120,7 +140,7 @@ __db_icursor(dbp, txn, dbtype, root, is_opd, dbcp)
dbc != NULL; dbc = TAILQ_NEXT(dbc, links))
if (dbtype == dbc->dbtype) {
TAILQ_REMOVE(&dbp->free_queue, dbc, links);
- dbc->flags = 0;
+ F_CLR(dbc, ~DBC_OWN_LID);
break;
}
MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp);
@@ -144,11 +164,35 @@ __db_icursor(dbp, txn, dbtype, root, is_opd, dbcp)
if (!DB_IS_THREADED(dbp) &&
(adbc = TAILQ_FIRST(&dbp->active_queue)) != NULL)
dbc->lid = adbc->lid;
- else
- if ((ret = lock_id(dbenv, &dbc->lid)) != 0)
+ else {
+ if ((ret =
+ dbenv->lock_id(dbenv, &dbc->lid)) != 0)
goto err;
+ F_SET(dbc, DBC_OWN_LID);
+ }
+
+ /*
+ * In CDB, secondary indices should share a lock file
+ * ID with the primary; otherwise we're susceptible to
+ * deadlocks. We also use __db_icursor rather
+ * than sdbp->cursor to create secondary update
+ * cursors in c_put and c_del; these won't
+ * acquire a new lock.
+ *
+ * !!!
+ * Since this is in the one-time cursor allocation
+ * code, we need to be sure to destroy, not just
+ * close, all cursors in the secondary when we
+ * associate.
+ */
+ if (CDB_LOCKING(dbp->dbenv) &&
+ F_ISSET(dbp, DB_AM_SECONDARY))
+ memcpy(dbc->lock.fileid,
+ dbp->s_primary->fileid, DB_FILE_ID_LEN);
+ else
+ memcpy(dbc->lock.fileid,
+ dbp->fileid, DB_FILE_ID_LEN);
- memcpy(dbc->lock.fileid, dbp->fileid, DB_FILE_ID_LEN);
if (CDB_LOCKING(dbenv)) {
if (F_ISSET(dbenv, DB_ENV_CDB_ALLDB)) {
/*
@@ -198,18 +242,55 @@ __db_icursor(dbp, txn, dbtype, root, is_opd, dbcp)
/* Refresh the DBC structure. */
dbc->dbtype = dbtype;
+ RESET_RET_MEM(dbc);
- if ((dbc->txn = txn) == NULL)
- dbc->locker = dbc->lid;
- else {
+ if ((dbc->txn = txn) == NULL) {
+ /*
+ * There are certain cases in which we want to create a
+ * new cursor with a particular locker ID that is known
+ * to be the same as (and thus not conflict with) an
+ * open cursor.
+ *
+ * The most obvious case is cursor duplication; when we
+ * call DBC->c_dup or __db_c_idup, we want to use the original
+ * cursor's locker ID.
+ *
+ * Another case is when updating secondary indices. Standard
+ * CDB locking would mean that we might block ourself: we need
+ * to open an update cursor in the secondary while an update
+ * cursor in the primary is open, and when the secondary and
+ * primary are subdatabases or we're using env-wide locking,
+ * this is disastrous.
+ *
+ * In these cases, our caller will pass a nonzero locker ID
+ * into this function. Use this locker ID instead of dbc->lid
+ * as the locker ID for our new cursor.
+ */
+ if (lockerid != DB_LOCK_INVALIDID)
+ dbc->locker = lockerid;
+ else
+ dbc->locker = dbc->lid;
+ } else {
dbc->locker = txn->txnid;
txn->cursors++;
}
+ /*
+ * These fields change when we are used as a secondary index, so
+ * if the DB is a secondary, make sure they're set properly just
+ * in case we opened some cursors before we were associated.
+ *
+ * __db_c_get is used by all access methods, so this should be safe.
+ */
+ if (F_ISSET(dbp, DB_AM_SECONDARY))
+ dbc->c_get = __db_c_secondary_get;
+
if (is_opd)
F_SET(dbc, DBC_OPD);
if (F_ISSET(dbp, DB_AM_RECOVER))
F_SET(dbc, DBC_RECOVER);
+ if (F_ISSET(dbp, DB_AM_COMPENSATE))
+ F_SET(dbc, DBC_COMPENSATE);
/* Refresh the DBC internal structure. */
cp = dbc->internal;
@@ -243,14 +324,14 @@ __db_icursor(dbp, txn, dbtype, root, is_opd, dbcp)
return (0);
err: if (allocated)
- __os_free(dbc, sizeof(*dbc));
+ __os_free(dbp->dbenv, dbc);
return (ret);
}
#ifdef DEBUG
/*
* __db_cprint --
- * Display the current cursor list.
+ * Display the cursor active and free queues.
*
* PUBLIC: int __db_cprint __P((DB *));
*/
@@ -258,60 +339,76 @@ int
__db_cprint(dbp)
DB *dbp;
{
+ DBC *dbc;
+ int ret, t_ret;
+
+ ret = 0;
+ MUTEX_THREAD_LOCK(dbp->dbenv, dbp->mutexp);
+ fprintf(stderr, "Active queue:\n");
+ for (dbc = TAILQ_FIRST(&dbp->active_queue);
+ dbc != NULL; dbc = TAILQ_NEXT(dbc, links))
+ if ((t_ret = __db_cprint_item(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ fprintf(stderr, "Free queue:\n");
+ for (dbc = TAILQ_FIRST(&dbp->free_queue);
+ dbc != NULL; dbc = TAILQ_NEXT(dbc, links))
+ if ((t_ret = __db_cprint_item(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ MUTEX_THREAD_UNLOCK(dbp->dbenv, dbp->mutexp);
+
+ return (ret);
+}
+
+static
+int __db_cprint_item(dbc)
+ DBC *dbc;
+{
static const FN fn[] = {
{ DBC_ACTIVE, "active" },
+ { DBC_COMPENSATE, "compensate" },
{ DBC_OPD, "off-page-dup" },
{ DBC_RECOVER, "recover" },
{ DBC_RMW, "read-modify-write" },
+ { DBC_TRANSIENT, "transient" },
{ DBC_WRITECURSOR, "write cursor" },
{ DBC_WRITEDUP, "internally dup'ed write cursor" },
{ DBC_WRITER, "short-term write cursor" },
{ 0, NULL }
};
- DBC *dbc;
+ DB *dbp;
DBC_INTERNAL *cp;
- char *s;
+ const char *s;
- MUTEX_THREAD_LOCK(dbp->dbenv, dbp->mutexp);
- for (dbc = TAILQ_FIRST(&dbp->active_queue);
- dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
- switch (dbc->dbtype) {
- case DB_BTREE:
- s = "btree";
- break;
- case DB_HASH:
- s = "hash";
- break;
- case DB_RECNO:
- s = "recno";
- break;
- case DB_QUEUE:
- s = "queue";
- break;
- default:
- DB_ASSERT(0);
- return (1);
- }
- cp = dbc->internal;
- fprintf(stderr, "%s/%#0lx: opd: %#0lx\n",
- s, P_TO_ULONG(dbc), P_TO_ULONG(cp->opd));
- fprintf(stderr, "\ttxn: %#0lx lid: %lu locker: %lu\n",
- P_TO_ULONG(dbc->txn),
- (u_long)dbc->lid, (u_long)dbc->locker);
- fprintf(stderr, "\troot: %lu page/index: %lu/%lu",
- (u_long)cp->root, (u_long)cp->pgno, (u_long)cp->indx);
- __db_prflags(dbc->flags, fn, stderr);
- fprintf(stderr, "\n");
-
- if (dbp->type == DB_BTREE)
- __bam_cprint(dbc);
+ dbp = dbc->dbp;
+ cp = dbc->internal;
+
+ s = __db_dbtype_to_string(dbc->dbtype);
+ if (strcmp(s, "UNKNOWN TYPE") == 0) {
+ DB_ASSERT(0);
+ return (1);
}
- for (dbc = TAILQ_FIRST(&dbp->free_queue);
- dbc != NULL; dbc = TAILQ_NEXT(dbc, links))
- fprintf(stderr, "free: %#0lx ", P_TO_ULONG(dbc));
+ fprintf(stderr, "%s/%#0lx: opd: %#0lx\n",
+ s, P_TO_ULONG(dbc), P_TO_ULONG(cp->opd));
+
+ fprintf(stderr, "\ttxn: %#0lx lid: %lu locker: %lu\n",
+ P_TO_ULONG(dbc->txn), (u_long)dbc->lid, (u_long)dbc->locker);
+
+ fprintf(stderr, "\troot: %lu page/index: %lu/%lu",
+ (u_long)cp->root, (u_long)cp->pgno, (u_long)cp->indx);
+
+ __db_prflags(dbc->flags, fn, stderr);
fprintf(stderr, "\n");
- MUTEX_THREAD_UNLOCK(dbp->dbenv, dbp->mutexp);
+ switch (dbp->type) {
+ case DB_BTREE:
+ __bam_cprint(dbc);
+ break;
+ case DB_HASH:
+ __ham_cprint(dbc);
+ break;
+ default:
+ break;
+ }
return (0);
}
#endif /* DEBUG */
@@ -345,7 +442,7 @@ __db_fd(dbp, fdp)
return (0);
} else {
*fdp = -1;
- __db_err(dbp->dbenv, "DB does not have a valid file handle.");
+ __db_err(dbp->dbenv, "DB does not have a valid file handle");
return (ENOENT);
}
}
@@ -372,8 +469,16 @@ __db_get(dbp, txn, key, data, flags)
if ((ret = __db_getchk(dbp, key, data, flags)) != 0)
return (ret);
+ /* Check for consistent transaction usage. */
+ if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 1)) != 0)
+ return (ret);
+
mode = 0;
- if (flags == DB_CONSUME || flags == DB_CONSUME_WAIT)
+ if (LF_ISSET(DB_DIRTY_READ)) {
+ mode = DB_DIRTY_READ;
+ LF_CLR(DB_DIRTY_READ);
+ }
+ else if (flags == DB_CONSUME || flags == DB_CONSUME_WAIT)
mode = DB_WRITELOCK;
if ((ret = dbp->cursor(dbp, txn, &dbc, mode)) != 0)
return (ret);
@@ -387,11 +492,17 @@ __db_get(dbp, txn, key, data, flags)
* going to close it right away. Thus, we can perform the get
* without duplicating the cursor, saving some cycles in this
* common case.
+ *
+ * SET_RET_MEM indicates that if key and/or data have no DBT
+ * flags set and DB manages the returned-data memory, that memory
+ * will belong to this handle, not to the underlying cursor.
*/
F_SET(dbc, DBC_TRANSIENT);
+ SET_RET_MEM(dbc, dbp);
- ret = dbc->c_get(dbc, key, data,
- flags == 0 || flags == DB_RMW ? flags | DB_SET : flags);
+ if (LF_ISSET(~(DB_RMW | DB_MULTIPLE)) == 0)
+ LF_SET(DB_SET);
+ ret = dbc->c_get(dbc, key, data, flags);
if ((t_ret = __db_c_close(dbc)) != 0 && ret == 0)
ret = t_ret;
@@ -414,20 +525,39 @@ __db_put(dbp, txn, key, data, flags)
{
DBC *dbc;
DBT tdata;
- int ret, t_ret;
+ DB_ENV *dbenv;
+ int ret, t_ret, txn_local;
- PANIC_CHECK(dbp->dbenv);
+ dbc = NULL;
+ dbenv = dbp->dbenv;
+ txn_local = 0;
+
+ PANIC_CHECK(dbenv);
DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->put");
+ /* Validate arguments. */
if ((ret = __db_putchk(dbp, key, data,
- flags, F_ISSET(dbp, DB_AM_RDONLY),
- F_ISSET(dbp, DB_AM_DUP) || F_ISSET(key, DB_DBT_DUPOK))) != 0)
+ flags, F_ISSET(dbp, DB_AM_DUP) || F_ISSET(key, DB_DBT_DUPOK))) != 0)
return (ret);
- DB_CHECK_TXN(dbp, txn);
+ /* Create local transaction as necessary. */
+ if (IS_AUTO_COMMIT(dbenv, txn, flags)) {
+ if ((ret = __db_txn_auto(dbp, &txn)) != 0)
+ return (ret);
+ txn_local = 1;
+ LF_CLR(DB_AUTO_COMMIT);
+ }
+
+ /* Check for consistent transaction usage. */
+ if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 0)) != 0)
+ goto err;
if ((ret = dbp->cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0)
- return (ret);
+ goto err;
+
+ DEBUG_LWRITE(dbc, txn, "db_put", key, data, flags);
+
+ SET_RET_MEM(dbc, dbp);
/*
* See the comment in __db_get().
@@ -441,9 +571,58 @@ __db_put(dbp, txn, key, data, flags)
*/
F_SET(dbc, DBC_TRANSIENT);
- DEBUG_LWRITE(dbc, txn, "__db_put", key, data, flags);
+ switch (flags) {
+ case DB_APPEND:
+ /*
+ * If there is an append callback, the value stored in
+ * data->data may be replaced and then freed. To avoid
+ * passing a freed pointer back to the user, just operate
+ * on a copy of the data DBT.
+ */
+ tdata = *data;
- if (flags == DB_NOOVERWRITE) {
+ /*
+ * Append isn't a normal put operation; call the appropriate
+ * access method's append function.
+ */
+ switch (dbp->type) {
+ case DB_QUEUE:
+ if ((ret = __qam_append(dbc, key, &tdata)) != 0)
+ goto err;
+ break;
+ case DB_RECNO:
+ if ((ret = __ram_append(dbc, key, &tdata)) != 0)
+ goto err;
+ break;
+ default:
+ /* The interface should prevent this. */
+ DB_ASSERT(0);
+ ret = __db_ferr(dbenv, "__db_put", flags);
+ goto err;
+ }
+
+ /*
+ * Secondary indices: since we've returned zero from
+ * an append function, we've just put a record, and done
+ * so outside __db_c_put. We know we're not a secondary--
+ * the interface prevents puts on them--but we may be a
+ * primary. If so, update our secondary indices
+ * appropriately.
+ */
+ DB_ASSERT(!F_ISSET(dbp, DB_AM_SECONDARY));
+
+ if (LIST_FIRST(&dbp->s_secondaries) != NULL)
+ ret = __db_append_primary(dbc, key, &tdata);
+
+ /*
+ * The append callback, if one exists, may have allocated
+ * a new tdata.data buffer. If so, free it.
+ */
+ FREE_IF_NEEDED(dbp, &tdata);
+
+ /* No need for a cursor put; we're done. */
+ goto err;
+ case DB_NOOVERWRITE:
flags = 0;
/*
* Set DB_DBT_USERMEM, this might be a threaded application and
@@ -460,16 +639,161 @@ __db_put(dbp, txn, key, data, flags)
if ((ret = dbc->c_get(dbc, key, &tdata,
DB_SET | (STD_LOCKING(dbc) ? DB_RMW : 0))) == 0)
ret = DB_KEYEXIST;
- else if (ret == DB_NOTFOUND)
+ else if (ret == DB_NOTFOUND || ret == DB_KEYEMPTY)
ret = 0;
+ break;
+ default:
+ /* Fall through to normal cursor put. */
+ break;
}
if (ret == 0)
ret = dbc->c_put(dbc,
- key, data, flags == 0 ? DB_KEYLAST : flags);
+ key, data, flags == 0 ? DB_KEYLAST : flags);
- if ((t_ret = __db_c_close(dbc)) != 0 && ret == 0)
+err: /* Close the cursor. */
+ if (dbc != NULL && (t_ret = __db_c_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Commit for DB_AUTO_COMMIT. */
+ if (txn_local) {
+ if (ret == 0)
+ ret = txn->commit(txn, 0);
+ else
+ if ((t_ret = txn->abort(txn)) != 0)
+ ret = __db_panic(dbenv, t_ret);
+ }
+
+ return (ret);
+}
+
+/*
+ * __db_delete --
+ * Delete the items referenced by a key.
+ *
+ * PUBLIC: int __db_delete __P((DB *, DB_TXN *, DBT *, u_int32_t));
+ */
+int
+__db_delete(dbp, txn, key, flags)
+ DB *dbp;
+ DB_TXN *txn;
+ DBT *key;
+ u_int32_t flags;
+{
+ DBC *dbc;
+ DBT data, lkey;
+ DB_ENV *dbenv;
+ u_int32_t f_init, f_next;
+ int ret, t_ret, txn_local;
+
+ dbc = NULL;
+ dbenv = dbp->dbenv;
+ txn_local = 0;
+
+ PANIC_CHECK(dbenv);
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->del");
+
+ /* Check for invalid flags. */
+ if ((ret = __db_delchk(dbp, key, flags)) != 0)
+ return (ret);
+
+ /* Create local transaction as necessary. */
+ if (IS_AUTO_COMMIT(dbenv, txn, flags)) {
+ if ((ret = __db_txn_auto(dbp, &txn)) != 0)
+ return (ret);
+ txn_local = 1;
+ LF_CLR(DB_AUTO_COMMIT);
+ }
+
+ /* Check for consistent transaction usage. */
+ if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 0)) != 0)
+ goto err;
+
+ /* Allocate a cursor. */
+ if ((ret = dbp->cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0)
+ goto err;
+
+ DEBUG_LWRITE(dbc, txn, "db_delete", key, NULL, flags);
+
+ /*
+ * Walk a cursor through the key/data pairs, deleting as we go. Set
+ * the DB_DBT_USERMEM flag, as this might be a threaded application
+ * and the flags checking will catch us. We don't actually want the
+ * keys or data, so request a partial of length 0.
+ */
+ memset(&lkey, 0, sizeof(lkey));
+ F_SET(&lkey, DB_DBT_USERMEM | DB_DBT_PARTIAL);
+ memset(&data, 0, sizeof(data));
+ F_SET(&data, DB_DBT_USERMEM | DB_DBT_PARTIAL);
+
+ /*
+ * If locking (and we haven't already acquired CDB locks), set the
+ * read-modify-write flag.
+ */
+ f_init = DB_SET;
+ f_next = DB_NEXT_DUP;
+ if (STD_LOCKING(dbc)) {
+ f_init |= DB_RMW;
+ f_next |= DB_RMW;
+ }
+
+ /* Walk through the set of key/data pairs, deleting as we go. */
+ if ((ret = dbc->c_get(dbc, key, &data, f_init)) != 0)
+ goto err;
+
+ /*
+ * Hash permits an optimization in DB->del: since on-page
+ * duplicates are stored in a single HKEYDATA structure, it's
+ * possible to delete an entire set of them at once, and as
+ * the HKEYDATA has to be rebuilt and re-put each time it
+ * changes, this is much faster than deleting the duplicates
+ * one by one. Thus, if we're not pointing at an off-page
+ * duplicate set, and we're not using secondary indices (in
+ * which case we'd have to examine the items one by one anyway),
+ * let hash do this "quick delete".
+ *
+ * !!!
+ * Note that this is the only application-executed delete call in
+ * Berkeley DB that does not go through the __db_c_del function.
+ * If anything other than the delete itself (like a secondary index
+ * update) has to happen there in a particular situation, the
+ * conditions here should be modified not to call __ham_quick_delete.
+ * The ordinary AM-independent alternative will work just fine with
+ * a hash; it'll just be slower.
+ */
+ if (dbp->type == DB_HASH) {
+ if (LIST_FIRST(&dbp->s_secondaries) == NULL &&
+ !F_ISSET(dbp, DB_AM_SECONDARY) &&
+ dbc->internal->opd == NULL) {
+ ret = __ham_quick_delete(dbc);
+ goto err;
+ }
+ }
+
+ for (;;) {
+ if ((ret = dbc->c_del(dbc, 0)) != 0)
+ goto err;
+ if ((ret = dbc->c_get(dbc, &lkey, &data, f_next)) != 0) {
+ if (ret == DB_NOTFOUND) {
+ ret = 0;
+ break;
+ }
+ goto err;
+ }
+ }
+
+err: /* Discard the cursor. */
+ if (dbc != NULL && (t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
ret = t_ret;
+ /* Commit for DB_AUTO_COMMIT. */
+ if (txn_local) {
+ if (ret == 0)
+ ret = txn->commit(txn, 0);
+ else
+ if ((t_ret = txn->abort(txn)) != 0)
+ ret = __db_panic(dbenv, t_ret);
+ }
+
return (ret);
}
@@ -505,7 +829,443 @@ __db_sync(dbp, flags)
return (0);
/* Flush any dirty pages from the cache to the backing file. */
- if ((t_ret = memp_fsync(dbp->mpf)) != 0 && ret == 0)
+ if ((t_ret = dbp->mpf->sync(dbp->mpf)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __db_associate --
+ * Associate another database as a secondary index to this one.
+ *
+ * PUBLIC: int __db_associate __P((DB *, DB_TXN *, DB *,
+ * PUBLIC: int (*)(DB *, const DBT *, const DBT *, DBT *), u_int32_t));
+ */
+int
+__db_associate(dbp, txn, sdbp, callback, flags)
+ DB *dbp, *sdbp;
+ DB_TXN *txn;
+ int (*callback) __P((DB *, const DBT *, const DBT *, DBT *));
+ u_int32_t flags;
+{
+ DB_ENV *dbenv;
+ DBC *pdbc, *sdbc;
+ DBT skey, key, data;
+ int build, ret, t_ret, txn_local;
+
+ dbenv = dbp->dbenv;
+
+ PANIC_CHECK(dbenv);
+
+ txn_local = 0;
+ pdbc = NULL;
+
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+ memset(&skey, 0, sizeof(DBT));
+
+ if ((ret = __db_associatechk(dbp, sdbp, callback, flags)) != 0)
+ return (ret);
+
+ /*
+ * Create a local transaction as necessary, check for consistent
+ * transaction usage, and, if we have no transaction but do have
+ * locking on, acquire a locker id for the handle lock acquisition.
+ */
+ if (IS_AUTO_COMMIT(dbenv, txn, flags)) {
+ if ((ret = __db_txn_auto(dbp, &txn)) != 0)
+ return (ret);
+ txn_local = 1;
+ } else if (txn != NULL && !TXN_ON(dbenv))
+ return (__db_not_txn_env(dbenv));
+
+ /*
+ * Check that if an open transaction is in progress, we're in it,
+ * for other common transaction errors, and for concurrent associates.
+ */
+ if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 0)) != 0)
+ return (ret);
+
+ sdbp->s_callback = callback;
+ sdbp->s_primary = dbp;
+
+ sdbp->stored_get = sdbp->get;
+ sdbp->get = __db_secondary_get;
+
+ sdbp->stored_close = sdbp->close;
+ sdbp->close = __db_secondary_close;
+
+ /*
+ * Secondary cursors may have the primary's lock file ID, so we
+ * need to make sure that no older cursors are lying around
+ * when we make the transition.
+ */
+ if (TAILQ_FIRST(&sdbp->active_queue) != NULL ||
+ TAILQ_FIRST(&sdbp->join_queue) != NULL) {
+ __db_err(dbenv,
+ "Databases may not become secondary indices while cursors are open");
+ ret = EINVAL;
+ goto err;
+ }
+ while ((sdbc = TAILQ_FIRST(&sdbp->free_queue)) != NULL)
+ if ((ret = __db_c_destroy(sdbc)) != 0)
+ goto err;
+
+ F_SET(sdbp, DB_AM_SECONDARY);
+
+ /*
+ * Check to see if the secondary is empty--and thus if we should
+ * build it--before we link it in and risk making it show up in
+ * other threads.
+ */
+ build = 0;
+ if (LF_ISSET(DB_CREATE)) {
+ if ((ret = sdbp->cursor(sdbp, txn, &sdbc, 0)) != 0)
+ goto err;
+
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ /*
+ * We don't care about key or data; we're just doing
+ * an existence check.
+ */
+ F_SET(&key, DB_DBT_PARTIAL | DB_DBT_USERMEM);
+ F_SET(&data, DB_DBT_PARTIAL | DB_DBT_USERMEM);
+ if ((ret = sdbc->c_real_get(sdbc, &key, &data,
+ (STD_LOCKING(sdbc) ? DB_RMW : 0) |
+ DB_FIRST)) == DB_NOTFOUND) {
+ build = 1;
+ ret = 0;
+ }
+
+ /*
+ * Secondary cursors have special refcounting close
+ * methods. Be careful.
+ */
+ if ((t_ret = __db_c_close(sdbc)) != 0)
+ ret = t_ret;
+ if (ret != 0)
+ goto err;
+ }
+
+ /*
+ * Add the secondary to the list on the primary. Do it here
+ * so that we see any updates that occur while we're walking
+ * the primary.
+ */
+ MUTEX_THREAD_LOCK(dbenv, dbp->mutexp);
+
+ /* See __db_s_next for an explanation of secondary refcounting. */
+ DB_ASSERT(sdbp->s_refcnt == 0);
+ sdbp->s_refcnt = 1;
+ LIST_INSERT_HEAD(&dbp->s_secondaries, sdbp, s_links);
+ MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp);
+
+ if (build) {
+ /*
+ * We loop through the primary, putting each item we
+ * find into the new secondary.
+ *
+ * If we're using CDB, opening these two cursors puts us
+ * in a bit of a locking tangle: CDB locks are done on the
+ * primary, so that we stay deadlock-free, but that means
+ * that updating the secondary while we have a read cursor
+ * open on the primary will self-block. To get around this,
+ * we force the primary cursor to use the same locker ID
+ * as the secondary, so they won't conflict. This should
+ * be harmless even if we're not using CDB.
+ */
+ if ((ret = sdbp->cursor(sdbp, txn, &sdbc,
+ CDB_LOCKING(sdbp->dbenv) ? DB_WRITECURSOR : 0)) != 0)
+ goto err;
+ if ((ret = __db_icursor(dbp,
+ txn, dbp->type, PGNO_INVALID, 0, sdbc->locker, &pdbc)) != 0)
+ goto err;
+
+ /* Lock out other threads, now that we have a locker ID. */
+ dbp->associate_lid = sdbc->locker;
+
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+ while ((ret = pdbc->c_get(pdbc, &key, &data, DB_NEXT)) == 0) {
+ memset(&skey, 0, sizeof(DBT));
+ if ((ret = callback(sdbp, &key, &data, &skey)) != 0) {
+ if (ret == DB_DONOTINDEX)
+ continue;
+ else
+ goto err;
+ }
+ if ((ret = sdbc->c_put(sdbc,
+ &skey, &key, DB_UPDATE_SECONDARY)) != 0) {
+ FREE_IF_NEEDED(sdbp, &skey);
+ goto err;
+ }
+
+ FREE_IF_NEEDED(sdbp, &skey);
+ }
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+
+ if ((ret = sdbc->c_close(sdbc)) != 0)
+ goto err;
+ }
+
+err: if (pdbc != NULL && (t_ret = pdbc->c_close(pdbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ dbp->associate_lid = DB_LOCK_INVALIDID;
+
+ if (txn_local) {
+ if (ret == 0)
+ ret = txn->commit(txn, 0);
+ else
+ if ((t_ret = txn->abort(txn)) != 0)
+ ret = __db_panic(dbenv, t_ret);
+ }
+
+ return (ret);
+}
+
+/*
+ * __db_pget --
+ * Return a primary key/data pair given a secondary key.
+ *
+ * PUBLIC: int __db_pget __P((DB *, DB_TXN *, DBT *, DBT *, DBT *, u_int32_t));
+ */
+int
+__db_pget(dbp, txn, skey, pkey, data, flags)
+ DB *dbp;
+ DB_TXN *txn;
+ DBT *skey, *pkey, *data;
+ u_int32_t flags;
+{
+ DBC *dbc;
+ int ret, t_ret;
+
+ PANIC_CHECK(dbp->dbenv);
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->pget");
+
+ if ((ret = __db_pgetchk(dbp, skey, pkey, data, flags)) != 0)
+ return (ret);
+
+ if ((ret = dbp->cursor(dbp, txn, &dbc, 0)) != 0)
+ return (ret);
+ SET_RET_MEM(dbc, dbp);
+
+ /*
+ * The underlying cursor pget will fill in a default DBT for null
+ * pkeys, and use the cursor's returned-key memory internally to
+ * store any intermediate primary keys. However, we've just set
+ * the returned-key memory to the DB handle's key memory, which
+ * is unsafe to use if the DB handle is threaded. If the pkey
+ * argument is NULL, use the DBC-owned returned-key memory
+ * instead; it'll go away when we close the cursor before we
+ * return, but in this case that's just fine, as we're not
+ * returning the primary key.
+ */
+ if (pkey == NULL)
+ dbc->rkey = &dbc->my_rkey;
+
+ DEBUG_LREAD(dbc, txn, "__db_pget", skey, NULL, flags);
+
+ /*
+ * The cursor is just a perfectly ordinary secondary database
+ * cursor. Call its c_pget() method to do the dirty work.
+ */
+ if (flags == 0 || flags == DB_RMW)
+ flags |= DB_SET;
+ ret = dbc->c_pget(dbc, skey, pkey, data, flags);
+
+ if ((t_ret = __db_c_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __db_secondary_get --
+ * This wrapper function for DB->pget() is the DB->get() function
+ * on a database which has been made into a secondary index.
+ */
+static int
+__db_secondary_get(sdbp, txn, skey, data, flags)
+ DB *sdbp;
+ DB_TXN *txn;
+ DBT *skey, *data;
+ u_int32_t flags;
+{
+
+ DB_ASSERT(F_ISSET(sdbp, DB_AM_SECONDARY));
+ return (sdbp->pget(sdbp, txn, skey, NULL, data, flags));
+}
+
+/*
+ * __db_secondary_close --
+ * Wrapper function for DB->close() which we use on secondaries to
+ * manage refcounting and make sure we don't close them underneath
+ * a primary that is updating.
+ */
+static int
+__db_secondary_close(sdbp, flags)
+ DB *sdbp;
+ u_int32_t flags;
+{
+ DB *primary;
+ int doclose;
+
+ doclose = 0;
+ primary = sdbp->s_primary;
+
+ MUTEX_THREAD_LOCK(primary->dbenv, primary->mutexp);
+ /*
+ * Check the refcount--if it was at 1 when we were called, no
+ * thread is currently updating this secondary through the primary,
+ * so it's safe to close it for real.
+ *
+ * If it's not safe to do the close now, we do nothing; the
+ * database will actually be closed when the refcount is decremented,
+ * which can happen in either __db_s_next or __db_s_done.
+ */
+ DB_ASSERT(sdbp->s_refcnt != 0);
+ if (--sdbp->s_refcnt == 0) {
+ LIST_REMOVE(sdbp, s_links);
+ /* We don't want to call close while the mutex is held. */
+ doclose = 1;
+ }
+ MUTEX_THREAD_UNLOCK(primary->dbenv, primary->mutexp);
+
+ /*
+ * sdbp->close is this function; call the real one explicitly if
+ * need be.
+ */
+ return (doclose ? __db_close(sdbp, flags) : 0);
+}
+
+/*
+ * __db_append_primary --
+ * Perform the secondary index updates necessary to put(DB_APPEND)
+ * a record to a primary database.
+ */
+static int
+__db_append_primary(dbc, key, data)
+ DBC *dbc;
+ DBT *key, *data;
+{
+ DB *dbp, *sdbp;
+ DBC *sdbc, *pdbc;
+ DBT oldpkey, pkey, pdata, skey;
+ int cmp, ret, t_ret;
+
+ dbp = dbc->dbp;
+ sdbp = NULL;
+ ret = 0;
+
+ /*
+ * Worrying about partial appends seems a little like worrying
+ * about Linear A character encodings. But we support those
+ * too if your application understands them.
+ */
+ pdbc = NULL;
+ if (F_ISSET(data, DB_DBT_PARTIAL) || F_ISSET(key, DB_DBT_PARTIAL)) {
+ /*
+ * The dbc we were passed is all set to pass things
+ * back to the user; we can't safely do a call on it.
+ * Dup the cursor, grab the real data item (we don't
+ * care what the key is--we've been passed it directly),
+ * and use that instead of the data DBT we were passed.
+ *
+ * Note that we can get away with this simple get because
+ * an appended item is by definition new, and the
+ * correctly-constructed full data item from this partial
+ * put is on the page waiting for us.
+ */
+ if ((ret = __db_c_idup(dbc, &pdbc, DB_POSITIONI)) != 0)
+ return (ret);
+ memset(&pkey, 0, sizeof(DBT));
+ memset(&pdata, 0, sizeof(DBT));
+
+ if ((ret = pdbc->c_get(pdbc, &pkey, &pdata, DB_CURRENT)) != 0)
+ goto err;
+
+ key = &pkey;
+ data = &pdata;
+ }
+
+ /*
+ * Loop through the secondary indices, putting a new item in
+ * each that points to the appended item.
+ *
+ * This is much like the loop in "step 3" in __db_c_put, so
+ * I'm not commenting heavily here; it was unclean to excerpt
+ * just that section into a common function, but the basic
+ * overview is the same here.
+ */
+ for (sdbp = __db_s_first(dbp);
+ sdbp != NULL && ret == 0; ret = __db_s_next(&sdbp)) {
+ memset(&skey, 0, sizeof(DBT));
+ if ((ret = sdbp->s_callback(sdbp, key, data, &skey)) != 0) {
+ if (ret == DB_DONOTINDEX)
+ continue;
+ else
+ goto err;
+ }
+
+ if ((ret = __db_icursor(sdbp, dbc->txn, sdbp->type,
+ PGNO_INVALID, 0, dbc->locker, &sdbc)) != 0) {
+ FREE_IF_NEEDED(sdbp, &skey);
+ goto err;
+ }
+ if (CDB_LOCKING(sdbp->dbenv)) {
+ DB_ASSERT(sdbc->mylock.off == LOCK_INVALID);
+ F_SET(sdbc, DBC_WRITER);
+ }
+
+ /*
+ * Since we know we have a new primary key, it can't be a
+ * duplicate duplicate in the secondary. It can be a
+ * duplicate in a secondary that doesn't support duplicates,
+ * however, so we need to be careful to avoid an overwrite
+ * (which would corrupt our index).
+ */
+ if (!F_ISSET(sdbp, DB_AM_DUP)) {
+ memset(&oldpkey, 0, sizeof(DBT));
+ F_SET(&oldpkey, DB_DBT_MALLOC);
+ ret = sdbc->c_real_get(sdbc, &skey, &oldpkey,
+ DB_SET | (STD_LOCKING(dbc) ? DB_RMW : 0));
+ if (ret == 0) {
+ cmp = __bam_defcmp(sdbp, &oldpkey, key);
+ /*
+ * XXX
+ * This needs to use the right free function
+ * as soon as this is possible.
+ */
+ __os_ufree(sdbp->dbenv,
+ oldpkey.data);
+ if (cmp != 0) {
+ __db_err(sdbp->dbenv, "%s%s",
+ "Append results in a non-unique secondary key in",
+ " an index not configured to support duplicates");
+ ret = EINVAL;
+ goto err1;
+ }
+ } else if (ret != DB_NOTFOUND && ret != DB_KEYEMPTY)
+ goto err1;
+ }
+
+ ret = sdbc->c_put(sdbc, &skey, key, DB_UPDATE_SECONDARY);
+
+err1: FREE_IF_NEEDED(sdbp, &skey);
+
+ if ((t_ret = sdbc->c_close(sdbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (ret != 0)
+ goto err;
+ }
+
+err: if (pdbc != NULL && (t_ret = pdbc->c_close(pdbc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (sdbp != NULL && (t_ret = __db_s_done(sdbp)) != 0 && ret == 0)
ret = t_ret;
return (ret);
}
diff --git a/bdb/db/db_cam.c b/bdb/db/db_cam.c
index 708d4cbda4d..4de3467d4aa 100644
--- a/bdb/db/db_cam.c
+++ b/bdb/db/db_cam.c
@@ -1,14 +1,14 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2000
+ * Copyright (c) 2000-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: db_cam.c,v 11.52 2001/01/18 15:11:16 bostic Exp $";
+static const char revid[] = "$Id: db_cam.c,v 11.114 2002/09/03 15:44:46 krinsky Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -18,17 +18,18 @@ static const char revid[] = "$Id: db_cam.c,v 11.52 2001/01/18 15:11:16 bostic Ex
#endif
#include "db_int.h"
-#include "db_page.h"
-#include "db_shash.h"
-#include "lock.h"
-#include "btree.h"
-#include "hash.h"
-#include "qam.h"
-#include "txn.h"
-#include "db_ext.h"
-
+#include "dbinc/db_page.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/qam.h"
+
+static int __db_buildpartial __P((DB *, DBT *, DBT *, DBT *));
static int __db_c_cleanup __P((DBC *, DBC *, int));
-static int __db_c_idup __P((DBC *, DBC **, u_int32_t));
+static int __db_c_del_secondary __P((DBC *));
+static int __db_c_pget_recno __P((DBC *, DBT *, DBT *, u_int32_t));
static int __db_wrlock_err __P((DB_ENV *));
#define CDB_LOCKING_INIT(dbp, dbc) \
@@ -43,9 +44,9 @@ static int __db_wrlock_err __P((DB_ENV *));
return (__db_wrlock_err(dbp->dbenv)); \
\
if (F_ISSET(dbc, DBC_WRITECURSOR) && \
- (ret = lock_get((dbp)->dbenv, (dbc)->locker, \
- DB_LOCK_UPGRADE, &(dbc)->lock_dbt, DB_LOCK_WRITE, \
- &(dbc)->mylock)) != 0) \
+ (ret = (dbp)->dbenv->lock_get((dbp)->dbenv, \
+ (dbc)->locker, DB_LOCK_UPGRADE, &(dbc)->lock_dbt, \
+ DB_LOCK_WRITE, &(dbc)->mylock)) != 0) \
return (ret); \
}
#define CDB_LOCKING_DONE(dbp, dbc) \
@@ -63,9 +64,8 @@ static int __db_wrlock_err __P((DB_ENV *));
F_ISSET((dbc_o), DBC_WRITECURSOR | DBC_WRITEDUP)) { \
memcpy(&(dbc_n)->mylock, &(dbc_o)->mylock, \
sizeof((dbc_o)->mylock)); \
- (dbc_n)->locker = (dbc_o)->locker; \
- /* This lock isn't ours to put--just discard it on close. */ \
- F_SET((dbc_n), DBC_WRITEDUP); \
+ /* This lock isn't ours to put--just discard it on close. */ \
+ F_SET((dbc_n), DBC_WRITEDUP); \
}
/*
@@ -81,12 +81,14 @@ __db_c_close(dbc)
DB *dbp;
DBC *opd;
DBC_INTERNAL *cp;
+ DB_ENV *dbenv;
int ret, t_ret;
dbp = dbc->dbp;
+ dbenv = dbp->dbenv;
ret = 0;
- PANIC_CHECK(dbp->dbenv);
+ PANIC_CHECK(dbenv);
/*
* If the cursor is already closed we have a serious problem, and we
@@ -95,7 +97,7 @@ __db_c_close(dbc)
*/
if (!F_ISSET(dbc, DBC_ACTIVE)) {
if (dbp != NULL)
- __db_err(dbp->dbenv, "Closing closed cursor");
+ __db_err(dbenv, "Closing already-closed cursor");
DB_ASSERT(0);
return (EINVAL);
@@ -113,11 +115,9 @@ __db_c_close(dbc)
* !!!
* Cursors must be removed from the active queue before calling the
* access specific cursor close routine, btree depends on having that
- * order of operations. It must also happen before any action that
- * can fail and cause __db_c_close to return an error, or else calls
- * here from __db_close may loop indefinitely.
+ * order of operations.
*/
- MUTEX_THREAD_LOCK(dbp->dbenv, dbp->mutexp);
+ MUTEX_THREAD_LOCK(dbenv, dbp->mutexp);
if (opd != NULL) {
F_CLR(opd, DBC_ACTIVE);
@@ -126,7 +126,7 @@ __db_c_close(dbc)
F_CLR(dbc, DBC_ACTIVE);
TAILQ_REMOVE(&dbp->active_queue, dbc, links);
- MUTEX_THREAD_UNLOCK(dbp->dbenv, dbp->mutexp);
+ MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp);
/* Call the access specific cursor close routine. */
if ((t_ret =
@@ -137,17 +137,20 @@ __db_c_close(dbc)
* Release the lock after calling the access method specific close
* routine, a Btree cursor may have had pending deletes.
*/
- if (CDB_LOCKING(dbc->dbp->dbenv)) {
+ if (CDB_LOCKING(dbenv)) {
/*
* If DBC_WRITEDUP is set, the cursor is an internally
* duplicated write cursor and the lock isn't ours to put.
+ *
+ * Also, be sure not to free anything if mylock.off is
+ * INVALID; in some cases, such as idup'ed read cursors
+ * and secondary update cursors, a cursor in a CDB
+ * environment may not have a lock at all.
*/
- if (!F_ISSET(dbc, DBC_WRITEDUP) &&
- dbc->mylock.off != LOCK_INVALID) {
- if ((t_ret = lock_put(dbc->dbp->dbenv,
- &dbc->mylock)) != 0 && ret == 0)
+ if (!F_ISSET(dbc, DBC_WRITEDUP) && LOCK_ISSET(dbc->mylock)) {
+ if ((t_ret = dbenv->lock_put(
+ dbenv, &dbc->mylock)) != 0 && ret == 0)
ret = t_ret;
- dbc->mylock.off = LOCK_INVALID;
}
/* For safety's sake, since this is going on the free queue. */
@@ -159,7 +162,7 @@ __db_c_close(dbc)
dbc->txn->cursors--;
/* Move the cursor(s) to the free queue. */
- MUTEX_THREAD_LOCK(dbp->dbenv, dbp->mutexp);
+ MUTEX_THREAD_LOCK(dbenv, dbp->mutexp);
if (opd != NULL) {
if (dbc->txn != NULL)
dbc->txn->cursors--;
@@ -167,7 +170,7 @@ __db_c_close(dbc)
opd = NULL;
}
TAILQ_INSERT_TAIL(&dbp->free_queue, dbc, links);
- MUTEX_THREAD_UNLOCK(dbp->dbenv, dbp->mutexp);
+ MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp);
return (ret);
}
@@ -183,27 +186,37 @@ __db_c_destroy(dbc)
DBC *dbc;
{
DB *dbp;
- DBC_INTERNAL *cp;
- int ret;
+ DB_ENV *dbenv;
+ int ret, t_ret;
dbp = dbc->dbp;
- cp = dbc->internal;
+ dbenv = dbp->dbenv;
/* Remove the cursor from the free queue. */
- MUTEX_THREAD_LOCK(dbp->dbenv, dbp->mutexp);
+ MUTEX_THREAD_LOCK(dbenv, dbp->mutexp);
TAILQ_REMOVE(&dbp->free_queue, dbc, links);
- MUTEX_THREAD_UNLOCK(dbp->dbenv, dbp->mutexp);
+ MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp);
/* Free up allocated memory. */
- if (dbc->rkey.data != NULL)
- __os_free(dbc->rkey.data, dbc->rkey.ulen);
- if (dbc->rdata.data != NULL)
- __os_free(dbc->rdata.data, dbc->rdata.ulen);
+ if (dbc->my_rskey.data != NULL)
+ __os_free(dbenv, dbc->my_rskey.data);
+ if (dbc->my_rkey.data != NULL)
+ __os_free(dbenv, dbc->my_rkey.data);
+ if (dbc->my_rdata.data != NULL)
+ __os_free(dbenv, dbc->my_rdata.data);
/* Call the access specific cursor destroy routine. */
ret = dbc->c_am_destroy == NULL ? 0 : dbc->c_am_destroy(dbc);
- __os_free(dbc, sizeof(*dbc));
+ /*
+ * Release the lock id for this cursor.
+ */
+ if (LOCKING_ON(dbenv) &&
+ F_ISSET(dbc, DBC_OWN_LID) &&
+ (t_ret = dbenv->lock_id_free(dbenv, dbc->lid)) != 0 && ret == 0)
+ ret = t_ret;
+
+ __os_free(dbenv, dbc);
return (ret);
}
@@ -256,7 +269,7 @@ __db_c_count(dbc, recnop, flags)
break;
default:
return (__db_unknown_type(dbp->dbenv,
- "__db_c_count", dbp->type));
+ "__db_c_count", dbp->type));
}
return (0);
}
@@ -286,11 +299,13 @@ __db_c_del(dbc, flags)
dbp = dbc->dbp;
PANIC_CHECK(dbp->dbenv);
- DB_CHECK_TXN(dbp, dbc->txn);
/* Check for invalid flags. */
- if ((ret = __db_cdelchk(dbp, flags,
- F_ISSET(dbp, DB_AM_RDONLY), IS_INITIALIZED(dbc))) != 0)
+ if ((ret = __db_cdelchk(dbp, flags, IS_INITIALIZED(dbc))) != 0)
+ return (ret);
+
+ /* Check for consistent transaction usage. */
+ if ((ret = __db_check_txn(dbp, dbc->txn, dbc->locker, 0)) != 0)
return (ret);
DEBUG_LWRITE(dbc, dbc->txn, "db_c_del", NULL, NULL, flags);
@@ -298,6 +313,27 @@ __db_c_del(dbc, flags)
CDB_LOCKING_INIT(dbp, dbc);
/*
+ * If we're a secondary index, and DB_UPDATE_SECONDARY isn't set
+ * (which it only is if we're being called from a primary update),
+ * then we need to call through to the primary and delete the item.
+ *
+ * Note that this will delete the current item; we don't need to
+ * delete it ourselves as well, so we can just goto done.
+ */
+ if (flags != DB_UPDATE_SECONDARY && F_ISSET(dbp, DB_AM_SECONDARY)) {
+ ret = __db_c_del_secondary(dbc);
+ goto done;
+ }
+
+ /*
+ * If we are a primary and have secondary indices, go through
+ * and delete any secondary keys that point at the current record.
+ */
+ if (LIST_FIRST(&dbp->s_secondaries) != NULL &&
+ (ret = __db_c_del_primary(dbc)) != 0)
+ goto done;
+
+ /*
* Off-page duplicate trees are locked in the primary tree, that is,
* we acquire a write lock in the primary tree and no locks in the
* off-page dup tree. If the del operation is done in an off-page
@@ -310,7 +346,7 @@ __db_c_del(dbc, flags)
if ((ret = dbc->c_am_writelock(dbc)) == 0)
ret = opd->c_am_del(opd);
- CDB_LOCKING_DONE(dbp, dbc);
+done: CDB_LOCKING_DONE(dbp, dbc);
return (ret);
}
@@ -362,7 +398,7 @@ __db_c_dup(dbc_orig, dbcp, flags)
if (CDB_LOCKING(dbenv) && flags != DB_POSITIONI) {
DB_ASSERT(!F_ISSET(dbc_orig, DBC_WRITER | DBC_WRITECURSOR));
- if ((ret = lock_get(dbenv, dbc_n->locker, 0,
+ if ((ret = dbenv->lock_get(dbenv, dbc_n->locker, 0,
&dbc_n->lock_dbt, DB_LOCK_READ, &dbc_n->mylock)) != 0) {
(void)__db_c_close(dbc_n);
return (ret);
@@ -380,6 +416,8 @@ __db_c_dup(dbc_orig, dbcp, flags)
dbc_n->internal->opd = dbc_nopd;
}
+ /* Copy the dirty read flag to the new cursor. */
+ F_SET(dbc_n, F_ISSET(dbc_orig, DBC_DIRTY_READ));
return (0);
err: if (dbc_n != NULL)
@@ -393,8 +431,10 @@ err: if (dbc_n != NULL)
/*
* __db_c_idup --
* Internal version of __db_c_dup.
+ *
+ * PUBLIC: int __db_c_idup __P((DBC *, DBC **, u_int32_t));
*/
-static int
+int
__db_c_idup(dbc_orig, dbcp, flags)
DBC *dbc_orig, **dbcp;
u_int32_t flags;
@@ -408,17 +448,16 @@ __db_c_idup(dbc_orig, dbcp, flags)
dbc_n = *dbcp;
if ((ret = __db_icursor(dbp, dbc_orig->txn, dbc_orig->dbtype,
- dbc_orig->internal->root, F_ISSET(dbc_orig, DBC_OPD), &dbc_n)) != 0)
+ dbc_orig->internal->root, F_ISSET(dbc_orig, DBC_OPD),
+ dbc_orig->locker, &dbc_n)) != 0)
return (ret);
- dbc_n->locker = dbc_orig->locker;
-
/* If the user wants the cursor positioned, do it here. */
if (flags == DB_POSITION || flags == DB_POSITIONI) {
int_n = dbc_n->internal;
int_orig = dbc_orig->internal;
- dbc_n->flags = dbc_orig->flags;
+ dbc_n->flags |= dbc_orig->flags & ~DBC_OWN_LID;
int_n->indx = int_orig->indx;
int_n->pgno = int_orig->pgno;
@@ -449,6 +488,9 @@ __db_c_idup(dbc_orig, dbcp, flags)
/* Now take care of duping the CDB information. */
CDB_LOCKING_COPY(dbp, dbc_orig, dbc_n);
+ /* Copy the dirty read flag to the new cursor. */
+ F_SET(dbc_n, F_ISSET(dbc_orig, DBC_DIRTY_READ));
+
*dbcp = dbc_n;
return (0);
@@ -460,12 +502,13 @@ err: (void)dbc_n->c_close(dbc_n);
* __db_c_newopd --
* Create a new off-page duplicate cursor.
*
- * PUBLIC: int __db_c_newopd __P((DBC *, db_pgno_t, DBC **));
+ * PUBLIC: int __db_c_newopd __P((DBC *, db_pgno_t, DBC *, DBC **));
*/
int
-__db_c_newopd(dbc_parent, root, dbcp)
+__db_c_newopd(dbc_parent, root, oldopd, dbcp)
DBC *dbc_parent;
db_pgno_t root;
+ DBC *oldopd;
DBC **dbcp;
{
DB *dbp;
@@ -476,14 +519,44 @@ __db_c_newopd(dbc_parent, root, dbcp)
dbp = dbc_parent->dbp;
dbtype = (dbp->dup_compare == NULL) ? DB_RECNO : DB_BTREE;
+ /*
+ * On failure, we want to default to returning the old off-page dup
+ * cursor, if any; our caller can't be left with a dangling pointer
+ * to a freed cursor. On error the only allowable behavior is to
+ * close the cursor (and the old OPD cursor it in turn points to), so
+ * this should be safe.
+ */
+ *dbcp = oldopd;
+
if ((ret = __db_icursor(dbp,
- dbc_parent->txn, dbtype, root, 1, &opd)) != 0)
+ dbc_parent->txn, dbtype, root, 1, dbc_parent->locker, &opd)) != 0)
return (ret);
+ /* !!!
+ * If the parent is a DBC_WRITER, this won't copy anything. That's
+ * not actually a problem--we only need lock information in an
+ * off-page dup cursor in order to upgrade at cursor close time
+ * if we've done a delete, but WRITERs don't need to upgrade.
+ */
CDB_LOCKING_COPY(dbp, dbc_parent, opd);
*dbcp = opd;
+ /*
+ * Check to see if we already have an off-page dup cursor that we've
+ * passed in. If we do, close it. It'd be nice to use it again
+ * if it's a cursor belonging to the right tree, but if we're doing
+ * a cursor-relative operation this might not be safe, so for now
+ * we'll take the easy way out and always close and reopen.
+ *
+ * Note that under no circumstances do we want to close the old
+ * cursor without returning a valid new one; we don't want to
+ * leave the main cursor in our caller with a non-NULL pointer
+ * to a freed off-page dup cursor.
+ */
+ if (oldopd != NULL && (ret = oldopd->c_close(oldopd)) != 0)
+ return (ret);
+
return (0);
}
@@ -502,8 +575,9 @@ __db_c_get(dbc_arg, key, data, flags)
DB *dbp;
DBC *dbc, *dbc_n, *opd;
DBC_INTERNAL *cp, *cp_n;
+ DB_MPOOLFILE *mpf;
db_pgno_t pgno;
- u_int32_t tmp_flags, tmp_rmw;
+ u_int32_t multi, tmp_dirty, tmp_flags, tmp_rmw;
u_int8_t type;
int ret, t_ret;
@@ -517,6 +591,7 @@ __db_c_get(dbc_arg, key, data, flags)
* functions.
*/
dbp = dbc_arg->dbp;
+ mpf = dbp->mpf;
dbc_n = NULL;
opd = NULL;
@@ -531,6 +606,12 @@ __db_c_get(dbc_arg, key, data, flags)
tmp_rmw = LF_ISSET(DB_RMW);
LF_CLR(DB_RMW);
+ tmp_dirty = LF_ISSET(DB_DIRTY_READ);
+ LF_CLR(DB_DIRTY_READ);
+
+ multi = LF_ISSET(DB_MULTIPLE|DB_MULTIPLE_KEY);
+ LF_CLR(DB_MULTIPLE|DB_MULTIPLE_KEY);
+
DEBUG_LREAD(dbc_arg, dbc_arg->txn, "db_c_get",
flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, NULL, flags);
@@ -538,8 +619,18 @@ __db_c_get(dbc_arg, key, data, flags)
* Return a cursor's record number. It has nothing to do with the
* cursor get code except that it was put into the interface.
*/
- if (flags == DB_GET_RECNO)
- return (__bam_c_rget(dbc_arg, data, flags | tmp_rmw));
+ if (flags == DB_GET_RECNO) {
+ if (tmp_rmw)
+ F_SET(dbc_arg, DBC_RMW);
+ if (tmp_dirty)
+ F_SET(dbc_arg, DBC_DIRTY_READ);
+ ret = __bam_c_rget(dbc_arg, data);
+ if (tmp_rmw)
+ F_CLR(dbc_arg, DBC_RMW);
+ if (tmp_dirty)
+ F_CLR(dbc_arg, DBC_DIRTY_READ);
+ return (ret);
+ }
if (flags == DB_CONSUME || flags == DB_CONSUME_WAIT)
CDB_LOCKING_INIT(dbp, dbc_arg);
@@ -564,8 +655,8 @@ __db_c_get(dbc_arg, key, data, flags)
if ((ret = __db_c_idup(cp->opd, &opd, DB_POSITIONI)) != 0)
return (ret);
- switch (ret = opd->c_am_get(
- opd, key, data, flags, NULL)) {
+ switch (ret =
+ opd->c_am_get(opd, key, data, flags, NULL)) {
case 0:
goto done;
case DB_NOTFOUND:
@@ -605,21 +696,49 @@ __db_c_get(dbc_arg, key, data, flags)
break;
}
+ if (tmp_dirty)
+ F_SET(dbc_arg, DBC_DIRTY_READ);
+
/*
* If this cursor is going to be closed immediately, we don't
* need to take precautions to clean it up on error.
*/
if (F_ISSET(dbc_arg, DBC_TRANSIENT))
dbc_n = dbc_arg;
- else if ((ret = __db_c_idup(dbc_arg, &dbc_n, tmp_flags)) != 0)
- goto err;
+ else {
+ ret = __db_c_idup(dbc_arg, &dbc_n, tmp_flags);
+ if (tmp_dirty)
+ F_CLR(dbc_arg, DBC_DIRTY_READ);
+
+ if (ret != 0)
+ goto err;
+ COPY_RET_MEM(dbc_arg, dbc_n);
+ }
if (tmp_rmw)
F_SET(dbc_n, DBC_RMW);
+
+ switch (multi) {
+ case DB_MULTIPLE:
+ F_SET(dbc_n, DBC_MULTIPLE);
+ break;
+ case DB_MULTIPLE_KEY:
+ F_SET(dbc_n, DBC_MULTIPLE_KEY);
+ break;
+ case DB_MULTIPLE | DB_MULTIPLE_KEY:
+ F_SET(dbc_n, DBC_MULTIPLE|DBC_MULTIPLE_KEY);
+ break;
+ case 0:
+ break;
+ }
+
pgno = PGNO_INVALID;
ret = dbc_n->c_am_get(dbc_n, key, data, flags, &pgno);
if (tmp_rmw)
F_CLR(dbc_n, DBC_RMW);
+ if (tmp_dirty)
+ F_CLR(dbc_arg, DBC_DIRTY_READ);
+ F_CLR(dbc_n, DBC_MULTIPLE|DBC_MULTIPLE_KEY);
if (ret != 0)
goto err;
@@ -630,7 +749,8 @@ __db_c_get(dbc_arg, key, data, flags)
* a new cursor and call the underlying function.
*/
if (pgno != PGNO_INVALID) {
- if ((ret = __db_c_newopd(dbc_arg, pgno, &cp_n->opd)) != 0)
+ if ((ret = __db_c_newopd(dbc_arg,
+ pgno, cp_n->opd, &cp_n->opd)) != 0)
goto err;
switch (flags) {
@@ -648,10 +768,9 @@ __db_c_get(dbc_arg, key, data, flags)
tmp_flags = DB_LAST;
break;
case DB_GET_BOTH:
- tmp_flags = DB_GET_BOTH;
- break;
case DB_GET_BOTHC:
- tmp_flags = DB_GET_BOTHC;
+ case DB_GET_BOTH_RANGE:
+ tmp_flags = flags;
break;
default:
ret =
@@ -680,19 +799,66 @@ done: /*
cp_n = dbc_n == NULL ? dbc_arg->internal : dbc_n->internal;
if (!F_ISSET(key, DB_DBT_ISSET)) {
if (cp_n->page == NULL && (ret =
- memp_fget(dbp->mpf, &cp_n->pgno, 0, &cp_n->page)) != 0)
+ mpf->get(mpf, &cp_n->pgno, 0, &cp_n->page)) != 0)
goto err;
if ((ret = __db_ret(dbp, cp_n->page, cp_n->indx,
- key, &dbc_arg->rkey.data, &dbc_arg->rkey.ulen)) != 0)
+ key, &dbc_arg->rkey->data, &dbc_arg->rkey->ulen)) != 0)
goto err;
}
- dbc = opd != NULL ? opd : cp_n->opd != NULL ? cp_n->opd : dbc_n;
- if (!F_ISSET(data, DB_DBT_ISSET)) {
+ if (multi != 0) {
+ /*
+ * Even if fetching from the OPD cursor we need a duplicate
+ * primary cursor if we are going after multiple keys.
+ */
+ if (dbc_n == NULL) {
+ /*
+ * Non-"_KEY" DB_MULTIPLE doesn't move the main cursor,
+ * so it's safe to just use dbc_arg, unless dbc_arg
+ * has an open OPD cursor whose state might need to
+ * be preserved.
+ */
+ if ((!(multi & DB_MULTIPLE_KEY) &&
+ dbc_arg->internal->opd == NULL) ||
+ F_ISSET(dbc_arg, DBC_TRANSIENT))
+ dbc_n = dbc_arg;
+ else {
+ if ((ret = __db_c_idup(dbc_arg,
+ &dbc_n, DB_POSITIONI)) != 0)
+ goto err;
+ if ((ret = dbc_n->c_am_get(dbc_n,
+ key, data, DB_CURRENT, &pgno)) != 0)
+ goto err;
+ }
+ cp_n = dbc_n->internal;
+ }
+
+ /*
+ * If opd is set then we dupped the opd that we came in with.
+ * When we return we may have a new opd if we went to another
+ * key.
+ */
+ if (opd != NULL) {
+ DB_ASSERT(cp_n->opd == NULL);
+ cp_n->opd = opd;
+ opd = NULL;
+ }
+
+ /*
+ * Bulk get doesn't use __db_retcopy, so data.size won't
+ * get set up unless there is an error. Assume success
+ * here. This is the only call to c_am_bulk, and it avoids
+ * setting it exactly the same everywhere. If we have an
+ * ENOMEM error, it'll get overwritten with the needed value.
+ */
+ data->size = data->ulen;
+ ret = dbc_n->c_am_bulk(dbc_n, data, flags | multi);
+ } else if (!F_ISSET(data, DB_DBT_ISSET)) {
+ dbc = opd != NULL ? opd : cp_n->opd != NULL ? cp_n->opd : dbc_n;
type = TYPE(dbc->internal->page);
ret = __db_ret(dbp, dbc->internal->page, dbc->internal->indx +
(type == P_LBTREE || type == P_HASH ? O_INDX : 0),
- data, &dbc_arg->rdata.data, &dbc_arg->rdata.ulen);
+ data, &dbc_arg->rdata->data, &dbc_arg->rdata->ulen);
}
err: /* Don't pass DB_DBT_ISSET back to application level, error or no. */
@@ -701,9 +867,8 @@ err: /* Don't pass DB_DBT_ISSET back to application level, error or no. */
/* Cleanup and cursor resolution. */
if (opd != NULL) {
- if ((t_ret =
- __db_c_cleanup(dbc_arg->internal->opd,
- opd, ret)) != 0 && ret == 0)
+ if ((t_ret = __db_c_cleanup(
+ dbc_arg->internal->opd, opd, ret)) != 0 && ret == 0)
ret = t_ret;
}
@@ -728,11 +893,12 @@ __db_c_put(dbc_arg, key, data, flags)
DBT *key, *data;
u_int32_t flags;
{
- DB *dbp;
- DBC *dbc_n, *opd;
+ DB *dbp, *sdbp;
+ DBC *dbc_n, *oldopd, *opd, *sdbc, *pdbc;
+ DBT olddata, oldpkey, oldskey, newdata, pkey, save_skey, skey, temp;
db_pgno_t pgno;
- u_int32_t tmp_flags;
- int ret, t_ret;
+ int cmp, have_oldrec, ispartial, nodel, re_pad, ret, rmw, t_ret;
+ u_int32_t re_len, size, tmp_flags;
/*
* Cursor Cleanup Note:
@@ -744,16 +910,30 @@ __db_c_put(dbc_arg, key, data, flags)
* functions.
*/
dbp = dbc_arg->dbp;
- dbc_n = NULL;
+ sdbp = NULL;
+ pdbc = dbc_n = NULL;
+ memset(&newdata, 0, sizeof(DBT));
PANIC_CHECK(dbp->dbenv);
- DB_CHECK_TXN(dbp, dbc_arg->txn);
/* Check for invalid flags. */
- if ((ret = __db_cputchk(dbp, key, data, flags,
- F_ISSET(dbp, DB_AM_RDONLY), IS_INITIALIZED(dbc_arg))) != 0)
+ if ((ret = __db_cputchk(dbp,
+ key, data, flags, IS_INITIALIZED(dbc_arg))) != 0)
+ return (ret);
+
+ /* Check for consistent transaction usage. */
+ if ((ret = __db_check_txn(dbp, dbc_arg->txn, dbc_arg->locker, 0)) != 0)
return (ret);
+ /*
+ * Putting to secondary indices is forbidden; when we need
+ * to internally update one, we'll call this with a private
+ * synonym for DB_KEYLAST, DB_UPDATE_SECONDARY, which does
+ * the right thing but won't return an error from cputchk().
+ */
+ if (flags == DB_UPDATE_SECONDARY)
+ flags = DB_KEYLAST;
+
DEBUG_LWRITE(dbc_arg, dbc_arg->txn, "db_c_put",
flags == DB_KEYFIRST || flags == DB_KEYLAST ||
flags == DB_NODUPDATA ? key : NULL, data, flags);
@@ -761,6 +941,439 @@ __db_c_put(dbc_arg, key, data, flags)
CDB_LOCKING_INIT(dbp, dbc_arg);
/*
+ * Check to see if we are a primary and have secondary indices.
+ * If we are not, we save ourselves a good bit of trouble and
+ * just skip to the "normal" put.
+ */
+ if (LIST_FIRST(&dbp->s_secondaries) == NULL)
+ goto skip_s_update;
+
+ /*
+ * We have at least one secondary which we may need to update.
+ *
+ * There is a rather vile locking issue here. Secondary gets
+ * will always involve acquiring a read lock in the secondary,
+ * then acquiring a read lock in the primary. Ideally, we
+ * would likewise perform puts by updating all the secondaries
+ * first, then doing the actual put in the primary, to avoid
+ * deadlock (since having multiple threads doing secondary
+ * gets and puts simultaneously is probably a common case).
+ *
+ * However, if this put is a put-overwrite--and we have no way to
+ * tell in advance whether it will be--we may need to delete
+ * an outdated secondary key. In order to find that old
+ * secondary key, we need to get the record we're overwriting,
+ * before we overwrite it.
+ *
+ * (XXX: It would be nice to avoid this extra get, and have the
+ * underlying put routines somehow pass us the old record
+ * since they need to traverse the tree anyway. I'm saving
+ * this optimization for later, as it's a lot of work, and it
+ * would be hard to fit into this locking paradigm anyway.)
+ *
+ * The simple thing to do would be to go get the old record before
+ * we do anything else. Unfortunately, though, doing so would
+ * violate our "secondary, then primary" lock acquisition
+ * ordering--even in the common case where no old primary record
+ * exists, we'll still acquire and keep a lock on the page where
+ * we're about to do the primary insert.
+ *
+ * To get around this, we do the following gyrations, which
+ * hopefully solve this problem in the common case:
+ *
+ * 1) If this is a c_put(DB_CURRENT), go ahead and get the
+ * old record. We already hold the lock on this page in
+ * the primary, so no harm done, and we'll need the primary
+ * key (which we weren't passed in this case) to do any
+ * secondary puts anyway.
+ *
+ * 2) If we're doing a partial put, we need to perform the
+ * get on the primary key right away, since we don't have
+ * the whole datum that the secondary key is based on.
+ * We may also need to pad out the record if the primary
+ * has a fixed record length.
+ *
+ * 3) Loop through the secondary indices, putting into each a
+ * new secondary key that corresponds to the new record.
+ *
+ * 4) If we haven't done so in (1) or (2), get the old primary
+ * key/data pair. If one does not exist--the common case--we're
+ * done with secondary indices, and can go straight on to the
+ * primary put.
+ *
+ * 5) If we do have an old primary key/data pair, however, we need
+ * to loop through all the secondaries a second time and delete
+ * the old secondary in each.
+ */
+ memset(&pkey, 0, sizeof(DBT));
+ memset(&olddata, 0, sizeof(DBT));
+ have_oldrec = nodel = 0;
+
+ /*
+ * Primary indices can't have duplicates, so only DB_CURRENT,
+ * DB_KEYFIRST, and DB_KEYLAST make any sense. Other flags
+ * should have been caught by the checking routine, but
+ * add a sprinkling of paranoia.
+ */
+ DB_ASSERT(flags == DB_CURRENT ||
+ flags == DB_KEYFIRST || flags == DB_KEYLAST);
+
+ /*
+ * We'll want to use DB_RMW in a few places, but it's only legal
+ * when locking is on.
+ */
+ rmw = STD_LOCKING(dbc_arg) ? DB_RMW : 0;
+
+ if (flags == DB_CURRENT) { /* Step 1. */
+ /*
+ * This is safe to do on the cursor we already have;
+ * error or no, it won't move.
+ *
+ * We use DB_RMW for all of these gets because we'll be
+ * writing soon enough in the "normal" put code. In
+ * transactional databases we'll hold those write locks
+ * even if we close the cursor we're reading with.
+ */
+ ret = dbc_arg->c_get(dbc_arg,
+ &pkey, &olddata, rmw | DB_CURRENT);
+ if (ret == DB_KEYEMPTY) {
+ nodel = 1; /*
+ * We know we don't need a delete
+ * in the secondary.
+ */
+ have_oldrec = 1; /* We've looked for the old record. */
+ } else if (ret != 0)
+ goto err;
+ else
+ have_oldrec = 1;
+
+ } else {
+ /* So we can just use &pkey everywhere instead of key. */
+ pkey.data = key->data;
+ pkey.size = key->size;
+ }
+
+ /*
+ * Check for partial puts (step 2).
+ */
+ if (F_ISSET(data, DB_DBT_PARTIAL)) {
+ if (!have_oldrec && !nodel) {
+ /*
+ * We're going to have to search the tree for the
+ * specified key. Dup a cursor (so we have the same
+ * locking info) and do a c_get.
+ */
+ if ((ret = __db_c_idup(dbc_arg, &pdbc, 0)) != 0)
+ goto err;
+
+ /* We should have gotten DB_CURRENT in step 1. */
+ DB_ASSERT(flags != DB_CURRENT);
+
+ ret = pdbc->c_get(pdbc,
+ &pkey, &olddata, rmw | DB_SET);
+ if (ret == DB_KEYEMPTY || ret == DB_NOTFOUND) {
+ nodel = 1;
+ ret = 0;
+ }
+ if ((t_ret = pdbc->c_close(pdbc)) != 0)
+ ret = t_ret;
+ if (ret != 0)
+ goto err;
+
+ have_oldrec = 1;
+ }
+
+ /*
+ * Now build the new datum from olddata and the partial
+ * data we were given.
+ */
+ if ((ret =
+ __db_buildpartial(dbp, &olddata, data, &newdata)) != 0)
+ goto err;
+ ispartial = 1;
+ } else
+ ispartial = 0;
+
+ /*
+ * Handle fixed-length records. If the primary database has
+ * fixed-length records, we need to pad out the datum before
+ * we pass it into the callback function; we always index the
+ * "real" record.
+ */
+ if ((dbp->type == DB_RECNO && F_ISSET(dbp, DB_AM_FIXEDLEN)) ||
+ (dbp->type == DB_QUEUE)) {
+ if (dbp->type == DB_QUEUE) {
+ re_len = ((QUEUE *)dbp->q_internal)->re_len;
+ re_pad = ((QUEUE *)dbp->q_internal)->re_pad;
+ } else {
+ re_len = ((BTREE *)dbp->bt_internal)->re_len;
+ re_pad = ((BTREE *)dbp->bt_internal)->re_pad;
+ }
+
+ size = ispartial ? newdata.size : data->size;
+ if (size > re_len) {
+ __db_err(dbp->dbenv,
+ "Length improper for fixed length record %lu",
+ (u_long)size);
+ ret = EINVAL;
+ goto err;
+ } else if (size < re_len) {
+ /*
+ * If we're not doing a partial put, copy
+ * data->data into newdata.data, then pad out
+ * newdata.data.
+ *
+ * If we're doing a partial put, the data
+ * we want are already in newdata.data; we
+ * just need to pad.
+ *
+ * Either way, realloc is safe.
+ */
+ if ((ret = __os_realloc(dbp->dbenv, re_len,
+ &newdata.data)) != 0)
+ goto err;
+ if (!ispartial)
+ memcpy(newdata.data, data->data, size);
+ memset((u_int8_t *)newdata.data + size, re_pad,
+ re_len - size);
+ newdata.size = re_len;
+ ispartial = 1;
+ }
+ }
+
+ /*
+ * Loop through the secondaries. (Step 3.)
+ *
+ * Note that __db_s_first and __db_s_next will take care of
+ * thread-locking and refcounting issues.
+ */
+ for (sdbp = __db_s_first(dbp);
+ sdbp != NULL && ret == 0; ret = __db_s_next(&sdbp)) {
+ /*
+ * Call the callback for this secondary, to get the
+ * appropriate secondary key.
+ */
+ memset(&skey, 0, sizeof(DBT));
+ if ((ret = sdbp->s_callback(sdbp,
+ &pkey, ispartial ? &newdata : data, &skey)) != 0) {
+ if (ret == DB_DONOTINDEX)
+ /*
+ * The callback returned a null value--don't
+ * put this key in the secondary. Just
+ * move on to the next one--we'll handle
+ * any necessary deletes in step 5.
+ */
+ continue;
+ else
+ goto err;
+ }
+
+ /*
+ * Save the DBT we just got back from the callback function
+ * off; we want to pass its value into c_get functions
+ * that may stomp on a buffer the callback function
+ * allocated.
+ */
+ memset(&save_skey, 0, sizeof(DBT)); /* Paranoia. */
+ save_skey = skey;
+
+ /*
+ * Open a cursor in this secondary.
+ *
+ * Use the same locker ID as our primary cursor, so that
+ * we're guaranteed that the locks don't conflict (e.g. in CDB
+ * or if we're subdatabases that share and want to lock a
+ * metadata page).
+ */
+ if ((ret = __db_icursor(sdbp, dbc_arg->txn, sdbp->type,
+ PGNO_INVALID, 0, dbc_arg->locker, &sdbc)) != 0)
+ goto err;
+
+ /*
+ * If we're in CDB, updates will fail since the new cursor
+ * isn't a writer. However, we hold the WRITE lock in the
+ * primary and will for as long as our new cursor lasts,
+ * and the primary and secondary share a lock file ID,
+ * so it's safe to consider this a WRITER. The close
+ * routine won't try to put anything because we don't
+ * really have a lock.
+ */
+ if (CDB_LOCKING(sdbp->dbenv)) {
+ DB_ASSERT(sdbc->mylock.off == LOCK_INVALID);
+ F_SET(sdbc, DBC_WRITER);
+ }
+
+ /*
+ * There are three cases here--
+ * 1) The secondary supports sorted duplicates.
+ * If we attempt to put a secondary/primary pair
+ * that already exists, that's a duplicate duplicate,
+ * and c_put will return DB_KEYEXIST (see __db_duperr).
+ * This will leave us with exactly one copy of the
+ * secondary/primary pair, and this is just right--we'll
+ * avoid deleting it later, as the old and new secondaries
+ * will match (since the old secondary is the dup dup
+ * that's already there).
+ * 2) The secondary supports duplicates, but they're not
+ * sorted. We need to avoid putting a duplicate
+ * duplicate, because the matching old and new secondaries
+ * will prevent us from deleting anything and we'll
+ * wind up with two secondary records that point to the
+ * same primary key. Do a c_get(DB_GET_BOTH); if
+ * that returns 0, skip the put.
+ * 3) The secondary doesn't support duplicates at all.
+ * In this case, secondary keys must be unique; if
+ * another primary key already exists for this
+ * secondary key, we have to either overwrite it or
+ * not put this one, and in either case we've
+ * corrupted the secondary index. Do a c_get(DB_SET).
+ * If the secondary/primary pair already exists, do
+ * nothing; if the secondary exists with a different
+ * primary, return an error; and if the secondary
+ * does not exist, put it.
+ */
+ if (!F_ISSET(sdbp, DB_AM_DUP)) {
+ /* Case 3. */
+ memset(&oldpkey, 0, sizeof(DBT));
+ F_SET(&oldpkey, DB_DBT_MALLOC);
+ ret = sdbc->c_real_get(sdbc,
+ &skey, &oldpkey, rmw | DB_SET);
+ if (ret == 0) {
+ cmp = __bam_defcmp(sdbp, &oldpkey, &pkey);
+ __os_ufree(sdbp->dbenv, oldpkey.data);
+ if (cmp != 0) {
+ __db_err(sdbp->dbenv, "%s%s",
+ "Put results in a non-unique secondary key in an ",
+ "index not configured to support duplicates");
+ ret = EINVAL;
+ goto skipput;
+ }
+ } else if (ret != DB_NOTFOUND && ret != DB_KEYEMPTY)
+ goto skipput;
+ } else if (!F_ISSET(sdbp, DB_AM_DUPSORT))
+ /* Case 2. */
+ if ((ret = sdbc->c_real_get(sdbc,
+ &skey, &pkey, rmw | DB_GET_BOTH)) == 0)
+ goto skipput;
+
+ ret = sdbc->c_put(sdbc, &skey, &pkey, DB_UPDATE_SECONDARY);
+
+ /*
+ * We don't know yet whether this was a put-overwrite that
+ * in fact changed nothing. If it was, we may get DB_KEYEXIST.
+ * This is not an error.
+ */
+ if (ret == DB_KEYEXIST)
+ ret = 0;
+
+skipput: FREE_IF_NEEDED(sdbp, &save_skey)
+
+ if ((t_ret = sdbc->c_close(sdbc)) != 0)
+ ret = t_ret;
+
+ if (ret != 0)
+ goto err;
+ }
+ if (ret != 0)
+ goto err;
+
+ /* If still necessary, go get the old primary key/data. (Step 4.) */
+ if (!have_oldrec) {
+ /* See the comments in step 2. This is real familiar. */
+ if ((ret = __db_c_idup(dbc_arg, &pdbc, 0)) != 0)
+ goto err;
+ DB_ASSERT(flags != DB_CURRENT);
+ pkey.data = key->data;
+ pkey.size = key->size;
+ ret = pdbc->c_get(pdbc, &pkey, &olddata, rmw | DB_SET);
+ if (ret == DB_KEYEMPTY || ret == DB_NOTFOUND) {
+ nodel = 1;
+ ret = 0;
+ }
+ if ((t_ret = pdbc->c_close(pdbc)) != 0)
+ ret = t_ret;
+ if (ret != 0)
+ goto err;
+ have_oldrec = 1;
+ }
+
+ /*
+ * If we don't follow this goto, we do in fact have an old record
+ * we may need to go delete. (Step 5).
+ */
+ if (nodel)
+ goto skip_s_update;
+
+ for (sdbp = __db_s_first(dbp);
+ sdbp != NULL && ret == 0; ret = __db_s_next(&sdbp)) {
+ /*
+ * Call the callback for this secondary to get the
+ * old secondary key.
+ */
+ memset(&oldskey, 0, sizeof(DBT));
+ if ((ret = sdbp->s_callback(sdbp,
+ &pkey, &olddata, &oldskey)) != 0) {
+ if (ret == DB_DONOTINDEX)
+ /*
+ * The callback returned a null value--there's
+ * nothing to delete. Go on to the next
+ * secondary.
+ */
+ continue;
+ else
+ goto err;
+ }
+ if ((ret = sdbp->s_callback(sdbp,
+ &pkey, ispartial ? &newdata : data, &skey)) != 0 &&
+ ret != DB_DONOTINDEX)
+ goto err;
+
+ /*
+ * If there is no new secondary key, or if the old secondary
+ * key is different from the new secondary key, then
+ * we need to delete the old one.
+ *
+ * Note that bt_compare is (and must be) set no matter
+ * what access method we're in.
+ */
+ sdbc = NULL;
+ if (ret == DB_DONOTINDEX ||
+ ((BTREE *)sdbp->bt_internal)->bt_compare(sdbp,
+ &oldskey, &skey) != 0) {
+ if ((ret = __db_icursor(sdbp, dbc_arg->txn, sdbp->type,
+ PGNO_INVALID, 0, dbc_arg->locker, &sdbc)) != 0)
+ goto err;
+ if (CDB_LOCKING(sdbp->dbenv)) {
+ DB_ASSERT(sdbc->mylock.off == LOCK_INVALID);
+ F_SET(sdbc, DBC_WRITER);
+ }
+
+ /*
+ * Don't let c_get(DB_GET_BOTH) stomp on
+ * any secondary key value that the callback
+ * function may have allocated. Use a temp
+ * DBT instead.
+ */
+ memset(&temp, 0, sizeof(DBT));
+ temp.data = oldskey.data;
+ temp.size = oldskey.size;
+ if ((ret = sdbc->c_real_get(sdbc,
+ &temp, &pkey, rmw | DB_GET_BOTH)) == 0)
+ ret = sdbc->c_del(sdbc, DB_UPDATE_SECONDARY);
+ }
+
+ FREE_IF_NEEDED(sdbp, &skey);
+ FREE_IF_NEEDED(sdbp, &oldskey);
+ if (sdbc != NULL && (t_ret = sdbc->c_close(sdbc)) != 0)
+ ret = t_ret;
+ if (ret != 0)
+ goto err;
+ }
+
+ /* Secondary index updates are now done. On to the "real" stuff. */
+
+skip_s_update:
+ /*
* If we have an off-page duplicates cursor, and the operation applies
* to it, perform the operation. Duplicate the cursor and call the
* underlying function.
@@ -826,8 +1439,12 @@ __db_c_put(dbc_arg, key, data, flags)
* a new cursor and call the underlying function.
*/
if (pgno != PGNO_INVALID) {
- if ((ret = __db_c_newopd(dbc_arg, pgno, &opd)) != 0)
+ oldopd = dbc_n->internal->opd;
+ if ((ret = __db_c_newopd(dbc_arg, pgno, oldopd, &opd)) != 0) {
+ dbc_n->internal->opd = opd;
goto err;
+ }
+
dbc_n->internal->opd = opd;
if ((ret = opd->c_am_put(
@@ -840,8 +1457,15 @@ err: /* Cleanup and cursor resolution. */
if ((t_ret = __db_c_cleanup(dbc_arg, dbc_n, ret)) != 0 && ret == 0)
ret = t_ret;
+ /* If newdata was used, free its buffer. */
+ if (newdata.data != NULL)
+ __os_free(dbp->dbenv, newdata.data);
+
CDB_LOCKING_DONE(dbp, dbc_arg);
+ if (sdbp != NULL && (t_ret = __db_s_done(sdbp)) != 0)
+ return (t_ret);
+
return (ret);
}
@@ -855,7 +1479,20 @@ __db_duperr(dbp, flags)
DB *dbp;
u_int32_t flags;
{
- if (flags != DB_NODUPDATA)
+
+ /*
+ * If we run into this error while updating a secondary index,
+ * don't yell--there's no clean way to pass DB_NODUPDATA in along
+ * with DB_UPDATE_SECONDARY, but we may run into this problem
+ * in a normal, non-error course of events.
+ *
+ * !!!
+ * If and when we ever permit duplicate duplicates in sorted-dup
+ * databases, we need to either change the secondary index code
+ * to check for dup dups, or we need to maintain the implicit
+ * "DB_NODUPDATA" behavior for databases with DB_AM_SECONDARY set.
+ */
+ if (flags != DB_NODUPDATA && !F_ISSET(dbp, DB_AM_SECONDARY))
__db_err(dbp->dbenv,
"Duplicate data items are not supported with sorted data");
return (DB_KEYEXIST);
@@ -873,60 +1510,55 @@ __db_c_cleanup(dbc, dbc_n, failed)
DB *dbp;
DBC *opd;
DBC_INTERNAL *internal;
+ DB_MPOOLFILE *mpf;
int ret, t_ret;
dbp = dbc->dbp;
+ mpf = dbp->mpf;
internal = dbc->internal;
ret = 0;
/* Discard any pages we're holding. */
if (internal->page != NULL) {
- if ((t_ret =
- memp_fput(dbp->mpf, internal->page, 0)) != 0 && ret == 0)
+ if ((t_ret = mpf->put(mpf, internal->page, 0)) != 0 && ret == 0)
ret = t_ret;
internal->page = NULL;
}
opd = internal->opd;
if (opd != NULL && opd->internal->page != NULL) {
- if ((t_ret = memp_fput(dbp->mpf,
- opd->internal->page, 0)) != 0 && ret == 0)
+ if ((t_ret =
+ mpf->put(mpf, opd->internal->page, 0)) != 0 && ret == 0)
ret = t_ret;
opd->internal->page = NULL;
}
/*
- * If dbc_n is NULL, there's no internal cursor swapping to be
- * done and no dbc_n to close--we probably did the entire
- * operation on an offpage duplicate cursor. Just return.
- */
- if (dbc_n == NULL)
- return (ret);
-
- /*
- * If dbc is marked DBC_TRANSIENT, we're inside a DB->{put/get}
+ * If dbc_n is NULL, there's no internal cursor swapping to be done
+ * and no dbc_n to close--we probably did the entire operation on an
+ * offpage duplicate cursor. Just return.
+ *
+ * If dbc and dbc_n are the same, we're either inside a DB->{put/get}
* operation, and as an optimization we performed the operation on
- * the main cursor rather than on a duplicated one. Assert
- * that dbc_n == dbc (i.e., that we really did skip the
- * duplication). Then just do nothing--even if there was
- * an error, we're about to close the cursor, and the fact that we
- * moved it isn't a user-visible violation of our "cursor
- * stays put on error" rule.
- */
- if (F_ISSET(dbc, DBC_TRANSIENT)) {
- DB_ASSERT(dbc == dbc_n);
+ * the main cursor rather than on a duplicated one, or we're in a
+ * bulk get that can't have moved the cursor (DB_MULTIPLE with the
+ * initial c_get operation on an off-page dup cursor). Just
+ * return--either we know we didn't move the cursor, or we're going
+ * to close it before we return to application code, so we're sure
+ * not to visibly violate the "cursor stays put on error" rule.
+ */
+ if (dbc_n == NULL || dbc == dbc_n)
return (ret);
- }
if (dbc_n->internal->page != NULL) {
- if ((t_ret = memp_fput(dbp->mpf,
- dbc_n->internal->page, 0)) != 0 && ret == 0)
+ if ((t_ret =
+ mpf->put(mpf, dbc_n->internal->page, 0)) != 0 && ret == 0)
ret = t_ret;
dbc_n->internal->page = NULL;
}
opd = dbc_n->internal->opd;
if (opd != NULL && opd->internal->page != NULL) {
- if ((t_ret = memp_fput(dbp->mpf,
- opd->internal->page, 0)) != 0 && ret == 0)
+ if ((t_ret =
+ mpf->put(mpf, opd->internal->page, 0)) != 0 && ret == 0)
ret = t_ret;
opd->internal->page = NULL;
}
@@ -963,6 +1595,316 @@ __db_c_cleanup(dbc, dbc_n, failed)
}
/*
+ * __db_c_secondary_get --
+ * This wrapper function for DBC->c_pget() is the DBC->c_get() function
+ * for a secondary index cursor.
+ *
+ * PUBLIC: int __db_c_secondary_get __P((DBC *, DBT *, DBT *, u_int32_t));
+ */
+int
+__db_c_secondary_get(dbc, skey, data, flags)
+ DBC *dbc;
+ DBT *skey, *data;
+ u_int32_t flags;
+{
+
+ DB_ASSERT(F_ISSET(dbc->dbp, DB_AM_SECONDARY));
+ return (dbc->c_pget(dbc, skey, NULL, data, flags));
+}
+
+/*
+ * __db_c_pget --
+ * Get a primary key/data pair through a secondary index.
+ *
+ * PUBLIC: int __db_c_pget __P((DBC *, DBT *, DBT *, DBT *, u_int32_t));
+ */
+int
+__db_c_pget(dbc, skey, pkey, data, flags)
+ DBC *dbc;
+ DBT *skey, *pkey, *data;
+ u_int32_t flags;
+{
+ DB *pdbp, *sdbp;
+ DBC *pdbc;
+ DBT *save_rdata, nullpkey;
+ int pkeymalloc, ret, save_pkey_flags, t_ret;
+
+ sdbp = dbc->dbp;
+ pdbp = sdbp->s_primary;
+ pkeymalloc = t_ret = 0;
+
+ PANIC_CHECK(sdbp->dbenv);
+ if ((ret = __db_cpgetchk(sdbp,
+ skey, pkey, data, flags, IS_INITIALIZED(dbc))) != 0)
+ return (ret);
+
+ /*
+ * The challenging part of this function is getting the behavior
+ * right for all the various permutations of DBT flags. The
+ * next several blocks handle the various cases we need to
+ * deal with specially.
+ */
+
+ /*
+ * We may be called with a NULL pkey argument, if we've been
+ * wrapped by a 2-DBT get call. If so, we need to use our
+ * own DBT.
+ */
+ if (pkey == NULL) {
+ memset(&nullpkey, 0, sizeof(DBT));
+ pkey = &nullpkey;
+ }
+
+ /*
+ * DB_GET_RECNO is a special case, because we're interested not in
+ * the primary key/data pair, but rather in the primary's record
+ * number.
+ */
+ if ((flags & DB_OPFLAGS_MASK) == DB_GET_RECNO)
+ return (__db_c_pget_recno(dbc, pkey, data, flags));
+
+ /*
+ * If the DBTs we've been passed don't have any of the
+ * user-specified memory management flags set, we want to make sure
+ * we return values using the DBTs dbc->rskey, dbc->rkey, and
+ * dbc->rdata, respectively.
+ *
+ * There are two tricky aspects to this: first, we need to pass
+ * skey and pkey *in* to the initial c_get on the secondary key,
+ * since either or both may be looked at by it (depending on the
+ * get flag). Second, we must not use a normal DB->get call
+ * on the secondary, even though that's what we want to accomplish,
+ * because the DB handle may be free-threaded. Instead,
+ * we open a cursor, then take steps to ensure that we actually use
+ * the rkey/rdata from the *secondary* cursor.
+ *
+ * We accomplish all this by passing in the DBTs we started out
+ * with to the c_get, but having swapped the contents of rskey and
+ * rkey, respectively, into rkey and rdata; __db_ret will treat
+ * them like the normal key/data pair in a c_get call, and will
+ * realloc them as need be (this is "step 1"). Then, for "step 2",
+ * we swap back rskey/rkey/rdata to normal, and do a get on the primary
+ * with the secondary dbc appointed as the owner of the returned-data
+ * memory.
+ *
+ * Note that in step 2, we copy the flags field in case we need to
+ * pass down a DB_DBT_PARTIAL or other flag that is compatible with
+ * letting DB do the memory management.
+ */
+ /* Step 1. */
+ save_rdata = dbc->rdata;
+ dbc->rdata = dbc->rkey;
+ dbc->rkey = dbc->rskey;
+
+ /*
+ * It is correct, though slightly sick, to attempt a partial get
+ * of a primary key. However, if we do so here, we'll never find the
+ * primary record; clear the DB_DBT_PARTIAL field of pkey just
+ * for the duration of the next call.
+ */
+ save_pkey_flags = pkey->flags;
+ F_CLR(pkey, DB_DBT_PARTIAL);
+
+ /*
+ * Now we can go ahead with the meat of this call. First, get the
+ * primary key from the secondary index. (What exactly we get depends
+ * on the flags, but the underlying cursor get will take care of the
+ * dirty work.)
+ */
+ if ((ret = dbc->c_real_get(dbc, skey, pkey, flags)) != 0) {
+ /* Restore rskey/rkey/rdata and return. */
+ pkey->flags = save_pkey_flags;
+ dbc->rskey = dbc->rkey;
+ dbc->rkey = dbc->rdata;
+ dbc->rdata = save_rdata;
+ goto err;
+ }
+
+ /* Restore pkey's flags in case we stomped the PARTIAL flag. */
+ pkey->flags = save_pkey_flags;
+
+ /*
+ * Restore the cursor's rskey, rkey, and rdata DBTs. If DB
+ * is handling the memory management, we now have newly
+ * reallocated buffers and ulens in rkey and rdata which we want
+ * to put in rskey and rkey. save_rdata contains the old value
+ * of dbc->rdata.
+ */
+ dbc->rskey = dbc->rkey;
+ dbc->rkey = dbc->rdata;
+ dbc->rdata = save_rdata;
+
+ /*
+ * Now we're ready for "step 2". If either or both of pkey and
+ * data do not have memory management flags set--that is, if DB is
+ * managing their memory--we need to swap around the rkey/rdata
+ * structures so that we don't wind up trying to use memory managed
+ * by the primary database cursor, which we'll close before we return.
+ *
+ * !!!
+ * If you're carefully following the bouncing ball, you'll note
+ * that in the DB-managed case, the buffer hanging off of pkey is
+ * the same as dbc->rkey->data. This is just fine; we may well
+ * realloc and stomp on it when we return, if we're going a
+ * DB_GET_BOTH and need to return a different partial or key
+ * (depending on the comparison function), but this is safe.
+ *
+ * !!!
+ * We need to use __db_icursor here rather than simply calling
+ * pdbp->cursor, because otherwise, if we're in CDB, we'll
+ * allocate a new locker ID and leave ourselves open to deadlocks.
+ * (Even though we're only acquiring read locks, we'll still block
+ * if there are any waiters.)
+ */
+ if ((ret = __db_icursor(pdbp,
+ dbc->txn, pdbp->type, PGNO_INVALID, 0, dbc->locker, &pdbc)) != 0)
+ goto err;
+
+ /*
+ * We're about to use pkey a second time. If DB_DBT_MALLOC
+ * is set on it, we'll leak the memory we allocated the first time.
+ * Thus, set DB_DBT_REALLOC instead so that we reuse that memory
+ * instead of leaking it.
+ *
+ * !!!
+ * This assumes that the user must always specify a compatible
+ * realloc function if a malloc function is specified. I think
+ * this is a reasonable requirement.
+ */
+ if (F_ISSET(pkey, DB_DBT_MALLOC)) {
+ F_CLR(pkey, DB_DBT_MALLOC);
+ F_SET(pkey, DB_DBT_REALLOC);
+ pkeymalloc = 1;
+ }
+
+ /*
+ * Do the actual get. Set DBC_TRANSIENT since we don't care
+ * about preserving the position on error, and it's faster.
+ * SET_RET_MEM so that the secondary DBC owns any returned-data
+ * memory.
+ */
+ F_SET(pdbc, DBC_TRANSIENT);
+ SET_RET_MEM(pdbc, dbc);
+ ret = pdbc->c_get(pdbc, pkey, data, DB_SET);
+
+ /*
+ * If the item wasn't found in the primary, this is a bug;
+ * our secondary has somehow gotten corrupted, and contains
+ * elements that don't correspond to anything in the primary.
+ * Complain.
+ */
+ if (ret == DB_NOTFOUND)
+ ret = __db_secondary_corrupt(pdbp);
+
+ /* Now close the primary cursor. */
+ t_ret = pdbc->c_close(pdbc);
+
+err: if (pkeymalloc) {
+ /*
+ * If pkey had a MALLOC flag, we need to restore it;
+ * otherwise, if the user frees the buffer but reuses
+ * the DBT without NULL'ing its data field or changing
+ * the flags, we may drop core.
+ */
+ F_CLR(pkey, DB_DBT_REALLOC);
+ F_SET(pkey, DB_DBT_MALLOC);
+ }
+ return (t_ret == 0 ? ret : t_ret);
+}
+
+/*
+ * __db_c_pget_recno --
+ * Perform a DB_GET_RECNO c_pget on a secondary index. Returns
+ * the secondary's record number in the pkey field and the primary's
+ * in the data field.
+ */
+static int
+__db_c_pget_recno(sdbc, pkey, data, flags)
+ DBC *sdbc;
+ DBT *pkey, *data;
+ u_int32_t flags;
+{
+ DB *pdbp, *sdbp;
+ DB_ENV *dbenv;
+ DBC *pdbc;
+ DBT discardme, primary_key;
+ db_recno_t oob;
+ u_int32_t rmw;
+ int ret, t_ret;
+
+ sdbp = sdbc->dbp;
+ pdbp = sdbp->s_primary;
+ dbenv = sdbp->dbenv;
+ pdbc = NULL;
+ ret = t_ret = 0;
+
+ rmw = LF_ISSET(DB_RMW);
+
+ memset(&discardme, 0, sizeof(DBT));
+ F_SET(&discardme, DB_DBT_USERMEM | DB_DBT_PARTIAL);
+
+ oob = RECNO_OOB;
+
+ /*
+ * If the primary is an rbtree, we want its record number, whether
+ * or not the secondary is one too. Fetch the recno into "data".
+ *
+ * If it's not an rbtree, return RECNO_OOB in "data".
+ */
+ if (F_ISSET(pdbp, DB_AM_RECNUM)) {
+ /*
+ * Get the primary key, so we can find the record number
+ * in the primary. (We're uninterested in the secondary key.)
+ */
+ memset(&primary_key, 0, sizeof(DBT));
+ F_SET(&primary_key, DB_DBT_MALLOC);
+ if ((ret = sdbc->c_real_get(sdbc,
+ &discardme, &primary_key, rmw | DB_CURRENT)) != 0)
+ return (ret);
+
+ /*
+ * Open a cursor on the primary, set it to the right record,
+ * and fetch its recno into "data".
+ *
+ * (See __db_c_pget for a comment on the use of __db_icursor.)
+ *
+ * SET_RET_MEM so that the secondary DBC owns any returned-data
+ * memory.
+ */
+ if ((ret = __db_icursor(pdbp, sdbc->txn,
+ pdbp->type, PGNO_INVALID, 0, sdbc->locker, &pdbc)) != 0)
+ goto perr;
+ SET_RET_MEM(pdbc, sdbc);
+ if ((ret = pdbc->c_get(pdbc,
+ &primary_key, &discardme, rmw | DB_SET)) != 0)
+ goto perr;
+
+ ret = pdbc->c_get(pdbc, &discardme, data, rmw | DB_GET_RECNO);
+
+perr: __os_ufree(sdbp->dbenv, primary_key.data);
+ if (pdbc != NULL &&
+ (t_ret = pdbc->c_close(pdbc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ return (ret);
+ } else if ((ret = __db_retcopy(dbenv, data, &oob,
+ sizeof(oob), &sdbc->rkey->data, &sdbc->rkey->ulen)) != 0)
+ return (ret);
+
+ /*
+ * If the secondary is an rbtree, we want its record number, whether
+ * or not the primary is one too. Fetch the recno into "pkey".
+ *
+ * If it's not an rbtree, return RECNO_OOB in "pkey".
+ */
+ if (F_ISSET(sdbp, DB_AM_RECNUM))
+ return (sdbc->c_real_get(sdbc, &discardme, pkey, flags));
+ else
+ return (__db_retcopy(dbenv, pkey, &oob,
+ sizeof(oob), &sdbc->rdata->data, &sdbc->rdata->ulen));
+}
+
+/*
* __db_wrlock_err -- do not have a write lock.
*/
static int
@@ -972,3 +1914,373 @@ __db_wrlock_err(dbenv)
__db_err(dbenv, "Write attempted on read-only cursor");
return (EPERM);
}
+
+/*
+ * __db_c_del_secondary --
+ * Perform a delete operation on a secondary index: call through
+ * to the primary and delete the primary record that this record
+ * points to.
+ *
+ * Note that deleting the primary record will call c_del on all
+ * the secondaries, including this one; thus, it is not necessary
+ * to execute both this function and an actual delete.
+ *
+ */
+static int
+__db_c_del_secondary(dbc)
+ DBC *dbc;
+{
+ DB *pdbp;
+ DBC *pdbc;
+ DBT skey, pkey;
+ int ret, t_ret;
+
+ memset(&skey, 0, sizeof(DBT));
+ memset(&pkey, 0, sizeof(DBT));
+
+ /*
+ * Get the current item that we're pointing at.
+ * We don't actually care about the secondary key, just
+ * the primary.
+ */
+ F_SET(&skey, DB_DBT_PARTIAL | DB_DBT_USERMEM);
+ if ((ret = dbc->c_real_get(dbc,
+ &skey, &pkey, DB_CURRENT)) != 0)
+ return (ret);
+
+ /*
+ * Create a cursor on the primary with our locker ID,
+ * so that when it calls back, we don't conflict.
+ *
+ * We create a cursor explicitly because there's no
+ * way to specify the same locker ID if we're using
+ * locking but not transactions if we use the DB->del
+ * interface. This shouldn't be any less efficient
+ * anyway.
+ */
+ pdbp = dbc->dbp->s_primary;
+ if ((ret = __db_icursor(pdbp, dbc->txn,
+ pdbp->type, PGNO_INVALID, 0, dbc->locker, &pdbc)) != 0)
+ return (ret);
+
+ /*
+ * See comment in __db_c_put--if we're in CDB,
+ * we already hold the locks we need, and we need to flag
+ * the cursor as a WRITER so we don't run into errors
+ * when we try to delete.
+ */
+ if (CDB_LOCKING(pdbp->dbenv)) {
+ DB_ASSERT(pdbc->mylock.off == LOCK_INVALID);
+ F_SET(pdbc, DBC_WRITER);
+ }
+
+ /*
+ * Set the new cursor to the correct primary key. Then
+ * delete it. We don't really care about the datum;
+ * just reuse our skey DBT.
+ *
+ * If the primary get returns DB_NOTFOUND, something is amiss--
+ * every record in the secondary should correspond to some record
+ * in the primary.
+ */
+ if ((ret = pdbc->c_get(pdbc, &pkey, &skey,
+ (STD_LOCKING(dbc) ? DB_RMW : 0) | DB_SET)) == 0)
+ ret = pdbc->c_del(pdbc, 0);
+ else if (ret == DB_NOTFOUND)
+ ret = __db_secondary_corrupt(pdbp);
+
+ if ((t_ret = pdbc->c_close(pdbc)) != 0 && ret != 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __db_c_del_primary --
+ * Perform a delete operation on a primary index. Loop through
+ * all the secondary indices which correspond to this primary
+ * database, and delete any secondary keys that point at the current
+ * record.
+ *
+ * PUBLIC: int __db_c_del_primary __P((DBC *));
+ */
+int
+__db_c_del_primary(dbc)
+ DBC *dbc;
+{
+ DB *dbp, *sdbp;
+ DBC *sdbc;
+ DBT data, pkey, skey, temp;
+ int ret, t_ret;
+
+ dbp = dbc->dbp;
+
+ /*
+ * If we're called at all, we have at least one secondary.
+ * (Unfortunately, we can't assert this without grabbing the mutex.)
+ * Get the current record so that we can construct appropriate
+ * secondary keys as needed.
+ */
+ memset(&pkey, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+ if ((ret = dbc->c_get(dbc, &pkey, &data, DB_CURRENT)) != 0)
+ return (ret);
+
+ for (sdbp = __db_s_first(dbp);
+ sdbp != NULL && ret == 0; ret = __db_s_next(&sdbp)) {
+ /*
+ * Get the secondary key for this secondary and the current
+ * item.
+ */
+ memset(&skey, 0, sizeof(DBT));
+ if ((ret = sdbp->s_callback(sdbp, &pkey, &data, &skey)) != 0) {
+ /*
+ * If the current item isn't in this index, we
+ * have no work to do. Proceed.
+ */
+ if (ret == DB_DONOTINDEX)
+ continue;
+
+ /* We had a substantive error. Bail. */
+ FREE_IF_NEEDED(sdbp, &skey);
+ goto done;
+ }
+
+ /* Open a secondary cursor. */
+ if ((ret = __db_icursor(sdbp, dbc->txn, sdbp->type,
+ PGNO_INVALID, 0, dbc->locker, &sdbc)) != 0)
+ goto done;
+ /* See comment above and in __db_c_put. */
+ if (CDB_LOCKING(sdbp->dbenv)) {
+ DB_ASSERT(sdbc->mylock.off == LOCK_INVALID);
+ F_SET(sdbc, DBC_WRITER);
+ }
+
+ /*
+ * Set the secondary cursor to the appropriate item.
+ * Delete it.
+ *
+ * We want to use DB_RMW if locking is on; it's only
+ * legal then, though.
+ *
+ * !!!
+ * Don't stomp on any callback-allocated buffer in skey
+ * when we do a c_get(DB_GET_BOTH); use a temp DBT instead.
+ */
+ memset(&temp, 0, sizeof(DBT));
+ temp.data = skey.data;
+ temp.size = skey.size;
+ if ((ret = sdbc->c_real_get(sdbc, &temp, &pkey,
+ (STD_LOCKING(dbc) ? DB_RMW : 0) | DB_GET_BOTH)) == 0)
+ ret = sdbc->c_del(sdbc, DB_UPDATE_SECONDARY);
+
+ FREE_IF_NEEDED(sdbp, &skey);
+
+ if ((t_ret = sdbc->c_close(sdbc)) != 0 || ret != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ goto done;
+ }
+ }
+
+done: if (sdbp != NULL && (t_ret = __db_s_done(sdbp)) != 0 && ret == 0)
+ return (t_ret);
+ return (ret);
+}
+
+/*
+ * __db_s_first --
+ * Get the first secondary, if any are present, from the primary.
+ *
+ * PUBLIC: DB *__db_s_first __P((DB *));
+ */
+DB *
+__db_s_first(pdbp)
+ DB *pdbp;
+{
+ DB *sdbp;
+
+ MUTEX_THREAD_LOCK(pdbp->dbenv, pdbp->mutexp);
+ sdbp = LIST_FIRST(&pdbp->s_secondaries);
+
+ /* See __db_s_next. */
+ if (sdbp != NULL)
+ sdbp->s_refcnt++;
+ MUTEX_THREAD_UNLOCK(pdbp->dbenv, pdbp->mutexp);
+
+ return (sdbp);
+}
+
+/*
+ * __db_s_next --
+ * Get the next secondary in the list.
+ *
+ * PUBLIC: int __db_s_next __P((DB **));
+ */
+int
+__db_s_next(sdbpp)
+ DB **sdbpp;
+{
+ DB *sdbp, *pdbp, *closeme;
+ int ret;
+
+ /*
+ * Secondary indices are kept in a linked list, s_secondaries,
+ * off each primary DB handle. If a primary is free-threaded,
+ * this list may only be traversed or modified while the primary's
+ * thread mutex is held.
+ *
+ * The tricky part is that we don't want to hold the thread mutex
+ * across the full set of secondary puts necessary for each primary
+ * put, or we'll wind up essentially single-threading all the puts
+ * to the handle; the secondary puts will each take about as
+ * long as the primary does, and may require I/O. So we instead
+ * hold the thread mutex only long enough to follow one link to the
+ * next secondary, and then we release it before performing the
+ * actual secondary put.
+ *
+ * The only danger here is that we might legitimately close a
+ * secondary index in one thread while another thread is performing
+ * a put and trying to update that same secondary index. To
+ * prevent this from happening, we refcount the secondary handles.
+ * If close is called on a secondary index handle while we're putting
+ * to it, it won't really be closed--the refcount will simply drop,
+ * and we'll be responsible for closing it here.
+ */
+ sdbp = *sdbpp;
+ pdbp = sdbp->s_primary;
+ closeme = NULL;
+
+ MUTEX_THREAD_LOCK(pdbp->dbenv, pdbp->mutexp);
+ DB_ASSERT(sdbp->s_refcnt != 0);
+ if (--sdbp->s_refcnt == 0) {
+ LIST_REMOVE(sdbp, s_links);
+ closeme = sdbp;
+ }
+ sdbp = LIST_NEXT(sdbp, s_links);
+ if (sdbp != NULL)
+ sdbp->s_refcnt++;
+ MUTEX_THREAD_UNLOCK(pdbp->dbenv, pdbp->mutexp);
+
+ *sdbpp = sdbp;
+
+ /*
+ * closeme->close() is a wrapper; call __db_close explicitly.
+ */
+ ret = closeme != NULL ? __db_close(closeme, 0) : 0;
+ return (ret);
+}
+
+/*
+ * __db_s_done --
+ * Properly decrement the refcount on a secondary database handle we're
+ * using, without calling __db_s_next.
+ *
+ * PUBLIC: int __db_s_done __P((DB *));
+ */
+int
+__db_s_done(sdbp)
+ DB *sdbp;
+{
+ DB *pdbp;
+ int doclose;
+
+ pdbp = sdbp->s_primary;
+ doclose = 0;
+
+ MUTEX_THREAD_LOCK(pdbp->dbenv, pdbp->mutexp);
+ DB_ASSERT(sdbp->s_refcnt != 0);
+ if (--sdbp->s_refcnt == 0) {
+ LIST_REMOVE(sdbp, s_links);
+ doclose = 1;
+ }
+ MUTEX_THREAD_UNLOCK(pdbp->dbenv, pdbp->mutexp);
+
+ return (doclose ? __db_close(sdbp, 0) : 0);
+}
+
+/*
+ * __db_buildpartial --
+ * Build the record that will result after a partial put is applied to
+ * an existing record.
+ *
+ * This should probably be merged with __bam_build, but that requires
+ * a little trickery if we plan to keep the overflow-record optimization
+ * in that function.
+ */
+static int
+__db_buildpartial(dbp, oldrec, partial, newrec)
+ DB *dbp;
+ DBT *oldrec, *partial, *newrec;
+{
+ int ret;
+ u_int8_t *buf;
+ u_int32_t len, nbytes;
+
+ DB_ASSERT(F_ISSET(partial, DB_DBT_PARTIAL));
+
+ memset(newrec, 0, sizeof(DBT));
+
+ nbytes = __db_partsize(oldrec->size, partial);
+ newrec->size = nbytes;
+
+ if ((ret = __os_malloc(dbp->dbenv, nbytes, &buf)) != 0)
+ return (ret);
+ newrec->data = buf;
+
+ /* Nul or pad out the buffer, for any part that isn't specified. */
+ memset(buf,
+ F_ISSET(dbp, DB_AM_FIXEDLEN) ? ((BTREE *)dbp->bt_internal)->re_pad :
+ 0, nbytes);
+
+ /* Copy in any leading data from the original record. */
+ memcpy(buf, oldrec->data,
+ partial->doff > oldrec->size ? oldrec->size : partial->doff);
+
+ /* Copy the data from partial. */
+ memcpy(buf + partial->doff, partial->data, partial->size);
+
+ /* Copy any trailing data from the original record. */
+ len = partial->doff + partial->dlen;
+ if (oldrec->size > len)
+ memcpy(buf + partial->doff + partial->size,
+ (u_int8_t *)oldrec->data + len, oldrec->size - len);
+
+ return (0);
+}
+
+/*
+ * __db_partsize --
+ * Given the number of bytes in an existing record and a DBT that
+ * is about to be partial-put, calculate the size of the record
+ * after the put.
+ *
+ * This code is called from __bam_partsize.
+ *
+ * PUBLIC: u_int32_t __db_partsize __P((u_int32_t, DBT *));
+ */
+u_int32_t
+__db_partsize(nbytes, data)
+ u_int32_t nbytes;
+ DBT *data;
+{
+
+ /*
+ * There are really two cases here:
+ *
+ * Case 1: We are replacing some bytes that do not exist (i.e., they
+ * are past the end of the record). In this case the number of bytes
+ * we are replacing is irrelevant and all we care about is how many
+ * bytes we are going to add from offset. So, the new record length
+ * is going to be the size of the new bytes (size) plus wherever those
+ * new bytes begin (doff).
+ *
+ * Case 2: All the bytes we are replacing exist. Therefore, the new
+ * size is the oldsize (nbytes) minus the bytes we are replacing (dlen)
+ * plus the bytes we are adding (size).
+ */
+ if (nbytes < data->doff + data->dlen) /* Case 1 */
+ return (data->doff + data->size);
+
+ return (nbytes + data->size - data->dlen); /* Case 2 */
+}
diff --git a/bdb/db/db_conv.c b/bdb/db/db_conv.c
index df60be06790..f731c82d85e 100644
--- a/bdb/db/db_conv.c
+++ b/bdb/db/db_conv.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
/*
@@ -40,7 +40,7 @@
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: db_conv.c,v 11.11 2000/11/30 00:58:31 ubell Exp $";
+static const char revid[] = "$Id: db_conv.c,v 11.38 2002/08/15 03:00:13 bostic Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -50,12 +50,14 @@ static const char revid[] = "$Id: db_conv.c,v 11.11 2000/11/30 00:58:31 ubell Ex
#endif
#include "db_int.h"
-#include "db_page.h"
-#include "db_swap.h"
-#include "db_am.h"
-#include "btree.h"
-#include "hash.h"
-#include "qam.h"
+#include "dbinc/crypto.h"
+#include "dbinc/hmac.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/log.h"
+#include "dbinc/qam.h"
/*
* __db_pgin --
@@ -70,15 +72,135 @@ __db_pgin(dbenv, pg, pp, cookie)
void *pp;
DBT *cookie;
{
+ DB dummydb, *dbp;
DB_PGINFO *pginfo;
+ DB_CIPHER *db_cipher;
+ DB_LSN not_used;
+ PAGE *pagep;
+ size_t pg_off, pg_len, sum_len;
+ int is_hmac, ret;
+ u_int8_t *chksum, *iv;
pginfo = (DB_PGINFO *)cookie->data;
+ pagep = (PAGE *)pp;
- switch (((PAGE *)pp)->type) {
- case P_HASH:
+ ret = is_hmac = 0;
+ chksum = iv = NULL;
+ memset(&dummydb, 0, sizeof(DB));
+ dbp = &dummydb;
+ dummydb.flags = pginfo->flags;
+ db_cipher = (DB_CIPHER *)dbenv->crypto_handle;
+ switch (pagep->type) {
case P_HASHMETA:
+ case P_BTREEMETA:
+ case P_QAMMETA:
+ /*
+ * If checksumming is set on the meta-page, we must set
+ * it in the dbp.
+ */
+ if (FLD_ISSET(((DBMETA *)pp)->metaflags, DBMETA_CHKSUM))
+ F_SET(dbp, DB_AM_CHKSUM);
+ if (((DBMETA *)pp)->encrypt_alg != 0 ||
+ F_ISSET(dbp, DB_AM_ENCRYPT))
+ is_hmac = 1;
+ /*
+ * !!!
+ * For all meta pages it is required that the chksum
+ * be at the same location. Use BTMETA to get to it
+ * for any meta type.
+ */
+ chksum = ((BTMETA *)pp)->chksum;
+ sum_len = DBMETASIZE;
+ break;
+ case P_INVALID:
+ /*
+ * We assume that we've read a file hole if we have
+ * a zero LSN, zero page number and P_INVALID. Otherwise
+ * we have an invalid page that might contain real data.
+ */
+ if (IS_ZERO_LSN(LSN(pagep)) && pagep->pgno == PGNO_INVALID) {
+ sum_len = 0;
+ break;
+ }
+ /* FALLTHROUGH */
+ default:
+ chksum = P_CHKSUM(dbp, pagep);
+ sum_len = pginfo->db_pagesize;
+ /*
+ * If we are reading in a non-meta page, then if we have
+ * a db_cipher then we are using hmac.
+ */
+ is_hmac = CRYPTO_ON(dbenv) ? 1 : 0;
+ break;
+ }
+
+ /*
+ * We expect a checksum error if there was a configuration problem.
+ * If there is no configuration problem and we don't get a match,
+ * it's fatal: panic the system.
+ */
+ if (F_ISSET(dbp, DB_AM_CHKSUM) && sum_len != 0)
+ switch (ret = __db_check_chksum(
+ dbenv, db_cipher, chksum, pp, sum_len, is_hmac)) {
+ case 0:
+ break;
+ case -1:
+ if (DBENV_LOGGING(dbenv))
+ __db_cksum_log(
+ dbenv, NULL, &not_used, DB_FLUSH);
+ __db_err(dbenv,
+ "checksum error: catastrophic recovery required");
+ return (__db_panic(dbenv, DB_RUNRECOVERY));
+ default:
+ return (ret);
+ }
+
+ if (F_ISSET(dbp, DB_AM_ENCRYPT)) {
+ DB_ASSERT(db_cipher != NULL);
+ DB_ASSERT(F_ISSET(dbp, DB_AM_CHKSUM));
+
+ pg_off = P_OVERHEAD(dbp);
+ DB_ASSERT(db_cipher->adj_size(pg_off) == 0);
+
+ switch (pagep->type) {
+ case P_HASHMETA:
+ case P_BTREEMETA:
+ case P_QAMMETA:
+ /*
+ * !!!
+ * For all meta pages it is required that the iv
+ * be at the same location. Use BTMETA to get to it
+ * for any meta type.
+ */
+ iv = ((BTMETA *)pp)->iv;
+ pg_len = DBMETASIZE;
+ break;
+ case P_INVALID:
+ if (IS_ZERO_LSN(LSN(pagep)) &&
+ pagep->pgno == PGNO_INVALID) {
+ pg_len = 0;
+ break;
+ }
+ /* FALLTHROUGH */
+ default:
+ iv = P_IV(dbp, pagep);
+ pg_len = pginfo->db_pagesize;
+ break;
+ }
+ if (pg_len != 0 && (ret = db_cipher->decrypt(dbenv,
+ db_cipher->data, iv, ((u_int8_t *)pagep) + pg_off,
+ pg_len - pg_off)) != 0)
+ return (ret);
+ }
+ switch (pagep->type) {
case P_INVALID:
- return (__ham_pgin(dbenv, pg, pp, cookie));
+ if (pginfo->type == DB_QUEUE)
+ return (__qam_pgin_out(dbenv, pg, pp, cookie));
+ else
+ return (__ham_pgin(dbenv, dbp, pg, pp, cookie));
+ case P_HASH:
+ case P_HASHMETA:
+ return (__ham_pgin(dbenv, dbp, pg, pp, cookie));
case P_BTREEMETA:
case P_IBTREE:
case P_IRECNO:
@@ -86,14 +208,14 @@ __db_pgin(dbenv, pg, pp, cookie)
case P_LDUP:
case P_LRECNO:
case P_OVERFLOW:
- return (__bam_pgin(dbenv, pg, pp, cookie));
+ return (__bam_pgin(dbenv, dbp, pg, pp, cookie));
case P_QAMMETA:
case P_QAMDATA:
return (__qam_pgin_out(dbenv, pg, pp, cookie));
default:
break;
}
- return (__db_unknown_type(dbenv, "__db_pgin", ((PAGE *)pp)->type));
+ return (__db_pgfmt(dbenv, pg));
}
/*
@@ -109,15 +231,33 @@ __db_pgout(dbenv, pg, pp, cookie)
void *pp;
DBT *cookie;
{
+ DB dummydb, *dbp;
+ DB_CIPHER *db_cipher;
DB_PGINFO *pginfo;
+ PAGE *pagep;
+ size_t pg_off, pg_len, sum_len;
+ int ret;
+ u_int8_t *chksum, *iv, *key;
pginfo = (DB_PGINFO *)cookie->data;
+ pagep = (PAGE *)pp;
- switch (((PAGE *)pp)->type) {
+ chksum = iv = key = NULL;
+ memset(&dummydb, 0, sizeof(DB));
+ dbp = &dummydb;
+ dummydb.flags = pginfo->flags;
+ ret = 0;
+ switch (pagep->type) {
+ case P_INVALID:
+ if (pginfo->type == DB_QUEUE)
+ ret = __qam_pgin_out(dbenv, pg, pp, cookie);
+ else
+ ret = __ham_pgout(dbenv, dbp, pg, pp, cookie);
+ break;
case P_HASH:
case P_HASHMETA:
- case P_INVALID:
- return (__ham_pgout(dbenv, pg, pp, cookie));
+ ret = __ham_pgout(dbenv, dbp, pg, pp, cookie);
+ break;
case P_BTREEMETA:
case P_IBTREE:
case P_IRECNO:
@@ -125,14 +265,73 @@ __db_pgout(dbenv, pg, pp, cookie)
case P_LDUP:
case P_LRECNO:
case P_OVERFLOW:
- return (__bam_pgout(dbenv, pg, pp, cookie));
+ ret = __bam_pgout(dbenv, dbp, pg, pp, cookie);
+ break;
case P_QAMMETA:
case P_QAMDATA:
- return (__qam_pgin_out(dbenv, pg, pp, cookie));
- default:
+ ret = __qam_pgin_out(dbenv, pg, pp, cookie);
break;
+ default:
+ return (__db_pgfmt(dbenv, pg));
+ }
+ if (ret)
+ return (ret);
+
+ db_cipher = (DB_CIPHER *)dbenv->crypto_handle;
+ if (F_ISSET(dbp, DB_AM_ENCRYPT)) {
+
+ DB_ASSERT(db_cipher != NULL);
+ DB_ASSERT(F_ISSET(dbp, DB_AM_CHKSUM));
+
+ pg_off = P_OVERHEAD(dbp);
+ DB_ASSERT(db_cipher->adj_size(pg_off) == 0);
+
+ key = db_cipher->mac_key;
+
+ switch (pagep->type) {
+ case P_HASHMETA:
+ case P_BTREEMETA:
+ case P_QAMMETA:
+ /*
+ * !!!
+ * For all meta pages it is required that the iv
+ * be at the same location. Use BTMETA to get to it
+ * for any meta type.
+ */
+ iv = ((BTMETA *)pp)->iv;
+ pg_len = DBMETASIZE;
+ break;
+ default:
+ iv = P_IV(dbp, pagep);
+ pg_len = pginfo->db_pagesize;
+ break;
+ }
+ if ((ret = db_cipher->encrypt(dbenv, db_cipher->data,
+ iv, ((u_int8_t *)pagep) + pg_off, pg_len - pg_off)) != 0)
+ return (ret);
+ }
+ if (F_ISSET(dbp, DB_AM_CHKSUM)) {
+ switch (pagep->type) {
+ case P_HASHMETA:
+ case P_BTREEMETA:
+ case P_QAMMETA:
+ /*
+ * !!!
+ * For all meta pages it is required that the chksum
+ * be at the same location. Use BTMETA to get to it
+ * for any meta type.
+ */
+ chksum = ((BTMETA *)pp)->chksum;
+ sum_len = DBMETASIZE;
+ break;
+ default:
+ chksum = P_CHKSUM(dbp, pagep);
+ sum_len = pginfo->db_pagesize;
+ break;
+ }
+ __db_chksum(pp, sum_len, key, chksum);
}
- return (__db_unknown_type(dbenv, "__db_pgout", ((PAGE *)pp)->type));
+ return (0);
}
/*
@@ -169,11 +368,13 @@ __db_metaswap(pg)
* __db_byteswap --
* Byteswap a page.
*
- * PUBLIC: int __db_byteswap __P((DB_ENV *, db_pgno_t, PAGE *, size_t, int));
+ * PUBLIC: int __db_byteswap
+ * PUBLIC: __P((DB_ENV *, DB *, db_pgno_t, PAGE *, size_t, int));
*/
int
-__db_byteswap(dbenv, pg, h, pagesize, pgin)
+__db_byteswap(dbenv, dbp, pg, h, pagesize, pgin)
DB_ENV *dbenv;
+ DB *dbp;
db_pgno_t pg;
PAGE *h;
size_t pagesize;
@@ -183,11 +384,12 @@ __db_byteswap(dbenv, pg, h, pagesize, pgin)
BKEYDATA *bk;
BOVERFLOW *bo;
RINTERNAL *ri;
- db_indx_t i, len, tmp;
+ db_indx_t i, *inp, len, tmp;
u_int8_t *p, *end;
COMPQUIET(pg, 0);
+ inp = P_INP(dbp, h);
if (pgin) {
M_32_SWAP(h->lsn.file);
M_32_SWAP(h->lsn.offset);
@@ -202,14 +404,14 @@ __db_byteswap(dbenv, pg, h, pagesize, pgin)
case P_HASH:
for (i = 0; i < NUM_ENT(h); i++) {
if (pgin)
- M_16_SWAP(h->inp[i]);
+ M_16_SWAP(inp[i]);
- switch (HPAGE_TYPE(h, i)) {
+ switch (HPAGE_TYPE(dbp, h, i)) {
case H_KEYDATA:
break;
case H_DUPLICATE:
- len = LEN_HKEYDATA(h, pagesize, i);
- p = HKEYDATA_DATA(P_ENTRY(h, i));
+ len = LEN_HKEYDATA(dbp, h, pagesize, i);
+ p = HKEYDATA_DATA(P_ENTRY(dbp, h, i));
for (end = p + len; p < end;) {
if (pgin) {
P_16_SWAP(p);
@@ -226,11 +428,11 @@ __db_byteswap(dbenv, pg, h, pagesize, pgin)
}
break;
case H_OFFDUP:
- p = HOFFPAGE_PGNO(P_ENTRY(h, i));
+ p = HOFFPAGE_PGNO(P_ENTRY(dbp, h, i));
SWAP32(p); /* pgno */
break;
case H_OFFPAGE:
- p = HOFFPAGE_PGNO(P_ENTRY(h, i));
+ p = HOFFPAGE_PGNO(P_ENTRY(dbp, h, i));
SWAP32(p); /* pgno */
SWAP32(p); /* tlen */
break;
@@ -246,14 +448,14 @@ __db_byteswap(dbenv, pg, h, pagesize, pgin)
*/
if (!pgin)
for (i = 0; i < NUM_ENT(h); i++)
- M_16_SWAP(h->inp[i]);
+ M_16_SWAP(inp[i]);
break;
case P_LBTREE:
case P_LDUP:
case P_LRECNO:
for (i = 0; i < NUM_ENT(h); i++) {
if (pgin)
- M_16_SWAP(h->inp[i]);
+ M_16_SWAP(inp[i]);
/*
* In the case of on-page duplicates, key information
@@ -261,17 +463,17 @@ __db_byteswap(dbenv, pg, h, pagesize, pgin)
*/
if (h->type == P_LBTREE && i > 1) {
if (pgin) {
- if (h->inp[i] == h->inp[i - 2])
+ if (inp[i] == inp[i - 2])
continue;
} else {
- M_16_SWAP(h->inp[i]);
- if (h->inp[i] == h->inp[i - 2])
+ M_16_SWAP(inp[i]);
+ if (inp[i] == inp[i - 2])
continue;
- M_16_SWAP(h->inp[i]);
+ M_16_SWAP(inp[i]);
}
}
- bk = GET_BKEYDATA(h, i);
+ bk = GET_BKEYDATA(dbp, h, i);
switch (B_TYPE(bk->type)) {
case B_KEYDATA:
M_16_SWAP(bk->len);
@@ -285,15 +487,15 @@ __db_byteswap(dbenv, pg, h, pagesize, pgin)
}
if (!pgin)
- M_16_SWAP(h->inp[i]);
+ M_16_SWAP(inp[i]);
}
break;
case P_IBTREE:
for (i = 0; i < NUM_ENT(h); i++) {
if (pgin)
- M_16_SWAP(h->inp[i]);
+ M_16_SWAP(inp[i]);
- bi = GET_BINTERNAL(h, i);
+ bi = GET_BINTERNAL(dbp, h, i);
M_16_SWAP(bi->len);
M_32_SWAP(bi->pgno);
M_32_SWAP(bi->nrecs);
@@ -310,20 +512,20 @@ __db_byteswap(dbenv, pg, h, pagesize, pgin)
}
if (!pgin)
- M_16_SWAP(h->inp[i]);
+ M_16_SWAP(inp[i]);
}
break;
case P_IRECNO:
for (i = 0; i < NUM_ENT(h); i++) {
if (pgin)
- M_16_SWAP(h->inp[i]);
+ M_16_SWAP(inp[i]);
- ri = GET_RINTERNAL(h, i);
+ ri = GET_RINTERNAL(dbp, h, i);
M_32_SWAP(ri->pgno);
M_32_SWAP(ri->nrecs);
if (!pgin)
- M_16_SWAP(h->inp[i]);
+ M_16_SWAP(inp[i]);
}
break;
case P_OVERFLOW:
@@ -331,7 +533,7 @@ __db_byteswap(dbenv, pg, h, pagesize, pgin)
/* Nothing to do. */
break;
default:
- return (__db_unknown_type(dbenv, "__db_byteswap", h->type));
+ return (__db_pgfmt(dbenv, pg));
}
if (!pgin) {
diff --git a/bdb/db/db_dispatch.c b/bdb/db/db_dispatch.c
index c9beac401a7..2cf29ec2f33 100644
--- a/bdb/db/db_dispatch.c
+++ b/bdb/db/db_dispatch.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
/*
@@ -39,7 +39,7 @@
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: db_dispatch.c,v 11.41 2001/01/11 18:19:50 bostic Exp $";
+static const char revid[] = "$Id: db_dispatch.c,v 11.121 2002/09/07 17:36:31 ubell Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -51,16 +51,24 @@ static const char revid[] = "$Id: db_dispatch.c,v 11.41 2001/01/11 18:19:50 bost
#endif
#include "db_int.h"
-#include "db_page.h"
-#include "db_dispatch.h"
-#include "db_am.h"
-#include "log_auto.h"
-#include "txn.h"
-#include "txn_auto.h"
-#include "log.h"
-
-static int __db_txnlist_find_internal __P((void *, db_txnlist_type,
- u_int32_t, u_int8_t [DB_FILE_ID_LEN], DB_TXNLIST **, int));
+#include "dbinc/db_page.h"
+#include "dbinc/hash.h"
+#include "dbinc/log.h"
+#include "dbinc/fop.h"
+#include "dbinc/rep.h"
+#include "dbinc/txn.h"
+
+static int __db_limbo_fix __P((DB *,
+ DB_TXN *, DB_TXNLIST *, db_pgno_t *, DBMETA *));
+static int __db_limbo_bucket __P((DB_ENV *, DB_TXN *, DB_TXNLIST *));
+static int __db_limbo_move __P((DB_ENV *, DB_TXN *, DB_TXN *, DB_TXNLIST *));
+static int __db_lock_move __P((DB_ENV *,
+ u_int8_t *, db_pgno_t, db_lockmode_t, DB_TXN *, DB_TXN *));
+static int __db_default_getpgnos __P((DB_ENV *, DB_LSN *lsnp, void *));
+static int __db_txnlist_find_internal __P((DB_ENV *, void *, db_txnlist_type,
+ u_int32_t, u_int8_t [DB_FILE_ID_LEN], DB_TXNLIST **, int));
+static int __db_txnlist_pgnoadd __P((DB_ENV *, DB_TXNHEAD *,
+ int32_t, u_int8_t [DB_FILE_ID_LEN], char *, db_pgno_t));
/*
* __db_dispatch --
@@ -71,16 +79,21 @@ static int __db_txnlist_find_internal __P((void *, db_txnlist_type,
* scripts in the tools directory). An application using a different
* recovery paradigm will supply a different dispatch function to txn_open.
*
- * PUBLIC: int __db_dispatch __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
+ * PUBLIC: int __db_dispatch __P((DB_ENV *,
+ * PUBLIC: int (**)__P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)),
+ * PUBLIC: size_t, DBT *, DB_LSN *, db_recops, void *));
*/
int
-__db_dispatch(dbenv, db, lsnp, redo, info)
+__db_dispatch(dbenv, dtab, dtabsize, db, lsnp, redo, info)
DB_ENV *dbenv; /* The environment. */
+ int (**dtab)__P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
+ size_t dtabsize; /* Size of the dtab. */
DBT *db; /* The log record upon which to dispatch. */
DB_LSN *lsnp; /* The lsn of the record being dispatched. */
db_recops redo; /* Redo this op (or undo it). */
void *info;
{
+ DB_LSN prev_lsn;
u_int32_t rectype, txnid;
int make_call, ret;
@@ -88,6 +101,9 @@ __db_dispatch(dbenv, db, lsnp, redo, info)
memcpy(&txnid, (u_int8_t *)db->data + sizeof(rectype), sizeof(txnid));
make_call = ret = 0;
+ /* If we don't have a dispatch table, it's hard to dispatch. */
+ DB_ASSERT(dtab != NULL);
+
/*
* If we find a record that is in the user's number space and they
* have specified a recovery routine, let them handle it. If they
@@ -96,17 +112,29 @@ __db_dispatch(dbenv, db, lsnp, redo, info)
*/
switch (redo) {
case DB_TXN_ABORT:
- /*
- * XXX
- * db_printlog depends on DB_TXN_ABORT not examining the TXN
- * list. If that ever changes, fix db_printlog too.
- */
+ case DB_TXN_APPLY:
+ case DB_TXN_PRINT:
make_call = 1;
break;
case DB_TXN_OPENFILES:
- if (rectype == DB_log_register)
- return (dbenv->dtab[rectype](dbenv,
- db, lsnp, redo, info));
+ /*
+ * We collect all the transactions that have
+ * "begin" records, those with no previous LSN,
+ * so that we do not abort partial transactions.
+ * These are known to be undone, otherwise the
+ * log would not have been freeable.
+ */
+ memcpy(&prev_lsn, (u_int8_t *)db->data +
+ sizeof(rectype) + sizeof(txnid), sizeof(prev_lsn));
+ if (txnid != 0 && prev_lsn.file == 0 && (ret =
+ __db_txnlist_add(dbenv, info, txnid, TXN_OK, NULL)) != 0)
+ return (ret);
+
+ /* FALLTHROUGH */
+ case DB_TXN_POPENFILES:
+ if (rectype == DB___dbreg_register ||
+ rectype == DB___txn_ckp || rectype == DB___txn_recycle)
+ return (dtab[rectype](dbenv, db, lsnp, redo, info));
break;
case DB_TXN_BACKWARD_ROLL:
/*
@@ -117,43 +145,146 @@ __db_dispatch(dbenv, db, lsnp, redo, info)
* we've never seen it, then we call the appropriate recovery
* routine.
*
- * We need to always undo DB_db_noop records, so that we
+ * We need to always undo DB___db_noop records, so that we
* properly handle any aborts before the file was closed.
*/
- if (rectype == DB_log_register ||
- rectype == DB_txn_ckp || rectype == DB_db_noop
- || rectype == DB_txn_child || (txnid != 0 &&
- (ret = __db_txnlist_find(info, txnid)) != 0)) {
+ switch(rectype) {
+ case DB___txn_regop:
+ case DB___txn_recycle:
+ case DB___txn_ckp:
+ case DB___db_noop:
+ case DB___fop_file_remove:
+ case DB___txn_child:
make_call = 1;
- if (ret == DB_NOTFOUND && rectype != DB_txn_regop &&
- rectype != DB_txn_xa_regop && (ret =
- __db_txnlist_add(dbenv, info, txnid, 1)) != 0)
- return (ret);
+ break;
+
+ case DB___dbreg_register:
+ if (txnid == 0) {
+ make_call = 1;
+ break;
+ }
+ /* FALLTHROUGH */
+ default:
+ if (txnid != 0 && (ret =
+ __db_txnlist_find(dbenv,
+ info, txnid)) != TXN_COMMIT && ret != TXN_IGNORE) {
+ /*
+ * If not found then, this is an incomplete
+ * abort.
+ */
+ if (ret == TXN_NOTFOUND)
+ return (__db_txnlist_add(dbenv,
+ info, txnid, TXN_IGNORE, lsnp));
+ make_call = 1;
+ if (ret == TXN_OK &&
+ (ret = __db_txnlist_update(dbenv,
+ info, txnid,
+ rectype == DB___txn_xa_regop ?
+ TXN_PREPARE : TXN_ABORT, NULL)) != 0)
+ return (ret);
+ }
}
break;
case DB_TXN_FORWARD_ROLL:
/*
* In the forward pass, if we haven't seen the transaction,
- * do nothing, else recovery it.
+ * do nothing, else recover it.
*
- * We need to always redo DB_db_noop records, so that we
+ * We need to always redo DB___db_noop records, so that we
* properly handle any commits after the file was closed.
*/
- if (rectype == DB_log_register ||
- rectype == DB_txn_ckp ||
- rectype == DB_db_noop ||
- __db_txnlist_find(info, txnid) == 0)
+ switch(rectype) {
+ case DB___txn_recycle:
+ case DB___txn_ckp:
+ case DB___db_noop:
make_call = 1;
+ break;
+
+ default:
+ if (txnid != 0 && (ret = __db_txnlist_find(dbenv,
+ info, txnid)) == TXN_COMMIT)
+ make_call = 1;
+ else if (ret != TXN_IGNORE &&
+ (rectype == DB___ham_metagroup ||
+ rectype == DB___ham_groupalloc ||
+ rectype == DB___db_pg_alloc)) {
+ /*
+ * Because we cannot undo file extensions
+ * all allocation records must be reprocessed
+ * during rollforward in case the file was
+ * just created. It may not have been
+ * present during the backward pass.
+ */
+ make_call = 1;
+ redo = DB_TXN_BACKWARD_ALLOC;
+ } else if (rectype == DB___dbreg_register) {
+ /*
+ * This may be a transaction dbreg_register.
+ * If it is, we only make the call on a COMMIT,
+ * which we checked above. If it's not, then we
+ * should always make the call, because we need
+ * the file open information.
+ */
+ if (txnid == 0)
+ make_call = 1;
+ }
+ }
break;
+ case DB_TXN_GETPGNOS:
+ /*
+ * If this is one of DB's own log records, we simply
+ * dispatch.
+ */
+ if (rectype < DB_user_BEGIN) {
+ make_call = 1;
+ break;
+ }
+
+ /*
+ * If we're still here, this is a custom record in an
+ * application that's doing app-specific logging. Such a
+ * record doesn't have a getpgno function for the user
+ * dispatch function to call--the getpgnos functions return
+ * which pages replication needs to lock using the TXN_RECS
+ * structure, which is private and not something we want to
+ * document.
+ *
+ * Thus, we leave any necessary locking for the app's
+ * recovery function to do during the upcoming
+ * DB_TXN_APPLY. Fill in default getpgnos info (we need
+ * a stub entry for every log record that will get
+ * DB_TXN_APPLY'd) and return success.
+ */
+ return (__db_default_getpgnos(dbenv, lsnp, info));
default:
return (__db_unknown_flag(dbenv, "__db_dispatch", redo));
}
+ /*
+ * The switch statement uses ret to receive the return value of
+ * __db_txnlist_find, which returns a large number of different
+ * statuses, none of which we will be returning. For safety,
+ * let's reset this here in case we ever do a "return(ret)"
+ * below in the future.
+ */
+ ret = 0;
if (make_call) {
- if (rectype >= DB_user_BEGIN && dbenv->tx_recover != NULL)
- return (dbenv->tx_recover(dbenv, db, lsnp, redo));
- else
- return (dbenv->dtab[rectype](dbenv, db, lsnp, redo, info));
+ if (rectype >= DB_user_BEGIN && dbenv->app_dispatch != NULL)
+ return (dbenv->app_dispatch(dbenv, db, lsnp, redo));
+ else {
+ /*
+ * The size of the dtab table argument is the same as
+ * the standard table, use the standard table's size
+ * as our sanity check.
+ */
+ if (rectype > dtabsize || dtab[rectype] == NULL) {
+ __db_err(dbenv,
+ "Illegal record type %lu in log",
+ (u_long)rectype);
+ return (EINVAL);
+ }
+ return (dtab[rectype](dbenv, db, lsnp, redo, info));
+ }
}
return (0);
@@ -163,75 +294,100 @@ __db_dispatch(dbenv, db, lsnp, redo, info)
* __db_add_recovery --
*
* PUBLIC: int __db_add_recovery __P((DB_ENV *,
- * PUBLIC: int (*)(DB_ENV *, DBT *, DB_LSN *, db_recops, void *), u_int32_t));
+ * PUBLIC: int (***)(DB_ENV *, DBT *, DB_LSN *, db_recops, void *), size_t *,
+ * PUBLIC: int (*)(DB_ENV *, DBT *, DB_LSN *, db_recops, void *), u_int32_t));
*/
int
-__db_add_recovery(dbenv, func, ndx)
+__db_add_recovery(dbenv, dtab, dtabsize, func, ndx)
DB_ENV *dbenv;
+ int (***dtab) __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
+ size_t *dtabsize;
int (*func) __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
u_int32_t ndx;
{
- u_int32_t i, nsize;
+ size_t i, nsize;
int ret;
/* Check if we have to grow the table. */
- if (ndx >= dbenv->dtab_size) {
+ if (ndx >= *dtabsize) {
nsize = ndx + 40;
- if ((ret = __os_realloc(dbenv,
- nsize * sizeof(dbenv->dtab[0]), NULL, &dbenv->dtab)) != 0)
+ if ((ret =
+ __os_realloc(dbenv, nsize * sizeof((*dtab)[0]), dtab)) != 0)
return (ret);
- for (i = dbenv->dtab_size; i < nsize; ++i)
- dbenv->dtab[i] = NULL;
- dbenv->dtab_size = nsize;
+ for (i = *dtabsize; i < nsize; ++i)
+ (*dtab)[i] = NULL;
+ *dtabsize = nsize;
}
- dbenv->dtab[ndx] = func;
+ (*dtab)[ndx] = func;
return (0);
}
/*
- * __deprecated_recover --
- * Stub routine for deprecated recovery functions.
- *
- * PUBLIC: int __deprecated_recover
- * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
- */
-int
-__deprecated_recover(dbenv, dbtp, lsnp, op, info)
- DB_ENV *dbenv;
- DBT *dbtp;
- DB_LSN *lsnp;
- db_recops op;
- void *info;
-{
- COMPQUIET(dbenv, NULL);
- COMPQUIET(dbtp, NULL);
- COMPQUIET(lsnp, NULL);
- COMPQUIET(op, 0);
- COMPQUIET(info, NULL);
- return (EINVAL);
-}
-
-/*
* __db_txnlist_init --
* Initialize transaction linked list.
*
- * PUBLIC: int __db_txnlist_init __P((DB_ENV *, void *));
+ * PUBLIC: int __db_txnlist_init __P((DB_ENV *,
+ * PUBLIC: u_int32_t, u_int32_t, DB_LSN *, void *));
*/
int
-__db_txnlist_init(dbenv, retp)
+__db_txnlist_init(dbenv, low_txn, hi_txn, trunc_lsn, retp)
DB_ENV *dbenv;
+ u_int32_t low_txn, hi_txn;
+ DB_LSN *trunc_lsn;
void *retp;
{
DB_TXNHEAD *headp;
- int ret;
+ u_int32_t tmp;
+ int ret, size;
- if ((ret = __os_malloc(dbenv, sizeof(DB_TXNHEAD), NULL, &headp)) != 0)
+ /*
+ * Size a hash table.
+ * If low is zero then we are being called during rollback
+ * and we need only one slot.
+ * Hi maybe lower than low if we have recycled txnid's.
+ * The numbers here are guesses about txn density, we can afford
+ * to look at a few entries in each slot.
+ */
+ if (low_txn == 0)
+ size = 1;
+ else {
+ if (hi_txn < low_txn) {
+ tmp = hi_txn;
+ hi_txn = low_txn;
+ low_txn = tmp;
+ }
+ tmp = hi_txn - low_txn;
+ /* See if we wrapped around. */
+ if (tmp > (TXN_MAXIMUM - TXN_MINIMUM) / 2)
+ tmp = (low_txn - TXN_MINIMUM) + (TXN_MAXIMUM - hi_txn);
+ size = tmp / 5;
+ if (size < 100)
+ size = 100;
+ }
+ if ((ret = __os_malloc(dbenv,
+ sizeof(DB_TXNHEAD) + size * sizeof(headp->head), &headp)) != 0)
return (ret);
- LIST_INIT(&headp->head);
- headp->maxid = 0;
- headp->generation = 1;
+ memset(headp, 0, sizeof(DB_TXNHEAD) + size * sizeof(headp->head));
+ headp->maxid = hi_txn;
+ headp->generation = 0;
+ headp->nslots = size;
+ headp->gen_alloc = 8;
+ if ((ret = __os_malloc(dbenv, headp->gen_alloc *
+ sizeof(headp->gen_array[0]), &headp->gen_array)) != 0) {
+ __os_free(dbenv, headp);
+ return (ret);
+ }
+ headp->gen_array[0].generation = 0;
+ headp->gen_array[0].txn_min = TXN_MINIMUM;
+ headp->gen_array[0].txn_max = TXN_MAXIMUM;
+ if (trunc_lsn != NULL)
+ headp->trunc_lsn = *trunc_lsn;
+ else
+ ZERO_LSN(headp->trunc_lsn);
+ ZERO_LSN(headp->maxlsn);
+ ZERO_LSN(headp->ckplsn);
*(void **)retp = headp;
return (0);
@@ -241,132 +397,86 @@ __db_txnlist_init(dbenv, retp)
* __db_txnlist_add --
* Add an element to our transaction linked list.
*
- * PUBLIC: int __db_txnlist_add __P((DB_ENV *, void *, u_int32_t, int32_t));
+ * PUBLIC: int __db_txnlist_add __P((DB_ENV *,
+ * PUBLIC: void *, u_int32_t, int32_t, DB_LSN *));
*/
int
-__db_txnlist_add(dbenv, listp, txnid, aborted)
+__db_txnlist_add(dbenv, listp, txnid, status, lsn)
DB_ENV *dbenv;
void *listp;
u_int32_t txnid;
- int32_t aborted;
+ int32_t status;
+ DB_LSN *lsn;
{
DB_TXNHEAD *hp;
DB_TXNLIST *elp;
int ret;
- if ((ret = __os_malloc(dbenv, sizeof(DB_TXNLIST), NULL, &elp)) != 0)
+ if ((ret = __os_malloc(dbenv, sizeof(DB_TXNLIST), &elp)) != 0)
return (ret);
hp = (DB_TXNHEAD *)listp;
- LIST_INSERT_HEAD(&hp->head, elp, links);
+ LIST_INSERT_HEAD(&hp->head[DB_TXNLIST_MASK(hp, txnid)], elp, links);
elp->type = TXNLIST_TXNID;
elp->u.t.txnid = txnid;
- elp->u.t.aborted = aborted;
+ elp->u.t.status = status;
+ elp->u.t.generation = hp->generation;
if (txnid > hp->maxid)
hp->maxid = txnid;
- elp->u.t.generation = hp->generation;
+ if (lsn != NULL && IS_ZERO_LSN(hp->maxlsn) && status == TXN_COMMIT)
+ hp->maxlsn = *lsn;
+
+ DB_ASSERT(lsn == NULL ||
+ status != TXN_COMMIT || log_compare(&hp->maxlsn, lsn) >= 0);
return (0);
}
+
/*
* __db_txnlist_remove --
* Remove an element from our transaction linked list.
*
- * PUBLIC: int __db_txnlist_remove __P((void *, u_int32_t));
+ * PUBLIC: int __db_txnlist_remove __P((DB_ENV *, void *, u_int32_t));
*/
int
-__db_txnlist_remove(listp, txnid)
+__db_txnlist_remove(dbenv, listp, txnid)
+ DB_ENV *dbenv;
void *listp;
u_int32_t txnid;
{
DB_TXNLIST *entry;
- return (__db_txnlist_find_internal(listp,
- TXNLIST_TXNID, txnid, NULL, &entry, 1));
-}
-
-/* __db_txnlist_close --
- *
- * Call this when we close a file. It allows us to reconcile whether
- * we have done any operations on this file with whether the file appears
- * to have been deleted. If you never do any operations on a file, then
- * we assume it's OK to appear deleted.
- *
- * PUBLIC: int __db_txnlist_close __P((void *, int32_t, u_int32_t));
- */
-
-int
-__db_txnlist_close(listp, lid, count)
- void *listp;
- int32_t lid;
- u_int32_t count;
-{
- DB_TXNHEAD *hp;
- DB_TXNLIST *p;
-
- hp = (DB_TXNHEAD *)listp;
- for (p = LIST_FIRST(&hp->head); p != NULL; p = LIST_NEXT(p, links)) {
- if (p->type == TXNLIST_DELETE)
- if (lid == p->u.d.fileid &&
- !F_ISSET(&p->u.d, TXNLIST_FLAG_CLOSED)) {
- p->u.d.count += count;
- return (0);
- }
- }
-
- return (0);
+ return (__db_txnlist_find_internal(dbenv,
+ listp, TXNLIST_TXNID, txnid,
+ NULL, &entry, 1) == TXN_NOTFOUND ? TXN_NOTFOUND : TXN_OK);
}
/*
- * __db_txnlist_delete --
- *
- * Record that a file was missing or deleted. If the deleted
- * flag is set, then we've encountered a delete of a file, else we've
- * just encountered a file that is missing. The lid is the log fileid
- * and is only meaningful if deleted is not equal to 0.
+ * __db_txnlist_ckp --
+ * Used to record the maximum checkpoint that will be retained
+ * after recovery. Typically this is simply the max checkpoint, but
+ * if we are doing client replication recovery or timestamp-based
+ * recovery, we are going to virtually truncate the log and we need
+ * to retain the last checkpoint before the truncation point.
*
- * PUBLIC: int __db_txnlist_delete __P((DB_ENV *,
- * PUBLIC: void *, char *, u_int32_t, int));
+ * PUBLIC: void __db_txnlist_ckp __P((DB_ENV *, void *, DB_LSN *));
*/
-int
-__db_txnlist_delete(dbenv, listp, name, lid, deleted)
+void
+__db_txnlist_ckp(dbenv, listp, ckp_lsn)
DB_ENV *dbenv;
void *listp;
- char *name;
- u_int32_t lid;
- int deleted;
+ DB_LSN *ckp_lsn;
{
DB_TXNHEAD *hp;
- DB_TXNLIST *p;
- int ret;
- hp = (DB_TXNHEAD *)listp;
- for (p = LIST_FIRST(&hp->head); p != NULL; p = LIST_NEXT(p, links)) {
- if (p->type == TXNLIST_DELETE)
- if (strcmp(name, p->u.d.fname) == 0) {
- if (deleted)
- F_SET(&p->u.d, TXNLIST_FLAG_DELETED);
- else
- F_CLR(&p->u.d, TXNLIST_FLAG_CLOSED);
- return (0);
- }
- }
-
- /* Need to add it. */
- if ((ret = __os_malloc(dbenv, sizeof(DB_TXNLIST), NULL, &p)) != 0)
- return (ret);
- LIST_INSERT_HEAD(&hp->head, p, links);
+ COMPQUIET(dbenv, NULL);
- p->type = TXNLIST_DELETE;
- p->u.d.flags = 0;
- if (deleted)
- F_SET(&p->u.d, TXNLIST_FLAG_DELETED);
- p->u.d.fileid = lid;
- p->u.d.count = 0;
- ret = __os_strdup(dbenv, name, &p->u.d.fname);
+ hp = (DB_TXNHEAD *)listp;
- return (ret);
+ if (IS_ZERO_LSN(hp->ckplsn) && !IS_ZERO_LSN(hp->maxlsn) &&
+ log_compare(&hp->maxlsn, ckp_lsn) >= 0)
+ hp->ckplsn = *ckp_lsn;
}
/*
@@ -383,99 +493,156 @@ __db_txnlist_end(dbenv, listp)
{
DB_TXNHEAD *hp;
DB_TXNLIST *p;
- DB_LOG *lp;
+ int i;
- hp = (DB_TXNHEAD *)listp;
- lp = (DB_LOG *)dbenv->lg_handle;
- while (hp != NULL && (p = LIST_FIRST(&hp->head)) != NULL) {
- LIST_REMOVE(p, links);
- switch (p->type) {
- case TXNLIST_DELETE:
- /*
- * If we have a file that is not deleted and has
- * some operations, we flag the warning. Since
- * the file could still be open, we need to check
- * the actual log table as well.
- */
- if ((!F_ISSET(&p->u.d, TXNLIST_FLAG_DELETED) &&
- p->u.d.count != 0) ||
- (!F_ISSET(&p->u.d, TXNLIST_FLAG_CLOSED) &&
- p->u.d.fileid != (int32_t) TXNLIST_INVALID_ID &&
- p->u.d.fileid < lp->dbentry_cnt &&
- lp->dbentry[p->u.d.fileid].count != 0))
- __db_err(dbenv, "warning: %s: %s",
- p->u.d.fname, db_strerror(ENOENT));
- __os_freestr(p->u.d.fname);
- break;
- case TXNLIST_LSN:
- __os_free(p->u.l.lsn_array,
- p->u.l.maxn * sizeof(DB_LSN));
- break;
- default:
- /* Possibly an incomplete DB_TXNLIST; just free it. */
- break;
+ if ((hp = (DB_TXNHEAD *)listp) == NULL)
+ return;
+
+ for (i = 0; i < hp->nslots; i++)
+ while (hp != NULL && (p = LIST_FIRST(&hp->head[i])) != NULL) {
+ LIST_REMOVE(p, links);
+ switch (p->type) {
+ case TXNLIST_LSN:
+ __os_free(dbenv, p->u.l.lsn_array);
+ break;
+ default:
+ /*
+ * Possibly an incomplete DB_TXNLIST; just
+ * free it.
+ */
+ break;
+ }
+ __os_free(dbenv, p);
}
- __os_free(p, sizeof(DB_TXNLIST));
- }
- __os_free(listp, sizeof(DB_TXNHEAD));
+
+ if (hp->gen_array != NULL)
+ __os_free(dbenv, hp->gen_array);
+ __os_free(dbenv, listp);
}
/*
* __db_txnlist_find --
* Checks to see if a txnid with the current generation is in the
- * txnid list. This returns DB_NOTFOUND if the item isn't in the
- * list otherwise it returns (like __db_txnlist_find_internal) a
- * 1 or 0 indicating if the transaction is aborted or not. A txnid
- * of 0 means the record was generated while not in a transaction.
+ * txnid list. This returns TXN_NOTFOUND if the item isn't in the
+ * list otherwise it returns (like __db_txnlist_find_internal)
+ * the status of the transaction. A txnid of 0 means the record
+ * was generated while not in a transaction.
*
- * PUBLIC: int __db_txnlist_find __P((void *, u_int32_t));
+ * PUBLIC: int __db_txnlist_find __P((DB_ENV *, void *, u_int32_t));
*/
int
-__db_txnlist_find(listp, txnid)
+__db_txnlist_find(dbenv, listp, txnid)
+ DB_ENV *dbenv;
void *listp;
u_int32_t txnid;
{
DB_TXNLIST *entry;
if (txnid == 0)
- return (DB_NOTFOUND);
- return (__db_txnlist_find_internal(listp,
- TXNLIST_TXNID, txnid, NULL, &entry, 0));
+ return (TXN_NOTFOUND);
+ return (__db_txnlist_find_internal(dbenv, listp,
+ TXNLIST_TXNID, txnid, NULL, &entry, 0));
+}
+
+/*
+ * __db_txnlist_update --
+ * Change the status of an existing transaction entry.
+ * Returns TXN_NOTFOUND if no such entry exists.
+ *
+ * PUBLIC: int __db_txnlist_update __P((DB_ENV *,
+ * PUBLIC: void *, u_int32_t, u_int32_t, DB_LSN *));
+ */
+int
+__db_txnlist_update(dbenv, listp, txnid, status, lsn)
+ DB_ENV *dbenv;
+ void *listp;
+ u_int32_t txnid;
+ u_int32_t status;
+ DB_LSN *lsn;
+{
+ DB_TXNHEAD *hp;
+ DB_TXNLIST *elp;
+ int ret;
+
+ if (txnid == 0)
+ return (TXN_NOTFOUND);
+ hp = (DB_TXNHEAD *)listp;
+ ret = __db_txnlist_find_internal(dbenv,
+ listp, TXNLIST_TXNID, txnid, NULL, &elp, 0);
+
+ if (ret == TXN_NOTFOUND)
+ return (ret);
+ elp->u.t.status = status;
+
+ if (lsn != NULL && IS_ZERO_LSN(hp->maxlsn) && status == TXN_COMMIT)
+ hp->maxlsn = *lsn;
+
+ return (ret);
}
/*
* __db_txnlist_find_internal --
- * Find an entry on the transaction list.
- * If the entry is not there or the list pointeris not initialized
- * we return DB_NOTFOUND. If the item is found, we return the aborted
- * status (1 for aborted, 0 for not aborted). Currently we always call
- * this with an initialized list pointer but checking for NULL keeps it general.
+ * Find an entry on the transaction list. If the entry is not there or
+ * the list pointer is not initialized we return TXN_NOTFOUND. If the
+ * item is found, we return the status. Currently we always call this
+ * with an initialized list pointer but checking for NULL keeps it general.
*/
static int
-__db_txnlist_find_internal(listp, type, txnid, uid, txnlistp, delete)
+__db_txnlist_find_internal(dbenv, listp, type, txnid, uid, txnlistp, delete)
+ DB_ENV *dbenv;
void *listp;
db_txnlist_type type;
- u_int32_t txnid;
+ u_int32_t txnid;
u_int8_t uid[DB_FILE_ID_LEN];
DB_TXNLIST **txnlistp;
int delete;
{
DB_TXNHEAD *hp;
DB_TXNLIST *p;
- int ret;
+ int32_t generation;
+ u_int32_t hash;
+ struct __db_headlink *head;
+ int i, ret;
if ((hp = (DB_TXNHEAD *)listp) == NULL)
- return (DB_NOTFOUND);
+ return (TXN_NOTFOUND);
+
+ switch (type) {
+ case TXNLIST_TXNID:
+ hash = txnid;
+ /* Find the most recent generation containing this ID */
+ for (i = 0; i <= hp->generation; i++)
+ /* The range may wrap around the end. */
+ if (hp->gen_array[i].txn_min <
+ hp->gen_array[i].txn_max ?
+ (txnid >= hp->gen_array[i].txn_min &&
+ txnid <= hp->gen_array[i].txn_max) :
+ (txnid >= hp->gen_array[i].txn_min ||
+ txnid <= hp->gen_array[i].txn_max))
+ break;
+ DB_ASSERT(i <= hp->generation);
+ generation = hp->gen_array[i].generation;
+ break;
+ case TXNLIST_PGNO:
+ memcpy(&hash, uid, sizeof(hash));
+ generation = 0;
+ break;
+ default:
+ DB_ASSERT(0);
+ return (EINVAL);
+ }
+
+ head = &hp->head[DB_TXNLIST_MASK(hp, hash)];
- for (p = LIST_FIRST(&hp->head); p != NULL; p = LIST_NEXT(p, links)) {
+ for (p = LIST_FIRST(head); p != NULL; p = LIST_NEXT(p, links)) {
if (p->type != type)
continue;
switch (type) {
case TXNLIST_TXNID:
if (p->u.t.txnid != txnid ||
- hp->generation != p->u.t.generation)
+ generation != p->u.t.generation)
continue;
- ret = p->u.t.aborted;
+ ret = p->u.t.status;
break;
case TXNLIST_PGNO:
@@ -490,42 +657,67 @@ __db_txnlist_find_internal(listp, type, txnid, uid, txnlistp, delete)
}
if (delete == 1) {
LIST_REMOVE(p, links);
- __os_free(p, sizeof(DB_TXNLIST));
- } else if (p != LIST_FIRST(&hp->head)) {
+ __os_free(dbenv, p);
+ } else if (p != LIST_FIRST(head)) {
/* Move it to head of list. */
LIST_REMOVE(p, links);
- LIST_INSERT_HEAD(&hp->head, p, links);
+ LIST_INSERT_HEAD(head, p, links);
}
*txnlistp = p;
return (ret);
}
- return (DB_NOTFOUND);
+ return (TXN_NOTFOUND);
}
/*
* __db_txnlist_gen --
* Change the current generation number.
*
- * PUBLIC: void __db_txnlist_gen __P((void *, int));
+ * PUBLIC: int __db_txnlist_gen __P((DB_ENV *,
+ * PUBLIC: void *, int, u_int32_t, u_int32_t));
*/
-void
-__db_txnlist_gen(listp, incr)
+int
+__db_txnlist_gen(dbenv, listp, incr, min, max)
+ DB_ENV *dbenv;
void *listp;
int incr;
+ u_int32_t min, max;
{
DB_TXNHEAD *hp;
+ int ret;
/*
- * During recovery generation numbers keep track of how many "restart"
- * checkpoints we've seen. Restart checkpoints occur whenever we take
- * a checkpoint and there are no outstanding transactions. When that
- * happens, we can reset transaction IDs back to 1. It always happens
- * at recovery and it prevents us from exhausting the transaction IDs
- * name space.
+ * During recovery generation numbers keep track of "restart"
+ * checkpoints and recycle records. Restart checkpoints occur
+ * whenever we take a checkpoint and there are no outstanding
+ * transactions. When that happens, we can reset transaction IDs
+ * back to TXNID_MINIMUM. Currently we only do the reset
+ * at then end of recovery. Recycle records occrur when txnids
+ * are exhausted during runtime. A free range of ids is identified
+ * and logged. This code maintains a stack of ranges. A txnid
+ * is given the generation number of the first range it falls into
+ * in the stack.
*/
hp = (DB_TXNHEAD *)listp;
hp->generation += incr;
+ if (incr < 0)
+ memmove(hp->gen_array, &hp->gen_array[1],
+ (hp->generation + 1) * sizeof(hp->gen_array[0]));
+ else {
+ if (hp->generation >= hp->gen_alloc) {
+ hp->gen_alloc *= 2;
+ if ((ret = __os_realloc(dbenv, hp->gen_alloc *
+ sizeof(hp->gen_array[0]), &hp->gen_array)) != 0)
+ return (ret);
+ }
+ memmove(&hp->gen_array[1], &hp->gen_array[0],
+ hp->generation * sizeof(hp->gen_array[0]));
+ hp->gen_array[0].generation = hp->generation;
+ hp->gen_array[0].txn_min = min;
+ hp->gen_array[0].txn_max = max;
+ }
+ return (0);
}
#define TXN_BUBBLE(AP, MAX) { \
@@ -542,10 +734,10 @@ __db_txnlist_gen(listp, incr)
/*
* __db_txnlist_lsnadd --
- * Add to or re-sort the transaction list lsn entry.
- * Note that since this is used during an abort, the __txn_undo
- * code calls into the "recovery" subsystem explicitly, and there
- * is only a single TXNLIST_LSN entry on the list.
+ * Add to or re-sort the transaction list lsn entry. Note that since this
+ * is used during an abort, the __txn_undo code calls into the "recovery"
+ * subsystem explicitly, and there is only a single TXNLIST_LSN entry on
+ * the list.
*
* PUBLIC: int __db_txnlist_lsnadd __P((DB_ENV *, void *, DB_LSN *, u_int32_t));
*/
@@ -562,19 +754,19 @@ __db_txnlist_lsnadd(dbenv, listp, lsnp, flags)
hp = (DB_TXNHEAD *)listp;
- for (elp = LIST_FIRST(&hp->head);
+ for (elp = LIST_FIRST(&hp->head[0]);
elp != NULL; elp = LIST_NEXT(elp, links))
if (elp->type == TXNLIST_LSN)
break;
if (elp == NULL)
- return (EINVAL);
+ return (DB_SURPRISE_KID);
if (LF_ISSET(TXNLIST_NEW)) {
if (elp->u.l.ntxns >= elp->u.l.maxn) {
if ((ret = __os_realloc(dbenv,
2 * elp->u.l.maxn * sizeof(DB_LSN),
- NULL, &elp->u.l.lsn_array)) != 0)
+ &elp->u.l.lsn_array)) != 0)
return (ret);
elp->u.l.maxn *= 2;
}
@@ -584,9 +776,9 @@ __db_txnlist_lsnadd(dbenv, listp, lsnp, flags)
elp->u.l.lsn_array[0] = *lsnp;
/*
- * If we just added a new entry and there may be NULL
- * entries, so we have to do a complete bubble sort,
- * not just trickle a changed entry around.
+ * If we just added a new entry and there may be NULL entries, so we
+ * have to do a complete bubble sort, not just trickle a changed entry
+ * around.
*/
for (i = 0; i < (!LF_ISSET(TXNLIST_NEW) ? 1 : elp->u.l.ntxns); i++)
TXN_BUBBLE(elp->u.l.lsn_array, elp->u.l.ntxns);
@@ -597,35 +789,6 @@ __db_txnlist_lsnadd(dbenv, listp, lsnp, flags)
}
/*
- * __db_txnlist_lsnhead --
- * Return a pointer to the beginning of the lsn_array.
- *
- * PUBLIC: int __db_txnlist_lsnhead __P((void *, DB_LSN **));
- */
-int
-__db_txnlist_lsnhead(listp, lsnpp)
- void *listp;
- DB_LSN **lsnpp;
-{
- DB_TXNHEAD *hp;
- DB_TXNLIST *elp;
-
- hp = (DB_TXNHEAD *)listp;
-
- for (elp = LIST_FIRST(&hp->head);
- elp != NULL; elp = LIST_NEXT(elp, links))
- if (elp->type == TXNLIST_LSN)
- break;
-
- if (elp == NULL)
- return (EINVAL);
-
- *lsnpp = &elp->u.l.lsn_array[0];
-
- return (0);
-}
-
-/*
* __db_txnlist_lsninit --
* Initialize a transaction list with an lsn array entry.
*
@@ -642,12 +805,12 @@ __db_txnlist_lsninit(dbenv, hp, lsnp)
elp = NULL;
- if ((ret = __os_malloc(dbenv, sizeof(DB_TXNLIST), NULL, &elp)) != 0)
+ if ((ret = __os_malloc(dbenv, sizeof(DB_TXNLIST), &elp)) != 0)
goto err;
- LIST_INSERT_HEAD(&hp->head, elp, links);
+ LIST_INSERT_HEAD(&hp->head[0], elp, links);
if ((ret = __os_malloc(dbenv,
- 12 * sizeof(DB_LSN), NULL, &elp->u.l.lsn_array)) != 0)
+ 12 * sizeof(DB_LSN), &elp->u.l.lsn_array)) != 0)
goto err;
elp->type = TXNLIST_LSN;
elp->u.l.maxn = 12;
@@ -662,8 +825,7 @@ err: __db_txnlist_end(dbenv, hp);
/*
* __db_add_limbo -- add pages to the limbo list.
- * Get the file information and call pgnoadd
- * for each page.
+ * Get the file information and call pgnoadd for each page.
*
* PUBLIC: int __db_add_limbo __P((DB_ENV *,
* PUBLIC: void *, int32_t, db_pgno_t, int32_t));
@@ -681,7 +843,7 @@ __db_add_limbo(dbenv, info, fileid, pgno, count)
int ret;
dblp = dbenv->lg_handle;
- if ((ret = __log_lid_to_fname(dblp, fileid, &fnp)) != 0)
+ if ((ret = __dbreg_id_to_fname(dblp, fileid, 0, &fnp)) != 0)
return (ret);
do {
@@ -698,201 +860,429 @@ __db_add_limbo(dbenv, info, fileid, pgno, count)
/*
* __db_do_the_limbo -- move pages from limbo to free.
*
- * If we are in recovery we add things to the free list without
- * logging becasue we want to incrementaly apply logs that
- * may be generated on another copy of this environment.
- * Otherwise we just call __db_free to put the pages on
- * the free list and log the activity.
+ * Limbo processing is what ensures that we correctly handle and
+ * recover from page allocations. During recovery, for each database,
+ * we process each in-question allocation, link them into the free list
+ * and then write out the new meta-data page that contains the pointer
+ * to the new beginning of the free list. On an abort, we use our
+ * standard __db_free mechanism in a compensating transaction which logs
+ * the specific modifications to the free list.
+ *
+ * If we run out of log space during an abort, then we can't write the
+ * compensating transaction, so we abandon the idea of a compenating
+ * transaction, and go back to processing how we do during recovery.
+ * The reason that this is not the norm is that it's expensive: it requires
+ * that we flush any database with an in-question allocation. Thus if
+ * a compensating transaction fails, we never try to restart it.
+ *
+ * Since files may be open and closed within transactions (in particular,
+ * the master database for subdatabases), we must be prepared to open
+ * files during this process. If there is a compensating transaction, we
+ * can open the files in that transaction. If this was an abort and there
+ * is no compensating transaction, then we've got to perform these opens
+ * in the context of the aborting transaction so that we do not deadlock.
+ * During recovery, there's no locking, so this isn't an issue.
*
- * PUBLIC: int __db_do_the_limbo __P((DB_ENV *, DB_TXNHEAD *));
+ * What you want to keep in mind when reading this is that there are two
+ * algorithms going on here: ctxn == NULL, then we're either in recovery
+ * or our compensating transaction has failed and we're doing the
+ * "create list and write meta-data page" algorithm. Otherwise, we're in
+ * an abort and doing the "use compensating transaction" algorithm.
+ *
+ * PUBLIC: int __db_do_the_limbo __P((DB_ENV *,
+ * PUBLIC: DB_TXN *, DB_TXN *, DB_TXNHEAD *));
*/
int
-__db_do_the_limbo(dbenv, hp)
+__db_do_the_limbo(dbenv, ptxn, txn, hp)
DB_ENV *dbenv;
+ DB_TXN *ptxn, *txn;
DB_TXNHEAD *hp;
{
- DB *dbp;
- DBC *dbc;
- DBMETA *meta;
- DB_TXN *txn;
DB_TXNLIST *elp;
- PAGE *pagep;
- db_pgno_t last_pgno, pgno;
- int i, in_recover, put_page, ret, t_ret;
+ int h, ret;
- dbp = NULL;
- dbc = NULL;
- txn = NULL;
ret = 0;
+ /*
+ * The slots correspond to hash buckets. We've hashed the
+ * fileids into hash buckets and need to pick up all affected
+ * files. (There will only be a single slot for an abort.)
+ */
+ for (h = 0; h < hp->nslots; h++) {
+ if ((elp = LIST_FIRST(&hp->head[h])) == NULL)
+ continue;
+ if (ptxn != NULL) {
+ if ((ret =
+ __db_limbo_move(dbenv, ptxn, txn, elp)) != 0)
+ goto err;
+ } else if ((ret = __db_limbo_bucket(dbenv, txn, elp)) != 0)
+ goto err;
+ }
+
+err: if (ret != 0) {
+ __db_err(dbenv, "Fatal error in abort of an allocation");
+ ret = __db_panic(dbenv, ret);
+ }
- /* Are we in recovery? */
- in_recover = F_ISSET((DB_LOG *)dbenv->lg_handle, DBLOG_RECOVER);
+ return (ret);
+}
- for (elp = LIST_FIRST(&hp->head);
- elp != NULL; elp = LIST_NEXT(elp, links)) {
+/* Limbo support routines. */
+
+/*
+ * __db_lock_move --
+ * Move a lock from child to parent.
+ */
+static int
+__db_lock_move(dbenv, fileid, pgno, mode, ptxn, txn)
+ DB_ENV *dbenv;
+ u_int8_t *fileid;
+ db_pgno_t pgno;
+ db_lockmode_t mode;
+ DB_TXN *ptxn, *txn;
+{
+ DBT lock_dbt;
+ DB_LOCK lock;
+ DB_LOCK_ILOCK lock_obj;
+ DB_LOCKREQ req;
+ int ret;
+
+ lock_obj.pgno = pgno;
+ memcpy(lock_obj.fileid, fileid, DB_FILE_ID_LEN);
+ lock_obj.type = DB_PAGE_LOCK;
+
+ memset(&lock_dbt, 0, sizeof(lock_dbt));
+ lock_dbt.data = &lock_obj;
+ lock_dbt.size = sizeof(lock_obj);
+
+ if ((ret = dbenv->lock_get(dbenv,
+ txn->txnid, 0, &lock_dbt, mode, &lock)) == 0) {
+ memset(&req, 0, sizeof(req));
+ req.lock = lock;
+ req.op = DB_LOCK_TRADE;
+
+ ret = dbenv->lock_vec(dbenv, ptxn->txnid, 0, &req, 1, NULL);
+ }
+ return (ret);
+}
+
+/*
+ * __db_limbo_move
+ * Move just the metapage lock to the parent.
+ */
+static int
+__db_limbo_move(dbenv, ptxn, txn, elp)
+ DB_ENV *dbenv;
+ DB_TXN *ptxn, *txn;
+ DB_TXNLIST *elp;
+{
+ int ret;
+
+ for (; elp != NULL; elp = LIST_NEXT(elp, links)) {
+ if (elp->type != TXNLIST_PGNO || elp->u.p.locked == 1)
+ continue;
+ if ((ret = __db_lock_move(dbenv, elp->u.p.uid,
+ PGNO_BASE_MD, DB_LOCK_WRITE, ptxn, txn)) != 0)
+ return (ret);
+ elp->u.p.locked = 1;
+ }
+
+ return (0);
+}
+/*
+ * __db_limbo_bucket
+ * Perform limbo processing for a single hash bucket in the txnlist.
+ * txn is the transaction aborting in the case of an abort and ctxn is the
+ * compensating transaction.
+ */
+
+#define T_RESTORED(txn) ((txn) != NULL && F_ISSET(txn, TXN_RESTORED))
+static int
+__db_limbo_bucket(dbenv, txn, elp)
+ DB_ENV *dbenv;
+ DB_TXN *txn;
+ DB_TXNLIST *elp;
+{
+ DB *dbp;
+ DB_MPOOLFILE *mpf;
+ DBMETA *meta;
+ DB_TXN *ctxn, *t;
+ db_pgno_t last_pgno, pgno;
+ int dbp_created, in_retry, ret, t_ret;
+
+ ctxn = NULL;
+ in_retry = 0;
+ meta = NULL;
+ mpf = NULL;
+ ret = 0;
+ for (; elp != NULL; elp = LIST_NEXT(elp, links)) {
if (elp->type != TXNLIST_PGNO)
continue;
+retry: dbp_created = 0;
+
+ /*
+ * Pick the transaction in which to potentially
+ * log compensations.
+ */
+ if (!in_retry && !IS_RECOVERING(dbenv) && !T_RESTORED(txn)
+ && (ret = __txn_compensate_begin(dbenv, &ctxn)) != 0)
+ return (ret);
+
+ /*
+ * Either use the compensating transaction or
+ * the one passed in, which will be null if recovering.
+ */
+ t = ctxn == NULL ? txn : ctxn;
+
+ /* First try to get a dbp by fileid. */
+ ret = __dbreg_id_to_db(dbenv, t, &dbp, elp->u.p.fileid, 0);
+
+ /*
+ * File is being destroyed. No need to worry about
+ * dealing with recovery of allocations.
+ */
+ if (ret == DB_DELETED ||
+ (ret == 0 && F_ISSET(dbp, DB_AM_DISCARD)))
+ goto next;
- if (in_recover) {
+ if (ret != 0) {
if ((ret = db_create(&dbp, dbenv, 0)) != 0)
goto err;
/*
- * It is ok if the file is nolonger there.
+ * This tells the system not to lock, which is always
+ * OK, whether this is an abort or recovery.
*/
+ F_SET(dbp, DB_AM_COMPENSATE);
+ dbp_created = 1;
+
+ /* It is ok if the file is nolonger there. */
dbp->type = DB_UNKNOWN;
- ret = __db_dbopen(dbp,
- elp->u.p.fname, 0, __db_omode("rw----"), 0);
+ ret = __db_dbopen(dbp, t, elp->u.p.fname, NULL,
+ DB_ODDFILESIZE, __db_omode("rw----"), PGNO_BASE_MD);
+ if (ret == ENOENT)
+ goto next;
+ }
+
+ /*
+ * Verify that we are opening the same file that we were
+ * referring to when we wrote this log record.
+ */
+ if (memcmp(elp->u.p.uid, dbp->fileid, DB_FILE_ID_LEN) != 0)
+ goto next;
+
+ mpf = dbp->mpf;
+ last_pgno = PGNO_INVALID;
+
+ if (ctxn == NULL) {
+ pgno = PGNO_BASE_MD;
+ if ((ret =
+ mpf->get(mpf, &pgno, 0, (PAGE **)&meta)) != 0)
+ goto err;
+ last_pgno = meta->free;
+ }
+
+ ret = __db_limbo_fix(dbp, ctxn, elp, &last_pgno, meta);
+ /*
+ * If we were doing compensating transactions, then we are
+ * going to hope this error was due to running out of space.
+ * We'll change modes (into the sync the file mode) and keep
+ * trying. If we weren't doing compensating transactions,
+ * then this is a real error and we're sunk.
+ */
+ if (ret != 0) {
+ if (ret == DB_RUNRECOVERY || ctxn == NULL)
+ goto err;
+ in_retry = 1;
+ goto retry;
+ }
+
+ if (ctxn != NULL) {
+ ret = ctxn->commit(ctxn, DB_TXN_NOSYNC);
+ ctxn = NULL;
+ if (ret != 0)
+ goto retry;
+ goto next;
+ }
+
+ /*
+ * This is where we handle the case where we're explicitly
+ * putting together a free list. We need to decide whether
+ * we have to write the meta-data page, and if we do, then
+ * we need to sync it as well.
+ */
+ if (last_pgno == meta->free) {
+ /* No change to page; just put the page back. */
+ if ((ret = mpf->put(mpf, meta, 0)) != 0)
+ goto err;
+ meta = NULL;
} else {
/*
- * If we are in transaction undo, then we know
- * the fileid is still correct.
+ * These changes are unlogged so we cannot have the
+ * metapage pointing at pages that are not on disk.
+ * Therefore, we flush the new free list, then update
+ * the metapage. We have to put the meta-data page
+ * first so that it isn't pinned when we try to sync.
*/
+ if (!IS_RECOVERING(dbenv) && !T_RESTORED(txn))
+ __db_err(dbenv, "Flushing free list to disk");
+ if ((ret = mpf->put(mpf, meta, 0)) != 0)
+ goto err;
+ meta = NULL;
+ dbp->sync(dbp, 0);
+ pgno = PGNO_BASE_MD;
if ((ret =
- __db_fileid_to_db(dbenv, &dbp,
- elp->u.p.fileid, 0)) != 0 && ret != DB_DELETED)
+ mpf->get(mpf, &pgno, 0, (PAGE **)&meta)) != 0)
+ goto err;
+ meta->free = last_pgno;
+ if ((ret = mpf->put(mpf, meta, DB_MPOOL_DIRTY)) != 0)
goto err;
- /* File is being destroyed. */
- if (F_ISSET(dbp, DB_AM_DISCARD))
- ret = DB_DELETED;
+ meta = NULL;
}
+
+next:
/*
- * Verify that we are opening the same file that we were
- * referring to when we wrote this log record.
+ * If we get here, either we have processed the list
+ * or the db file has been deleted or could no be opened.
*/
- if (ret == 0 &&
- memcmp(elp->u.p.uid, dbp->fileid, DB_FILE_ID_LEN) == 0) {
- last_pgno = PGNO_INVALID;
- if (in_recover) {
- pgno = PGNO_BASE_MD;
- if ((ret = memp_fget(dbp->mpf,
- &pgno, 0, (PAGE **)&meta)) != 0)
- goto err;
- last_pgno = meta->free;
- /*
- * Check to see if the head of the free
- * list is any of the pages we are about
- * to link in. We could have crashed
- * after linking them in and before writing
- * a checkpoint.
- * It may not be the last one since
- * any page may get reallocated before here.
- */
- if (last_pgno != PGNO_INVALID)
- for (i = 0; i < elp->u.p.nentries; i++)
- if (last_pgno
- == elp->u.p.pgno_array[i])
- goto done_it;
- }
+ if (ctxn != NULL &&
+ (t_ret = ctxn->abort(ctxn)) != 0 && ret == 0)
+ ret = t_ret;
- for (i = 0; i < elp->u.p.nentries; i++) {
- pgno = elp->u.p.pgno_array[i];
- if ((ret = memp_fget(dbp->mpf,
- &pgno, DB_MPOOL_CREATE, &pagep)) != 0)
- goto err;
+ if (dbp_created &&
+ (t_ret = __db_close_i(dbp, txn, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ dbp = NULL;
+ __os_free(dbenv, elp->u.p.fname);
+ __os_free(dbenv, elp->u.p.pgno_array);
+ if (ret == ENOENT)
+ ret = 0;
+ else if (ret != 0)
+ goto err;
+ }
- put_page = 1;
- if (IS_ZERO_LSN(LSN(pagep))) {
- P_INIT(pagep, dbp->pgsize,
- pgno, PGNO_INVALID,
- last_pgno, 0, P_INVALID);
-
- if (in_recover) {
- LSN(pagep) = LSN(meta);
- last_pgno = pgno;
- } else {
- /*
- * Starting the transaction
- * is postponed until we know
- * we have something to do.
- */
- if (txn == NULL &&
- (ret = txn_begin(dbenv,
- NULL, &txn, 0)) != 0)
- goto err;
-
- if (dbc == NULL &&
- (ret = dbp->cursor(dbp,
- txn, &dbc, 0)) != 0)
- goto err;
- /* Turn off locking. */
- F_SET(dbc, DBC_COMPENSATE);
-
- /* __db_free puts the page. */
- if ((ret =
- __db_free(dbc, pagep)) != 0)
- goto err;
- put_page = 0;
- }
- }
+err: if (meta != NULL)
+ (void)mpf->put(mpf, meta, 0);
+ return (ret);
+}
- if (put_page == 1 &&
- (ret = memp_fput(dbp->mpf,
- pagep, DB_MPOOL_DIRTY)) != 0)
- goto err;
- }
- if (in_recover) {
- if (last_pgno == meta->free) {
-done_it:
+/*
+ * __db_limbo_fix --
+ * Process a single limbo entry which describes all the page allocations
+ * for a single file.
+ */
+static int
+__db_limbo_fix(dbp, ctxn, elp, lastp, meta)
+ DB *dbp;
+ DB_TXN *ctxn;
+ DB_TXNLIST *elp;
+ db_pgno_t *lastp;
+ DBMETA *meta;
+{
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *freep, *pagep;
+ db_pgno_t next, pgno;
+ int i, put_page, ret, t_ret;
+
+ /*
+ * Loop through the entries for this txnlist element and
+ * either link them into the free list or write a compensating
+ * record for each.
+ */
+ put_page = 0;
+ ret = 0;
+ mpf = dbp->mpf;
+ dbc = NULL;
+
+ for (i = 0; i < elp->u.p.nentries; i++) {
+ pgno = elp->u.p.pgno_array[i];
+
+ if ((ret = mpf->get(mpf, &pgno, DB_MPOOL_CREATE, &pagep)) != 0)
+ goto err;
+ put_page = 1;
+
+ if (IS_ZERO_LSN(LSN(pagep))) {
+ if (ctxn == NULL) {
+ /*
+ * If this is a fatal recovery which
+ * spans a previous crash this page may
+ * be on the free list already.
+ */
+ for (next = *lastp; next != 0; ) {
+ if (next == pgno)
+ break;
+ if ((ret = mpf->get(mpf,
+ &next, 0, &freep)) != 0)
+ goto err;
+ next = NEXT_PGNO(freep);
if ((ret =
- memp_fput(dbp->mpf, meta, 0)) != 0)
+ mpf->put(mpf, freep, 0)) != 0)
goto err;
- } else {
- /*
- * Flush the new free list then
- * update the metapage. This is
- * unlogged so we cannot have the
- * metapage pointing at pages that
- * are not on disk.
- */
- dbp->sync(dbp, 0);
- meta->free = last_pgno;
- if ((ret = memp_fput(dbp->mpf,
- meta, DB_MPOOL_DIRTY)) != 0)
+ }
+
+ if (next != pgno) {
+ P_INIT(pagep, dbp->pgsize, pgno,
+ PGNO_INVALID, *lastp, 0, P_INVALID);
+ LSN(pagep) = LSN(meta);
+ *lastp = pgno;
+ }
+ } else {
+ P_INIT(pagep, dbp->pgsize, pgno,
+ PGNO_INVALID, *lastp, 0, P_INVALID);
+ if (dbc == NULL && (ret =
+ dbp->cursor(dbp, ctxn, &dbc, 0)) != 0)
goto err;
+ /*
+ * If the dbp is compensating (because we
+ * opened it), the dbc will automatically be
+ * marked compensating, but in case we didn't
+ * do the open, we have to mark it explicitly.
+ */
+ F_SET(dbc, DBC_COMPENSATE);
+ ret = __db_free(dbc, pagep);
+ put_page = 0;
+ /*
+ * On any error, we hope that the error was
+ * caused due to running out of space, and we
+ * switch modes, doing the processing where we
+ * sync out files instead of doing compensating
+ * transactions. If this was a real error and
+ * not out of space, we assume that some other
+ * call will fail real soon.
+ */
+ if (ret != 0) {
+ /* Assume that this is out of space. */
+ (void)dbc->c_close(dbc);
+ dbc = NULL;
+ goto err;
}
}
- if (dbc != NULL && (ret = dbc->c_close(dbc)) != 0)
- goto err;
- dbc = NULL;
}
- if (in_recover && (t_ret = dbp->close(dbp, 0)) != 0 && ret == 0)
- ret = t_ret;
- dbp = NULL;
- __os_free(elp->u.p.fname, 0);
- __os_free(elp->u.p.pgno_array, 0);
- if (ret == ENOENT)
- ret = 0;
- else if (ret != 0)
+
+ if (put_page == 1) {
+ ret = mpf->put(mpf, pagep, DB_MPOOL_DIRTY);
+ put_page = 0;
+ }
+ if (ret != 0)
goto err;
}
- if (txn != NULL) {
- ret = txn_commit(txn, 0);
- txn = NULL;
- }
-err:
- if (dbc != NULL)
- (void)dbc->c_close(dbc);
- if (in_recover && dbp != NULL)
- (void)dbp->close(dbp, 0);
- if (txn != NULL)
- (void)txn_abort(txn);
+err: if (put_page &&
+ (t_ret = mpf->put(mpf, pagep, DB_MPOOL_DIRTY)) != 0 && ret == 0)
+ ret = t_ret;
+ if (dbc != NULL && (t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
return (ret);
-
}
-#define DB_TXNLIST_MAX_PGNO 8 /* A nice even number. */
+#define DB_TXNLIST_MAX_PGNO 8 /* A nice even number. */
/*
* __db_txnlist_pgnoadd --
- * Find the txnlist entry for a file and add this pgno,
- * or add the list entry for the file and then add the pgno.
- *
- * PUBLIC: int __db_txnlist_pgnoadd __P((DB_ENV *, DB_TXNHEAD *,
- * PUBLIC: int32_t, u_int8_t [DB_FILE_ID_LEN], char *, db_pgno_t));
+ * Find the txnlist entry for a file and add this pgno, or add the list
+ * entry for the file and then add the pgno.
*/
-int
+static int
__db_txnlist_pgnoadd(dbenv, hp, fileid, uid, fname, pgno)
DB_ENV *dbenv;
DB_TXNHEAD *hp;
@@ -902,34 +1292,39 @@ __db_txnlist_pgnoadd(dbenv, hp, fileid, uid, fname, pgno)
db_pgno_t pgno;
{
DB_TXNLIST *elp;
+ u_int32_t hash;
int len, ret;
elp = NULL;
- if (__db_txnlist_find_internal(hp, TXNLIST_PGNO, 0, uid, &elp, 0) != 0) {
+ if (__db_txnlist_find_internal(dbenv, hp,
+ TXNLIST_PGNO, 0, uid, &elp, 0) != 0) {
if ((ret =
- __os_malloc(dbenv, sizeof(DB_TXNLIST), NULL, &elp)) != 0)
+ __os_malloc(dbenv, sizeof(DB_TXNLIST), &elp)) != 0)
goto err;
- LIST_INSERT_HEAD(&hp->head, elp, links);
+ memcpy(&hash, uid, sizeof(hash));
+ LIST_INSERT_HEAD(
+ &hp->head[DB_TXNLIST_MASK(hp, hash)], elp, links);
elp->u.p.fileid = fileid;
memcpy(elp->u.p.uid, uid, DB_FILE_ID_LEN);
- len = strlen(fname) + 1;
- if ((ret = __os_malloc(dbenv, len, NULL, &elp->u.p.fname)) != 0)
+ len = (int)strlen(fname) + 1;
+ if ((ret = __os_malloc(dbenv, len, &elp->u.p.fname)) != 0)
goto err;
memcpy(elp->u.p.fname, fname, len);
elp->u.p.maxentry = 0;
+ elp->u.p.locked = 0;
elp->type = TXNLIST_PGNO;
if ((ret = __os_malloc(dbenv,
- 8 * sizeof(db_pgno_t), NULL, &elp->u.p.pgno_array)) != 0)
+ 8 * sizeof(db_pgno_t), &elp->u.p.pgno_array)) != 0)
goto err;
elp->u.p.maxentry = DB_TXNLIST_MAX_PGNO;
elp->u.p.nentries = 0;
} else if (elp->u.p.nentries == elp->u.p.maxentry) {
elp->u.p.maxentry <<= 1;
if ((ret = __os_realloc(dbenv, elp->u.p.maxentry *
- sizeof(db_pgno_t), NULL, &elp->u.p.pgno_array)) != 0)
+ sizeof(db_pgno_t), &elp->u.p.pgno_array)) != 0)
goto err;
}
@@ -941,6 +1336,36 @@ err: __db_txnlist_end(dbenv, hp);
return (ret);
}
+/*
+ * __db_default_getpgnos --
+ * Fill in default getpgnos information for an application-specific
+ * log record.
+ */
+static int
+__db_default_getpgnos(dbenv, lsnp, summary)
+ DB_ENV *dbenv;
+ DB_LSN *lsnp;
+ void *summary;
+{
+ TXN_RECS *t;
+ int ret;
+
+ t = (TXN_RECS *)summary;
+
+ if ((ret = __rep_check_alloc(dbenv, t, 1)) != 0)
+ return (ret);
+
+ t->array[t->npages].flags = LSN_PAGE_NOLOCK;
+ t->array[t->npages].lsn = *lsnp;
+ t->array[t->npages].fid = DB_LOGFILEID_INVALID;
+ memset(&t->array[t->npages].pgdesc, 0,
+ sizeof(t->array[t->npages].pgdesc));
+
+ t->npages++;
+
+ return (0);
+}
+
#ifdef DEBUG
/*
* __db_txnlist_print --
@@ -954,25 +1379,21 @@ __db_txnlist_print(listp)
{
DB_TXNHEAD *hp;
DB_TXNLIST *p;
+ int i;
+ char *stats[] = { "ok", "commit", "prepare", "abort", "notfound",
+ "ignore", "expected", "unexpected" };
hp = (DB_TXNHEAD *)listp;
printf("Maxid: %lu Generation: %lu\n",
(u_long)hp->maxid, (u_long)hp->generation);
- for (p = LIST_FIRST(&hp->head); p != NULL; p = LIST_NEXT(p, links)) {
+ for (i = 0; i < hp->nslots; i++)
+ for (p = LIST_FIRST(&hp->head[i]); p != NULL; p = LIST_NEXT(p, links)) {
switch (p->type) {
case TXNLIST_TXNID:
- printf("TXNID: %lu(%lu)\n",
- (u_long)p->u.t.txnid, (u_long)p->u.t.generation);
- break;
- case TXNLIST_DELETE:
- printf("FILE: %s id=%d ops=%d %s %s\n",
- p->u.d.fname, p->u.d.fileid, p->u.d.count,
- F_ISSET(&p->u.d, TXNLIST_FLAG_DELETED) ?
- "(deleted)" : "(missing)",
- F_ISSET(&p->u.d, TXNLIST_FLAG_CLOSED) ?
- "(closed)" : "(open)");
-
+ printf("TXNID: %lx(%lu): %s\n",
+ (u_long)p->u.t.txnid, (u_long)p->u.t.generation,
+ stats[p->u.t.status]);
break;
default:
printf("Unrecognized type: %d\n", p->type);
diff --git a/bdb/db/db_dup.c b/bdb/db/db_dup.c
index 6d8b2df9518..2d33d79153f 100644
--- a/bdb/db/db_dup.c
+++ b/bdb/db/db_dup.c
@@ -1,14 +1,14 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: db_dup.c,v 11.18 2000/11/30 00:58:32 ubell Exp $";
+static const char revid[] = "$Id: db_dup.c,v 11.32 2002/08/08 03:57:47 bostic Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -18,12 +18,10 @@ static const char revid[] = "$Id: db_dup.c,v 11.18 2000/11/30 00:58:32 ubell Exp
#endif
#include "db_int.h"
-#include "db_page.h"
-#include "db_shash.h"
-#include "btree.h"
-#include "hash.h"
-#include "lock.h"
-#include "db_am.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/lock.h"
+#include "dbinc/db_am.h"
/*
* __db_ditem --
@@ -39,19 +37,20 @@ __db_ditem(dbc, pagep, indx, nbytes)
{
DB *dbp;
DBT ldbt;
- db_indx_t cnt, offset;
+ db_indx_t cnt, *inp, offset;
int ret;
u_int8_t *from;
dbp = dbc->dbp;
- if (DB_LOGGING(dbc)) {
- ldbt.data = P_ENTRY(pagep, indx);
+ if (DBC_LOGGING(dbc)) {
+ ldbt.data = P_ENTRY(dbp, pagep, indx);
ldbt.size = nbytes;
- if ((ret = __db_addrem_log(dbp->dbenv, dbc->txn,
- &LSN(pagep), 0, DB_REM_DUP, dbp->log_fileid, PGNO(pagep),
+ if ((ret = __db_addrem_log(dbp, dbc->txn,
+ &LSN(pagep), 0, DB_REM_DUP, PGNO(pagep),
(u_int32_t)indx, nbytes, &ldbt, NULL, &LSN(pagep))) != 0)
return (ret);
- }
+ } else
+ LSN_NOT_LOGGED(LSN(pagep));
/*
* If there's only a single item on the page, we don't have to
@@ -63,24 +62,26 @@ __db_ditem(dbc, pagep, indx, nbytes)
return (0);
}
+ inp = P_INP(dbp, pagep);
/*
* Pack the remaining key/data items at the end of the page. Use
* memmove(3), the regions may overlap.
*/
from = (u_int8_t *)pagep + HOFFSET(pagep);
- memmove(from + nbytes, from, pagep->inp[indx] - HOFFSET(pagep));
+ DB_ASSERT((int)inp[indx] - HOFFSET(pagep) >= 0);
+ memmove(from + nbytes, from, inp[indx] - HOFFSET(pagep));
HOFFSET(pagep) += nbytes;
/* Adjust the indices' offsets. */
- offset = pagep->inp[indx];
+ offset = inp[indx];
for (cnt = 0; cnt < NUM_ENT(pagep); ++cnt)
- if (pagep->inp[cnt] < offset)
- pagep->inp[cnt] += nbytes;
+ if (inp[cnt] < offset)
+ inp[cnt] += nbytes;
/* Shift the indices down. */
--NUM_ENT(pagep);
if (indx != NUM_ENT(pagep))
- memmove(&pagep->inp[indx], &pagep->inp[indx + 1],
+ memmove(&inp[indx], &inp[indx + 1],
sizeof(db_indx_t) * (NUM_ENT(pagep) - indx));
return (0);
@@ -104,11 +105,13 @@ __db_pitem(dbc, pagep, indx, nbytes, hdr, data)
DB *dbp;
BKEYDATA bk;
DBT thdr;
+ db_indx_t *inp;
int ret;
u_int8_t *p;
- if (nbytes > P_FREESPACE(pagep)) {
- DB_ASSERT(nbytes <= P_FREESPACE(pagep));
+ dbp = dbc->dbp;
+ if (nbytes > P_FREESPACE(dbp, pagep)) {
+ DB_ASSERT(nbytes <= P_FREESPACE(dbp, pagep));
return (EINVAL);
}
/*
@@ -128,12 +131,13 @@ __db_pitem(dbc, pagep, indx, nbytes, hdr, data)
* the passed in header sizes must be adjusted for the structure's
* placeholder for the trailing variable-length data field.
*/
- dbp = dbc->dbp;
- if (DB_LOGGING(dbc))
- if ((ret = __db_addrem_log(dbp->dbenv, dbc->txn,
- &LSN(pagep), 0, DB_ADD_DUP, dbp->log_fileid, PGNO(pagep),
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __db_addrem_log(dbp, dbc->txn,
+ &LSN(pagep), 0, DB_ADD_DUP, PGNO(pagep),
(u_int32_t)indx, nbytes, hdr, data, &LSN(pagep))) != 0)
return (ret);
+ } else
+ LSN_NOT_LOGGED(LSN(pagep));
if (hdr == NULL) {
B_TSET(bk.type, B_KEYDATA, 0);
@@ -143,16 +147,17 @@ __db_pitem(dbc, pagep, indx, nbytes, hdr, data)
thdr.size = SSZA(BKEYDATA, data);
hdr = &thdr;
}
+ inp = P_INP(dbp, pagep);
/* Adjust the index table, then put the item on the page. */
if (indx != NUM_ENT(pagep))
- memmove(&pagep->inp[indx + 1], &pagep->inp[indx],
+ memmove(&inp[indx + 1], &inp[indx],
sizeof(db_indx_t) * (NUM_ENT(pagep) - indx));
HOFFSET(pagep) -= nbytes;
- pagep->inp[indx] = HOFFSET(pagep);
+ inp[indx] = HOFFSET(pagep);
++NUM_ENT(pagep);
- p = P_ENTRY(pagep, indx);
+ p = P_ENTRY(dbp, pagep, indx);
memcpy(p, hdr->data, hdr->size);
if (data != NULL)
memcpy(p + hdr->size, data->data, data->size);
@@ -177,13 +182,16 @@ __db_relink(dbc, add_rem, pagep, new_next, needlock)
PAGE *np, *pp;
DB_LOCK npl, ppl;
DB_LSN *nlsnp, *plsnp, ret_lsn;
+ DB_MPOOLFILE *mpf;
int ret;
- ret = 0;
+ dbp = dbc->dbp;
np = pp = NULL;
- npl.off = ppl.off = LOCK_INVALID;
+ LOCK_INIT(npl);
+ LOCK_INIT(ppl);
nlsnp = plsnp = NULL;
- dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ ret = 0;
/*
* Retrieve and lock the one/two pages. For a remove, we may need
@@ -194,9 +202,8 @@ __db_relink(dbc, add_rem, pagep, new_next, needlock)
if (needlock && (ret = __db_lget(dbc,
0, pagep->next_pgno, DB_LOCK_WRITE, 0, &npl)) != 0)
goto err;
- if ((ret = memp_fget(dbp->mpf,
- &pagep->next_pgno, 0, &np)) != 0) {
- (void)__db_pgerr(dbp, pagep->next_pgno);
+ if ((ret = mpf->get(mpf, &pagep->next_pgno, 0, &np)) != 0) {
+ __db_pgerr(dbp, pagep->next_pgno, ret);
goto err;
}
nlsnp = &np->lsn;
@@ -205,28 +212,27 @@ __db_relink(dbc, add_rem, pagep, new_next, needlock)
if (needlock && (ret = __db_lget(dbc,
0, pagep->prev_pgno, DB_LOCK_WRITE, 0, &ppl)) != 0)
goto err;
- if ((ret = memp_fget(dbp->mpf,
- &pagep->prev_pgno, 0, &pp)) != 0) {
- (void)__db_pgerr(dbp, pagep->next_pgno);
+ if ((ret = mpf->get(mpf, &pagep->prev_pgno, 0, &pp)) != 0) {
+ __db_pgerr(dbp, pagep->next_pgno, ret);
goto err;
}
plsnp = &pp->lsn;
}
/* Log the change. */
- if (DB_LOGGING(dbc)) {
- if ((ret = __db_relink_log(dbp->dbenv, dbc->txn,
- &ret_lsn, 0, add_rem, dbp->log_fileid,
- pagep->pgno, &pagep->lsn,
- pagep->prev_pgno, plsnp, pagep->next_pgno, nlsnp)) != 0)
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __db_relink_log(dbp, dbc->txn, &ret_lsn, 0, add_rem,
+ pagep->pgno, &pagep->lsn, pagep->prev_pgno, plsnp,
+ pagep->next_pgno, nlsnp)) != 0)
goto err;
- if (np != NULL)
- np->lsn = ret_lsn;
- if (pp != NULL)
- pp->lsn = ret_lsn;
- if (add_rem == DB_REM_PAGE)
- pagep->lsn = ret_lsn;
- }
+ } else
+ LSN_NOT_LOGGED(ret_lsn);
+ if (np != NULL)
+ np->lsn = ret_lsn;
+ if (pp != NULL)
+ pp->lsn = ret_lsn;
+ if (add_rem == DB_REM_PAGE)
+ pagep->lsn = ret_lsn;
/*
* Modify and release the two pages.
@@ -242,10 +248,10 @@ __db_relink(dbc, add_rem, pagep, new_next, needlock)
else
np->prev_pgno = pagep->prev_pgno;
if (new_next == NULL)
- ret = memp_fput(dbp->mpf, np, DB_MPOOL_DIRTY);
+ ret = mpf->put(mpf, np, DB_MPOOL_DIRTY);
else {
*new_next = np;
- ret = memp_fset(dbp->mpf, np, DB_MPOOL_DIRTY);
+ ret = mpf->set(mpf, np, DB_MPOOL_DIRTY);
}
if (ret != 0)
goto err;
@@ -256,7 +262,7 @@ __db_relink(dbc, add_rem, pagep, new_next, needlock)
if (pp != NULL) {
pp->next_pgno = pagep->next_pgno;
- if ((ret = memp_fput(dbp->mpf, pp, DB_MPOOL_DIRTY)) != 0)
+ if ((ret = mpf->put(mpf, pp, DB_MPOOL_DIRTY)) != 0)
goto err;
if (needlock)
(void)__TLPUT(dbc, ppl);
@@ -264,12 +270,12 @@ __db_relink(dbc, add_rem, pagep, new_next, needlock)
return (0);
err: if (np != NULL)
- (void)memp_fput(dbp->mpf, np, 0);
- if (needlock && npl.off != LOCK_INVALID)
+ (void)mpf->put(mpf, np, 0);
+ if (needlock)
(void)__TLPUT(dbc, npl);
if (pp != NULL)
- (void)memp_fput(dbp->mpf, pp, 0);
- if (needlock && ppl.off != LOCK_INVALID)
+ (void)mpf->put(mpf, pp, 0);
+ if (needlock)
(void)__TLPUT(dbc, ppl);
return (ret);
}
diff --git a/bdb/db/db_iface.c b/bdb/db/db_iface.c
index 3548a2527bb..b518c3b14b2 100644
--- a/bdb/db/db_iface.c
+++ b/bdb/db/db_iface.c
@@ -1,55 +1,69 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: db_iface.c,v 11.34 2001/01/11 18:19:51 bostic Exp $";
+static const char revid[] = "$Id: db_iface.c,v 11.77 2002/08/08 03:57:47 bostic Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
#include <sys/types.h>
-
-#include <errno.h>
#endif
#include "db_int.h"
-#include "db_page.h"
-#include "db_am.h"
-#include "btree.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
static int __db_curinval __P((const DB_ENV *));
+static int __db_fnl __P((const DB_ENV *, const char *));
static int __db_rdonly __P((const DB_ENV *, const char *));
static int __dbt_ferr __P((const DB *, const char *, const DBT *, int));
/*
+ * A database should be required to be readonly if it's been explicitly
+ * specified as such or if we're a client in a replicated environment and
+ * we don't have the special "client-writer" designation.
+ */
+#define IS_READONLY(dbp) \
+ (F_ISSET(dbp, DB_AM_RDONLY) || \
+ (F_ISSET((dbp)->dbenv, DB_ENV_REP_CLIENT) && \
+ !F_ISSET((dbp), DB_AM_CL_WRITER)))
+
+/*
* __db_cursorchk --
* Common cursor argument checking routine.
*
- * PUBLIC: int __db_cursorchk __P((const DB *, u_int32_t, int));
+ * PUBLIC: int __db_cursorchk __P((const DB *, u_int32_t));
*/
int
-__db_cursorchk(dbp, flags, isrdonly)
+__db_cursorchk(dbp, flags)
const DB *dbp;
u_int32_t flags;
- int isrdonly;
{
+ /* DB_DIRTY_READ is the only valid bit-flag and requires locking. */
+ if (LF_ISSET(DB_DIRTY_READ)) {
+ if (!LOCKING_ON(dbp->dbenv))
+ return (__db_fnl(dbp->dbenv, "DB->cursor"));
+ LF_CLR(DB_DIRTY_READ);
+ }
+
/* Check for invalid function flags. */
switch (flags) {
case 0:
break;
case DB_WRITECURSOR:
- if (isrdonly)
+ if (IS_READONLY(dbp))
return (__db_rdonly(dbp->dbenv, "DB->cursor"));
if (!CDB_LOCKING(dbp->dbenv))
return (__db_ferr(dbp->dbenv, "DB->cursor", 0));
break;
case DB_WRITELOCK:
- if (isrdonly)
+ if (IS_READONLY(dbp))
return (__db_rdonly(dbp->dbenv, "DB->cursor"));
break;
default:
@@ -90,22 +104,25 @@ __db_ccountchk(dbp, flags, isvalid)
* __db_cdelchk --
* Common cursor delete argument checking routine.
*
- * PUBLIC: int __db_cdelchk __P((const DB *, u_int32_t, int, int));
+ * PUBLIC: int __db_cdelchk __P((const DB *, u_int32_t, int));
*/
int
-__db_cdelchk(dbp, flags, isrdonly, isvalid)
+__db_cdelchk(dbp, flags, isvalid)
const DB *dbp;
u_int32_t flags;
- int isrdonly, isvalid;
+ int isvalid;
{
/* Check for changes to a read-only tree. */
- if (isrdonly)
+ if (IS_READONLY(dbp))
return (__db_rdonly(dbp->dbenv, "c_del"));
/* Check for invalid function flags. */
switch (flags) {
case 0:
break;
+ case DB_UPDATE_SECONDARY:
+ DB_ASSERT(F_ISSET(dbp, DB_AM_SECONDARY));
+ break;
default:
return (__db_ferr(dbp->dbenv, "DBcursor->c_del", 0));
}
@@ -130,7 +147,7 @@ __db_cgetchk(dbp, key, data, flags, isvalid)
u_int32_t flags;
int isvalid;
{
- int ret;
+ int dirty, multi, ret;
/*
* Check for read-modify-write validity. DB_RMW doesn't make sense
@@ -140,44 +157,68 @@ __db_cgetchk(dbp, key, data, flags, isvalid)
* If this changes, confirm that DB does not itself set the DB_RMW
* flag in a path where CDB may have been configured.
*/
- if (LF_ISSET(DB_RMW)) {
- if (!LOCKING_ON(dbp->dbenv)) {
- __db_err(dbp->dbenv,
- "the DB_RMW flag requires locking");
- return (EINVAL);
- }
- LF_CLR(DB_RMW);
+ dirty = 0;
+ if (LF_ISSET(DB_DIRTY_READ | DB_RMW)) {
+ if (!LOCKING_ON(dbp->dbenv))
+ return (__db_fnl(dbp->dbenv, "DBcursor->c_get"));
+ if (LF_ISSET(DB_DIRTY_READ))
+ dirty = 1;
+ LF_CLR(DB_DIRTY_READ | DB_RMW);
+ }
+
+ multi = 0;
+ if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY)) {
+ multi = 1;
+ if (LF_ISSET(DB_MULTIPLE) && LF_ISSET(DB_MULTIPLE_KEY))
+ goto multi_err;
+ LF_CLR(DB_MULTIPLE | DB_MULTIPLE_KEY);
}
/* Check for invalid function flags. */
switch (flags) {
case DB_CONSUME:
case DB_CONSUME_WAIT:
+ if (dirty) {
+ __db_err(dbp->dbenv,
+ "DB_DIRTY_READ is not supported with DB_CONSUME or DB_CONSUME_WAIT");
+ return (EINVAL);
+ }
if (dbp->type != DB_QUEUE)
goto err;
break;
case DB_CURRENT:
case DB_FIRST:
case DB_GET_BOTH:
- case DB_LAST:
+ case DB_GET_BOTH_RANGE:
case DB_NEXT:
case DB_NEXT_DUP:
case DB_NEXT_NODUP:
- case DB_PREV:
- case DB_PREV_NODUP:
case DB_SET:
case DB_SET_RANGE:
break;
+ case DB_LAST:
+ case DB_PREV:
+ case DB_PREV_NODUP:
+ if (multi)
+multi_err: return (__db_ferr(dbp->dbenv, "DBcursor->c_get", 1));
+ break;
case DB_GET_BOTHC:
if (dbp->type == DB_QUEUE)
goto err;
break;
case DB_GET_RECNO:
- if (!F_ISSET(dbp, DB_BT_RECNUM))
+ /*
+ * The one situation in which this might be legal with a
+ * non-RECNUM dbp is if dbp is a secondary and its primary is
+ * DB_AM_RECNUM.
+ */
+ if (!F_ISSET(dbp, DB_AM_RECNUM) &&
+ (!F_ISSET(dbp, DB_AM_SECONDARY) ||
+ !F_ISSET(dbp->s_primary, DB_AM_RECNUM)))
goto err;
break;
case DB_SET_RECNO:
- if (!F_ISSET(dbp, DB_BT_RECNUM))
+ if (!F_ISSET(dbp, DB_AM_RECNUM))
goto err;
break;
default:
@@ -190,11 +231,24 @@ err: return (__db_ferr(dbp->dbenv, "DBcursor->c_get", 0));
if ((ret = __dbt_ferr(dbp, "data", data, 0)) != 0)
return (ret);
+ if (multi && !F_ISSET(data, DB_DBT_USERMEM)) {
+ __db_err(dbp->dbenv,
+ "DB_MULTIPLE(_KEY) requires that DB_DBT_USERMEM be set");
+ return (EINVAL);
+ }
+ if (multi &&
+ (F_ISSET(key, DB_DBT_PARTIAL) || F_ISSET(data, DB_DBT_PARTIAL))) {
+ __db_err(dbp->dbenv,
+ "DB_DBT_PARTIAL forbidden with DB_MULTIPLE(_KEY)");
+ return (EINVAL);
+ }
+
/*
- * The cursor must be initialized for DB_CURRENT or DB_NEXT_DUP,
- * return EINVAL for an invalid cursor, otherwise 0.
+ * The cursor must be initialized for DB_CURRENT, DB_GET_RECNO and
+ * DB_NEXT_DUP. Return EINVAL for an invalid cursor, otherwise 0.
*/
- if (isvalid || (flags != DB_CURRENT && flags != DB_NEXT_DUP))
+ if (isvalid || (flags != DB_CURRENT &&
+ flags != DB_GET_RECNO && flags != DB_NEXT_DUP))
return (0);
return (__db_curinval(dbp->dbenv));
@@ -205,24 +259,35 @@ err: return (__db_ferr(dbp->dbenv, "DBcursor->c_get", 0));
* Common cursor put argument checking routine.
*
* PUBLIC: int __db_cputchk __P((const DB *,
- * PUBLIC: const DBT *, DBT *, u_int32_t, int, int));
+ * PUBLIC: const DBT *, DBT *, u_int32_t, int));
*/
int
-__db_cputchk(dbp, key, data, flags, isrdonly, isvalid)
+__db_cputchk(dbp, key, data, flags, isvalid)
const DB *dbp;
const DBT *key;
DBT *data;
u_int32_t flags;
- int isrdonly, isvalid;
+ int isvalid;
{
int key_flags, ret;
key_flags = 0;
/* Check for changes to a read-only tree. */
- if (isrdonly)
+ if (IS_READONLY(dbp))
return (__db_rdonly(dbp->dbenv, "c_put"));
+ /* Check for puts on a secondary. */
+ if (F_ISSET(dbp, DB_AM_SECONDARY)) {
+ if (flags == DB_UPDATE_SECONDARY)
+ flags = DB_KEYLAST;
+ else {
+ __db_err(dbp->dbenv,
+ "DBcursor->c_put forbidden on secondary indices");
+ return (EINVAL);
+ }
+ }
+
/* Check for invalid function flags. */
switch (flags) {
case DB_AFTER:
@@ -238,7 +303,7 @@ __db_cputchk(dbp, key, data, flags, isrdonly, isvalid)
case DB_QUEUE: /* Not permitted. */
goto err;
case DB_RECNO: /* Only with mutable record numbers. */
- if (!F_ISSET(dbp, DB_RE_RENUMBER))
+ if (!F_ISSET(dbp, DB_AM_RENUMBER))
goto err;
key_flags = 1;
break;
@@ -259,8 +324,6 @@ __db_cputchk(dbp, key, data, flags, isrdonly, isvalid)
/* FALLTHROUGH */
case DB_KEYFIRST:
case DB_KEYLAST:
- if (dbp->type == DB_QUEUE || dbp->type == DB_RECNO)
- goto err;
key_flags = 1;
break;
default:
@@ -285,48 +348,153 @@ err: return (__db_ferr(dbp->dbenv, "DBcursor->c_put", 0));
}
/*
- * __db_closechk --
- * DB->close flag check.
+ * __db_pgetchk --
+ * DB->pget flag check.
*
- * PUBLIC: int __db_closechk __P((const DB *, u_int32_t));
+ * PUBLIC: int __db_pgetchk __P((const DB *, const DBT *, DBT *, DBT *,
+ * PUBLIC: u_int32_t));
*/
int
-__db_closechk(dbp, flags)
+__db_pgetchk(dbp, skey, pkey, data, flags)
const DB *dbp;
+ const DBT *skey;
+ DBT *pkey, *data;
u_int32_t flags;
{
- /* Check for invalid function flags. */
+ int ret;
+ u_int32_t save_flags;
+
+ save_flags = flags;
+
+ if (!F_ISSET(dbp, DB_AM_SECONDARY)) {
+ __db_err(dbp->dbenv,
+ "DB->pget may only be used on secondary indices");
+ return (EINVAL);
+ }
+
+ if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY)) {
+ __db_err(dbp->dbenv,
+ "DB_MULTIPLE and DB_MULTIPLE_KEY may not be used on secondary indices");
+ return (EINVAL);
+ }
+
+ /* DB_CONSUME makes no sense on a secondary index. */
+ LF_CLR(DB_RMW);
switch (flags) {
- case 0:
- case DB_NOSYNC:
+ case DB_CONSUME:
+ case DB_CONSUME_WAIT:
+ return (__db_ferr(dbp->dbenv, "DB->pget", 0));
+ default:
+ /* __db_getchk will catch the rest. */
+ break;
+ }
+
+ /*
+ * We allow the pkey field to be NULL, so that we can make the
+ * two-DBT get calls into wrappers for the three-DBT ones.
+ */
+ if (pkey != NULL &&
+ (ret = __dbt_ferr(dbp, "primary key", pkey, 1)) != 0)
+ return (ret);
+
+ /* But the pkey field can't be NULL if we're doing a DB_GET_BOTH. */
+ if (pkey == NULL && flags == DB_GET_BOTH) {
+ __db_err(dbp->dbenv,
+ "DB_GET_BOTH on a secondary index requires a primary key");
+ return (EINVAL);
+ }
+
+ return (__db_getchk(dbp, skey, data, save_flags));
+}
+
+/*
+ * __db_cpgetchk --
+ * Secondary-index cursor get argument checking routine.
+ *
+ * PUBLIC: int __db_cpgetchk __P((const DB *,
+ * PUBLIC: DBT *, DBT *, DBT *, u_int32_t, int));
+ */
+int
+__db_cpgetchk(dbp, skey, pkey, data, flags, isvalid)
+ const DB *dbp;
+ DBT *skey, *pkey, *data;
+ u_int32_t flags;
+ int isvalid;
+{
+ int ret;
+ u_int32_t save_flags;
+
+ save_flags = flags;
+
+ if (!F_ISSET(dbp, DB_AM_SECONDARY)) {
+ __db_err(dbp->dbenv,
+ "DBcursor->c_pget may only be used on secondary indices");
+ return (EINVAL);
+ }
+
+ if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY)) {
+ __db_err(dbp->dbenv,
+ "DB_MULTIPLE and DB_MULTIPLE_KEY may not be used on secondary indices");
+ return (EINVAL);
+ }
+
+ LF_CLR(DB_RMW);
+ switch (flags) {
+ case DB_CONSUME:
+ case DB_CONSUME_WAIT:
+ /* DB_CONSUME makes no sense on a secondary index. */
+ return (__db_ferr(dbp->dbenv, "DBcursor->c_pget", 0));
+ case DB_GET_BOTH:
+ /* DB_GET_BOTH is "get both the primary and the secondary". */
+ if (pkey == NULL) {
+ __db_err(dbp->dbenv,
+ "DB_GET_BOTH requires both a secondary and a primary key");
+ return (EINVAL);
+ }
break;
default:
- return (__db_ferr(dbp->dbenv, "DB->close", 0));
+ /* __db_cgetchk will catch the rest. */
+ break;
}
- return (0);
+ /*
+ * We allow the pkey field to be NULL, so that we can make the
+ * two-DBT get calls into wrappers for the three-DBT ones.
+ */
+ if (pkey != NULL &&
+ (ret = __dbt_ferr(dbp, "primary key", pkey, 0)) != 0)
+ return (ret);
+
+ /* But the pkey field can't be NULL if we're doing a DB_GET_BOTH. */
+ if (pkey == NULL && flags == DB_GET_BOTH) {
+ __db_err(dbp->dbenv,
+ "DB_GET_BOTH on a secondary index requires a primary key");
+ return (EINVAL);
+ }
+
+ return (__db_cgetchk(dbp, skey, data, save_flags, isvalid));
}
/*
* __db_delchk --
* Common delete argument checking routine.
*
- * PUBLIC: int __db_delchk __P((const DB *, DBT *, u_int32_t, int));
+ * PUBLIC: int __db_delchk __P((const DB *, DBT *, u_int32_t));
*/
int
-__db_delchk(dbp, key, flags, isrdonly)
+__db_delchk(dbp, key, flags)
const DB *dbp;
DBT *key;
u_int32_t flags;
- int isrdonly;
{
COMPQUIET(key, NULL);
/* Check for changes to a read-only tree. */
- if (isrdonly)
+ if (IS_READONLY(dbp))
return (__db_rdonly(dbp->dbenv, "delete"));
/* Check for invalid function flags. */
+ LF_CLR(DB_AUTO_COMMIT);
switch (flags) {
case 0:
break;
@@ -350,7 +518,7 @@ __db_getchk(dbp, key, data, flags)
DBT *data;
u_int32_t flags;
{
- int ret;
+ int dirty, multi, ret;
/*
* Check for read-modify-write validity. DB_RMW doesn't make sense
@@ -360,13 +528,21 @@ __db_getchk(dbp, key, data, flags)
* If this changes, confirm that DB does not itself set the DB_RMW
* flag in a path where CDB may have been configured.
*/
- if (LF_ISSET(DB_RMW)) {
- if (!LOCKING_ON(dbp->dbenv)) {
- __db_err(dbp->dbenv,
- "the DB_RMW flag requires locking");
- return (EINVAL);
- }
- LF_CLR(DB_RMW);
+ dirty = 0;
+ if (LF_ISSET(DB_DIRTY_READ | DB_RMW)) {
+ if (!LOCKING_ON(dbp->dbenv))
+ return (__db_fnl(dbp->dbenv, "DB->get"));
+ if (LF_ISSET(DB_DIRTY_READ))
+ dirty = 1;
+ LF_CLR(DB_DIRTY_READ | DB_RMW);
+ }
+
+ multi = 0;
+ if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY)) {
+ if (LF_ISSET(DB_MULTIPLE_KEY))
+ goto multi_err;
+ multi = LF_ISSET(DB_MULTIPLE) ? 1 : 0;
+ LF_CLR(DB_MULTIPLE);
}
/* Check for invalid function flags. */
@@ -375,24 +551,48 @@ __db_getchk(dbp, key, data, flags)
case DB_GET_BOTH:
break;
case DB_SET_RECNO:
- if (!F_ISSET(dbp, DB_BT_RECNUM))
+ if (!F_ISSET(dbp, DB_AM_RECNUM))
goto err;
break;
case DB_CONSUME:
case DB_CONSUME_WAIT:
+ if (dirty) {
+ __db_err(dbp->dbenv,
+ "DB_DIRTY_READ is not supported with DB_CONSUME or DB_CONSUME_WAIT");
+ return (EINVAL);
+ }
+ if (multi)
+multi_err: return (__db_ferr(dbp->dbenv, "DB->get", 1));
if (dbp->type == DB_QUEUE)
break;
- /* Fall through */
+ /* FALLTHROUGH */
default:
err: return (__db_ferr(dbp->dbenv, "DB->get", 0));
}
- /* Check for invalid key/data flags. */
+ /*
+ * Check for invalid key/data flags.
+ *
+ * XXX: Dave Krinsky
+ * Remember to modify this when we fix the flag-returning problem.
+ */
if ((ret = __dbt_ferr(dbp, "key", key, flags == DB_SET_RECNO)) != 0)
return (ret);
if ((ret = __dbt_ferr(dbp, "data", data, 1)) != 0)
return (ret);
+ if (multi && !F_ISSET(data, DB_DBT_USERMEM)) {
+ __db_err(dbp->dbenv,
+ "DB_MULTIPLE requires that DB_DBT_USERMEM be set");
+ return (EINVAL);
+ }
+ if (multi &&
+ (F_ISSET(key, DB_DBT_PARTIAL) || F_ISSET(data, DB_DBT_PARTIAL))) {
+ __db_err(dbp->dbenv,
+ "DB_DBT_PARTIAL forbidden with DB_MULTIPLE(_KEY)");
+ return (EINVAL);
+ }
+
return (0);
}
@@ -449,13 +649,11 @@ __db_joingetchk(dbp, key, flags)
u_int32_t flags;
{
- if (LF_ISSET(DB_RMW)) {
- if (!LOCKING_ON(dbp->dbenv)) {
- __db_err(dbp->dbenv,
- "the DB_RMW flag requires locking");
- return (EINVAL);
- }
- LF_CLR(DB_RMW);
+ if (LF_ISSET(DB_DIRTY_READ | DB_RMW)) {
+ if (!LOCKING_ON(dbp->dbenv))
+ return (__db_fnl(dbp->dbenv, "DBcursor->c_get"));
+
+ LF_CLR(DB_DIRTY_READ | DB_RMW);
}
switch (flags) {
@@ -491,23 +689,32 @@ __db_joingetchk(dbp, key, flags)
* Common put argument checking routine.
*
* PUBLIC: int __db_putchk
- * PUBLIC: __P((const DB *, DBT *, const DBT *, u_int32_t, int, int));
+ * PUBLIC: __P((const DB *, DBT *, const DBT *, u_int32_t, int));
*/
int
-__db_putchk(dbp, key, data, flags, isrdonly, isdup)
+__db_putchk(dbp, key, data, flags, isdup)
const DB *dbp;
DBT *key;
const DBT *data;
u_int32_t flags;
- int isrdonly, isdup;
+ int isdup;
{
- int ret;
+ int ret, returnkey;
+
+ returnkey = 0;
/* Check for changes to a read-only tree. */
- if (isrdonly)
+ if (IS_READONLY(dbp))
return (__db_rdonly(dbp->dbenv, "put"));
+ /* Check for puts on a secondary. */
+ if (F_ISSET(dbp, DB_AM_SECONDARY)) {
+ __db_err(dbp->dbenv, "DB->put forbidden on secondary indices");
+ return (EINVAL);
+ }
+
/* Check for invalid function flags. */
+ LF_CLR(DB_AUTO_COMMIT);
switch (flags) {
case 0:
case DB_NOOVERWRITE:
@@ -515,6 +722,7 @@ __db_putchk(dbp, key, data, flags, isrdonly, isdup)
case DB_APPEND:
if (dbp->type != DB_RECNO && dbp->type != DB_QUEUE)
goto err;
+ returnkey = 1;
break;
case DB_NODUPDATA:
if (F_ISSET(dbp, DB_AM_DUPSORT))
@@ -525,7 +733,7 @@ err: return (__db_ferr(dbp->dbenv, "DB->put", 0));
}
/* Check for invalid key/data flags. */
- if ((ret = __dbt_ferr(dbp, "key", key, 0)) != 0)
+ if ((ret = __dbt_ferr(dbp, "key", key, returnkey)) != 0)
return (ret);
if ((ret = __dbt_ferr(dbp, "data", data, 0)) != 0)
return (ret);
@@ -541,28 +749,6 @@ err: return (__db_ferr(dbp->dbenv, "DB->put", 0));
}
/*
- * __db_removechk --
- * DB->remove flag check.
- *
- * PUBLIC: int __db_removechk __P((const DB *, u_int32_t));
- */
-int
-__db_removechk(dbp, flags)
- const DB *dbp;
- u_int32_t flags;
-{
- /* Check for invalid function flags. */
- switch (flags) {
- case 0:
- break;
- default:
- return (__db_ferr(dbp->dbenv, "DB->remove", 0));
- }
-
- return (0);
-}
-
-/*
* __db_statchk --
* Common stat argument checking routine.
*
@@ -576,12 +762,13 @@ __db_statchk(dbp, flags)
/* Check for invalid function flags. */
switch (flags) {
case 0:
- case DB_CACHED_COUNTS:
+ case DB_FAST_STAT:
+ case DB_CACHED_COUNTS: /* Deprecated and undocumented. */
break;
- case DB_RECORDCOUNT:
+ case DB_RECORDCOUNT: /* Deprecated and undocumented. */
if (dbp->type == DB_RECNO)
break;
- if (dbp->type == DB_BTREE && F_ISSET(dbp, DB_BT_RECNUM))
+ if (dbp->type == DB_BTREE && F_ISSET(dbp, DB_AM_RECNUM))
break;
goto err;
default:
@@ -636,9 +823,9 @@ __dbt_ferr(dbp, name, dbt, check_thread)
* database and then specify that same DBT as a key to a primary
* database, without having to clear flags.
*/
- if ((ret = __db_fchk(dbenv, name, dbt->flags,
- DB_DBT_MALLOC | DB_DBT_DUPOK |
- DB_DBT_REALLOC | DB_DBT_USERMEM | DB_DBT_PARTIAL)) != 0)
+ if ((ret = __db_fchk(dbenv, name, dbt->flags, DB_DBT_APPMALLOC |
+ DB_DBT_MALLOC | DB_DBT_DUPOK | DB_DBT_REALLOC | DB_DBT_USERMEM |
+ DB_DBT_PARTIAL)) != 0)
return (ret);
switch (F_ISSET(dbt, DB_DBT_MALLOC | DB_DBT_REALLOC | DB_DBT_USERMEM)) {
case 0:
@@ -674,6 +861,20 @@ __db_rdonly(dbenv, name)
}
/*
+ * __db_fnl --
+ * Common flag-needs-locking message.
+ */
+static int
+__db_fnl(dbenv, name)
+ const DB_ENV *dbenv;
+ const char *name;
+{
+ __db_err(dbenv,
+ "%s: the DB_DIRTY_READ and DB_RMW flags require locking", name);
+ return (EINVAL);
+}
+
+/*
* __db_curinval
* Report that a cursor is in an invalid state.
*/
@@ -685,3 +886,98 @@ __db_curinval(dbenv)
"Cursor position must be set before performing this operation");
return (EINVAL);
}
+
+/*
+ * __db_secondary_corrupt --
+ * Report that a secondary index appears corrupt, as it has a record
+ * that does not correspond to a record in the primary.
+ *
+ * PUBLIC: int __db_secondary_corrupt __P((DB *));
+ */
+int
+__db_secondary_corrupt(dbp)
+ DB *dbp;
+{
+
+ __db_err(dbp->dbenv,
+ "Secondary index corrupt: item in secondary not found in primary");
+ return (DB_SECONDARY_BAD);
+}
+
+/*
+ * __db_associatechk --
+ * Argument checking routine for DB->associate().
+ *
+ * PUBLIC: int __db_associatechk __P((DB *, DB *,
+ * PUBLIC: int (*)(DB *, const DBT *, const DBT *, DBT *), u_int32_t));
+ */
+int
+__db_associatechk(dbp, sdbp, callback, flags)
+ DB *dbp, *sdbp;
+ int (*callback) __P((DB *, const DBT *, const DBT *, DBT *));
+ u_int32_t flags;
+{
+ DB_ENV *dbenv;
+
+ dbenv = dbp->dbenv;
+
+ if (F_ISSET(sdbp, DB_AM_SECONDARY)) {
+ __db_err(dbenv,
+ "Secondary index handles may not be re-associated");
+ return (EINVAL);
+ }
+ if (F_ISSET(dbp, DB_AM_SECONDARY)) {
+ __db_err(dbenv,
+ "Secondary indices may not be used as primary databases");
+ return (EINVAL);
+ }
+ if (F_ISSET(dbp, DB_AM_DUP)) {
+ __db_err(dbenv,
+ "Primary databases may not be configured with duplicates");
+ return (EINVAL);
+ }
+ if (F_ISSET(dbp, DB_AM_RENUMBER)) {
+ __db_err(dbenv,
+ "Renumbering recno databases may not be used as primary databases");
+ return (EINVAL);
+ }
+ if (callback == NULL &&
+ (!F_ISSET(dbp, DB_AM_RDONLY) || !F_ISSET(sdbp, DB_AM_RDONLY))) {
+ __db_err(dbenv,
+ "Callback function may be NULL only when database handles are read-only");
+ return (EINVAL);
+ }
+
+ return (__db_fchk(dbenv,
+ "DB->associate", flags, DB_CREATE | DB_AUTO_COMMIT));
+}
+
+/*
+ * __db_txn_auto --
+ * Handle DB_AUTO_COMMIT initialization.
+ *
+ * PUBLIC: int __db_txn_auto __P((DB *, DB_TXN **));
+ */
+int
+__db_txn_auto(dbp, txnidp)
+ DB *dbp;
+ DB_TXN **txnidp;
+{
+ DB_ENV *dbenv;
+
+ dbenv = dbp->dbenv;
+
+ if (*txnidp != NULL) {
+ __db_err(dbenv,
+ "DB_AUTO_COMMIT may not be specified along with a transaction handle");
+ return (EINVAL);
+ }
+
+ if (!TXN_ON(dbenv)) {
+ __db_err(dbenv,
+ "DB_AUTO_COMMIT may not be specified in non-transactional environment");
+ return (EINVAL);
+ }
+
+ return (dbenv->txn_begin(dbenv, NULL, txnidp, 0));
+}
diff --git a/bdb/db/db_join.c b/bdb/db/db_join.c
index 881dedde0fc..6281b1a8383 100644
--- a/bdb/db/db_join.c
+++ b/bdb/db/db_join.c
@@ -1,14 +1,14 @@
-/*-
+/*
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1998, 1999, 2000
+ * Copyright (c) 1998-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: db_join.c,v 11.31 2000/12/20 22:41:54 krinsky Exp $";
+static const char revid[] = "$Id: db_join.c,v 11.55 2002/08/08 03:57:47 bostic Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -19,16 +19,17 @@ static const char revid[] = "$Id: db_join.c,v 11.31 2000/12/20 22:41:54 krinsky
#endif
#include "db_int.h"
-#include "db_page.h"
-#include "db_join.h"
-#include "db_am.h"
-#include "btree.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_join.h"
+#include "dbinc/btree.h"
static int __db_join_close __P((DBC *));
static int __db_join_cmp __P((const void *, const void *));
static int __db_join_del __P((DBC *, u_int32_t));
static int __db_join_get __P((DBC *, DBT *, DBT *, u_int32_t));
-static int __db_join_getnext __P((DBC *, DBT *, DBT *, u_int32_t));
+static int __db_join_getnext __P((DBC *, DBT *, DBT *, u_int32_t, u_int32_t));
+static int __db_join_primget __P((DB *,
+ DB_TXN *, u_int32_t, DBT *, DBT *, u_int32_t));
static int __db_join_put __P((DBC *, DBT *, DBT *, u_int32_t));
/*
@@ -84,7 +85,8 @@ __db_join(primary, curslist, dbcp, flags)
DBC *dbc;
JOIN_CURSOR *jc;
int ret;
- u_int32_t i, ncurs, nslots;
+ u_int32_t i;
+ size_t ncurs, nslots;
COMPQUIET(nslots, 0);
@@ -104,11 +106,13 @@ __db_join(primary, curslist, dbcp, flags)
1, sizeof(JOIN_CURSOR), &jc)) != 0)
goto err;
- if ((ret = __os_malloc(dbenv, 256, NULL, &jc->j_key.data)) != 0)
+ if ((ret = __os_malloc(dbenv, 256, &jc->j_key.data)) != 0)
goto err;
jc->j_key.ulen = 256;
F_SET(&jc->j_key, DB_DBT_USERMEM);
+ F_SET(&jc->j_rdata, DB_DBT_REALLOC);
+
for (jc->j_curslist = curslist;
*jc->j_curslist != NULL; jc->j_curslist++)
;
@@ -184,7 +188,7 @@ __db_join(primary, curslist, dbcp, flags)
jc->j_fdupcurs[i] = NULL;
jc->j_exhausted[i] = 0;
}
- jc->j_ncurs = ncurs;
+ jc->j_ncurs = (u_int32_t)ncurs;
/*
* If DB_JOIN_NOSORT is not set, optimize secondary cursors by
@@ -226,20 +230,20 @@ __db_join(primary, curslist, dbcp, flags)
err: if (jc != NULL) {
if (jc->j_curslist != NULL)
- __os_free(jc->j_curslist, nslots * sizeof(DBC *));
+ __os_free(dbenv, jc->j_curslist);
if (jc->j_workcurs != NULL) {
if (jc->j_workcurs[0] != NULL)
- __os_free(jc->j_workcurs[0], sizeof(DBC));
- __os_free(jc->j_workcurs, nslots * sizeof(DBC *));
+ __os_free(dbenv, jc->j_workcurs[0]);
+ __os_free(dbenv, jc->j_workcurs);
}
if (jc->j_fdupcurs != NULL)
- __os_free(jc->j_fdupcurs, nslots * sizeof(DBC *));
+ __os_free(dbenv, jc->j_fdupcurs);
if (jc->j_exhausted != NULL)
- __os_free(jc->j_exhausted, nslots * sizeof(u_int8_t));
- __os_free(jc, sizeof(JOIN_CURSOR));
+ __os_free(dbenv, jc->j_exhausted);
+ __os_free(dbenv, jc);
}
if (dbc != NULL)
- __os_free(dbc, sizeof(DBC));
+ __os_free(dbenv, dbc);
return (ret);
}
@@ -279,8 +283,8 @@ __db_join_get(dbc, key_arg, data_arg, flags)
DB *dbp;
DBC *cp;
JOIN_CURSOR *jc;
- int ret;
- u_int32_t i, j, operation;
+ int db_manage_data, ret;
+ u_int32_t i, j, operation, opmods;
dbp = dbc->dbp;
jc = (JOIN_CURSOR *)dbc->internal;
@@ -289,6 +293,12 @@ __db_join_get(dbc, key_arg, data_arg, flags)
operation = LF_ISSET(DB_OPFLAGS_MASK);
+ /* !!!
+ * If the set of flags here changes, check that __db_join_primget
+ * is updated to handle them properly.
+ */
+ opmods = LF_ISSET(DB_RMW | DB_DIRTY_READ);
+
if ((ret = __db_joingetchk(dbp, key_arg, flags)) != 0)
return (ret);
@@ -319,13 +329,14 @@ __db_join_get(dbc, key_arg, data_arg, flags)
goto samekey;
F_CLR(jc, JOIN_RETRY);
-retry: ret = jc->j_workcurs[0]->c_get(jc->j_workcurs[0],
- &jc->j_key, key_n, jc->j_exhausted[0] ? DB_NEXT_DUP : DB_CURRENT);
+retry: ret = jc->j_workcurs[0]->c_real_get(jc->j_workcurs[0],
+ &jc->j_key, key_n,
+ opmods | (jc->j_exhausted[0] ? DB_NEXT_DUP : DB_CURRENT));
if (ret == ENOMEM) {
jc->j_key.ulen <<= 1;
if ((ret = __os_realloc(dbp->dbenv,
- jc->j_key.ulen, NULL, &jc->j_key.data)) != 0)
+ jc->j_key.ulen, &jc->j_key.data)) != 0)
goto mem_err;
goto retry;
}
@@ -379,7 +390,7 @@ retry: ret = jc->j_workcurs[0]->c_get(jc->j_workcurs[0],
retry2: cp = jc->j_workcurs[i];
if ((ret = __db_join_getnext(cp, &jc->j_key, key_n,
- jc->j_exhausted[i])) == DB_NOTFOUND) {
+ jc->j_exhausted[i], opmods)) == DB_NOTFOUND) {
/*
* jc->j_workcurs[i] has no more of the datum we're
* interested in. Go back one cursor and get
@@ -475,7 +486,7 @@ retry2: cp = jc->j_workcurs[i];
if (ret == ENOMEM) {
jc->j_key.ulen <<= 1;
if ((ret = __os_realloc(dbp->dbenv, jc->j_key.ulen,
- NULL, &jc->j_key.data)) != 0) {
+ &jc->j_key.data)) != 0) {
mem_err: __db_err(dbp->dbenv,
"Allocation failed for join key, len = %lu",
(u_long)jc->j_key.ulen);
@@ -523,8 +534,8 @@ samekey: /*
* Get the key we tried and failed to return last time;
* it should be the current datum of all the secondary cursors.
*/
- if ((ret = jc->j_workcurs[0]->c_get(jc->j_workcurs[0],
- &jc->j_key, key_n, DB_CURRENT)) != 0)
+ if ((ret = jc->j_workcurs[0]->c_real_get(jc->j_workcurs[0],
+ &jc->j_key, key_n, DB_CURRENT | opmods)) != 0)
return (ret);
F_CLR(jc, JOIN_RETRY);
}
@@ -532,36 +543,28 @@ samekey: /*
/*
* ret == 0; we have a key to return.
*
- * If DB_DBT_USERMEM or DB_DBT_MALLOC is set, we need to
- * copy it back into the dbt we were given for the key;
- * call __db_retcopy.
- *
- * Otherwise, assert that we do not in fact need to copy anything
- * and simply proceed.
+ * If DB_DBT_USERMEM or DB_DBT_MALLOC is set, we need to copy the key
+ * back into the dbt we were given for the key; call __db_retcopy.
+ * Otherwise, assert that we do not need to copy anything and proceed.
*/
- if (F_ISSET(key_arg, DB_DBT_USERMEM) ||
- F_ISSET(key_arg, DB_DBT_MALLOC)) {
+ DB_ASSERT(F_ISSET(
+ key_arg, DB_DBT_USERMEM | DB_DBT_MALLOC) || key_n == key_arg);
+
+ if (F_ISSET(key_arg, DB_DBT_USERMEM | DB_DBT_MALLOC) &&
+ (ret = __db_retcopy(dbp->dbenv,
+ key_arg, key_n->data, key_n->size, NULL, NULL)) != 0) {
/*
- * We need to copy the key back into our original
- * datum. Do so.
+ * The retcopy failed, most commonly because we have a user
+ * buffer for the key which is too small. Set things up to
+ * retry next time, and return.
*/
- if ((ret = __db_retcopy(dbp,
- key_arg, key_n->data, key_n->size, NULL, NULL)) != 0) {
- /*
- * The retcopy failed, most commonly because we
- * have a user buffer for the key which is too small.
- * Set things up to retry next time, and return.
- */
- F_SET(jc, JOIN_RETRY);
- return (ret);
- }
- } else
- DB_ASSERT(key_n == key_arg);
+ F_SET(jc, JOIN_RETRY);
+ return (ret);
+ }
/*
- * If DB_JOIN_ITEM is
- * set, we return it; otherwise we do the lookup in the
- * primary and then return.
+ * If DB_JOIN_ITEM is set, we return it; otherwise we do the lookup
+ * in the primary and then return.
*
* Note that we use key_arg here; it is safe (and appropriate)
* to do so.
@@ -569,14 +572,45 @@ samekey: /*
if (operation == DB_JOIN_ITEM)
return (0);
- if ((ret = jc->j_primary->get(jc->j_primary,
- jc->j_curslist[0]->txn, key_arg, data_arg, 0)) != 0)
- /*
- * The get on the primary failed, most commonly because we're
- * using a user buffer that's not big enough. Flag our
- * failure so we can return the same key next time.
- */
- F_SET(jc, JOIN_RETRY);
+ /*
+ * If data_arg->flags == 0--that is, if DB is managing the
+ * data DBT's memory--it's not safe to just pass the DBT
+ * through to the primary get call, since we don't want that
+ * memory to belong to the primary DB handle (and if the primary
+ * is free-threaded, it can't anyway).
+ *
+ * Instead, use memory that is managed by the join cursor, in
+ * jc->j_rdata.
+ */
+ if (!F_ISSET(data_arg, DB_DBT_MALLOC | DB_DBT_REALLOC | DB_DBT_USERMEM))
+ db_manage_data = 1;
+ else
+ db_manage_data = 0;
+ if ((ret = __db_join_primget(jc->j_primary,
+ jc->j_curslist[0]->txn, jc->j_curslist[0]->locker, key_arg,
+ db_manage_data ? &jc->j_rdata : data_arg, opmods)) != 0) {
+ if (ret == DB_NOTFOUND)
+ /*
+ * If ret == DB_NOTFOUND, the primary and secondary
+ * are out of sync; every item in each secondary
+ * should correspond to something in the primary,
+ * or we shouldn't have done the join this way.
+ * Wail.
+ */
+ ret = __db_secondary_corrupt(jc->j_primary);
+ else
+ /*
+ * The get on the primary failed for some other
+ * reason, most commonly because we're using a user
+ * buffer that's not big enough. Flag our failure
+ * so we can return the same key next time.
+ */
+ F_SET(jc, JOIN_RETRY);
+ }
+ if (db_manage_data && ret == 0) {
+ data_arg->data = jc->j_rdata.data;
+ data_arg->size = jc->j_rdata.size;
+ }
return (ret);
}
@@ -586,12 +620,14 @@ __db_join_close(dbc)
DBC *dbc;
{
DB *dbp;
+ DB_ENV *dbenv;
JOIN_CURSOR *jc;
int ret, t_ret;
u_int32_t i;
jc = (JOIN_CURSOR *)dbc->internal;
dbp = dbc->dbp;
+ dbenv = dbp->dbenv;
ret = t_ret = 0;
/*
@@ -599,11 +635,11 @@ __db_join_close(dbc)
* must happen before any action that can fail and return, or else
* __db_close may loop indefinitely.
*/
- MUTEX_THREAD_LOCK(dbp->dbenv, dbp->mutexp);
+ MUTEX_THREAD_LOCK(dbenv, dbp->mutexp);
TAILQ_REMOVE(&dbp->join_queue, dbc, links);
- MUTEX_THREAD_UNLOCK(dbp->dbenv, dbp->mutexp);
+ MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp);
- PANIC_CHECK(dbc->dbp->dbenv);
+ PANIC_CHECK(dbenv);
/*
* Close any open scratch cursors. In each case, there may
@@ -625,13 +661,15 @@ __db_join_close(dbc)
ret = t_ret;
}
- __os_free(jc->j_exhausted, 0);
- __os_free(jc->j_curslist, 0);
- __os_free(jc->j_workcurs, 0);
- __os_free(jc->j_fdupcurs, 0);
- __os_free(jc->j_key.data, jc->j_key.ulen);
- __os_free(jc, sizeof(JOIN_CURSOR));
- __os_free(dbc, sizeof(DBC));
+ __os_free(dbenv, jc->j_exhausted);
+ __os_free(dbenv, jc->j_curslist);
+ __os_free(dbenv, jc->j_workcurs);
+ __os_free(dbenv, jc->j_fdupcurs);
+ __os_free(dbenv, jc->j_key.data);
+ if (jc->j_rdata.data != NULL)
+ __os_ufree(dbenv, jc->j_rdata.data);
+ __os_free(dbenv, jc);
+ __os_free(dbenv, dbc);
return (ret);
}
@@ -652,10 +690,10 @@ __db_join_close(dbc)
* If no matching datum exists, returns DB_NOTFOUND, else 0.
*/
static int
-__db_join_getnext(dbc, key, data, exhausted)
+__db_join_getnext(dbc, key, data, exhausted, opmods)
DBC *dbc;
DBT *key, *data;
- u_int32_t exhausted;
+ u_int32_t exhausted, opmods;
{
int ret, cmp;
DB *dbp;
@@ -667,10 +705,14 @@ __db_join_getnext(dbc, key, data, exhausted)
switch (exhausted) {
case 0:
+ /*
+ * We don't want to step on data->data; use a new
+ * DBT and malloc so we don't step on dbc's rdata memory.
+ */
memset(&ldata, 0, sizeof(DBT));
- /* We don't want to step on data->data; malloc. */
F_SET(&ldata, DB_DBT_MALLOC);
- if ((ret = dbc->c_get(dbc, key, &ldata, DB_CURRENT)) != 0)
+ if ((ret = dbc->c_real_get(dbc,
+ key, &ldata, opmods | DB_CURRENT)) != 0)
break;
cmp = func(dbp, data, &ldata);
if (cmp == 0) {
@@ -679,10 +721,10 @@ __db_join_getnext(dbc, key, data, exhausted)
* it into data, then free the buffer we malloc'ed
* above.
*/
- if ((ret = __db_retcopy(dbp, data, ldata.data,
+ if ((ret = __db_retcopy(dbp->dbenv, data, ldata.data,
ldata.size, &data->data, &data->size)) != 0)
return (ret);
- __os_free(ldata.data, 0);
+ __os_ufree(dbp->dbenv, ldata.data);
return (0);
}
@@ -691,10 +733,10 @@ __db_join_getnext(dbc, key, data, exhausted)
* dups. We just forget about ldata and free
* its buffer--data contains the value we're searching for.
*/
- __os_free(ldata.data, 0);
+ __os_ufree(dbp->dbenv, ldata.data);
/* FALLTHROUGH */
case 1:
- ret = dbc->c_get(dbc, key, data, DB_GET_BOTHC);
+ ret = dbc->c_real_get(dbc, key, data, opmods | DB_GET_BOTHC);
break;
default:
ret = EINVAL;
@@ -708,7 +750,6 @@ __db_join_getnext(dbc, key, data, exhausted)
* __db_join_cmp --
* Comparison function for sorting DBCs in cardinality order.
*/
-
static int
__db_join_cmp(a, b)
const void *a, *b;
@@ -728,3 +769,54 @@ __db_join_cmp(a, b)
return (counta - countb);
}
+
+/*
+ * __db_join_primget --
+ * Perform a DB->get in the primary, being careful not to use a new
+ * locker ID if we're doing CDB locking.
+ */
+static int
+__db_join_primget(dbp, txn, lockerid, key, data, flags)
+ DB *dbp;
+ DB_TXN *txn;
+ u_int32_t lockerid;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ DBC *dbc;
+ int dirty, ret, rmw, t_ret;
+
+ /*
+ * The only allowable flags here are the two flags copied into
+ * "opmods" in __db_join_get, DB_RMW and DB_DIRTY_READ. The former
+ * is an op on the c_get call, the latter on the cursor call.
+ * It's a DB bug if we allow any other flags down in here.
+ */
+ rmw = LF_ISSET(DB_RMW);
+ dirty = LF_ISSET(DB_DIRTY_READ);
+ LF_CLR(DB_RMW | DB_DIRTY_READ);
+ DB_ASSERT(flags == 0);
+
+ if ((ret = __db_icursor(dbp,
+ txn, dbp->type, PGNO_INVALID, 0, lockerid, &dbc)) != 0)
+ return (ret);
+
+ if (dirty ||
+ (txn != NULL && F_ISSET(txn, TXN_DIRTY_READ)))
+ F_SET(dbc, DBC_DIRTY_READ);
+ F_SET(dbc, DBC_TRANSIENT);
+
+ /*
+ * This shouldn't be necessary, thanks to the fact that join cursors
+ * swap in their own DB_DBT_REALLOC'ed buffers, but just for form's
+ * sake, we mirror what __db_get does.
+ */
+ SET_RET_MEM(dbc, dbp);
+
+ ret = dbc->c_get(dbc, key, data, DB_SET | rmw);
+
+ if ((t_ret = __db_c_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
diff --git a/bdb/db/db_meta.c b/bdb/db/db_meta.c
index 5b57c369454..015ef5c8fc7 100644
--- a/bdb/db/db_meta.c
+++ b/bdb/db/db_meta.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
/*
@@ -43,7 +43,7 @@
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: db_meta.c,v 11.26 2001/01/16 21:57:19 ubell Exp $";
+static const char revid[] = "$Id: db_meta.c,v 11.61 2002/08/08 03:57:48 bostic Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -53,12 +53,37 @@ static const char revid[] = "$Id: db_meta.c,v 11.26 2001/01/16 21:57:19 ubell Ex
#endif
#include "db_int.h"
-#include "db_page.h"
-#include "db_shash.h"
-#include "lock.h"
-#include "txn.h"
-#include "db_am.h"
-#include "btree.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/lock.h"
+#include "dbinc/db_am.h"
+
+static void __db_init_meta __P((void *, u_int32_t, db_pgno_t, u_int32_t));
+
+/*
+ * __db_init_meta --
+ * Helper function for __db_new that initializes the important fields in
+ * a meta-data page (used instead of P_INIT). We need to make sure that we
+ * retain the page number and LSN of the existing page.
+ */
+static void
+__db_init_meta(p, pgsize, pgno, pgtype)
+ void *p;
+ u_int32_t pgsize;
+ db_pgno_t pgno;
+ u_int32_t pgtype;
+{
+ DB_LSN save_lsn;
+ DBMETA *meta;
+
+ meta = (DBMETA *)p;
+ save_lsn = meta->lsn;
+ memset(meta, 0, sizeof(DBMETA));
+ meta->lsn = save_lsn;
+ meta->pagesize = pgsize;
+ meta->pgno = pgno;
+ meta->type = (u_int8_t)pgtype;
+}
/*
* __db_new --
@@ -75,60 +100,110 @@ __db_new(dbc, type, pagepp)
DBMETA *meta;
DB *dbp;
DB_LOCK metalock;
+ DB_LSN lsn;
+ DB_MPOOLFILE *mpf;
PAGE *h;
- db_pgno_t pgno;
- int ret;
+ db_pgno_t pgno, newnext;
+ int meta_flags, extend, ret;
- dbp = dbc->dbp;
meta = NULL;
+ meta_flags = 0;
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
h = NULL;
+ newnext = PGNO_INVALID;
pgno = PGNO_BASE_MD;
if ((ret = __db_lget(dbc,
LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
goto err;
- if ((ret = memp_fget(dbp->mpf, &pgno, 0, (PAGE **)&meta)) != 0)
+ if ((ret = mpf->get(mpf, &pgno, 0, (PAGE **)&meta)) != 0)
goto err;
-
if (meta->free == PGNO_INVALID) {
- if ((ret = memp_fget(dbp->mpf, &pgno, DB_MPOOL_NEW, &h)) != 0)
- goto err;
- ZERO_LSN(h->lsn);
- h->pgno = pgno;
+ pgno = meta->last_pgno + 1;
+ ZERO_LSN(lsn);
+ extend = 1;
} else {
pgno = meta->free;
- if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
+ if ((ret = mpf->get(mpf, &pgno, 0, &h)) != 0)
goto err;
- meta->free = h->next_pgno;
- (void)memp_fset(dbp->mpf, (PAGE *)meta, DB_MPOOL_DIRTY);
+
+ /*
+ * We want to take the first page off the free list and
+ * then set meta->free to the that page's next_pgno, but
+ * we need to log the change first.
+ */
+ newnext = h->next_pgno;
+ lsn = h->lsn;
+ extend = 0;
}
- DB_ASSERT(TYPE(h) == P_INVALID);
+ /*
+ * Log the allocation before fetching the new page. If we
+ * don't have room in the log then we don't want to tell
+ * mpool to extend the file.
+ */
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __db_pg_alloc_log(dbp, dbc->txn, &LSN(meta), 0,
+ &LSN(meta), PGNO_BASE_MD, &lsn, pgno,
+ (u_int32_t)type, newnext)) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(LSN(meta));
- if (TYPE(h) != P_INVALID)
- return (__db_panic(dbp->dbenv, EINVAL));
+ meta_flags = DB_MPOOL_DIRTY;
+ meta->free = newnext;
- /* Log the change. */
- if (DB_LOGGING(dbc)) {
- if ((ret = __db_pg_alloc_log(dbp->dbenv,
- dbc->txn, &LSN(meta), 0, dbp->log_fileid,
- &LSN(meta), &h->lsn, h->pgno,
- (u_int32_t)type, meta->free)) != 0)
+ if (extend == 1) {
+ meta->last_pgno++;
+ if ((ret = mpf->get(mpf, &pgno, DB_MPOOL_NEW, &h)) != 0)
goto err;
- LSN(h) = LSN(meta);
+ ZERO_LSN(h->lsn);
+ h->pgno = pgno;
+ DB_ASSERT(pgno == meta->last_pgno);
}
+ LSN(h) = LSN(meta);
+
+ DB_ASSERT(TYPE(h) == P_INVALID);
+
+ if (TYPE(h) != P_INVALID)
+ return (__db_panic(dbp->dbenv, EINVAL));
- (void)memp_fput(dbp->mpf, (PAGE *)meta, DB_MPOOL_DIRTY);
+ (void)mpf->put(mpf, (PAGE *)meta, DB_MPOOL_DIRTY);
(void)__TLPUT(dbc, metalock);
- P_INIT(h, dbp->pgsize, h->pgno, PGNO_INVALID, PGNO_INVALID, 0, type);
+ switch (type) {
+ case P_BTREEMETA:
+ case P_HASHMETA:
+ case P_QAMMETA:
+ __db_init_meta(h, dbp->pgsize, h->pgno, type);
+ break;
+ default:
+ P_INIT(h, dbp->pgsize,
+ h->pgno, PGNO_INVALID, PGNO_INVALID, 0, type);
+ break;
+ }
+
+ /*
+ * If dirty reads are enabled and we are in a transaction, we could
+ * abort this allocation after the page(s) pointing to this
+ * one have their locks downgraded. This would permit dirty readers
+ * to access this page which is ok, but they must be off the
+ * page when we abort. This will also prevent updates happening
+ * to this page until we commit.
+ */
+ if (F_ISSET(dbc->dbp, DB_AM_DIRTY) && dbc->txn != NULL) {
+ if ((ret = __db_lget(dbc, 0,
+ h->pgno, DB_LOCK_WWRITE, 0, &metalock)) != 0)
+ goto err;
+ }
*pagepp = h;
return (0);
err: if (h != NULL)
- (void)memp_fput(dbp->mpf, h, 0);
+ (void)mpf->put(mpf, h, 0);
if (meta != NULL)
- (void)memp_fput(dbp->mpf, meta, 0);
+ (void)mpf->put(mpf, meta, meta_flags);
(void)__TLPUT(dbc, metalock);
return (ret);
}
@@ -148,11 +223,13 @@ __db_free(dbc, h)
DB *dbp;
DBT ldbt;
DB_LOCK metalock;
+ DB_MPOOLFILE *mpf;
db_pgno_t pgno;
u_int32_t dirty_flag;
int ret, t_ret;
dbp = dbc->dbp;
+ mpf = dbp->mpf;
/*
* Retrieve the metadata page and insert the page at the head of
@@ -163,43 +240,44 @@ __db_free(dbc, h)
dirty_flag = 0;
pgno = PGNO_BASE_MD;
if ((ret = __db_lget(dbc,
- LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
+ LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
goto err;
- if ((ret = memp_fget(dbp->mpf, &pgno, 0, (PAGE **)&meta)) != 0) {
+ if ((ret = mpf->get(mpf, &pgno, 0, (PAGE **)&meta)) != 0) {
(void)__TLPUT(dbc, metalock);
goto err;
}
DB_ASSERT(h->pgno != meta->free);
/* Log the change. */
- if (DB_LOGGING(dbc)) {
+ if (DBC_LOGGING(dbc)) {
memset(&ldbt, 0, sizeof(ldbt));
ldbt.data = h;
- ldbt.size = P_OVERHEAD;
- if ((ret = __db_pg_free_log(dbp->dbenv,
- dbc->txn, &LSN(meta), 0, dbp->log_fileid, h->pgno,
- &LSN(meta), &ldbt, meta->free)) != 0) {
- (void)memp_fput(dbp->mpf, (PAGE *)meta, 0);
+ ldbt.size = P_OVERHEAD(dbp);
+ if ((ret = __db_pg_free_log(dbp,
+ dbc->txn, &LSN(meta), 0, h->pgno,
+ &LSN(meta), PGNO_BASE_MD, &ldbt, meta->free)) != 0) {
+ (void)mpf->put(mpf, (PAGE *)meta, 0);
(void)__TLPUT(dbc, metalock);
- return (ret);
+ goto err;
}
- LSN(h) = LSN(meta);
- }
+ } else
+ LSN_NOT_LOGGED(LSN(meta));
+ LSN(h) = LSN(meta);
P_INIT(h, dbp->pgsize, h->pgno, PGNO_INVALID, meta->free, 0, P_INVALID);
meta->free = h->pgno;
/* Discard the metadata page. */
- if ((t_ret = memp_fput(dbp->mpf,
- (PAGE *)meta, DB_MPOOL_DIRTY)) != 0 && ret == 0)
+ if ((t_ret =
+ mpf->put(mpf, (PAGE *)meta, DB_MPOOL_DIRTY)) != 0 && ret == 0)
ret = t_ret;
if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0)
ret = t_ret;
/* Discard the caller's page reference. */
dirty_flag = DB_MPOOL_DIRTY;
-err: if ((t_ret = memp_fput(dbp->mpf, h, dirty_flag)) != 0 && ret == 0)
+err: if ((t_ret = mpf->put(mpf, h, dirty_flag)) != 0 && ret == 0)
ret = t_ret;
/*
@@ -227,44 +305,63 @@ __db_lprint(dbc)
if (LOCKING_ON(dbp->dbenv)) {
req.op = DB_LOCK_DUMP;
- lock_vec(dbp->dbenv, dbc->locker, 0, &req, 1, NULL);
+ dbp->dbenv->lock_vec(dbp->dbenv, dbc->locker, 0, &req, 1, NULL);
}
return (0);
}
#endif
/*
+ * Implement the rules for transactional locking. We can release the previous
+ * lock if we are not in a transaction or COUPLE_ALWAYS is specifed (used in
+ * record locking). If we are doing dirty reads then we can release read locks
+ * and down grade write locks.
+ */
+#define DB_PUT_ACTION(dbc, action, lockp) \
+ (((action == LCK_COUPLE || action == LCK_COUPLE_ALWAYS) && \
+ LOCK_ISSET(*lockp)) ? \
+ (dbc->txn == NULL || action == LCK_COUPLE_ALWAYS || \
+ (F_ISSET(dbc, DBC_DIRTY_READ) && \
+ (lockp)->mode == DB_LOCK_DIRTY)) ? LCK_COUPLE : \
+ (F_ISSET((dbc)->dbp, DB_AM_DIRTY) && \
+ (lockp)->mode == DB_LOCK_WRITE) ? LCK_DOWNGRADE : 0 : 0)
+
+/*
* __db_lget --
* The standard lock get call.
*
* PUBLIC: int __db_lget __P((DBC *,
- * PUBLIC: int, db_pgno_t, db_lockmode_t, int, DB_LOCK *));
+ * PUBLIC: int, db_pgno_t, db_lockmode_t, u_int32_t, DB_LOCK *));
*/
int
-__db_lget(dbc, flags, pgno, mode, lkflags, lockp)
+__db_lget(dbc, action, pgno, mode, lkflags, lockp)
DBC *dbc;
- int flags, lkflags;
+ int action;
db_pgno_t pgno;
db_lockmode_t mode;
+ u_int32_t lkflags;
DB_LOCK *lockp;
{
DB *dbp;
DB_ENV *dbenv;
DB_LOCKREQ couple[2], *reqp;
- int ret;
+ DB_TXN *txn;
+ int has_timeout, ret;
dbp = dbc->dbp;
dbenv = dbp->dbenv;
+ txn = dbc->txn;
/*
* We do not always check if we're configured for locking before
* calling __db_lget to acquire the lock.
*/
- if (CDB_LOCKING(dbenv)
- || !LOCKING_ON(dbenv) || F_ISSET(dbc, DBC_COMPENSATE)
- || (!LF_ISSET(LCK_ROLLBACK) && F_ISSET(dbc, DBC_RECOVER))
- || (!LF_ISSET(LCK_ALWAYS) && F_ISSET(dbc, DBC_OPD))) {
- lockp->off = LOCK_INVALID;
+ if (CDB_LOCKING(dbenv) ||
+ !LOCKING_ON(dbenv) || F_ISSET(dbc, DBC_COMPENSATE) ||
+ (F_ISSET(dbc, DBC_RECOVER) &&
+ (action != LCK_ROLLBACK || F_ISSET(dbenv, DB_ENV_REP_CLIENT))) ||
+ (action != LCK_ALWAYS && F_ISSET(dbc, DBC_OPD))) {
+ LOCK_INIT(*lockp);
return (0);
}
@@ -282,27 +379,73 @@ __db_lget(dbc, flags, pgno, mode, lkflags, lockp)
if (DB_NONBLOCK(dbc))
lkflags |= DB_LOCK_NOWAIT;
- /*
- * If the object not currently locked, acquire the lock and return,
- * otherwise, lock couple.
- */
- if (LF_ISSET(LCK_COUPLE)) {
- couple[0].op = DB_LOCK_GET;
+ if (F_ISSET(dbc, DBC_DIRTY_READ) && mode == DB_LOCK_READ)
+ mode = DB_LOCK_DIRTY;
+
+ has_timeout = txn != NULL && F_ISSET(txn, TXN_LOCKTIMEOUT);
+
+ switch (DB_PUT_ACTION(dbc, action, lockp)) {
+ case LCK_COUPLE:
+lck_couple: couple[0].op = has_timeout? DB_LOCK_GET_TIMEOUT : DB_LOCK_GET;
couple[0].obj = &dbc->lock_dbt;
couple[0].mode = mode;
- couple[1].op = DB_LOCK_PUT;
- couple[1].lock = *lockp;
+ if (action == LCK_COUPLE_ALWAYS)
+ action = LCK_COUPLE;
+ UMRW_SET(couple[0].timeout);
+ if (has_timeout)
+ couple[0].timeout = txn->lock_timeout;
+ if (action == LCK_COUPLE) {
+ couple[1].op = DB_LOCK_PUT;
+ couple[1].lock = *lockp;
+ }
- ret = lock_vec(dbenv,
- dbc->locker, lkflags, couple, 2, &reqp);
+ ret = dbenv->lock_vec(dbenv, dbc->locker,
+ lkflags, couple, action == LCK_COUPLE ? 2 : 1, &reqp);
if (ret == 0 || reqp == &couple[1])
*lockp = couple[0].lock;
- } else {
- ret = lock_get(dbenv,
+ break;
+ case LCK_DOWNGRADE:
+ if ((ret = dbenv->lock_downgrade(
+ dbenv, lockp, DB_LOCK_WWRITE, 0)) != 0)
+ return (ret);
+ /* FALL THROUGH */
+ default:
+ if (has_timeout)
+ goto lck_couple;
+ ret = dbenv->lock_get(dbenv,
dbc->locker, lkflags, &dbc->lock_dbt, mode, lockp);
+ break;
+ }
+
+ return (ret);
+}
+
+/*
+ * __db_lput --
+ * The standard lock put call.
+ *
+ * PUBLIC: int __db_lput __P((DBC *, DB_LOCK *));
+ */
+int
+__db_lput(dbc, lockp)
+ DBC *dbc;
+ DB_LOCK *lockp;
+{
+ DB_ENV *dbenv;
+ int ret;
- if (ret != 0)
- lockp->off = LOCK_INVALID;
+ dbenv = dbc->dbp->dbenv;
+
+ switch (DB_PUT_ACTION(dbc, LCK_COUPLE, lockp)) {
+ case LCK_COUPLE:
+ ret = dbenv->lock_put(dbenv, lockp);
+ break;
+ case LCK_DOWNGRADE:
+ ret = __lock_downgrade(dbenv, lockp, DB_LOCK_WWRITE, 0);
+ break;
+ default:
+ ret = 0;
+ break;
}
return (ret);
diff --git a/bdb/db/db_method.c b/bdb/db/db_method.c
index 01568a6e144..14712180df0 100644
--- a/bdb/db/db_method.c
+++ b/bdb/db/db_method.c
@@ -1,14 +1,14 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1999, 2000
+ * Copyright (c) 1999-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: db_method.c,v 11.36 2000/12/21 09:17:04 krinsky Exp $";
+static const char revid[] = "$Id: db_method.c,v 11.78 2002/07/02 19:26:55 sue Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -21,50 +21,56 @@ static const char revid[] = "$Id: db_method.c,v 11.36 2000/12/21 09:17:04 krinsk
#include <string.h>
#endif
-#ifdef HAVE_RPC
-#include "db_server.h"
-#endif
-
#include "db_int.h"
-#include "db_page.h"
-#include "db_am.h"
-#include "btree.h"
-#include "hash.h"
-#include "qam.h"
-#include "xa.h"
-#include "xa_ext.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/qam.h"
+#include "dbinc/xa.h"
+#include "dbinc_auto/xa_ext.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/lock.h"
#ifdef HAVE_RPC
-#include "gen_client_ext.h"
-#include "rpc_client_ext.h"
+#include "dbinc_auto/db_server.h"
+#include "dbinc_auto/rpc_client_ext.h"
#endif
-static int __db_get_byteswapped __P((DB *));
-static DBTYPE
- __db_get_type __P((DB *));
+static int __db_get_byteswapped __P((DB *, int *));
+static int __db_get_type __P((DB *, DBTYPE *dbtype));
static int __db_init __P((DB *, u_int32_t));
static int __db_key_range
__P((DB *, DB_TXN *, DBT *, DB_KEY_RANGE *, u_int32_t));
+static int __db_set_alloc __P((DB *, void *(*)(size_t),
+ void *(*)(void *, size_t), void (*)(void *)));
static int __db_set_append_recno __P((DB *, int (*)(DB *, DBT *, db_recno_t)));
static int __db_set_cachesize __P((DB *, u_int32_t, u_int32_t, int));
+static int __db_set_cache_priority __P((DB *, DB_CACHE_PRIORITY));
static int __db_set_dup_compare
__P((DB *, int (*)(DB *, const DBT *, const DBT *)));
-static void __db_set_errcall __P((DB *, void (*)(const char *, char *)));
-static void __db_set_errfile __P((DB *, FILE *));
+static int __db_set_encrypt __P((DB *, const char *, u_int32_t));
static int __db_set_feedback __P((DB *, void (*)(DB *, int, int)));
static int __db_set_flags __P((DB *, u_int32_t));
-static int __db_set_lorder __P((DB *, int));
-static int __db_set_malloc __P((DB *, void *(*)(size_t)));
static int __db_set_pagesize __P((DB *, u_int32_t));
-static int __db_set_realloc __P((DB *, void *(*)(void *, size_t)));
-static void __db_set_errpfx __P((DB *, const char *));
static int __db_set_paniccall __P((DB *, void (*)(DB_ENV *, int)));
+static void __db_set_errcall __P((DB *, void (*)(const char *, char *)));
+static void __db_set_errfile __P((DB *, FILE *));
+static void __db_set_errpfx __P((DB *, const char *));
+static int __db_stat_fail __P((DB *, void *, u_int32_t));
static void __dbh_err __P((DB *, int, const char *, ...));
static void __dbh_errx __P((DB *, const char *, ...));
+#ifdef HAVE_RPC
+static int __dbcl_init __P((DB *, DB_ENV *, u_int32_t));
+#endif
+
/*
* db_create --
* DB constructor.
+ *
+ * EXTERN: int db_create __P((DB **, DB_ENV *, u_int32_t));
*/
int
db_create(dbpp, dbenv, flags)
@@ -102,27 +108,25 @@ db_create(dbpp, dbenv, flags)
if ((ret = __os_calloc(dbenv, 1, sizeof(*dbp), &dbp)) != 0)
return (ret);
#ifdef HAVE_RPC
- if (dbenv != NULL && dbenv->cl_handle != NULL)
+ if (dbenv != NULL && RPC_ON(dbenv))
ret = __dbcl_init(dbp, dbenv, flags);
else
#endif
ret = __db_init(dbp, flags);
if (ret != 0) {
- __os_free(dbp, sizeof(*dbp));
+ __os_free(dbenv, dbp);
return (ret);
}
/* If we don't have an environment yet, allocate a local one. */
if (dbenv == NULL) {
if ((ret = db_env_create(&dbenv, 0)) != 0) {
- __os_free(dbp, sizeof(*dbp));
+ __os_free(dbenv, dbp);
return (ret);
}
- dbenv->dblocal_ref = 0;
F_SET(dbenv, DB_ENV_DBLOCAL);
}
- if (F_ISSET(dbenv, DB_ENV_DBLOCAL))
- ++dbenv->dblocal_ref;
+ ++dbenv->db_ref;
dbp->dbenv = dbenv;
@@ -141,18 +145,21 @@ __db_init(dbp, flags)
{
int ret;
- dbp->log_fileid = DB_LOGFILEID_INVALID;
+ dbp->lid = DB_LOCK_INVALIDID;
+ LOCK_INIT(dbp->handle_lock);
TAILQ_INIT(&dbp->free_queue);
TAILQ_INIT(&dbp->active_queue);
TAILQ_INIT(&dbp->join_queue);
+ LIST_INIT(&dbp->s_secondaries);
FLD_SET(dbp->am_ok,
DB_OK_BTREE | DB_OK_HASH | DB_OK_QUEUE | DB_OK_RECNO);
+ dbp->associate = __db_associate;
dbp->close = __db_close;
dbp->cursor = __db_cursor;
- dbp->del = NULL; /* !!! Must be set by access method. */
+ dbp->del = __db_delete;
dbp->err = __dbh_err;
dbp->errx = __dbh_errx;
dbp->fd = __db_fd;
@@ -162,26 +169,30 @@ __db_init(dbp, flags)
dbp->join = __db_join;
dbp->key_range = __db_key_range;
dbp->open = __db_open;
+ dbp->pget = __db_pget;
dbp->put = __db_put;
dbp->remove = __db_remove;
dbp->rename = __db_rename;
+ dbp->truncate = __db_truncate;
+ dbp->set_alloc = __db_set_alloc;
dbp->set_append_recno = __db_set_append_recno;
dbp->set_cachesize = __db_set_cachesize;
+ dbp->set_cache_priority = __db_set_cache_priority;
dbp->set_dup_compare = __db_set_dup_compare;
+ dbp->set_encrypt = __db_set_encrypt;
dbp->set_errcall = __db_set_errcall;
dbp->set_errfile = __db_set_errfile;
dbp->set_errpfx = __db_set_errpfx;
dbp->set_feedback = __db_set_feedback;
dbp->set_flags = __db_set_flags;
dbp->set_lorder = __db_set_lorder;
- dbp->set_malloc = __db_set_malloc;
dbp->set_pagesize = __db_set_pagesize;
dbp->set_paniccall = __db_set_paniccall;
- dbp->set_realloc = __db_set_realloc;
- dbp->stat = NULL; /* !!! Must be set by access method. */
+ dbp->stat = __db_stat_fail;
dbp->sync = __db_sync;
dbp->upgrade = __db_upgrade;
dbp->verify = __db_verify;
+
/* Access method specific. */
if ((ret = __bam_db_create(dbp)) != 0)
return (ret);
@@ -244,16 +255,7 @@ __dbh_err(dbp, error, fmt, va_alist)
va_dcl
#endif
{
- va_list ap;
-
-#ifdef __STDC__
- va_start(ap, fmt);
-#else
- va_start(ap);
-#endif
- __db_real_err(dbp->dbenv, error, 1, 1, fmt, ap);
-
- va_end(ap);
+ DB_REAL_ERR(dbp->dbenv, error, 1, 1, fmt);
}
/*
@@ -270,16 +272,7 @@ __dbh_errx(dbp, fmt, va_alist)
va_dcl
#endif
{
- va_list ap;
-
-#ifdef __STDC__
- va_start(ap, fmt);
-#else
- va_start(ap);
-#endif
- __db_real_err(dbp->dbenv, 0, 0, 1, fmt, ap);
-
- va_end(ap);
+ DB_REAL_ERR(dbp->dbenv, 0, 0, 1, fmt);
}
/*
@@ -287,25 +280,29 @@ __dbh_errx(dbp, fmt, va_alist)
* Return if database requires byte swapping.
*/
static int
-__db_get_byteswapped(dbp)
+__db_get_byteswapped(dbp, isswapped)
DB *dbp;
+ int *isswapped;
{
DB_ILLEGAL_BEFORE_OPEN(dbp, "get_byteswapped");
- return (F_ISSET(dbp, DB_AM_SWAP) ? 1 : 0);
+ *isswapped = F_ISSET(dbp, DB_AM_SWAP) ? 1 : 0;
+ return (0);
}
/*
* __db_get_type --
* Return type of underlying database.
*/
-static DBTYPE
-__db_get_type(dbp)
+static int
+__db_get_type(dbp, dbtype)
DB *dbp;
+ DBTYPE *dbtype;
{
DB_ILLEGAL_BEFORE_OPEN(dbp, "get_type");
- return (dbp->type);
+ *dbtype = dbp->type;
+ return (0);
}
/*
@@ -366,6 +363,26 @@ __db_set_cachesize(dbp, cache_gbytes, cache_bytes, ncache)
}
/*
+ * __db_set_cache_priority --
+ * Set cache priority for pages from this file.
+ */
+static int
+__db_set_cache_priority(dbp, priority)
+ DB *dbp;
+ DB_CACHE_PRIORITY priority;
+{
+ /*
+ * If an underlying DB_MPOOLFILE exists, call it. Otherwise, save
+ * the information away until DB->open is called.
+ */
+ if (dbp->mpf == NULL) {
+ dbp->priority = priority;
+ return (0);
+ }
+ return (dbp->mpf->set_priority(dbp->mpf, priority));
+}
+
+/*
* __db_set_dup_compare --
* Set duplicate comparison routine.
*/
@@ -374,14 +391,50 @@ __db_set_dup_compare(dbp, func)
DB *dbp;
int (*func) __P((DB *, const DBT *, const DBT *));
{
+ int ret;
+
DB_ILLEGAL_AFTER_OPEN(dbp, "dup_compare");
DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE | DB_OK_HASH);
+ if ((ret = dbp->set_flags(dbp, DB_DUPSORT)) != 0)
+ return (ret);
+
dbp->dup_compare = func;
return (0);
}
+/*
+ * __db_set_encrypt --
+ * Set database passwd.
+ */
+static int
+__db_set_encrypt(dbp, passwd, flags)
+ DB *dbp;
+ const char *passwd;
+ u_int32_t flags;
+{
+ DB_CIPHER *db_cipher;
+ int ret;
+
+ DB_ILLEGAL_IN_ENV(dbp, "set_encrypt");
+ DB_ILLEGAL_AFTER_OPEN(dbp, "set_encrypt");
+
+ if ((ret = dbp->dbenv->set_encrypt(dbp->dbenv, passwd, flags)) != 0)
+ return (ret);
+
+ /*
+ * In a real env, this gets initialized with the region. In a local
+ * env, we must do it here.
+ */
+ db_cipher = (DB_CIPHER *)dbp->dbenv->crypto_handle;
+ if (!F_ISSET(db_cipher, CIPHER_ANY) &&
+ (ret = db_cipher->init(dbp->dbenv, db_cipher)) != 0)
+ return (ret);
+
+ return (dbp->set_flags(dbp, DB_ENCRYPT));
+}
+
static void
__db_set_errcall(dbp, errcall)
DB *dbp;
@@ -430,6 +483,21 @@ __db_set_flags(dbp, flags)
*
* The queue access method takes no flags.
*/
+ if (LF_ISSET(DB_ENCRYPT)) {
+ if (!CRYPTO_ON(dbp->dbenv)) {
+ __db_err(dbp->dbenv,
+ "Database environment not configured for encryption");
+ return (EINVAL);
+ }
+ F_SET(dbp, DB_AM_ENCRYPT);
+ F_SET(dbp, DB_AM_CHKSUM);
+ LF_CLR(DB_ENCRYPT);
+ }
+ if (LF_ISSET(DB_CHKSUM_SHA1)) {
+ F_SET(dbp, DB_AM_CHKSUM);
+ LF_CLR(DB_CHKSUM_SHA1);
+ }
+
if ((ret = __bam_set_flags(dbp, &flags)) != 0)
return (ret);
if ((ret = __ram_set_flags(dbp, &flags)) != 0)
@@ -438,7 +506,13 @@ __db_set_flags(dbp, flags)
return (flags == 0 ? 0 : __db_ferr(dbp->dbenv, "DB->set_flags", 0));
}
-static int
+/*
+ * __db_set_lorder --
+ * Set whether lorder is swapped or not.
+ *
+ * PUBLIC: int __db_set_lorder __P((DB *, int));
+ */
+int
__db_set_lorder(dbp, db_lorder)
DB *dbp;
int db_lorder;
@@ -463,14 +537,17 @@ __db_set_lorder(dbp, db_lorder)
}
static int
-__db_set_malloc(dbp, func)
+__db_set_alloc(dbp, mal_func, real_func, free_func)
DB *dbp;
- void *(*func) __P((size_t));
+ void *(*mal_func) __P((size_t));
+ void *(*real_func) __P((void *, size_t));
+ void (*free_func) __P((void *));
{
- DB_ILLEGAL_AFTER_OPEN(dbp, "set_malloc");
+ DB_ILLEGAL_IN_ENV(dbp, "set_alloc");
+ DB_ILLEGAL_AFTER_OPEN(dbp, "set_alloc");
- dbp->db_malloc = func;
- return (0);
+ return (dbp->dbenv->set_alloc(dbp->dbenv,
+ mal_func, real_func, free_func));
}
static int
@@ -495,7 +572,7 @@ __db_set_pagesize(dbp, db_pagesize)
* We don't want anything that's not a power-of-2, as we rely on that
* for alignment of various types on the pages.
*/
- if ((u_int32_t)1 << __db_log2(db_pagesize) != db_pagesize) {
+ if (!POWER_OF_TWO(db_pagesize)) {
__db_err(dbp->dbenv, "page sizes must be a power-of-2");
return (EINVAL);
}
@@ -511,44 +588,44 @@ __db_set_pagesize(dbp, db_pagesize)
}
static int
-__db_set_realloc(dbp, func)
+__db_set_paniccall(dbp, paniccall)
DB *dbp;
- void *(*func) __P((void *, size_t));
+ void (*paniccall) __P((DB_ENV *, int));
{
- DB_ILLEGAL_AFTER_OPEN(dbp, "set_realloc");
-
- dbp->db_realloc = func;
- return (0);
+ return (dbp->dbenv->set_paniccall(dbp->dbenv, paniccall));
}
static int
-__db_set_paniccall(dbp, paniccall)
+__db_stat_fail(dbp, sp, flags)
DB *dbp;
- void (*paniccall) __P((DB_ENV *, int));
+ void *sp;
+ u_int32_t flags;
{
- return (dbp->dbenv->set_paniccall(dbp->dbenv, paniccall));
+ COMPQUIET(sp, NULL);
+ COMPQUIET(flags, 0);
+
+ /*
+ * DB->stat isn't initialized until the actual DB->open call,
+ * but we don't want to core dump.
+ */
+ PANIC_CHECK(dbp->dbenv);
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->stat");
+
+ /* NOTREACHED */
+ return (EINVAL);
}
#ifdef HAVE_RPC
/*
* __dbcl_init --
* Initialize a DB structure on the server.
- *
- * PUBLIC: #ifdef HAVE_RPC
- * PUBLIC: int __dbcl_init __P((DB *, DB_ENV *, u_int32_t));
- * PUBLIC: #endif
*/
-int
+static int
__dbcl_init(dbp, dbenv, flags)
DB *dbp;
DB_ENV *dbenv;
u_int32_t flags;
{
- CLIENT *cl;
- __db_create_reply *replyp;
- __db_create_msg req;
- int ret;
-
TAILQ_INIT(&dbp->free_queue);
TAILQ_INIT(&dbp->active_queue);
/* !!!
@@ -556,6 +633,7 @@ __dbcl_init(dbp, dbenv, flags)
* not used in RPC clients. See the comment in __dbcl_db_join_ret().
*/
+ dbp->associate = __dbcl_db_associate;
dbp->close = __dbcl_db_close;
dbp->cursor = __dbcl_db_cursor;
dbp->del = __dbcl_db_del;
@@ -563,31 +641,34 @@ __dbcl_init(dbp, dbenv, flags)
dbp->errx = __dbh_errx;
dbp->fd = __dbcl_db_fd;
dbp->get = __dbcl_db_get;
- dbp->get_byteswapped = __dbcl_db_swapped;
+ dbp->get_byteswapped = __db_get_byteswapped;
dbp->get_type = __db_get_type;
dbp->join = __dbcl_db_join;
dbp->key_range = __dbcl_db_key_range;
- dbp->open = __dbcl_db_open;
+ dbp->open = __dbcl_db_open_wrap;
+ dbp->pget = __dbcl_db_pget;
dbp->put = __dbcl_db_put;
dbp->remove = __dbcl_db_remove;
dbp->rename = __dbcl_db_rename;
+ dbp->set_alloc = __dbcl_db_alloc;
dbp->set_append_recno = __dbcl_db_set_append_recno;
dbp->set_cachesize = __dbcl_db_cachesize;
- dbp->set_dup_compare = NULL;
+ dbp->set_cache_priority = __dbcl_db_cache_priority;
+ dbp->set_dup_compare = __dbcl_db_dup_compare;
+ dbp->set_encrypt = __dbcl_db_encrypt;
dbp->set_errcall = __db_set_errcall;
dbp->set_errfile = __db_set_errfile;
dbp->set_errpfx = __db_set_errpfx;
dbp->set_feedback = __dbcl_db_feedback;
dbp->set_flags = __dbcl_db_flags;
dbp->set_lorder = __dbcl_db_lorder;
- dbp->set_malloc = __dbcl_db_malloc;
dbp->set_pagesize = __dbcl_db_pagesize;
dbp->set_paniccall = __dbcl_db_panic;
- dbp->set_q_extentsize = __dbcl_db_extentsize;
- dbp->set_realloc = __dbcl_db_realloc;
dbp->stat = __dbcl_db_stat;
dbp->sync = __dbcl_db_sync;
+ dbp->truncate = __dbcl_db_truncate;
dbp->upgrade = __dbcl_db_upgrade;
+ dbp->verify = __dbcl_db_verify;
/*
* Set all the method specific functions to client funcs as well.
@@ -599,31 +680,12 @@ __dbcl_init(dbp, dbenv, flags)
dbp->set_h_ffactor = __dbcl_db_h_ffactor;
dbp->set_h_hash = __dbcl_db_h_hash;
dbp->set_h_nelem = __dbcl_db_h_nelem;
+ dbp->set_q_extentsize = __dbcl_db_extentsize;
dbp->set_re_delim = __dbcl_db_re_delim;
dbp->set_re_len = __dbcl_db_re_len;
dbp->set_re_pad = __dbcl_db_re_pad;
dbp->set_re_source = __dbcl_db_re_source;
-/*
- dbp->set_q_extentsize = __dbcl_db_q_extentsize;
-*/
-
- cl = (CLIENT *)dbenv->cl_handle;
- req.flags = flags;
- req.envpcl_id = dbenv->cl_id;
-
- /*
- * CALL THE SERVER
- */
- replyp = __db_db_create_1(&req, cl);
- if (replyp == NULL) {
- __db_err(dbenv, clnt_sperror(cl, "Berkeley DB"));
- return (DB_NOSERVER);
- }
- if ((ret = replyp->status) != 0)
- return (ret);
-
- dbp->cl_id = replyp->dbpcl_id;
- return (0);
+ return (__dbcl_db_create(dbp, dbenv, flags));
}
#endif
diff --git a/bdb/db/db_open.c b/bdb/db/db_open.c
new file mode 100644
index 00000000000..f6f96cda547
--- /dev/null
+++ b/bdb/db/db_open.c
@@ -0,0 +1,705 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2002
+ * Sleepycat Software. All rights reserved.
+ */
+
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: db_open.c,v 11.215 2002/08/15 15:27:52 bostic Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/btree.h"
+#include "dbinc/crypto.h"
+#include "dbinc/hmac.h"
+#include "dbinc/fop.h"
+#include "dbinc/hash.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+static int __db_openchk __P((DB *,
+ DB_TXN *, const char *, const char *, DBTYPE, u_int32_t));
+
+/*
+ * __db_open --
+ * Main library interface to the DB access methods.
+ *
+ * PUBLIC: int __db_open __P((DB *, DB_TXN *,
+ * PUBLIC: const char *, const char *, DBTYPE, u_int32_t, int));
+ */
+int
+__db_open(dbp, txn, name, subdb, type, flags, mode)
+ DB *dbp;
+ DB_TXN *txn;
+ const char *name, *subdb;
+ DBTYPE type;
+ u_int32_t flags;
+ int mode;
+{
+ DB_ENV *dbenv;
+ int remove_master, remove_me, ret, t_ret, txn_local;
+
+ dbenv = dbp->dbenv;
+ remove_me = remove_master = txn_local = 0;
+
+ PANIC_CHECK(dbenv);
+
+ if ((ret = __db_openchk(dbp, txn, name, subdb, type, flags)) != 0)
+ return (ret);
+
+ /*
+ * Create local transaction as necessary, check for consistent
+ * transaction usage.
+ */
+ if (IS_AUTO_COMMIT(dbenv, txn, flags)) {
+ if ((ret = __db_txn_auto(dbp, &txn)) != 0)
+ return (ret);
+ txn_local = 1;
+ } else
+ if (txn != NULL && !TXN_ON(dbenv))
+ return (__db_not_txn_env(dbenv));
+
+ /*
+ * If the environment was configured with threads, the DB handle
+ * must also be free-threaded, so we force the DB_THREAD flag on.
+ * (See SR #2033 for why this is a requirement--recovery needs
+ * to be able to grab a dbp using __db_fileid_to_dbp, and it has
+ * no way of knowing which dbp goes with which thread, so whichever
+ * one it finds has to be usable in any of them.)
+ */
+ if (F_ISSET(dbenv, DB_ENV_THREAD))
+ LF_SET(DB_THREAD);
+
+ /* Convert any DB->open flags. */
+ if (LF_ISSET(DB_RDONLY))
+ F_SET(dbp, DB_AM_RDONLY);
+ if (LF_ISSET(DB_DIRTY_READ))
+ F_SET(dbp, DB_AM_DIRTY);
+
+ /* Fill in the type. */
+ dbp->type = type;
+
+ /*
+ * If we're opening a subdatabase, we have to open (and potentially
+ * create) the main database, and then get (and potentially store)
+ * our base page number in that database. Then, we can finally open
+ * the subdatabase.
+ */
+ if ((ret = __db_dbopen(
+ dbp, txn, name, subdb, flags, mode, PGNO_BASE_MD)) != 0)
+ goto err;
+
+ /*
+ * You can open the database that describes the subdatabases in the
+ * rest of the file read-only. The content of each key's data is
+ * unspecified and applications should never be adding new records
+ * or updating existing records. However, during recovery, we need
+ * to open these databases R/W so we can redo/undo changes in them.
+ * Likewise, we need to open master databases read/write during
+ * rename and remove so we can be sure they're fully sync'ed, so
+ * we provide an override flag for the purpose.
+ */
+ if (subdb == NULL && !IS_RECOVERING(dbenv) && !LF_ISSET(DB_RDONLY) &&
+ !LF_ISSET(DB_RDWRMASTER) && F_ISSET(dbp, DB_AM_SUBDB)) {
+ __db_err(dbenv,
+ "files containing multiple databases may only be opened read-only");
+ ret = EINVAL;
+ goto err;
+ }
+
+err: /* If we were successful, don't discard the file on close. */
+ if (ret == 0)
+ /* If we were successful, don't discard the file on close. */
+ F_CLR(dbp, DB_AM_DISCARD | DB_AM_CREATED | DB_AM_CREATED_MSTR);
+ else {
+ /*
+ * If we are not transactional, we need to remove the
+ * databases/subdatabases. If we are transactional, then
+ * the abort of the child transaction should take care of
+ * cleaning them up.
+ */
+ remove_me = txn == NULL && F_ISSET(dbp, DB_AM_CREATED);
+ remove_master = txn == NULL && F_ISSET(dbp, DB_AM_CREATED_MSTR);
+
+ /*
+ * If we had an error, it may have happened before or after
+ * we actually logged the open. If it happened before, then
+ * abort won't know anything about it and won't close or
+ * refresh the dbp, so we need to do it explicitly.
+ */
+ (void)__db_refresh(dbp, txn, DB_NOSYNC);
+ }
+
+ /* Remove anyone we created. */
+ if (remove_master || (subdb == NULL && remove_me))
+ /* Remove file. */
+ (void)dbenv->dbremove(dbenv, txn, name, NULL, 0);
+ else if (remove_me)
+ /* Remove subdatabase. */
+ (void)dbenv->dbremove(dbenv, txn, name, subdb, 0);
+
+ /* Commit for DB_AUTO_COMMIT. */
+ if (txn_local) {
+ if (ret == 0)
+ ret = txn->commit(txn, 0);
+ else
+ if ((t_ret = txn->abort(txn)) != 0)
+ ret = __db_panic(dbenv, t_ret);
+ }
+
+ return (ret);
+}
+
+/*
+ * __db_dbopen --
+ * Open a database. This routine gets called in three different ways.
+ * 1. It can be called to open a file/database. In this case, subdb will
+ * be NULL and meta_pgno will be PGNO_BASE_MD.
+ * 2. It can be called to open a subdatabase during normal operation. In
+ * this case, name and subname will both be non-NULL and meta_pgno will
+ * be PGNO_BAS_MD (also PGNO_INVALID).
+ * 3. It can be called during recovery to open a subdatabase in which case
+ * name will be non-NULL, subname mqy be NULL and meta-pgno will be
+ * a valid pgno (i.e., not PGNO_BASE_MD).
+ *
+ * PUBLIC: int __db_dbopen __P((DB *, DB_TXN *,
+ * PUBLIC: const char *, const char *, u_int32_t, int, db_pgno_t));
+ */
+int
+__db_dbopen(dbp, txn, name, subdb, flags, mode, meta_pgno)
+ DB *dbp;
+ DB_TXN *txn;
+ const char *name, *subdb;
+ u_int32_t flags;
+ int mode;
+ db_pgno_t meta_pgno;
+{
+ DB_ENV *dbenv;
+ int ret;
+ u_int32_t id;
+
+ dbenv = dbp->dbenv;
+ id = TXN_INVALID;
+ if (txn != NULL)
+ F_SET(dbp, DB_AM_TXN);
+
+ DB_TEST_RECOVERY(dbp, DB_TEST_PREOPEN, ret, name);
+ /*
+ * If name is NULL, it's always a create, so make sure that we
+ * have a type specified. It would be nice if this checking
+ * were done in __db_open where most of the interface checking
+ * is done, but this interface (__db_dbopen) is used by the
+ * recovery and limbo system, so we need to safeguard this
+ * interface as well.
+ */
+ if (name == NULL) {
+ F_SET(dbp, DB_AM_INMEM);
+
+ if (dbp->type == DB_UNKNOWN) {
+ __db_err(dbenv,
+ "DBTYPE of unknown without existing file");
+ return (EINVAL);
+ }
+
+ if (dbp->pgsize == 0)
+ dbp->pgsize = DB_DEF_IOSIZE;
+
+ /*
+ * If the file is a temporary file and we're doing locking,
+ * then we have to create a unique file ID. We can't use our
+ * normal dev/inode pair (or whatever this OS uses in place of
+ * dev/inode pairs) because no backing file will be created
+ * until the mpool cache is filled forcing the buffers to disk.
+ * Grab a random locker ID to use as a file ID. The created
+ * ID must never match a potential real file ID -- we know it
+ * won't because real file IDs contain a time stamp after the
+ * dev/inode pair, and we're simply storing a 4-byte value.
+ *
+ * !!!
+ * Store the locker in the file id structure -- we can get it
+ * from there as necessary, and it saves having two copies.
+ */
+ if (LOCKING_ON(dbenv) && (ret = dbenv->lock_id(dbenv,
+ (u_int32_t *)dbp->fileid)) != 0)
+ return (ret);
+ } else if (subdb == NULL && meta_pgno == PGNO_BASE_MD) {
+ /* Open/create the underlying file. Acquire locks. */
+ if ((ret =
+ __fop_file_setup(dbp, txn, name, mode, flags, &id)) != 0)
+ return (ret);
+ } else {
+ if ((ret = __fop_subdb_setup(dbp,
+ txn, name, subdb, mode, flags)) != 0)
+ return (ret);
+ meta_pgno = dbp->meta_pgno;
+ }
+
+ /*
+ * If we created the file, set the truncate flag for the mpool. This
+ * isn't for anything we've done, it's protection against stupid user
+ * tricks: if the user deleted a file behind Berkeley DB's back, we
+ * may still have pages in the mpool that match the file's "unique" ID.
+ *
+ * Note that if we're opening a subdatabase, we don't want to set
+ * the TRUNCATE flag even if we just created the file--we already
+ * opened and updated the master using access method interfaces,
+ * so we don't want to get rid of any pages that are in the mpool.
+ * If we created the file when we opened the master, we already hit
+ * this check in a non-subdb context then.
+ */
+ if (subdb == NULL && F_ISSET(dbp, DB_AM_CREATED))
+ LF_SET(DB_TRUNCATE);
+
+ /* Set up the underlying environment. */
+ if ((ret = __db_dbenv_setup(dbp, txn, name, id, flags)) != 0)
+ return (ret);
+
+ /*
+ * Set the open flag. We use it to mean that the dbp has gone
+ * through mpf setup, including dbreg_register. Also, below,
+ * the underlying access method open functions may want to do
+ * things like acquire cursors, so the open flag has to be set
+ * before calling them.
+ */
+ F_SET(dbp, DB_AM_OPEN_CALLED);
+
+ /*
+ * For unnamed files, we need to actually create the file now
+ * that the mpool is open.
+ */
+ if (name == NULL && (ret = __db_new_file(dbp, txn, NULL, NULL)) != 0)
+ return (ret);
+
+ switch (dbp->type) {
+ case DB_BTREE:
+ ret = __bam_open(dbp, txn, name, meta_pgno, flags);
+ break;
+ case DB_HASH:
+ ret = __ham_open(dbp, txn, name, meta_pgno, flags);
+ break;
+ case DB_RECNO:
+ ret = __ram_open(dbp, txn, name, meta_pgno, flags);
+ break;
+ case DB_QUEUE:
+ ret = __qam_open(dbp, txn, name, meta_pgno, mode, flags);
+ break;
+ case DB_UNKNOWN:
+ return (__db_unknown_type(dbenv, "__db_dbopen", dbp->type));
+ }
+ if (ret != 0)
+ goto err;
+
+ DB_TEST_RECOVERY(dbp, DB_TEST_POSTOPEN, ret, name);
+
+ /*
+ * Unnamed files don't need handle locks, so we only have to check
+ * for a handle lock downgrade or lockevent in the case of named
+ * files.
+ */
+ if (!F_ISSET(dbp, DB_AM_RECOVER) &&
+ name != NULL && LOCK_ISSET(dbp->handle_lock)) {
+ if (txn != NULL) {
+ ret = __txn_lockevent(dbenv,
+ txn, dbp, &dbp->handle_lock, dbp->lid);
+ } else if (LOCKING_ON(dbenv))
+ /* Trade write handle lock for read handle lock. */
+ ret = __lock_downgrade(dbenv,
+ &dbp->handle_lock, DB_LOCK_READ, 0);
+ }
+DB_TEST_RECOVERY_LABEL
+err:
+ return (ret);
+}
+
+/*
+ * __db_new_file --
+ * Create a new database file.
+ *
+ * PUBLIC: int __db_new_file __P((DB *, DB_TXN *, DB_FH *, const char *));
+ */
+int
+__db_new_file(dbp, txn, fhp, name)
+ DB *dbp;
+ DB_TXN *txn;
+ DB_FH *fhp;
+ const char *name;
+{
+ int ret;
+
+ switch (dbp->type) {
+ case DB_BTREE:
+ case DB_RECNO:
+ ret = __bam_new_file(dbp, txn, fhp, name);
+ break;
+ case DB_HASH:
+ ret = __ham_new_file(dbp, txn, fhp, name);
+ break;
+ case DB_QUEUE:
+ ret = __qam_new_file(dbp, txn, fhp, name);
+ break;
+ default:
+ __db_err(dbp->dbenv,
+ "%s: Invalid type %d specified", name, dbp->type);
+ ret = EINVAL;
+ break;
+ }
+
+ DB_TEST_RECOVERY(dbp, DB_TEST_POSTLOGMETA, ret, name);
+ /* Sync the file in preparation for moving it into place. */
+ if (ret == 0 && fhp != NULL)
+ ret = __os_fsync(dbp->dbenv, fhp);
+
+ DB_TEST_RECOVERY(dbp, DB_TEST_POSTSYNC, ret, name);
+
+DB_TEST_RECOVERY_LABEL
+ return (ret);
+}
+
+/*
+ * __db_init_subdb --
+ * Initialize the dbp for a subdb.
+ *
+ * PUBLIC: int __db_init_subdb __P((DB *, DB *, const char *, DB_TXN *));
+ */
+int
+__db_init_subdb(mdbp, dbp, name, txn)
+ DB *mdbp, *dbp;
+ const char *name;
+ DB_TXN *txn;
+{
+ DBMETA *meta;
+ DB_MPOOLFILE *mpf;
+ int ret, t_ret;
+
+ ret = 0;
+ if (!F_ISSET(dbp, DB_AM_CREATED)) {
+ /* Subdb exists; read meta-data page and initialize. */
+ mpf = mdbp->mpf;
+ if ((ret = mpf->get(mpf, &dbp->meta_pgno, 0, &meta)) != 0)
+ goto err;
+ ret = __db_meta_setup(mdbp->dbenv, dbp, name, meta, 0, 0);
+ if ((t_ret = mpf->put(mpf, meta, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ /*
+ * If __db_meta_setup found that the meta-page hadn't
+ * been written out during recovery, we can just return.
+ */
+ if (ret == ENOENT)
+ ret = 0;
+ goto err;
+ }
+
+ /* Handle the create case here. */
+ switch (dbp->type) {
+ case DB_BTREE:
+ case DB_RECNO:
+ ret = __bam_new_subdb(mdbp, dbp, txn);
+ break;
+ case DB_HASH:
+ ret = __ham_new_subdb(mdbp, dbp, txn);
+ break;
+ case DB_QUEUE:
+ ret = EINVAL;
+ break;
+ default:
+ __db_err(dbp->dbenv,
+ "Invalid subdatabase type %d specified", dbp->type);
+ return (EINVAL);
+ }
+
+err: return (ret);
+}
+
+/*
+ * __db_chk_meta --
+ * Take a buffer containing a meta-data page and check it for a checksum
+ * (and verify the checksum if necessary) and possibly decrypt it.
+ *
+ * Return 0 on success, >0 (errno) on error, -1 on checksum mismatch.
+ *
+ * PUBLIC: int __db_chk_meta __P((DB_ENV *, DB *, DBMETA *, int));
+ */
+int
+__db_chk_meta(dbenv, dbp, meta, do_metachk)
+ DB_ENV *dbenv;
+ DB *dbp;
+ DBMETA *meta;
+ int do_metachk;
+{
+ int is_hmac, ret;
+ u_int8_t *chksum;
+
+ ret = 0;
+
+ if (FLD_ISSET(meta->metaflags, DBMETA_CHKSUM)) {
+ if (dbp != NULL)
+ F_SET(dbp, DB_AM_CHKSUM);
+
+ is_hmac = meta->encrypt_alg == 0 ? 0 : 1;
+ chksum = ((BTMETA *)meta)->chksum;
+ if (do_metachk && ((ret = __db_check_chksum(dbenv,
+ (DB_CIPHER *)dbenv->crypto_handle, chksum, meta,
+ DBMETASIZE, is_hmac)) != 0))
+ return (ret);
+ }
+
+#ifdef HAVE_CRYPTO
+ ret = __crypto_decrypt_meta(dbenv, dbp, (u_int8_t *)meta, do_metachk);
+#endif
+ return (ret);
+}
+
+/*
+ * __db_meta_setup --
+ *
+ * Take a buffer containing a meta-data page and figure out if it's
+ * valid, and if so, initialize the dbp from the meta-data page.
+ *
+ * PUBLIC: int __db_meta_setup __P((DB_ENV *,
+ * PUBLIC: DB *, const char *, DBMETA *, u_int32_t, int));
+ */
+int
+__db_meta_setup(dbenv, dbp, name, meta, oflags, do_metachk)
+ DB_ENV *dbenv;
+ DB *dbp;
+ const char *name;
+ DBMETA *meta;
+ u_int32_t oflags;
+ int do_metachk;
+{
+ u_int32_t flags, magic;
+ int ret;
+
+ ret = 0;
+
+ /*
+ * Figure out what access method we're dealing with, and then
+ * call access method specific code to check error conditions
+ * based on conflicts between the found file and application
+ * arguments. A found file overrides some user information --
+ * we don't consider it an error, for example, if the user set
+ * an expected byte order and the found file doesn't match it.
+ */
+ F_CLR(dbp, DB_AM_SWAP);
+ magic = meta->magic;
+
+swap_retry:
+ switch (magic) {
+ case DB_BTREEMAGIC:
+ case DB_HASHMAGIC:
+ case DB_QAMMAGIC:
+ case DB_RENAMEMAGIC:
+ break;
+ case 0:
+ /*
+ * The only time this should be 0 is if we're in the
+ * midst of opening a subdb during recovery and that
+ * subdatabase had its meta-data page allocated, but
+ * not yet initialized.
+ */
+ if (F_ISSET(dbp, DB_AM_SUBDB) && ((IS_RECOVERING(dbenv) &&
+ F_ISSET((DB_LOG *) dbenv->lg_handle, DBLOG_FORCE_OPEN)) ||
+ meta->pgno != PGNO_INVALID))
+ return (ENOENT);
+
+ goto bad_format;
+ default:
+ if (F_ISSET(dbp, DB_AM_SWAP))
+ goto bad_format;
+
+ M_32_SWAP(magic);
+ F_SET(dbp, DB_AM_SWAP);
+ goto swap_retry;
+ }
+
+ /*
+ * We can only check the meta page if we are sure we have a meta page.
+ * If it is random data, then this check can fail. So only now can we
+ * checksum and decrypt. Don't distinguish between configuration and
+ * checksum match errors here, because we haven't opened the database
+ * and even a checksum error isn't a reason to panic the environment.
+ */
+ if ((ret = __db_chk_meta(dbenv, dbp, meta, do_metachk)) != 0) {
+ if (ret == -1) {
+ __db_err(dbenv,
+ "%s: metadata page checksum error", name);
+ ret = EINVAL;
+ }
+ goto bad_format;
+ }
+
+ switch (magic) {
+ case DB_BTREEMAGIC:
+ flags = meta->flags;
+ if (F_ISSET(dbp, DB_AM_SWAP))
+ M_32_SWAP(flags);
+ if (LF_ISSET(BTM_RECNO))
+ dbp->type = DB_RECNO;
+ else
+ dbp->type = DB_BTREE;
+ if ((oflags & DB_TRUNCATE) == 0 && (ret =
+ __bam_metachk(dbp, name, (BTMETA *)meta)) != 0)
+ return (ret);
+ break;
+ case DB_HASHMAGIC:
+ dbp->type = DB_HASH;
+ if ((oflags & DB_TRUNCATE) == 0 && (ret =
+ __ham_metachk(dbp, name, (HMETA *)meta)) != 0)
+ return (ret);
+ break;
+ case DB_QAMMAGIC:
+ dbp->type = DB_QUEUE;
+ if ((oflags & DB_TRUNCATE) == 0 && (ret =
+ __qam_metachk(dbp, name, (QMETA *)meta)) != 0)
+ return (ret);
+ break;
+ case DB_RENAMEMAGIC:
+ F_SET(dbp, DB_AM_IN_RENAME);
+ break;
+ }
+ return (0);
+
+bad_format:
+ __db_err(dbenv, "%s: unexpected file type or format", name);
+ return (ret);
+}
+
+/*
+ * __db_openchk --
+ * Interface error checking for open calls.
+ */
+static int
+__db_openchk(dbp, txn, name, subdb, type, flags)
+ DB *dbp;
+ DB_TXN *txn;
+ const char *name, *subdb;
+ DBTYPE type;
+ u_int32_t flags;
+{
+ DB_ENV *dbenv;
+ int ret;
+ u_int32_t ok_flags;
+
+ dbenv = dbp->dbenv;
+
+ /* Validate arguments. */
+#define OKFLAGS \
+ (DB_AUTO_COMMIT | DB_CREATE | DB_DIRTY_READ | DB_EXCL | \
+ DB_FCNTL_LOCKING | DB_NOMMAP | DB_RDONLY | DB_RDWRMASTER | \
+ DB_THREAD | DB_TRUNCATE | DB_WRITEOPEN)
+ if ((ret = __db_fchk(dbenv, "DB->open", flags, OKFLAGS)) != 0)
+ return (ret);
+ if (LF_ISSET(DB_EXCL) && !LF_ISSET(DB_CREATE))
+ return (__db_ferr(dbenv, "DB->open", 1));
+ if (LF_ISSET(DB_RDONLY) && LF_ISSET(DB_CREATE))
+ return (__db_ferr(dbenv, "DB->open", 1));
+
+#ifdef HAVE_VXWORKS
+ if (LF_ISSET(DB_TRUNCATE)) {
+ __db_err(dbenv, "DB_TRUNCATE unsupported in VxWorks");
+ return (__db_eopnotsup(dbenv));
+ }
+#endif
+ switch (type) {
+ case DB_UNKNOWN:
+ if (LF_ISSET(DB_CREATE|DB_TRUNCATE)) {
+ __db_err(dbenv,
+ "%s: DB_UNKNOWN type specified with DB_CREATE or DB_TRUNCATE",
+ name);
+ return (EINVAL);
+ }
+ ok_flags = 0;
+ break;
+ case DB_BTREE:
+ ok_flags = DB_OK_BTREE;
+ break;
+ case DB_HASH:
+ ok_flags = DB_OK_HASH;
+ break;
+ case DB_QUEUE:
+ ok_flags = DB_OK_QUEUE;
+ break;
+ case DB_RECNO:
+ ok_flags = DB_OK_RECNO;
+ break;
+ default:
+ __db_err(dbenv, "unknown type: %lu", (u_long)type);
+ return (EINVAL);
+ }
+ if (ok_flags)
+ DB_ILLEGAL_METHOD(dbp, ok_flags);
+
+ /* The environment may have been created, but never opened. */
+ if (!F_ISSET(dbenv, DB_ENV_DBLOCAL | DB_ENV_OPEN_CALLED)) {
+ __db_err(dbenv, "environment not yet opened");
+ return (EINVAL);
+ }
+
+ /*
+ * Historically, you could pass in an environment that didn't have a
+ * mpool, and DB would create a private one behind the scenes. This
+ * no longer works.
+ */
+ if (!F_ISSET(dbenv, DB_ENV_DBLOCAL) && !MPOOL_ON(dbenv)) {
+ __db_err(dbenv, "environment did not include a memory pool");
+ return (EINVAL);
+ }
+
+ /*
+ * You can't specify threads during DB->open if subsystems in the
+ * environment weren't configured with them.
+ */
+ if (LF_ISSET(DB_THREAD) &&
+ !F_ISSET(dbenv, DB_ENV_DBLOCAL | DB_ENV_THREAD)) {
+ __db_err(dbenv, "environment not created using DB_THREAD");
+ return (EINVAL);
+ }
+
+ /* DB_TRUNCATE is not transaction recoverable. */
+ if (LF_ISSET(DB_TRUNCATE) && txn != NULL) {
+ __db_err(dbenv,
+ "DB_TRUNCATE illegal with transaction specified");
+ return (EINVAL);
+ }
+
+ /* Subdatabase checks. */
+ if (subdb != NULL) {
+ /* Subdatabases must be created in named files. */
+ if (name == NULL) {
+ __db_err(dbenv,
+ "multiple databases cannot be created in temporary files");
+ return (EINVAL);
+ }
+
+ /* Truncate is a physical file operation */
+ if (LF_ISSET(DB_TRUNCATE)) {
+ __db_err(dbenv,
+ "DB_TRUNCATE illegal with multiple databases");
+ return (EINVAL);
+ }
+
+ /* QAM can't be done as a subdatabase. */
+ if (type == DB_QUEUE) {
+ __db_err(dbenv, "Queue databases must be one-per-file");
+ return (EINVAL);
+ }
+ }
+
+ return (0);
+}
diff --git a/bdb/db/db_overflow.c b/bdb/db/db_overflow.c
index 54f0a03aafe..27dcb41a2ff 100644
--- a/bdb/db/db_overflow.c
+++ b/bdb/db/db_overflow.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
/*
@@ -43,7 +43,7 @@
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: db_overflow.c,v 11.21 2000/11/30 00:58:32 ubell Exp $";
+static const char revid[] = "$Id: db_overflow.c,v 11.46 2002/08/08 03:57:48 bostic Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -53,9 +53,9 @@ static const char revid[] = "$Id: db_overflow.c,v 11.21 2000/11/30 00:58:32 ubel
#endif
#include "db_int.h"
-#include "db_page.h"
-#include "db_am.h"
-#include "db_verify.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/db_verify.h"
/*
* Big key/data code.
@@ -83,6 +83,7 @@ __db_goff(dbp, dbt, tlen, pgno, bpp, bpsz)
u_int32_t *bpsz;
{
DB_ENV *dbenv;
+ DB_MPOOLFILE *mpf;
PAGE *h;
db_indx_t bytes;
u_int32_t curoff, needed, start;
@@ -90,6 +91,7 @@ __db_goff(dbp, dbt, tlen, pgno, bpp, bpsz)
int ret;
dbenv = dbp->dbenv;
+ mpf = dbp->mpf;
/*
* Check if the buffer is big enough; if it is not and we are
@@ -99,7 +101,12 @@ __db_goff(dbp, dbt, tlen, pgno, bpp, bpsz)
*/
if (F_ISSET(dbt, DB_DBT_PARTIAL)) {
start = dbt->doff;
- needed = dbt->dlen;
+ if (start > tlen)
+ needed = 0;
+ else if (dbt->dlen > tlen - start)
+ needed = tlen - start;
+ else
+ needed = dbt->dlen;
} else {
start = 0;
needed = tlen;
@@ -112,15 +119,13 @@ __db_goff(dbp, dbt, tlen, pgno, bpp, bpsz)
return (ENOMEM);
}
} else if (F_ISSET(dbt, DB_DBT_MALLOC)) {
- if ((ret = __os_malloc(dbenv,
- needed, dbp->db_malloc, &dbt->data)) != 0)
+ if ((ret = __os_umalloc(dbenv, needed, &dbt->data)) != 0)
return (ret);
} else if (F_ISSET(dbt, DB_DBT_REALLOC)) {
- if ((ret = __os_realloc(dbenv,
- needed, dbp->db_realloc, &dbt->data)) != 0)
+ if ((ret = __os_urealloc(dbenv, needed, &dbt->data)) != 0)
return (ret);
} else if (*bpsz == 0 || *bpsz < needed) {
- if ((ret = __os_realloc(dbenv, needed, NULL, bpp)) != 0)
+ if ((ret = __os_realloc(dbenv, needed, bpp)) != 0)
return (ret);
*bpsz = needed;
dbt->data = *bpp;
@@ -133,13 +138,12 @@ __db_goff(dbp, dbt, tlen, pgno, bpp, bpsz)
*/
dbt->size = needed;
for (curoff = 0, p = dbt->data; pgno != PGNO_INVALID && needed > 0;) {
- if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) {
- (void)__db_pgerr(dbp, pgno);
+ if ((ret = mpf->get(mpf, &pgno, 0, &h)) != 0)
return (ret);
- }
+
/* Check if we need any bytes from this page. */
if (curoff + OV_LEN(h) >= start) {
- src = (u_int8_t *)h + P_OVERHEAD;
+ src = (u_int8_t *)h + P_OVERHEAD(dbp);
bytes = OV_LEN(h);
if (start > curoff) {
src += start - curoff;
@@ -153,7 +157,7 @@ __db_goff(dbp, dbt, tlen, pgno, bpp, bpsz)
}
curoff += OV_LEN(h);
pgno = h->next_pgno;
- memp_fput(dbp->mpf, h, 0);
+ (void)mpf->put(mpf, h, 0);
}
return (0);
}
@@ -171,13 +175,14 @@ __db_poff(dbc, dbt, pgnop)
db_pgno_t *pgnop;
{
DB *dbp;
- PAGE *pagep, *lastp;
- DB_LSN new_lsn, null_lsn;
DBT tmp_dbt;
+ DB_LSN new_lsn, null_lsn;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep, *lastp;
db_indx_t pagespace;
u_int32_t sz;
u_int8_t *p;
- int ret;
+ int ret, t_ret;
/*
* Allocate pages and copy the key/data item into them. Calculate the
@@ -185,8 +190,10 @@ __db_poff(dbc, dbt, pgnop)
* item.
*/
dbp = dbc->dbp;
- pagespace = P_MAXSPACE(dbp->pgsize);
+ mpf = dbp->mpf;
+ pagespace = P_MAXSPACE(dbp, dbp->pgsize);
+ ret = 0;
lastp = NULL;
for (p = dbt->data,
sz = dbt->size; sz > 0; p += pagespace, sz -= pagespace) {
@@ -203,30 +210,36 @@ __db_poff(dbc, dbt, pgnop)
* have a partial record.
*/
if ((ret = __db_new(dbc, P_OVERFLOW, &pagep)) != 0)
- return (ret);
- if (DB_LOGGING(dbc)) {
+ break;
+ if (DBC_LOGGING(dbc)) {
tmp_dbt.data = p;
tmp_dbt.size = pagespace;
ZERO_LSN(null_lsn);
- if ((ret = __db_big_log(dbp->dbenv, dbc->txn,
- &new_lsn, 0, DB_ADD_BIG, dbp->log_fileid,
- PGNO(pagep), lastp ? PGNO(lastp) : PGNO_INVALID,
+ if ((ret = __db_big_log(dbp, dbc->txn,
+ &new_lsn, 0, DB_ADD_BIG, PGNO(pagep),
+ lastp ? PGNO(lastp) : PGNO_INVALID,
PGNO_INVALID, &tmp_dbt, &LSN(pagep),
lastp == NULL ? &null_lsn : &LSN(lastp),
- &null_lsn)) != 0)
- return (ret);
+ &null_lsn)) != 0) {
+ if (lastp != NULL)
+ (void)mpf->put(mpf,
+ lastp, DB_MPOOL_DIRTY);
+ lastp = pagep;
+ break;
+ }
+ } else
+ LSN_NOT_LOGGED(new_lsn);
- /* Move lsn onto page. */
- if (lastp)
- LSN(lastp) = new_lsn;
- LSN(pagep) = new_lsn;
- }
+ /* Move LSN onto page. */
+ if (lastp != NULL)
+ LSN(lastp) = new_lsn;
+ LSN(pagep) = new_lsn;
P_INIT(pagep, dbp->pgsize,
PGNO(pagep), PGNO_INVALID, PGNO_INVALID, 0, P_OVERFLOW);
OV_LEN(pagep) = pagespace;
OV_REF(pagep) = 1;
- memcpy((u_int8_t *)pagep + P_OVERHEAD, p, pagespace);
+ memcpy((u_int8_t *)pagep + P_OVERHEAD(dbp), p, pagespace);
/*
* If this is the first entry, update the user's info.
@@ -238,12 +251,14 @@ __db_poff(dbc, dbt, pgnop)
else {
lastp->next_pgno = PGNO(pagep);
pagep->prev_pgno = PGNO(lastp);
- (void)memp_fput(dbp->mpf, lastp, DB_MPOOL_DIRTY);
+ (void)mpf->put(mpf, lastp, DB_MPOOL_DIRTY);
}
lastp = pagep;
}
- (void)memp_fput(dbp->mpf, lastp, DB_MPOOL_DIRTY);
- return (0);
+ if (lastp != NULL &&
+ (t_ret = mpf->put(mpf, lastp, DB_MPOOL_DIRTY)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
}
/*
@@ -259,23 +274,29 @@ __db_ovref(dbc, pgno, adjust)
int32_t adjust;
{
DB *dbp;
+ DB_MPOOLFILE *mpf;
PAGE *h;
int ret;
dbp = dbc->dbp;
- if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) {
- (void)__db_pgerr(dbp, pgno);
+ mpf = dbp->mpf;
+
+ if ((ret = mpf->get(mpf, &pgno, 0, &h)) != 0) {
+ __db_pgerr(dbp, pgno, ret);
return (ret);
}
- if (DB_LOGGING(dbc))
- if ((ret = __db_ovref_log(dbp->dbenv, dbc->txn,
- &LSN(h), 0, dbp->log_fileid, h->pgno, adjust,
- &LSN(h))) != 0)
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __db_ovref_log(dbp,
+ dbc->txn, &LSN(h), 0, h->pgno, adjust, &LSN(h))) != 0) {
+ (void)mpf->put(mpf, h, 0);
return (ret);
+ }
+ } else
+ LSN_NOT_LOGGED(LSN(h));
OV_REF(h) += adjust;
- (void)memp_fput(dbp->mpf, h, DB_MPOOL_DIRTY);
+ (void)mpf->put(mpf, h, DB_MPOOL_DIRTY);
return (0);
}
@@ -293,13 +314,16 @@ __db_doff(dbc, pgno)
DB *dbp;
PAGE *pagep;
DB_LSN null_lsn;
+ DB_MPOOLFILE *mpf;
DBT tmp_dbt;
int ret;
dbp = dbc->dbp;
+ mpf = dbp->mpf;
+
do {
- if ((ret = memp_fget(dbp->mpf, &pgno, 0, &pagep)) != 0) {
- (void)__db_pgerr(dbp, pgno);
+ if ((ret = mpf->get(mpf, &pgno, 0, &pagep)) != 0) {
+ __db_pgerr(dbp, pgno, ret);
return (ret);
}
@@ -309,20 +333,24 @@ __db_doff(dbc, pgno)
* decrement the reference count and return.
*/
if (OV_REF(pagep) > 1) {
- (void)memp_fput(dbp->mpf, pagep, 0);
+ (void)mpf->put(mpf, pagep, 0);
return (__db_ovref(dbc, pgno, -1));
}
- if (DB_LOGGING(dbc)) {
- tmp_dbt.data = (u_int8_t *)pagep + P_OVERHEAD;
+ if (DBC_LOGGING(dbc)) {
+ tmp_dbt.data = (u_int8_t *)pagep + P_OVERHEAD(dbp);
tmp_dbt.size = OV_LEN(pagep);
ZERO_LSN(null_lsn);
- if ((ret = __db_big_log(dbp->dbenv, dbc->txn,
- &LSN(pagep), 0, DB_REM_BIG, dbp->log_fileid,
- PGNO(pagep), PREV_PGNO(pagep), NEXT_PGNO(pagep),
- &tmp_dbt, &LSN(pagep), &null_lsn, &null_lsn)) != 0)
+ if ((ret = __db_big_log(dbp, dbc->txn,
+ &LSN(pagep), 0, DB_REM_BIG,
+ PGNO(pagep), PREV_PGNO(pagep),
+ NEXT_PGNO(pagep), &tmp_dbt,
+ &LSN(pagep), &null_lsn, &null_lsn)) != 0) {
+ (void)mpf->put(mpf, pagep, 0);
return (ret);
- }
+ }
+ } else
+ LSN_NOT_LOGGED(LSN(pagep));
pgno = pagep->next_pgno;
if ((ret = __db_free(dbc, pagep)) != 0)
return (ret);
@@ -352,13 +380,16 @@ __db_moff(dbp, dbt, pgno, tlen, cmpfunc, cmpp)
u_int32_t tlen;
int (*cmpfunc) __P((DB *, const DBT *, const DBT *)), *cmpp;
{
- PAGE *pagep;
DBT local_dbt;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
void *buf;
u_int32_t bufsize, cmp_bytes, key_left;
u_int8_t *p1, *p2;
int ret;
+ mpf = dbp->mpf;
+
/*
* If there is a user-specified comparison function, build a
* contiguous copy of the key, and call it.
@@ -373,27 +404,27 @@ __db_moff(dbp, dbt, pgno, tlen, cmpfunc, cmpp)
return (ret);
/* Pass the key as the first argument */
*cmpp = cmpfunc(dbp, dbt, &local_dbt);
- __os_free(buf, bufsize);
+ __os_free(dbp->dbenv, buf);
return (0);
}
/* While there are both keys to compare. */
for (*cmpp = 0, p1 = dbt->data,
key_left = dbt->size; key_left > 0 && pgno != PGNO_INVALID;) {
- if ((ret = memp_fget(dbp->mpf, &pgno, 0, &pagep)) != 0)
+ if ((ret = mpf->get(mpf, &pgno, 0, &pagep)) != 0)
return (ret);
cmp_bytes = OV_LEN(pagep) < key_left ? OV_LEN(pagep) : key_left;
tlen -= cmp_bytes;
key_left -= cmp_bytes;
- for (p2 =
- (u_int8_t *)pagep + P_OVERHEAD; cmp_bytes-- > 0; ++p1, ++p2)
+ for (p2 = (u_int8_t *)pagep + P_OVERHEAD(dbp);
+ cmp_bytes-- > 0; ++p1, ++p2)
if (*p1 != *p2) {
*cmpp = (long)*p1 - (long)*p2;
break;
}
pgno = NEXT_PGNO(pagep);
- if ((ret = memp_fput(dbp->mpf, pagep, 0)) != 0)
+ if ((ret = mpf->put(mpf, pagep, 0)) != 0)
return (ret);
if (*cmpp != 0)
return (0);
@@ -440,7 +471,7 @@ __db_vrfy_overflow(dbp, vdp, h, pgno, flags)
pip->refcount = OV_REF(h);
if (pip->refcount < 1) {
EPRINT((dbp->dbenv,
- "Overflow page %lu has zero reference count",
+ "Page %lu: overflow page has zero reference count",
(u_long)pgno));
isbad = 1;
}
@@ -448,7 +479,7 @@ __db_vrfy_overflow(dbp, vdp, h, pgno, flags)
/* Just store for now. */
pip->olen = HOFFSET(h);
-err: if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0)
+err: if ((t_ret = __db_vrfy_putpageinfo(dbp->dbenv, vdp, pip)) != 0)
ret = t_ret;
return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
}
@@ -495,7 +526,7 @@ __db_vrfy_ovfl_structure(dbp, vdp, pgno, tlen, flags)
if (pip->type != P_OVERFLOW) {
EPRINT((dbp->dbenv,
- "Overflow page %lu of invalid type",
+ "Page %lu: overflow page of invalid type %lu",
(u_long)pgno, (u_long)pip->type));
ret = DB_VERIFY_BAD;
goto err; /* Unsafe to continue. */
@@ -504,7 +535,8 @@ __db_vrfy_ovfl_structure(dbp, vdp, pgno, tlen, flags)
prev = pip->prev_pgno;
if (prev != PGNO_INVALID) {
EPRINT((dbp->dbenv,
- "First overflow page %lu has a prev_pgno", (u_long)pgno));
+ "Page %lu: first page in overflow chain has a prev_pgno %lu",
+ (u_long)pgno, (u_long)prev));
isbad = 1;
}
@@ -543,7 +575,7 @@ __db_vrfy_ovfl_structure(dbp, vdp, pgno, tlen, flags)
*/
if ((u_int32_t)p > refcount) {
EPRINT((dbp->dbenv,
- "Page %lu encountered twice in overflow traversal",
+ "Page %lu: encountered twice in overflow traversal",
(u_long)pgno));
ret = DB_VERIFY_BAD;
goto err;
@@ -571,19 +603,20 @@ __db_vrfy_ovfl_structure(dbp, vdp, pgno, tlen, flags)
if (!IS_VALID_PGNO(next)) {
DB_ASSERT(0);
EPRINT((dbp->dbenv,
- "Overflow page %lu has bad next_pgno",
- (u_long)pgno));
+ "Page %lu: bad next_pgno %lu on overflow page",
+ (u_long)pgno, (u_long)next));
ret = DB_VERIFY_BAD;
goto err;
}
- if ((ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 ||
+ if ((ret = __db_vrfy_putpageinfo(dbp->dbenv, vdp, pip)) != 0 ||
(ret = __db_vrfy_getpageinfo(vdp, next, &pip)) != 0)
return (ret);
if (pip->prev_pgno != pgno) {
EPRINT((dbp->dbenv,
- "Overflow page %lu has bogus prev_pgno value",
- (u_long)next));
+ "Page %lu: bad prev_pgno %lu on overflow page (should be %lu)",
+ (u_long)next, (u_long)pip->prev_pgno,
+ (u_long)pgno));
isbad = 1;
/*
* It's safe to continue because we have separate
@@ -597,10 +630,11 @@ __db_vrfy_ovfl_structure(dbp, vdp, pgno, tlen, flags)
if (tlen > 0) {
isbad = 1;
EPRINT((dbp->dbenv,
- "Overflow item incomplete on page %lu", (u_long)pgno));
+ "Page %lu: overflow item incomplete", (u_long)pgno));
}
-err: if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0)
+err: if ((t_ret =
+ __db_vrfy_putpageinfo(dbp->dbenv, vdp, pip)) != 0 && ret == 0)
ret = t_ret;
return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
}
@@ -622,13 +656,15 @@ __db_safe_goff(dbp, vdp, pgno, dbt, buf, flags)
void **buf;
u_int32_t flags;
{
+ DB_MPOOLFILE *mpf;
PAGE *h;
- int ret, err_ret;
+ int ret, t_ret;
u_int32_t bytesgot, bytes;
u_int8_t *src, *dest;
- ret = DB_VERIFY_BAD;
- err_ret = 0;
+ mpf = dbp->mpf;
+ h = NULL;
+ ret = t_ret = 0;
bytesgot = bytes = 0;
while ((pgno != PGNO_INVALID) && (IS_VALID_PGNO(pgno))) {
@@ -639,7 +675,7 @@ __db_safe_goff(dbp, vdp, pgno, dbt, buf, flags)
if ((ret = __db_salvage_markdone(vdp, pgno)) != 0)
break;
- if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
+ if ((ret = mpf->get(mpf, &pgno, 0, &h)) != 0)
break;
/*
@@ -651,14 +687,14 @@ __db_safe_goff(dbp, vdp, pgno, dbt, buf, flags)
break;
}
- src = (u_int8_t *)h + P_OVERHEAD;
+ src = (u_int8_t *)h + P_OVERHEAD(dbp);
bytes = OV_LEN(h);
- if (bytes + P_OVERHEAD > dbp->pgsize)
- bytes = dbp->pgsize - P_OVERHEAD;
+ if (bytes + P_OVERHEAD(dbp) > dbp->pgsize)
+ bytes = dbp->pgsize - P_OVERHEAD(dbp);
if ((ret = __os_realloc(dbp->dbenv,
- bytesgot + bytes, 0, buf)) != 0)
+ bytesgot + bytes, buf)) != 0)
break;
dest = (u_int8_t *)*buf + bytesgot;
@@ -667,15 +703,24 @@ __db_safe_goff(dbp, vdp, pgno, dbt, buf, flags)
memcpy(dest, src, bytes);
pgno = NEXT_PGNO(h);
- /* Not much we can do here--we don't want to quit. */
- if ((ret = memp_fput(dbp->mpf, h, 0)) != 0)
- err_ret = ret;
+
+ if ((ret = mpf->put(mpf, h, 0)) != 0)
+ break;
+ h = NULL;
}
- if (ret == 0) {
+ /*
+ * If we're being aggressive, salvage a partial datum if there
+ * was an error somewhere along the way.
+ */
+ if (ret == 0 || LF_ISSET(DB_AGGRESSIVE)) {
dbt->size = bytesgot;
dbt->data = *buf;
}
- return ((err_ret != 0 && ret == 0) ? err_ret : ret);
+ /* If we broke out on error, don't leave pages pinned. */
+ if (h != NULL && (t_ret = mpf->put(mpf, h, 0)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
}
diff --git a/bdb/db/db_pr.c b/bdb/db/db_pr.c
index cb977cadfda..235e7187f7c 100644
--- a/bdb/db/db_pr.c
+++ b/bdb/db/db_pr.c
@@ -1,14 +1,14 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: db_pr.c,v 11.46 2001/01/22 17:25:06 krinsky Exp $";
+static const char revid[] = "$Id: db_pr.c,v 11.84 2002/09/10 02:45:20 bostic Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -21,34 +21,24 @@ static const char revid[] = "$Id: db_pr.c,v 11.46 2001/01/22 17:25:06 krinsky Ex
#endif
#include "db_int.h"
-#include "db_page.h"
-#include "btree.h"
-#include "hash.h"
-#include "qam.h"
-#include "db_am.h"
-#include "db_verify.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/mp.h"
+#include "dbinc/qam.h"
+#include "dbinc/db_verify.h"
static int __db_bmeta __P((DB *, FILE *, BTMETA *, u_int32_t));
static int __db_hmeta __P((DB *, FILE *, HMETA *, u_int32_t));
static void __db_meta __P((DB *, DBMETA *, FILE *, FN const *, u_int32_t));
-static const char *__db_dbtype_to_string __P((DB *));
-static void __db_prdb __P((DB *, FILE *, u_int32_t));
-static FILE *__db_prinit __P((FILE *));
-static void __db_proff __P((void *));
-static int __db_prtree __P((DB *, u_int32_t));
-static void __db_psize __P((DB *));
+static const char *__db_pagetype_to_string __P((u_int32_t));
+static void __db_prdb __P((DB *, FILE *));
+static void __db_proff __P((void *, FILE *));
+static int __db_prtree __P((DB *, FILE *, u_int32_t));
static int __db_qmeta __P((DB *, FILE *, QMETA *, u_int32_t));
/*
- * 64K is the maximum page size, so by default we check for offsets larger
- * than that, and, where possible, we refine the test.
- */
-#define PSIZE_BOUNDARY (64 * 1024 + 1)
-static size_t set_psize = PSIZE_BOUNDARY;
-
-static FILE *set_fp; /* Output file descriptor. */
-
-/*
* __db_loadme --
* A nice place to put a breakpoint.
*
@@ -57,7 +47,9 @@ static FILE *set_fp; /* Output file descriptor. */
void
__db_loadme()
{
- getpid();
+ u_int32_t id;
+
+ __os_id(&id);
}
/*
@@ -71,21 +63,9 @@ __db_dump(dbp, op, name)
DB *dbp;
char *op, *name;
{
- FILE *fp, *save_fp;
+ FILE *fp;
u_int32_t flags;
-
- COMPQUIET(save_fp, NULL);
-
- if (set_psize == PSIZE_BOUNDARY)
- __db_psize(dbp);
-
- if (name != NULL) {
- if ((fp = fopen(name, "w")) == NULL)
- return (__os_get_errno());
- save_fp = set_fp;
- set_fp = fp;
- } else
- fp = __db_prinit(NULL);
+ int ret;
for (flags = 0; *op != '\0'; ++op)
switch (*op) {
@@ -101,60 +81,93 @@ __db_dump(dbp, op, name)
return (EINVAL);
}
- __db_prdb(dbp, fp, flags);
+ if (name == NULL)
+ fp = stdout;
+ else {
+ if ((fp = fopen(name, "w")) == NULL)
+ return (__os_get_errno());
+ }
+
+ __db_prdb(dbp, fp);
fprintf(fp, "%s\n", DB_LINE);
- (void)__db_prtree(dbp, flags);
+ ret = __db_prtree(dbp, fp, flags);
fflush(fp);
-
- if (name != NULL) {
+ if (name != NULL)
fclose(fp);
- set_fp = save_fp;
- }
- return (0);
+
+ return (ret);
}
/*
- * __db_prdb --
- * Print out the DB structure information.
+ * __db_inmemdbflags --
+ * Call a callback for printing or other handling of strings associated
+ * with whatever in-memory DB structure flags are set.
+ *
+ * PUBLIC: void __db_inmemdbflags __P((u_int32_t, void *,
+ * PUBLIC: void (*)(u_int32_t, const FN *, void *)));
*/
-static void
-__db_prdb(dbp, fp, flags)
- DB *dbp;
- FILE *fp;
+void
+__db_inmemdbflags(flags, cookie, callback)
u_int32_t flags;
+ void *cookie;
+ void (*callback) __P((u_int32_t, const FN *, void *));
{
static const FN fn[] = {
+ { DB_AM_CHKSUM, "checksumming" },
+ { DB_AM_CL_WRITER, "client replica writer" },
+ { DB_AM_COMPENSATE, "created by compensating transaction" },
+ { DB_AM_CREATED, "database created" },
+ { DB_AM_CREATED_MSTR, "encompassing file created" },
+ { DB_AM_DBM_ERROR, "dbm/ndbm error" },
+ { DB_AM_DELIMITER, "variable length" },
+ { DB_AM_DIRTY, "dirty reads" },
{ DB_AM_DISCARD, "discard cached pages" },
{ DB_AM_DUP, "duplicates" },
+ { DB_AM_DUPSORT, "sorted duplicates" },
+ { DB_AM_ENCRYPT, "encrypted" },
+ { DB_AM_FIXEDLEN, "fixed-length records" },
{ DB_AM_INMEM, "in-memory" },
+ { DB_AM_IN_RENAME, "file is being renamed" },
+ { DB_AM_OPEN_CALLED, "DB->open called" },
+ { DB_AM_PAD, "pad value" },
{ DB_AM_PGDEF, "default page size" },
{ DB_AM_RDONLY, "read-only" },
- { DB_AM_SUBDB, "multiple-databases" },
+ { DB_AM_RECNUM, "Btree record numbers" },
+ { DB_AM_RECOVER, "opened for recovery" },
+ { DB_AM_RENUMBER, "renumber" },
+ { DB_AM_REVSPLITOFF, "no reverse splits" },
+ { DB_AM_SECONDARY, "secondary" },
+ { DB_AM_SNAPSHOT, "load on open" },
+ { DB_AM_SUBDB, "subdatabases" },
{ DB_AM_SWAP, "needswap" },
- { DB_BT_RECNUM, "btree:recnum" },
- { DB_BT_REVSPLIT, "btree:no reverse split" },
- { DB_DBM_ERROR, "dbm/ndbm error" },
- { DB_OPEN_CALLED, "DB->open called" },
- { DB_RE_DELIMITER, "recno:delimiter" },
- { DB_RE_FIXEDLEN, "recno:fixed-length" },
- { DB_RE_PAD, "recno:pad" },
- { DB_RE_RENUMBER, "recno:renumber" },
- { DB_RE_SNAPSHOT, "recno:snapshot" },
+ { DB_AM_TXN, "transactional" },
+ { DB_AM_VERIFYING, "verifier" },
{ 0, NULL }
};
+
+ callback(flags, fn, cookie);
+}
+
+/*
+ * __db_prdb --
+ * Print out the DB structure information.
+ */
+static void
+__db_prdb(dbp, fp)
+ DB *dbp;
+ FILE *fp;
+{
BTREE *bt;
HASH *h;
QUEUE *q;
- COMPQUIET(flags, 0);
-
fprintf(fp,
"In-memory DB structure:\n%s: %#lx",
- __db_dbtype_to_string(dbp), (u_long)dbp->flags);
- __db_prflags(dbp->flags, fn, fp);
+ __db_dbtype_to_string(dbp->type), (u_long)dbp->flags);
+ __db_inmemdbflags(dbp->flags, fp, __db_prflags);
fprintf(fp, "\n");
switch (dbp->type) {
@@ -166,7 +179,7 @@ __db_prdb(dbp, fp, flags)
fprintf(fp, "bt_maxkey: %lu bt_minkey: %lu\n",
(u_long)bt->bt_maxkey, (u_long)bt->bt_minkey);
fprintf(fp, "bt_compare: %#lx bt_prefix: %#lx\n",
- (u_long)bt->bt_compare, (u_long)bt->bt_prefix);
+ P_TO_ULONG(bt->bt_compare), P_TO_ULONG(bt->bt_prefix));
fprintf(fp, "bt_lpgno: %lu\n", (u_long)bt->bt_lpgno);
if (dbp->type == DB_RECNO) {
fprintf(fp,
@@ -183,7 +196,7 @@ __db_prdb(dbp, fp, flags)
fprintf(fp, "meta_pgno: %lu\n", (u_long)h->meta_pgno);
fprintf(fp, "h_ffactor: %lu\n", (u_long)h->h_ffactor);
fprintf(fp, "h_nelem: %lu\n", (u_long)h->h_nelem);
- fprintf(fp, "h_hash: %#lx\n", (u_long)h->h_hash);
+ fprintf(fp, "h_hash: %#lx\n", P_TO_ULONG(h->h_hash));
break;
case DB_QUEUE:
q = dbp->q_internal;
@@ -204,39 +217,34 @@ __db_prdb(dbp, fp, flags)
* Print out the entire tree.
*/
static int
-__db_prtree(dbp, flags)
+__db_prtree(dbp, fp, flags)
DB *dbp;
+ FILE *fp;
u_int32_t flags;
{
+ DB_MPOOLFILE *mpf;
PAGE *h;
db_pgno_t i, last;
int ret;
- if (set_psize == PSIZE_BOUNDARY)
- __db_psize(dbp);
+ mpf = dbp->mpf;
- if (dbp->type == DB_QUEUE) {
- ret = __db_prqueue(dbp, flags);
- goto done;
- }
-
- /* Find out the page number of the last page in the database. */
- if ((ret = memp_fget(dbp->mpf, &last, DB_MPOOL_LAST, &h)) != 0)
- return (ret);
- if ((ret = memp_fput(dbp->mpf, h, 0)) != 0)
- return (ret);
+ if (dbp->type == DB_QUEUE)
+ return (__db_prqueue(dbp, fp, flags));
- /* Dump each page. */
+ /*
+ * Find out the page number of the last page in the database, then
+ * dump each page.
+ */
+ mpf->last_pgno(mpf, &last);
for (i = 0; i <= last; ++i) {
- if ((ret = memp_fget(dbp->mpf, &i, 0, &h)) != 0)
+ if ((ret = mpf->get(mpf, &i, 0, &h)) != 0)
return (ret);
- (void)__db_prpage(dbp, h, flags);
- if ((ret = memp_fput(dbp->mpf, h, 0)) != 0)
+ (void)__db_prpage(dbp, h, fp, flags);
+ if ((ret = mpf->put(mpf, h, 0)) != 0)
return (ret);
}
-done:
- (void)fflush(__db_prinit(NULL));
return (0);
}
@@ -252,13 +260,15 @@ __db_meta(dbp, dbmeta, fp, fn, flags)
FN const *fn;
u_int32_t flags;
{
+ DB_MPOOLFILE *mpf;
PAGE *h;
- int cnt;
db_pgno_t pgno;
u_int8_t *p;
- int ret;
+ int cnt, ret;
const char *sep;
+ mpf = dbp->mpf;
+
fprintf(fp, "\tmagic: %#lx\n", (u_long)dbmeta->magic);
fprintf(fp, "\tversion: %lu\n", (u_long)dbmeta->version);
fprintf(fp, "\tpagesize: %lu\n", (u_long)dbmeta->pagesize);
@@ -275,14 +285,14 @@ __db_meta(dbp, dbmeta, fp, fn, flags)
fprintf(fp, "\tfree list: %lu", (u_long)dbmeta->free);
for (pgno = dbmeta->free,
cnt = 0, sep = ", "; pgno != PGNO_INVALID;) {
- if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) {
+ if ((ret = mpf->get(mpf, &pgno, 0, &h)) != 0) {
fprintf(fp,
"Unable to retrieve free-list page: %lu: %s\n",
(u_long)pgno, db_strerror(ret));
break;
}
pgno = h->next_pgno;
- (void)memp_fput(dbp->mpf, h, 0);
+ (void)mpf->put(mpf, h, 0);
fprintf(fp, "%s%lu", sep, (u_long)pgno);
if (++cnt % 10 == 0) {
fprintf(fp, "\n");
@@ -292,6 +302,7 @@ __db_meta(dbp, dbmeta, fp, fn, flags)
sep = ", ";
}
fprintf(fp, "\n");
+ fprintf(fp, "\tlast_pgno: %lu\n", (u_long)dbmeta->last_pgno);
}
if (fn != NULL) {
@@ -404,26 +415,28 @@ __db_qmeta(dbp, fp, h, flags)
* __db_prnpage
* -- Print out a specific page.
*
- * PUBLIC: int __db_prnpage __P((DB *, db_pgno_t));
+ * PUBLIC: int __db_prnpage __P((DB *, db_pgno_t, FILE *));
*/
int
-__db_prnpage(dbp, pgno)
+__db_prnpage(dbp, pgno, fp)
DB *dbp;
db_pgno_t pgno;
+ FILE *fp;
{
+ DB_MPOOLFILE *mpf;
PAGE *h;
- int ret;
+ int ret, t_ret;
- if (set_psize == PSIZE_BOUNDARY)
- __db_psize(dbp);
+ mpf = dbp->mpf;
- if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
+ if ((ret = mpf->get(mpf, &pgno, 0, &h)) != 0)
return (ret);
- ret = __db_prpage(dbp, h, DB_PR_PAGE);
- (void)fflush(__db_prinit(NULL));
+ ret = __db_prpage(dbp, h, fp, DB_PR_PAGE);
+
+ if ((t_ret = mpf->put(mpf, h, 0)) != 0 && ret == 0)
+ ret = t_ret;
- (void)memp_fput(dbp->mpf, h, 0);
return (ret);
}
@@ -431,32 +444,29 @@ __db_prnpage(dbp, pgno)
* __db_prpage
* -- Print out a page.
*
- * PUBLIC: int __db_prpage __P((DB *, PAGE *, u_int32_t));
+ * PUBLIC: int __db_prpage __P((DB *, PAGE *, FILE *, u_int32_t));
*/
int
-__db_prpage(dbp, h, flags)
+__db_prpage(dbp, h, fp, flags)
DB *dbp;
PAGE *h;
+ FILE *fp;
u_int32_t flags;
{
BINTERNAL *bi;
BKEYDATA *bk;
- BTREE *t;
- FILE *fp;
HOFFPAGE a_hkd;
QAMDATA *qp, *qep;
RINTERNAL *ri;
- db_indx_t dlen, len, i;
+ db_indx_t dlen, len, i, *inp;
db_pgno_t pgno;
db_recno_t recno;
+ u_int32_t pagesize, qlen;
+ u_int8_t *ep, *hk, *p;
int deleted, ret;
const char *s;
- u_int32_t qlen;
- u_int8_t *ep, *hk, *p;
void *sp;
- fp = __db_prinit(NULL);
-
/*
* If we're doing recovery testing and this page is P_INVALID,
* assume it's a page that's on the free list, and don't display it.
@@ -471,6 +481,14 @@ __db_prpage(dbp, h, flags)
return (1);
}
+ /*
+ * !!!
+ * Find out the page size. We don't want to do it the "right" way,
+ * by reading the value from the meta-data page, that's going to be
+ * slow. Reach down into the mpool region.
+ */
+ pagesize = (u_int32_t)dbp->mpf->mfp->stat.st_pagesize;
+
/* Page number, page type. */
fprintf(fp, "page %lu: %s level: %lu",
(u_long)h->pgno, s, (u_long)h->level);
@@ -500,7 +518,7 @@ __db_prpage(dbp, h, flags)
qlen = ((QUEUE *)dbp->q_internal)->re_len;
recno = (h->pgno - 1) * QAM_RECNO_PER_PAGE(dbp) + 1;
i = 0;
- qep = (QAMDATA *)((u_int8_t *)h + set_psize - qlen);
+ qep = (QAMDATA *)((u_int8_t *)h + pagesize - qlen);
for (qp = QAM_GET_RECORD(dbp, h, i); qp < qep;
recno++, i++, qp = QAM_GET_RECORD(dbp, h, i)) {
if (!F_ISSET(qp, QAM_SET))
@@ -508,9 +526,9 @@ __db_prpage(dbp, h, flags)
fprintf(fp, "%s",
F_ISSET(qp, QAM_VALID) ? "\t" : " D");
- fprintf(fp, "[%03lu] %4lu ",
- (u_long)recno, (u_long)qp - (u_long)h);
- __db_pr(qp->data, qlen);
+ fprintf(fp, "[%03lu] %4lu ", (u_long)recno,
+ (u_long)((u_int8_t *)qp - (u_int8_t *)h));
+ __db_pr(qp->data, qlen, fp);
}
return (0);
}
@@ -520,8 +538,6 @@ __db_prpage(dbp, h, flags)
fprintf(fp, " (lsn.file: %lu lsn.offset: %lu)\n",
(u_long)LSN(h).file, (u_long)LSN(h).offset);
- t = dbp->bt_internal;
-
s = "\t";
if (TYPE(h) != P_IBTREE && TYPE(h) != P_IRECNO) {
fprintf(fp, "%sprev: %4lu next: %4lu",
@@ -530,7 +546,7 @@ __db_prpage(dbp, h, flags)
}
if (TYPE(h) == P_OVERFLOW) {
fprintf(fp, "%sref cnt: %4lu ", s, (u_long)OV_REF(h));
- __db_pr((u_int8_t *)h + P_OVERHEAD, OV_LEN(h));
+ __db_pr((u_int8_t *)h + P_OVERHEAD(dbp), OV_LEN(h), fp);
return (0);
}
fprintf(fp, "%sentries: %4lu", s, (u_long)NUM_ENT(h));
@@ -540,12 +556,14 @@ __db_prpage(dbp, h, flags)
return (0);
ret = 0;
+ inp = P_INP(dbp, h);
for (i = 0; i < NUM_ENT(h); i++) {
- if (P_ENTRY(h, i) - (u_int8_t *)h < P_OVERHEAD ||
- (size_t)(P_ENTRY(h, i) - (u_int8_t *)h) >= set_psize) {
+ if ((db_alignp_t)(P_ENTRY(dbp, h, i) - (u_int8_t *)h) <
+ (db_alignp_t)(P_OVERHEAD(dbp)) ||
+ (size_t)(P_ENTRY(dbp, h, i) - (u_int8_t *)h) >= pagesize) {
fprintf(fp,
"ILLEGAL PAGE OFFSET: indx: %lu of %lu\n",
- (u_long)i, (u_long)h->inp[i]);
+ (u_long)i, (u_long)inp[i]);
ret = EINVAL;
continue;
}
@@ -554,17 +572,17 @@ __db_prpage(dbp, h, flags)
case P_HASH:
case P_IBTREE:
case P_IRECNO:
- sp = P_ENTRY(h, i);
+ sp = P_ENTRY(dbp, h, i);
break;
case P_LBTREE:
- sp = P_ENTRY(h, i);
+ sp = P_ENTRY(dbp, h, i);
deleted = i % 2 == 0 &&
- B_DISSET(GET_BKEYDATA(h, i + O_INDX)->type);
+ B_DISSET(GET_BKEYDATA(dbp, h, i + O_INDX)->type);
break;
case P_LDUP:
case P_LRECNO:
- sp = P_ENTRY(h, i);
- deleted = B_DISSET(GET_BKEYDATA(h, i)->type);
+ sp = P_ENTRY(dbp, h, i);
+ deleted = B_DISSET(GET_BKEYDATA(dbp, h, i)->type);
break;
default:
fprintf(fp,
@@ -573,7 +591,7 @@ __db_prpage(dbp, h, flags)
continue;
}
fprintf(fp, "%s", deleted ? " D" : "\t");
- fprintf(fp, "[%03lu] %4lu ", (u_long)i, (u_long)h->inp[i]);
+ fprintf(fp, "[%03lu] %4lu ", (u_long)i, (u_long)inp[i]);
switch (TYPE(h)) {
case P_HASH:
hk = sp;
@@ -592,7 +610,7 @@ __db_prpage(dbp, h, flags)
* set.
*/
if (i != 0)
- len = LEN_HKEYDATA(h, 0, i);
+ len = LEN_HKEYDATA(dbp, h, 0, i);
else
len = 1;
@@ -602,13 +620,14 @@ __db_prpage(dbp, h, flags)
memcpy(&dlen, p, sizeof(db_indx_t));
p += sizeof(db_indx_t);
fprintf(fp, "\t\t");
- __db_pr(p, dlen);
+ __db_pr(p, dlen, fp);
p += sizeof(db_indx_t) + dlen;
}
break;
case H_KEYDATA:
__db_pr(HKEYDATA_DATA(hk),
- LEN_HKEYDATA(h, i == 0 ? set_psize : 0, i));
+ LEN_HKEYDATA(dbp, h, i == 0 ?
+ pagesize : 0, i), fp);
break;
case H_OFFPAGE:
memcpy(&a_hkd, hk, HOFFPAGE_SIZE);
@@ -625,11 +644,11 @@ __db_prpage(dbp, h, flags)
(u_long)bi->type);
switch (B_TYPE(bi->type)) {
case B_KEYDATA:
- __db_pr(bi->data, bi->len);
+ __db_pr(bi->data, bi->len, fp);
break;
case B_DUPLICATE:
case B_OVERFLOW:
- __db_proff(bi->data);
+ __db_proff(bi->data, fp);
break;
default:
fprintf(fp, "ILLEGAL BINTERNAL TYPE: %lu\n",
@@ -649,11 +668,11 @@ __db_prpage(dbp, h, flags)
bk = sp;
switch (B_TYPE(bk->type)) {
case B_KEYDATA:
- __db_pr(bk->data, bk->len);
+ __db_pr(bk->data, bk->len, fp);
break;
case B_DUPLICATE:
case B_OVERFLOW:
- __db_proff(bk);
+ __db_proff(bk, fp);
break;
default:
fprintf(fp,
@@ -673,19 +692,17 @@ __db_prpage(dbp, h, flags)
* __db_pr --
* Print out a data element.
*
- * PUBLIC: void __db_pr __P((u_int8_t *, u_int32_t));
+ * PUBLIC: void __db_pr __P((u_int8_t *, u_int32_t, FILE *));
*/
void
-__db_pr(p, len)
+__db_pr(p, len, fp)
u_int8_t *p;
u_int32_t len;
-{
FILE *fp;
+{
u_int lastch;
int i;
- fp = __db_prinit(NULL);
-
fprintf(fp, "len: %3lu", (u_long)len);
lastch = '.';
if (len != 0) {
@@ -744,6 +761,13 @@ __db_prdbt(dbtp, checkprint, prefix, handle, callback, is_recno, vdp)
handle, callback, vdp, 0);
F_CLR(vdp, SALVAGE_PRINTHEADER);
F_SET(vdp, SALVAGE_PRINTFOOTER);
+
+ /*
+ * Even if the printable flag wasn't set by our immediate
+ * caller, it may be set on a salvage-wide basis.
+ */
+ if (F_ISSET(vdp, SALVAGE_PRINTABLE))
+ checkprint = 1;
}
/*
@@ -760,12 +784,12 @@ __db_prdbt(dbtp, checkprint, prefix, handle, callback, is_recno, vdp)
* in a platform-independent way. So we use the numeral in
* straight ASCII.
*/
- __ua_memcpy(&recno, dbtp->data, sizeof(recno));
+ (void)__ua_memcpy(&recno, dbtp->data, sizeof(recno));
snprintf(buf, DBTBUFLEN, "%lu", (u_long)recno);
/* If we're printing data as hex, print keys as hex too. */
if (!checkprint) {
- for (len = strlen(buf), p = buf, hp = hbuf;
+ for (len = (u_int32_t)strlen(buf), p = buf, hp = hbuf;
len-- > 0; ++p) {
*hp++ = hex[(u_int8_t)(*p & 0xf0) >> 4];
*hp++ = hex[*p & 0x0f];
@@ -810,14 +834,12 @@ __db_prdbt(dbtp, checkprint, prefix, handle, callback, is_recno, vdp)
* Print out an off-page element.
*/
static void
-__db_proff(vp)
+__db_proff(vp, fp)
void *vp;
-{
FILE *fp;
+{
BOVERFLOW *bo;
- fp = __db_prinit(NULL);
-
bo = vp;
switch (B_TYPE(bo->type)) {
case B_OVERFLOW:
@@ -834,18 +856,25 @@ __db_proff(vp)
* __db_prflags --
* Print out flags values.
*
- * PUBLIC: void __db_prflags __P((u_int32_t, const FN *, FILE *));
+ * PUBLIC: void __db_prflags __P((u_int32_t, const FN *, void *));
*/
void
-__db_prflags(flags, fn, fp)
+__db_prflags(flags, fn, vfp)
u_int32_t flags;
FN const *fn;
- FILE *fp;
+ void *vfp;
{
+ FILE *fp;
const FN *fnp;
int found;
const char *sep;
+ /*
+ * We pass the FILE * through a void * so that we can use
+ * this function as as a callback.
+ */
+ fp = (FILE *)vfp;
+
sep = " (";
for (found = 0, fnp = fn; fnp->mask != 0; ++fnp)
if (LF_ISSET(fnp->mask)) {
@@ -858,62 +887,21 @@ __db_prflags(flags, fn, fp)
}
/*
- * __db_prinit --
- * Initialize tree printing routines.
- */
-static FILE *
-__db_prinit(fp)
- FILE *fp;
-{
- if (set_fp == NULL)
- set_fp = fp == NULL ? stdout : fp;
- return (set_fp);
-}
-
-/*
- * __db_psize --
- * Get the page size.
- */
-static void
-__db_psize(dbp)
- DB *dbp;
-{
- DBMETA *mp;
- db_pgno_t pgno;
-
- set_psize = PSIZE_BOUNDARY - 1;
-
- pgno = PGNO_BASE_MD;
- if (memp_fget(dbp->mpf, &pgno, 0, &mp) != 0)
- return;
-
- switch (mp->magic) {
- case DB_BTREEMAGIC:
- case DB_HASHMAGIC:
- case DB_QAMMAGIC:
- set_psize = mp->pagesize;
- break;
- }
- (void)memp_fput(dbp->mpf, mp, 0);
-}
-
-/*
* __db_dbtype_to_string --
* Return the name of the database type.
+ * PUBLIC: const char * __db_dbtype_to_string __P((DBTYPE));
*/
-static const char *
-__db_dbtype_to_string(dbp)
- DB *dbp;
+const char *
+__db_dbtype_to_string(type)
+ DBTYPE type;
{
- switch (dbp->type) {
+ switch (type) {
case DB_BTREE:
return ("btree");
case DB_HASH:
return ("hash");
- break;
case DB_RECNO:
return ("recno");
- break;
case DB_QUEUE:
return ("queue");
default:
@@ -925,10 +913,8 @@ __db_dbtype_to_string(dbp)
/*
* __db_pagetype_to_string --
* Return the name of the specified page type.
- *
- * PUBLIC: const char *__db_pagetype_to_string __P((u_int32_t));
*/
-const char *
+static const char *
__db_pagetype_to_string(type)
u_int32_t type;
{
@@ -1000,6 +986,7 @@ __db_prheader(dbp, subname, pflag, keyflag, handle, callback, vdp, meta_pgno)
DB_ENV *dbenv;
DB_HASH_STAT *hsp;
DB_QUEUE_STAT *qsp;
+ DBT dbt;
VRFY_PAGEINFO *pip;
char *buf;
int buflen, ret, t_ret;
@@ -1021,10 +1008,16 @@ __db_prheader(dbp, subname, pflag, keyflag, handle, callback, vdp, meta_pgno)
* If we've been passed a verifier statistics object, use
* that; we're being called in a context where dbp->stat
* is unsafe.
+ *
+ * Also, the verifier may set the pflag on a per-salvage basis.
+ * If so, respect that.
*/
if (vdp != NULL) {
if ((ret = __db_vrfy_getpageinfo(vdp, meta_pgno, &pip)) != 0)
return (ret);
+
+ if (F_ISSET(vdp, SALVAGE_PRINTABLE))
+ pflag = 1;
} else
pip = NULL;
@@ -1071,16 +1064,22 @@ __db_prheader(dbp, subname, pflag, keyflag, handle, callback, vdp, meta_pgno)
/*
* 64 bytes is long enough, as a minimum bound, for any of the
- * fields besides subname. Subname can be anything, and so
- * 64 + subname is big enough for all the things we need to print here.
+ * fields besides subname. Subname uses __db_prdbt and therefore
+ * does not need buffer space here.
*/
- buflen = 64 + ((subname != NULL) ? strlen(subname) : 0);
- if ((ret = __os_malloc(dbenv, buflen, NULL, &buf)) != 0)
+ buflen = 64;
+ if ((ret = __os_malloc(dbenv, buflen, &buf)) != 0)
goto err;
if (subname != NULL) {
- snprintf(buf, buflen, "database=%s\n", subname);
+ snprintf(buf, buflen, "database=");
if ((ret = callback(handle, buf)) != 0)
goto err;
+ memset(&dbt, 0, sizeof(dbt));
+ dbt.data = subname;
+ dbt.size = (u_int32_t)strlen(subname);
+ if ((ret = __db_prdbt(&dbt,
+ 1, NULL, handle, callback, 0, NULL)) != 0)
+ goto err;
}
switch (dbtype) {
case DB_BTREE:
@@ -1106,11 +1105,11 @@ __db_prheader(dbp, subname, pflag, keyflag, handle, callback, vdp, meta_pgno)
}
break;
}
- if ((ret = dbp->stat(dbp, &btsp, NULL, 0)) != 0) {
+ if ((ret = dbp->stat(dbp, &btsp, 0)) != 0) {
dbp->err(dbp, ret, "DB->stat");
goto err;
}
- if (F_ISSET(dbp, DB_BT_RECNUM))
+ if (F_ISSET(dbp, DB_AM_RECNUM))
if ((ret = callback(handle, "recnum=1\n")) != 0)
goto err;
if (btsp->bt_maxkey != 0) {
@@ -1144,7 +1143,7 @@ __db_prheader(dbp, subname, pflag, keyflag, handle, callback, vdp, meta_pgno)
}
break;
}
- if ((ret = dbp->stat(dbp, &hsp, NULL, 0)) != 0) {
+ if ((ret = dbp->stat(dbp, &hsp, 0)) != 0) {
dbp->err(dbp, ret, "DB->stat");
goto err;
}
@@ -1154,10 +1153,9 @@ __db_prheader(dbp, subname, pflag, keyflag, handle, callback, vdp, meta_pgno)
if ((ret = callback(handle, buf)) != 0)
goto err;
}
- if (hsp->hash_nelem != 0 || hsp->hash_nkeys != 0) {
- snprintf(buf, buflen, "h_nelem=%lu\n",
- hsp->hash_nelem > hsp->hash_nkeys ?
- (u_long)hsp->hash_nelem : (u_long)hsp->hash_nkeys);
+ if (hsp->hash_nkeys != 0) {
+ snprintf(buf, buflen,
+ "h_nelem=%lu\n", (u_long)hsp->hash_nkeys);
if ((ret = callback(handle, buf)) != 0)
goto err;
}
@@ -1172,15 +1170,24 @@ __db_prheader(dbp, subname, pflag, keyflag, handle, callback, vdp, meta_pgno)
goto err;
break;
}
- if ((ret = dbp->stat(dbp, &qsp, NULL, 0)) != 0) {
+ if ((ret = dbp->stat(dbp, &qsp, 0)) != 0) {
dbp->err(dbp, ret, "DB->stat");
goto err;
}
snprintf(buf, buflen, "re_len=%lu\n", (u_long)qsp->qs_re_len);
- if (qsp->qs_re_pad != 0 && qsp->qs_re_pad != ' ')
- snprintf(buf, buflen, "re_pad=%#x\n", qsp->qs_re_pad);
if ((ret = callback(handle, buf)) != 0)
goto err;
+ if (qsp->qs_re_pad != 0 && qsp->qs_re_pad != ' ') {
+ snprintf(buf, buflen, "re_pad=%#x\n", qsp->qs_re_pad);
+ if ((ret = callback(handle, buf)) != 0)
+ goto err;
+ }
+ if (qsp->qs_extentsize != 0) {
+ snprintf(buf, buflen,
+ "extentsize=%lu\n", (u_long)qsp->qs_extentsize);
+ if ((ret = callback(handle, buf)) != 0)
+ goto err;
+ }
break;
case DB_RECNO:
if ((ret = callback(handle, "type=recno\n")) != 0)
@@ -1198,14 +1205,14 @@ __db_prheader(dbp, subname, pflag, keyflag, handle, callback, vdp, meta_pgno)
}
break;
}
- if ((ret = dbp->stat(dbp, &btsp, NULL, 0)) != 0) {
+ if ((ret = dbp->stat(dbp, &btsp, 0)) != 0) {
dbp->err(dbp, ret, "DB->stat");
goto err;
}
- if (F_ISSET(dbp, DB_RE_RENUMBER))
+ if (F_ISSET(dbp, DB_AM_RENUMBER))
if ((ret = callback(handle, "renumber=1\n")) != 0)
goto err;
- if (F_ISSET(dbp, DB_RE_FIXEDLEN)) {
+ if (F_ISSET(dbp, DB_AM_FIXEDLEN)) {
snprintf(buf, buflen,
"re_len=%lu\n", (u_long)btsp->bt_re_len);
if ((ret = callback(handle, buf)) != 0)
@@ -1233,6 +1240,9 @@ __db_prheader(dbp, subname, pflag, keyflag, handle, callback, vdp, meta_pgno)
goto err;
/* We should handle page size. XXX */
} else {
+ if (F_ISSET(dbp, DB_AM_CHKSUM))
+ if ((ret = callback(handle, "chksum=1\n")) != 0)
+ goto err;
if (F_ISSET(dbp, DB_AM_DUP))
if ((ret = callback(handle, "duplicates=1\n")) != 0)
goto err;
@@ -1253,16 +1263,16 @@ __db_prheader(dbp, subname, pflag, keyflag, handle, callback, vdp, meta_pgno)
ret = callback(handle, "HEADER=END\n");
err: if (pip != NULL &&
- (t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0)
+ (t_ret = __db_vrfy_putpageinfo(dbenv, vdp, pip)) != 0 && ret == 0)
ret = t_ret;
if (btsp != NULL)
- __os_free(btsp, 0);
+ __os_ufree(dbenv, btsp);
if (hsp != NULL)
- __os_free(hsp, 0);
+ __os_ufree(dbenv, hsp);
if (qsp != NULL)
- __os_free(qsp, 0);
+ __os_ufree(dbenv, qsp);
if (buf != NULL)
- __os_free(buf, buflen);
+ __os_free(dbenv, buf);
return (ret);
}
diff --git a/bdb/db/db_rec.c b/bdb/db/db_rec.c
index 998d074290d..303ab2fe1d4 100644
--- a/bdb/db/db_rec.c
+++ b/bdb/db/db_rec.c
@@ -1,14 +1,14 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: db_rec.c,v 11.10 2000/08/03 15:32:19 ubell Exp $";
+static const char revid[] = "$Id: db_rec.c,v 11.35 2002/08/08 03:57:49 bostic Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -18,9 +18,9 @@ static const char revid[] = "$Id: db_rec.c,v 11.10 2000/08/03 15:32:19 ubell Exp
#endif
#include "db_int.h"
-#include "db_page.h"
-#include "log.h"
-#include "hash.h"
+#include "dbinc/db_page.h"
+#include "dbinc/log.h"
+#include "dbinc/hash.h"
/*
* PUBLIC: int __db_addrem_recover
@@ -45,11 +45,12 @@ __db_addrem_recover(dbenv, dbtp, lsnp, op, info)
u_int32_t change;
int cmp_n, cmp_p, ret;
+ pagep = NULL;
COMPQUIET(info, NULL);
REC_PRINT(__db_addrem_print);
REC_INTRO(__db_addrem_read, 1);
- if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) {
+ if ((ret = mpf->get(mpf, &argp->pgno, 0, &pagep)) != 0) {
if (DB_UNDO(op)) {
/*
* We are undoing and the page doesn't exist. That
@@ -59,7 +60,7 @@ __db_addrem_recover(dbenv, dbtp, lsnp, op, info)
*/
goto done;
} else
- if ((ret = memp_fget(mpf,
+ if ((ret = mpf->get(mpf,
&argp->pgno, DB_MPOOL_CREATE, &pagep)) != 0)
goto out;
}
@@ -95,13 +96,16 @@ __db_addrem_recover(dbenv, dbtp, lsnp, op, info)
LSN(pagep) = argp->pagelsn;
}
- if ((ret = memp_fput(mpf, pagep, change)) != 0)
+ if ((ret = mpf->put(mpf, pagep, change)) != 0)
goto out;
+ pagep = NULL;
done: *lsnp = argp->prev_lsn;
ret = 0;
-out: REC_CLOSE;
+out: if (pagep != NULL)
+ (void)mpf->put(mpf, pagep, 0);
+ REC_CLOSE;
}
/*
@@ -124,11 +128,12 @@ __db_big_recover(dbenv, dbtp, lsnp, op, info)
u_int32_t change;
int cmp_n, cmp_p, ret;
+ pagep = NULL;
COMPQUIET(info, NULL);
REC_PRINT(__db_big_print);
REC_INTRO(__db_big_read, 1);
- if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) {
+ if ((ret = mpf->get(mpf, &argp->pgno, 0, &pagep)) != 0) {
if (DB_UNDO(op)) {
/*
* We are undoing and the page doesn't exist. That
@@ -139,7 +144,7 @@ __db_big_recover(dbenv, dbtp, lsnp, op, info)
ret = 0;
goto ppage;
} else
- if ((ret = memp_fget(mpf,
+ if ((ret = mpf->get(mpf,
&argp->pgno, DB_MPOOL_CREATE, &pagep)) != 0)
goto out;
}
@@ -161,7 +166,7 @@ __db_big_recover(dbenv, dbtp, lsnp, op, info)
argp->next_pgno, 0, P_OVERFLOW);
OV_LEN(pagep) = argp->dbt.size;
OV_REF(pagep) = 1;
- memcpy((u_int8_t *)pagep + P_OVERHEAD, argp->dbt.data,
+ memcpy((u_int8_t *)pagep + P_OVERHEAD(file_dbp), argp->dbt.data,
argp->dbt.size);
PREV_PGNO(pagep) = argp->prev_pgno;
change = DB_MPOOL_DIRTY;
@@ -177,13 +182,21 @@ __db_big_recover(dbenv, dbtp, lsnp, op, info)
if (change)
LSN(pagep) = DB_REDO(op) ? *lsnp : argp->pagelsn;
- if ((ret = memp_fput(mpf, pagep, change)) != 0)
+ if ((ret = mpf->put(mpf, pagep, change)) != 0)
goto out;
+ pagep = NULL;
+
+ /*
+ * We only delete a whole chain of overflow.
+ * Each page is handled individually
+ */
+ if (argp->opcode == DB_REM_BIG)
+ goto done;
/* Now check the previous page. */
ppage: if (argp->prev_pgno != PGNO_INVALID) {
change = 0;
- if ((ret = memp_fget(mpf, &argp->prev_pgno, 0, &pagep)) != 0) {
+ if ((ret = mpf->get(mpf, &argp->prev_pgno, 0, &pagep)) != 0) {
if (DB_UNDO(op)) {
/*
* We are undoing and the page doesn't exist.
@@ -195,7 +208,7 @@ ppage: if (argp->prev_pgno != PGNO_INVALID) {
ret = 0;
goto npage;
} else
- if ((ret = memp_fget(mpf, &argp->prev_pgno,
+ if ((ret = mpf->get(mpf, &argp->prev_pgno,
DB_MPOOL_CREATE, &pagep)) != 0)
goto out;
}
@@ -204,28 +217,27 @@ ppage: if (argp->prev_pgno != PGNO_INVALID) {
cmp_p = log_compare(&LSN(pagep), &argp->prevlsn);
CHECK_LSN(op, cmp_p, &LSN(pagep), &argp->prevlsn);
- if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_ADD_BIG) ||
- (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_REM_BIG)) {
+ if (cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_ADD_BIG) {
/* Redo add, undo delete. */
NEXT_PGNO(pagep) = argp->pgno;
change = DB_MPOOL_DIRTY;
- } else if ((cmp_n == 0 &&
- DB_UNDO(op) && argp->opcode == DB_ADD_BIG) ||
- (cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_REM_BIG)) {
+ } else if (cmp_n == 0 &&
+ DB_UNDO(op) && argp->opcode == DB_ADD_BIG) {
/* Redo delete, undo add. */
NEXT_PGNO(pagep) = argp->next_pgno;
change = DB_MPOOL_DIRTY;
}
if (change)
LSN(pagep) = DB_REDO(op) ? *lsnp : argp->prevlsn;
- if ((ret = memp_fput(mpf, pagep, change)) != 0)
+ if ((ret = mpf->put(mpf, pagep, change)) != 0)
goto out;
}
+ pagep = NULL;
/* Now check the next page. Can only be set on a delete. */
npage: if (argp->next_pgno != PGNO_INVALID) {
change = 0;
- if ((ret = memp_fget(mpf, &argp->next_pgno, 0, &pagep)) != 0) {
+ if ((ret = mpf->get(mpf, &argp->next_pgno, 0, &pagep)) != 0) {
if (DB_UNDO(op)) {
/*
* We are undoing and the page doesn't exist.
@@ -235,7 +247,7 @@ npage: if (argp->next_pgno != PGNO_INVALID) {
*/
goto done;
} else
- if ((ret = memp_fget(mpf, &argp->next_pgno,
+ if ((ret = mpf->get(mpf, &argp->next_pgno,
DB_MPOOL_CREATE, &pagep)) != 0)
goto out;
}
@@ -252,21 +264,25 @@ npage: if (argp->next_pgno != PGNO_INVALID) {
}
if (change)
LSN(pagep) = DB_REDO(op) ? *lsnp : argp->nextlsn;
- if ((ret = memp_fput(mpf, pagep, change)) != 0)
+ if ((ret = mpf->put(mpf, pagep, change)) != 0)
goto out;
}
+ pagep = NULL;
done: *lsnp = argp->prev_lsn;
ret = 0;
-out: REC_CLOSE;
+out: if (pagep != NULL)
+ (void)mpf->put(mpf, pagep, 0);
+ REC_CLOSE;
}
/*
* __db_ovref_recover --
* Recovery function for __db_ovref().
*
- * PUBLIC: int __db_ovref_recover __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
+ * PUBLIC: int __db_ovref_recover
+ * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
*/
int
__db_ovref_recover(dbenv, dbtp, lsnp, op, info)
@@ -283,14 +299,15 @@ __db_ovref_recover(dbenv, dbtp, lsnp, op, info)
PAGE *pagep;
int cmp, modified, ret;
+ pagep = NULL;
COMPQUIET(info, NULL);
REC_PRINT(__db_ovref_print);
REC_INTRO(__db_ovref_read, 1);
- if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) {
+ if ((ret = mpf->get(mpf, &argp->pgno, 0, &pagep)) != 0) {
if (DB_UNDO(op))
goto done;
- (void)__db_pgerr(file_dbp, argp->pgno);
+ __db_pgerr(file_dbp, argp->pgno, ret);
goto out;
}
@@ -310,13 +327,16 @@ __db_ovref_recover(dbenv, dbtp, lsnp, op, info)
pagep->lsn = argp->lsn;
modified = 1;
}
- if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0)
+ if ((ret = mpf->put(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0)
goto out;
+ pagep = NULL;
done: *lsnp = argp->prev_lsn;
ret = 0;
-out: REC_CLOSE;
+out: if (pagep != NULL)
+ (void)mpf->put(mpf, pagep, 0);
+ REC_CLOSE;
}
/*
@@ -341,6 +361,7 @@ __db_relink_recover(dbenv, dbtp, lsnp, op, info)
PAGE *pagep;
int cmp_n, cmp_p, modified, ret;
+ pagep = NULL;
COMPQUIET(info, NULL);
REC_PRINT(__db_relink_print);
REC_INTRO(__db_relink_read, 1);
@@ -351,9 +372,9 @@ __db_relink_recover(dbenv, dbtp, lsnp, op, info)
* the current page is the result of a split and is being recovered
* elsewhere, so all we need do is recover the next page.
*/
- if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) {
+ if ((ret = mpf->get(mpf, &argp->pgno, 0, &pagep)) != 0) {
if (DB_REDO(op)) {
- (void)__db_pgerr(file_dbp, argp->pgno);
+ __db_pgerr(file_dbp, argp->pgno, ret);
goto out;
}
goto next2;
@@ -376,12 +397,13 @@ __db_relink_recover(dbenv, dbtp, lsnp, op, info)
pagep->lsn = argp->lsn;
modified = 1;
}
-next1: if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0)
+next1: if ((ret = mpf->put(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0)
goto out;
+ pagep = NULL;
-next2: if ((ret = memp_fget(mpf, &argp->next, 0, &pagep)) != 0) {
+next2: if ((ret = mpf->get(mpf, &argp->next, 0, &pagep)) != 0) {
if (DB_REDO(op)) {
- (void)__db_pgerr(file_dbp, argp->next);
+ __db_pgerr(file_dbp, argp->next, ret);
goto out;
}
goto prev;
@@ -409,14 +431,15 @@ next2: if ((ret = memp_fget(mpf, &argp->next, 0, &pagep)) != 0) {
else
pagep->lsn = *lsnp;
}
- if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0)
+ if ((ret = mpf->put(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0)
goto out;
+ pagep = NULL;
if (argp->opcode == DB_ADD_PAGE)
goto done;
-prev: if ((ret = memp_fget(mpf, &argp->prev, 0, &pagep)) != 0) {
+prev: if ((ret = mpf->get(mpf, &argp->prev, 0, &pagep)) != 0) {
if (DB_REDO(op)) {
- (void)__db_pgerr(file_dbp, argp->prev);
+ __db_pgerr(file_dbp, argp->prev, ret);
goto out;
}
goto done;
@@ -441,13 +464,16 @@ prev: if ((ret = memp_fget(mpf, &argp->prev, 0, &pagep)) != 0) {
else
pagep->lsn = *lsnp;
}
- if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0)
+ if ((ret = mpf->put(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0)
goto out;
+ pagep = NULL;
done: *lsnp = argp->prev_lsn;
ret = 0;
-out: REC_CLOSE;
+out: if (pagep != NULL)
+ (void)mpf->put(mpf, pagep, 0);
+ REC_CLOSE;
}
/*
@@ -468,8 +494,8 @@ __db_debug_recover(dbenv, dbtp, lsnp, op, info)
__db_debug_args *argp;
int ret;
- COMPQUIET(op, 0);
COMPQUIET(dbenv, NULL);
+ COMPQUIET(op, DB_TXN_ABORT);
COMPQUIET(info, NULL);
REC_PRINT(__db_debug_print);
@@ -504,11 +530,12 @@ __db_noop_recover(dbenv, dbtp, lsnp, op, info)
u_int32_t change;
int cmp_n, cmp_p, ret;
+ pagep = NULL;
COMPQUIET(info, NULL);
REC_PRINT(__db_noop_print);
REC_INTRO(__db_noop_read, 0);
- if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0)
+ if ((ret = mpf->get(mpf, &argp->pgno, 0, &pagep)) != 0)
goto out;
cmp_n = log_compare(lsnp, &LSN(pagep));
@@ -522,8 +549,349 @@ __db_noop_recover(dbenv, dbtp, lsnp, op, info)
LSN(pagep) = argp->prevlsn;
change = DB_MPOOL_DIRTY;
}
- ret = memp_fput(mpf, pagep, change);
+ ret = mpf->put(mpf, pagep, change);
+ pagep = NULL;
done: *lsnp = argp->prev_lsn;
-out: REC_CLOSE;
+out: if (pagep != NULL)
+ (void)mpf->put(mpf, pagep, 0);
+ REC_CLOSE;
+}
+
+/*
+ * __db_pg_alloc_recover --
+ * Recovery function for pg_alloc.
+ *
+ * PUBLIC: int __db_pg_alloc_recover
+ * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_pg_alloc_recover(dbenv, dbtp, lsnp, op, info)
+ DB_ENV *dbenv;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __db_pg_alloc_args *argp;
+ DB *file_dbp;
+ DBC *dbc;
+ DBMETA *meta;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ db_pgno_t pgno;
+ int cmp_n, cmp_p, created, level, modified, ret;
+
+ meta = NULL;
+ pagep = NULL;
+ REC_PRINT(__db_pg_alloc_print);
+ REC_INTRO(__db_pg_alloc_read, 0);
+
+ /*
+ * Fix up the allocated page. If we're redoing the operation, we have
+ * to get the page (creating it if it doesn't exist), and update its
+ * LSN. If we're undoing the operation, we have to reset the page's
+ * LSN and put it on the free list.
+ *
+ * Fix up the metadata page. If we're redoing the operation, we have
+ * to get the metadata page and update its LSN and its free pointer.
+ * If we're undoing the operation and the page was ever created, we put
+ * it on the freelist.
+ */
+ pgno = PGNO_BASE_MD;
+ if ((ret = mpf->get(mpf, &pgno, 0, &meta)) != 0) {
+ /* The metadata page must always exist on redo. */
+ if (DB_REDO(op)) {
+ __db_pgerr(file_dbp, pgno, ret);
+ goto out;
+ } else
+ goto done;
+ }
+ created = modified = 0;
+ if ((ret = mpf->get(mpf, &argp->pgno, 0, &pagep)) != 0) {
+ /*
+ * We have to be able to identify if a page was newly
+ * created so we can recover it properly. We cannot simply
+ * look for an empty header, because hash uses a pgin
+ * function that will set the header. Instead, we explicitly
+ * try for the page without CREATE and if that fails, then
+ * create it.
+ */
+ if ((ret =
+ mpf->get(mpf, &argp->pgno, DB_MPOOL_CREATE, &pagep)) != 0) {
+ __db_pgerr(file_dbp, argp->pgno, ret);
+ goto out;
+ }
+ created = modified = 1;
+ }
+
+ /* Fix up the allocated page. */
+ cmp_n = log_compare(lsnp, &LSN(pagep));
+ cmp_p = log_compare(&LSN(pagep), &argp->page_lsn);
+
+ /*
+ * If an inital allocation is aborted and then reallocated
+ * during an archival restore the log record will have
+ * an LSN for the page but the page will be empty.
+ */
+ if (IS_ZERO_LSN(LSN(pagep)))
+ cmp_p = 0;
+ CHECK_LSN(op, cmp_p, &LSN(pagep), &argp->page_lsn);
+ /*
+ * If we we rolled back this allocation previously during an
+ * archive restore, the page may have the LSN of the meta page
+ * at the point of the roll back. This will be no more
+ * than the LSN of the metadata page at the time of this allocation.
+ * Another special case we have to handle is if we ended up with a
+ * page of all 0's which can happen if we abort between allocating a
+ * page in mpool and initializing it. In that case, even if we're
+ * undoing, we need to re-initialize the page.
+ */
+ if (DB_REDO(op) &&
+ (cmp_p == 0 ||
+ (IS_ZERO_LSN(argp->page_lsn) &&
+ log_compare(&LSN(pagep), &argp->meta_lsn) <= 0))) {
+ /* Need to redo update described. */
+ switch (argp->ptype) {
+ case P_LBTREE:
+ case P_LRECNO:
+ case P_LDUP:
+ level = LEAFLEVEL;
+ break;
+ default:
+ level = 0;
+ break;
+ }
+ P_INIT(pagep, file_dbp->pgsize,
+ argp->pgno, PGNO_INVALID, PGNO_INVALID, level, argp->ptype);
+
+ pagep->lsn = *lsnp;
+ modified = 1;
+ } else if (DB_UNDO(op) && (cmp_n == 0 || created)) {
+ /*
+ * This is where we handle the case of a 0'd page (pagep->pgno
+ * is equal to PGNO_INVALID).
+ * Undo the allocation, reinitialize the page and
+ * link its next pointer to the free list.
+ */
+ P_INIT(pagep, file_dbp->pgsize,
+ argp->pgno, PGNO_INVALID, argp->next, 0, P_INVALID);
+
+ pagep->lsn = argp->page_lsn;
+ modified = 1;
+ }
+
+ /*
+ * If the page was newly created, put it on the limbo list.
+ */
+ if (IS_ZERO_LSN(LSN(pagep)) &&
+ IS_ZERO_LSN(argp->page_lsn) && DB_UNDO(op)) {
+ /* Put the page in limbo.*/
+ if ((ret = __db_add_limbo(dbenv,
+ info, argp->fileid, argp->pgno, 1)) != 0)
+ goto out;
+ }
+
+ if ((ret = mpf->put(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0)
+ goto out;
+ pagep = NULL;
+
+ /* Fix up the metadata page. */
+ modified = 0;
+ cmp_n = log_compare(lsnp, &LSN(meta));
+ cmp_p = log_compare(&LSN(meta), &argp->meta_lsn);
+ CHECK_LSN(op, cmp_p, &LSN(meta), &argp->meta_lsn);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Need to redo update described. */
+ LSN(meta) = *lsnp;
+ meta->free = argp->next;
+ modified = 1;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Need to undo update described. */
+ LSN(meta) = argp->meta_lsn;
+
+ /*
+ * If the page has a zero LSN then its newly created
+ * and will go into limbo rather than directly on the
+ * free list.
+ */
+ if (!IS_ZERO_LSN(argp->page_lsn))
+ meta->free = argp->pgno;
+ modified = 1;
+ }
+ if ((ret = mpf->put(mpf, meta, modified ? DB_MPOOL_DIRTY : 0)) != 0)
+ goto out;
+ meta = NULL;
+ /*
+ * This could be the metapage from a subdb which is read from disk
+ * to recover its creation.
+ */
+ if (F_ISSET(file_dbp, DB_AM_SUBDB))
+ switch (argp->type) {
+ case P_BTREEMETA:
+ case P_HASHMETA:
+ case P_QAMMETA:
+ file_dbp->sync(file_dbp, 0);
+ break;
+ }
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)mpf->put(mpf, pagep, 0);
+ if (meta != NULL)
+ (void)mpf->put(mpf, meta, 0);
+ if (ret == ENOENT && op == DB_TXN_BACKWARD_ALLOC)
+ ret = 0;
+ REC_CLOSE;
+}
+
+/*
+ * __db_pg_free_recover --
+ * Recovery function for pg_free.
+ *
+ * PUBLIC: int __db_pg_free_recover
+ * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_pg_free_recover(dbenv, dbtp, lsnp, op, info)
+ DB_ENV *dbenv;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __db_pg_free_args *argp;
+ DB *file_dbp;
+ DBC *dbc;
+ DBMETA *meta;
+ DB_LSN copy_lsn;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ db_pgno_t pgno;
+ int cmp_n, cmp_p, modified, ret;
+
+ COMPQUIET(info, NULL);
+ meta = NULL;
+ pagep = NULL;
+ REC_PRINT(__db_pg_free_print);
+ REC_INTRO(__db_pg_free_read, 1);
+
+ /*
+ * Fix up the freed page. If we're redoing the operation we get the
+ * page and explicitly discard its contents, then update its LSN. If
+ * we're undoing the operation, we get the page and restore its header.
+ * Create the page if necessary, we may be freeing an aborted
+ * create.
+ */
+ if ((ret = mpf->get(mpf, &argp->pgno, DB_MPOOL_CREATE, &pagep)) != 0)
+ goto out;
+ modified = 0;
+ (void)__ua_memcpy(&copy_lsn, &LSN(argp->header.data), sizeof(DB_LSN));
+ cmp_n = log_compare(lsnp, &LSN(pagep));
+ cmp_p = log_compare(&LSN(pagep), &copy_lsn);
+ CHECK_LSN(op, cmp_p, &LSN(pagep), &copy_lsn);
+ if (DB_REDO(op) &&
+ (cmp_p == 0 ||
+ (IS_ZERO_LSN(copy_lsn) &&
+ log_compare(&LSN(pagep), &argp->meta_lsn) <= 0))) {
+ /* Need to redo update described. */
+ P_INIT(pagep, file_dbp->pgsize,
+ argp->pgno, PGNO_INVALID, argp->next, 0, P_INVALID);
+ pagep->lsn = *lsnp;
+
+ modified = 1;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Need to undo update described. */
+ memcpy(pagep, argp->header.data, argp->header.size);
+
+ modified = 1;
+ }
+ if ((ret = mpf->put(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0)
+ goto out;
+ pagep = NULL;
+
+ /*
+ * Fix up the metadata page. If we're redoing or undoing the operation
+ * we get the page and update its LSN and free pointer.
+ */
+ pgno = PGNO_BASE_MD;
+ if ((ret = mpf->get(mpf, &pgno, 0, &meta)) != 0) {
+ /* The metadata page must always exist. */
+ __db_pgerr(file_dbp, pgno, ret);
+ goto out;
+ }
+
+ modified = 0;
+ cmp_n = log_compare(lsnp, &LSN(meta));
+ cmp_p = log_compare(&LSN(meta), &argp->meta_lsn);
+ CHECK_LSN(op, cmp_p, &LSN(meta), &argp->meta_lsn);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Need to redo the deallocation. */
+ meta->free = argp->pgno;
+ LSN(meta) = *lsnp;
+ modified = 1;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Need to undo the deallocation. */
+ meta->free = argp->next;
+ LSN(meta) = argp->meta_lsn;
+ modified = 1;
+ }
+ if ((ret = mpf->put(mpf, meta, modified ? DB_MPOOL_DIRTY : 0)) != 0)
+ goto out;
+ meta = NULL;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)mpf->put(mpf, pagep, 0);
+ if (meta != NULL)
+ (void)mpf->put(mpf, meta, 0);
+ REC_CLOSE;
+}
+
+/*
+ * __db_cksum_recover --
+ * Recovery function for checksum failure log record.
+ *
+ * PUBLIC: int __db_cksum_recover __P((DB_ENV *,
+ * PUBLIC: DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_cksum_recover(dbenv, dbtp, lsnp, op, info)
+ DB_ENV *dbenv;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __db_cksum_args *argp;
+
+ int ret;
+
+ COMPQUIET(info, NULL);
+ COMPQUIET(lsnp, NULL);
+ COMPQUIET(op, DB_TXN_ABORT);
+
+ REC_PRINT(__db_cksum_print);
+
+ if ((ret = __db_cksum_read(dbenv, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ /*
+ * We had a checksum failure -- the only option is to run catastrophic
+ * recovery.
+ */
+ if (F_ISSET(dbenv, DB_ENV_FATAL))
+ ret = 0;
+ else {
+ __db_err(dbenv,
+ "Checksum failure requires catastrophic recovery");
+ ret = __db_panic(dbenv, DB_RUNRECOVERY);
+ }
+
+ __os_free(dbenv, argp);
+ return (ret);
}
diff --git a/bdb/db/db_reclaim.c b/bdb/db/db_reclaim.c
index 739f348407d..9aa39bcfa9b 100644
--- a/bdb/db/db_reclaim.c
+++ b/bdb/db/db_reclaim.c
@@ -1,74 +1,26 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: db_reclaim.c,v 11.5 2000/04/07 14:26:58 bostic Exp $";
+static const char revid[] = "$Id: db_reclaim.c,v 11.28 2002/08/06 06:11:17 bostic Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
#include <sys/types.h>
+#include <string.h>
#endif
#include "db_int.h"
-#include "db_page.h"
-#include "db_am.h"
-
-/*
- * Assume that we enter with a valid pgno. We traverse a set of
- * duplicate pages. The format of the callback routine is:
- * callback(dbp, page, cookie, did_put). did_put is an output
- * value that will be set to 1 by the callback routine if it
- * already put the page back. Otherwise, this routine must
- * put the page.
- *
- * PUBLIC: int __db_traverse_dup __P((DB *,
- * PUBLIC: db_pgno_t, int (*)(DB *, PAGE *, void *, int *), void *));
- */
-int
-__db_traverse_dup(dbp, pgno, callback, cookie)
- DB *dbp;
- db_pgno_t pgno;
- int (*callback) __P((DB *, PAGE *, void *, int *));
- void *cookie;
-{
- PAGE *p;
- int did_put, i, opgno, ret;
-
- do {
- did_put = 0;
- if ((ret = memp_fget(dbp->mpf, &pgno, 0, &p)) != 0)
- return (ret);
- pgno = NEXT_PGNO(p);
-
- for (i = 0; i < NUM_ENT(p); i++) {
- if (B_TYPE(GET_BKEYDATA(p, i)->type) == B_OVERFLOW) {
- opgno = GET_BOVERFLOW(p, i)->pgno;
- if ((ret = __db_traverse_big(dbp,
- opgno, callback, cookie)) != 0)
- goto err;
- }
- }
-
- if ((ret = callback(dbp, p, cookie, &did_put)) != 0)
- goto err;
-
- if (!did_put)
- if ((ret = memp_fput(dbp->mpf, p, 0)) != 0)
- return (ret);
- } while (pgno != PGNO_INVALID);
-
- if (0) {
-err: if (did_put == 0)
- (void)memp_fput(dbp->mpf, p, 0);
- }
- return (ret);
-}
+#include "dbinc/db_page.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
/*
* __db_traverse_big
@@ -88,17 +40,20 @@ __db_traverse_big(dbp, pgno, callback, cookie)
int (*callback) __P((DB *, PAGE *, void *, int *));
void *cookie;
{
+ DB_MPOOLFILE *mpf;
PAGE *p;
int did_put, ret;
+ mpf = dbp->mpf;
+
do {
did_put = 0;
- if ((ret = memp_fget(dbp->mpf, &pgno, 0, &p)) != 0)
+ if ((ret = mpf->get(mpf, &pgno, 0, &p)) != 0)
return (ret);
pgno = NEXT_PGNO(p);
if ((ret = callback(dbp, p, cookie, &did_put)) == 0 &&
!did_put)
- ret = memp_fput(dbp->mpf, p, 0);
+ ret = mpf->put(mpf, p, 0);
} while (ret == 0 && pgno != PGNO_INVALID);
return (ret);
@@ -132,3 +87,162 @@ __db_reclaim_callback(dbp, p, cookie, putp)
return (0);
}
+
+/*
+ * __db_truncate_callback
+ * This is the callback routine used during a truncate.
+ * we are traversing a btree or hash table and trying to free all the
+ * pages.
+ *
+ * PUBLIC: int __db_truncate_callback __P((DB *, PAGE *, void *, int *));
+ */
+int
+__db_truncate_callback(dbp, p, cookie, putp)
+ DB *dbp;
+ PAGE *p;
+ void *cookie;
+ int *putp;
+{
+ DBMETA *meta;
+ DBT ldbt;
+ DB_LOCK metalock;
+ DB_MPOOLFILE *mpf;
+ db_indx_t indx, len, off, tlen, top;
+ db_pgno_t pgno;
+ db_trunc_param *param;
+ u_int8_t *hk, type;
+ int ret;
+
+ top = NUM_ENT(p);
+ mpf = dbp->mpf;
+ param = cookie;
+ *putp = 1;
+
+ switch (TYPE(p)) {
+ case P_LBTREE:
+ /* Skip for off-page duplicates and deleted items. */
+ for (indx = 0; indx < top; indx += P_INDX) {
+ type = GET_BKEYDATA(dbp, p, indx + O_INDX)->type;
+ if (!B_DISSET(type) && B_TYPE(type) != B_DUPLICATE)
+ ++param->count;
+ }
+ /* FALLTHROUGH */
+ case P_IBTREE:
+ case P_IRECNO:
+ case P_INVALID:
+ if (dbp->type != DB_HASH &&
+ ((BTREE *)dbp->bt_internal)->bt_root == PGNO(p)) {
+ type = dbp->type == DB_RECNO ? P_LRECNO : P_LBTREE;
+ goto reinit;
+ }
+ break;
+ case P_OVERFLOW:
+ if (DBC_LOGGING(param->dbc)) {
+ if ((ret = __db_ovref_log(dbp, param->dbc->txn,
+ &LSN(p), 0, p->pgno, -1, &LSN(p))) != 0)
+ return (ret);
+ } else
+ LSN_NOT_LOGGED(LSN(p));
+ if (--OV_REF(p) != 0)
+ *putp = 0;
+ break;
+ case P_LRECNO:
+ param->count += top;
+ if (((BTREE *)dbp->bt_internal)->bt_root == PGNO(p)) {
+ type = P_LRECNO;
+ goto reinit;
+ }
+ break;
+ case P_LDUP:
+ /* Correct for deleted items. */
+ for (indx = 0; indx < top; indx += O_INDX)
+ if (!B_DISSET(GET_BKEYDATA(dbp, p, indx)->type))
+ ++param->count;
+
+ break;
+ case P_HASH:
+ /* Correct for on-page duplicates and deleted items. */
+ for (indx = 0; indx < top; indx += P_INDX) {
+ switch (*H_PAIRDATA(dbp, p, indx)) {
+ case H_OFFDUP:
+ case H_OFFPAGE:
+ break;
+ case H_KEYDATA:
+ ++param->count;
+ break;
+ case H_DUPLICATE:
+ tlen = LEN_HDATA(dbp, p, 0, indx);
+ hk = H_PAIRDATA(dbp, p, indx);
+ for (off = 0; off < tlen;
+ off += len + 2 * sizeof (db_indx_t)) {
+ ++param->count;
+ memcpy(&len,
+ HKEYDATA_DATA(hk)
+ + off, sizeof(db_indx_t));
+ }
+ }
+ }
+ /* Don't free the head of the bucket. */
+ if (PREV_PGNO(p) == PGNO_INVALID) {
+ type = P_HASH;
+
+reinit: *putp = 0;
+ if (DBC_LOGGING(param->dbc)) {
+ pgno = PGNO_BASE_MD;
+ if ((ret = __db_lget(param->dbc, LCK_ALWAYS,
+ pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
+ return (ret);
+ if ((ret = mpf->get(mpf,
+ &pgno, 0, (PAGE **)&meta)) != 0) {
+ goto err;
+ }
+ memset(&ldbt, 0, sizeof(ldbt));
+ ldbt.data = p;
+ ldbt.size = P_OVERHEAD(dbp);
+ if ((ret = __db_pg_free_log(dbp,
+ param->dbc->txn, &LSN(meta), 0,
+ p->pgno, &LSN(meta),
+ PGNO_BASE_MD, &ldbt, meta->free)) != 0)
+ goto err;
+ LSN(p) = LSN(meta);
+
+ if ((ret =
+ __db_pg_alloc_log(dbp,
+ param->dbc->txn, &LSN(meta), 0,
+ &LSN(meta), PGNO_BASE_MD,
+ &p->lsn, p->pgno, type, meta->free)) != 0) {
+err: (void)mpf->put(mpf, (PAGE *)meta, 0);
+ (void)__TLPUT(param->dbc, metalock);
+ return (ret);
+ }
+ LSN(p) = LSN(meta);
+
+ if ((ret = mpf->put(mpf,
+ (PAGE *)meta, DB_MPOOL_DIRTY)) != 0) {
+ (void)__TLPUT(param->dbc, metalock);
+ return (ret);
+ }
+ if ((ret = __TLPUT(param->dbc, metalock)) != 0)
+ return (ret);
+ } else
+ LSN_NOT_LOGGED(LSN(p));
+
+ P_INIT(p, dbp->pgsize, PGNO(p), PGNO_INVALID,
+ PGNO_INVALID, type == P_HASH ? 0 : 1, type);
+ }
+ break;
+ default:
+ return (__db_pgfmt(dbp->dbenv, p->pgno));
+ }
+
+ if (*putp == 1) {
+ if ((ret = __db_free(param->dbc, p)) != 0)
+ return (ret);
+ } else {
+ if ((ret = mpf->put(mpf, p, DB_MPOOL_DIRTY)) != 0)
+ return (ret);
+ *putp = 1;
+ }
+
+ return (0);
+}
diff --git a/bdb/db/db_remove.c b/bdb/db/db_remove.c
new file mode 100644
index 00000000000..ef11c342555
--- /dev/null
+++ b/bdb/db/db_remove.c
@@ -0,0 +1,318 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001-2002
+ * Sleepycat Software. All rights reserved.
+ */
+
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: db_remove.c,v 11.203 2002/08/19 18:34:18 margo Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+#endif
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/fop.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/lock.h"
+
+static int __db_subdb_remove __P((DB *, DB_TXN *, const char *, const char *));
+static int __db_dbtxn_remove __P((DB *, DB_TXN *, const char *));
+
+/*
+ * __dbenv_dbremove
+ * Remove method for DB_ENV.
+ *
+ * PUBLIC: int __dbenv_dbremove __P((DB_ENV *,
+ * PUBLIC: DB_TXN *, const char *, const char *, u_int32_t));
+ */
+int
+__dbenv_dbremove(dbenv, txn, name, subdb, flags)
+ DB_ENV *dbenv;
+ DB_TXN *txn;
+ const char *name, *subdb;
+ u_int32_t flags;
+{
+ DB *dbp;
+ int ret, t_ret, txn_local;
+
+ txn_local = 0;
+
+ PANIC_CHECK(dbenv);
+ ENV_ILLEGAL_BEFORE_OPEN(dbenv, "DB_ENV->dbremove");
+
+ /* Validate arguments. */
+ if ((ret = __db_fchk(dbenv, "DB->remove", flags, DB_AUTO_COMMIT)) != 0)
+ return (ret);
+
+ if ((ret = db_create(&dbp, dbenv, 0)) != 0)
+ return (ret);
+
+ /*
+ * Create local transaction as necessary, check for consistent
+ * transaction usage.
+ */
+ if (IS_AUTO_COMMIT(dbenv, txn, flags)) {
+ if ((ret = __db_txn_auto(dbp, &txn)) != 0)
+ return (ret);
+ txn_local = 1;
+ } else
+ if (txn != NULL && !TXN_ON(dbenv))
+ return (__db_not_txn_env(dbenv));
+
+ ret = __db_remove_i(dbp, txn, name, subdb);
+
+ /* Commit for DB_AUTO_COMMIT. */
+ if (txn_local) {
+ if (ret == 0)
+ ret = txn->commit(txn, 0);
+ else
+ if ((t_ret = txn->abort(txn)) != 0)
+ ret = __db_panic(dbenv, t_ret);
+ /*
+ * We created the DBP here and when we committed/aborted,
+ * we release all the tranasctional locks, which includes
+ * the handle lock; mark the handle cleared explicitly.
+ */
+ LOCK_INIT(dbp->handle_lock);
+ dbp->lid = DB_LOCK_INVALIDID;
+ }
+
+ /*
+ * We never opened this dbp for real, so don't call the transactional
+ * version of DB->close, and use NOSYNC to avoid calling into mpool.
+ */
+ if ((t_ret = dbp->close(dbp, DB_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __db_remove
+ * Remove method for DB.
+ *
+ * PUBLIC: int __db_remove __P((DB *, const char *, const char *, u_int32_t));
+ */
+int
+__db_remove(dbp, name, subdb, flags)
+ DB *dbp;
+ const char *name, *subdb;
+ u_int32_t flags;
+{
+ DB_ENV *dbenv;
+ int ret, t_ret;
+
+ dbenv = dbp->dbenv;
+
+ PANIC_CHECK(dbenv);
+
+ /*
+ * Validate arguments, continuing to destroy the handle on failure.
+ *
+ * Cannot use DB_ILLEGAL_AFTER_OPEN directly because it returns.
+ *
+ * !!!
+ * We have a serious problem if we're here with a handle used to open
+ * a database -- we'll destroy the handle, and the application won't
+ * ever be able to close the database.
+ */
+ if (F_ISSET(dbp, DB_AM_OPEN_CALLED)) {
+ ret = __db_mi_open(dbenv, "DB->remove", 1);
+ goto err;
+ }
+
+ /* Validate arguments. */
+ if ((ret = __db_fchk(dbenv, "DB->remove", flags, 0)) != 0)
+ goto err;
+
+ /* Check for consistent transaction usage. */
+ if ((ret = __db_check_txn(dbp, NULL, DB_LOCK_INVALIDID, 0)) != 0)
+ goto err;
+
+ /* Remove the file. */
+ ret = __db_remove_i(dbp, NULL, name, subdb);
+
+ /*
+ * We never opened this dbp for real, use NOSYNC to avoid calling into
+ * mpool.
+ */
+err: if ((t_ret = dbp->close(dbp, DB_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __db_remove_i
+ * Internal remove method for DB.
+ *
+ * PUBLIC: int __db_remove_i __P((DB *, DB_TXN *, const char *, const char *));
+ */
+int
+__db_remove_i(dbp, txn, name, subdb)
+ DB *dbp;
+ DB_TXN *txn;
+ const char *name, *subdb;
+{
+ DB_ENV *dbenv;
+ DB_LSN newlsn;
+ int ret;
+ char *real_name;
+
+ dbenv = dbp->dbenv;
+ real_name = NULL;
+
+ /* Handle subdatabase removes separately. */
+ if (subdb != NULL)
+ return (__db_subdb_remove(dbp, txn, name, subdb));
+
+ /* Handle transactional file removes separately. */
+ if (txn != NULL)
+ return (__db_dbtxn_remove(dbp, txn, name));
+
+ /*
+ * The remaining case is a non-transactional file remove.
+ *
+ * Find the real name of the file.
+ */
+ if ((ret = __db_appname(dbenv,
+ DB_APP_DATA, name, 0, NULL, &real_name)) != 0)
+ return (ret);
+
+ if ((ret = __fop_remove_setup(dbp, NULL, real_name, 0)) != 0)
+ goto err;
+
+ if (dbp->db_am_remove != NULL &&
+ (ret = dbp->db_am_remove(dbp, NULL, name, subdb, &newlsn)) != 0)
+ goto err;
+
+ ret = __fop_remove(dbenv, NULL, dbp->fileid, name, DB_APP_DATA);
+
+err:
+ if (real_name != NULL)
+ __os_free(dbenv, real_name);
+
+ return (ret);
+}
+
+/*
+ * __db_subdb_remove --
+ * Remove a subdatabase.
+ */
+static int
+__db_subdb_remove(dbp, txn, name, subdb)
+ DB *dbp;
+ DB_TXN *txn;
+ const char *name, *subdb;
+{
+ DB *mdbp, *sdbp;
+ int ret, t_ret;
+
+ mdbp = sdbp = NULL;
+
+ /* Open the subdatabase. */
+ if ((ret = db_create(&sdbp, dbp->dbenv, 0)) != 0)
+ goto err;
+ if ((ret = __db_open(sdbp,
+ txn, name, subdb, DB_UNKNOWN, DB_WRITEOPEN, 0)) != 0)
+ goto err;
+
+ DB_TEST_RECOVERY(sdbp, DB_TEST_PREDESTROY, ret, name);
+
+ /* Free up the pages in the subdatabase. */
+ switch (sdbp->type) {
+ case DB_BTREE:
+ case DB_RECNO:
+ if ((ret = __bam_reclaim(sdbp, txn)) != 0)
+ goto err;
+ break;
+ case DB_HASH:
+ if ((ret = __ham_reclaim(sdbp, txn)) != 0)
+ goto err;
+ break;
+ default:
+ ret = __db_unknown_type(
+ sdbp->dbenv, "__db_subdb_remove", sdbp->type);
+ goto err;
+ }
+
+ /*
+ * Remove the entry from the main database and free the subdatabase
+ * metadata page.
+ */
+ if ((ret = __db_master_open(sdbp, txn, name, 0, 0, &mdbp)) != 0)
+ goto err;
+
+ if ((ret = __db_master_update(
+ mdbp, sdbp, txn, subdb, sdbp->type, MU_REMOVE, NULL, 0)) != 0)
+ goto err;
+
+ DB_TEST_RECOVERY(sdbp, DB_TEST_POSTDESTROY, ret, name);
+
+DB_TEST_RECOVERY_LABEL
+err:
+ /* Close the main and subdatabases. */
+ if ((t_ret = __db_close_i(sdbp, txn, 0)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (mdbp != NULL &&
+ (t_ret = __db_close_i(mdbp, txn, 0)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+static int
+__db_dbtxn_remove(dbp, txn, name)
+ DB *dbp;
+ DB_TXN *txn;
+ const char *name;
+{
+ DB_ENV *dbenv;
+ DB_LSN newlsn;
+ int ret;
+ char *tmpname;
+
+ dbenv = dbp->dbenv;
+ tmpname = NULL;
+
+ /*
+ * This is a transactional rename, so we have to keep the name
+ * of the file locked until the transaction commits. As a result,
+ * we implement remove by renaming the file to some other name
+ * (which creates a dummy named file as a placeholder for the
+ * file being rename/dremoved) and then deleting that file as
+ * a delayed remove at commit.
+ */
+ if ((ret = __db_backup_name(dbenv, name, txn, &tmpname)) != 0)
+ return (ret);
+
+ DB_TEST_RECOVERY(dbp, DB_TEST_PREDESTROY, ret, name);
+
+ if ((ret = __db_rename_i(dbp, txn, name, NULL, tmpname)) != 0)
+ goto err;
+
+ /* The internal removes will also translate into delayed removes. */
+ if (dbp->db_am_remove != NULL &&
+ (ret = dbp->db_am_remove(dbp, txn, tmpname, NULL, &newlsn)) != 0)
+ goto err;
+
+ ret = __fop_remove(dbenv, txn, dbp->fileid, tmpname, DB_APP_DATA);
+
+ DB_TEST_RECOVERY(dbp, DB_TEST_POSTDESTROY, ret, name);
+
+err:
+DB_TEST_RECOVERY_LABEL
+ if (tmpname != NULL)
+ __os_free(dbenv, tmpname);
+
+ return (ret);
+}
diff --git a/bdb/db/db_rename.c b/bdb/db/db_rename.c
new file mode 100644
index 00000000000..87f88232cda
--- /dev/null
+++ b/bdb/db/db_rename.c
@@ -0,0 +1,297 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001-2002
+ * Sleepycat Software. All rights reserved.
+ */
+
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: db_rename.c,v 11.203 2002/08/07 16:16:47 bostic Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/db_am.h"
+#include "dbinc/fop.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+
+static int __db_subdb_rename __P(( DB *, DB_TXN *,
+ const char *, const char *, const char *));
+
+/*
+ * __dbenv_dbrename
+ * Rename method for DB_ENV.
+ *
+ * PUBLIC: int __dbenv_dbrename __P((DB_ENV *, DB_TXN *,
+ * PUBLIC: const char *, const char *, const char *, u_int32_t));
+ */
+int
+__dbenv_dbrename(dbenv, txn, name, subdb, newname, flags)
+ DB_ENV *dbenv;
+ DB_TXN *txn;
+ const char *name, *subdb, *newname;
+ u_int32_t flags;
+{
+ DB *dbp;
+ int ret, t_ret, txn_local;
+
+ txn_local = 0;
+
+ PANIC_CHECK(dbenv);
+ ENV_ILLEGAL_BEFORE_OPEN(dbenv, "DB_ENV->dbrename");
+
+ /* Validate arguments. */
+ if ((ret = __db_fchk(dbenv, "DB->rename", flags, DB_AUTO_COMMIT)) != 0)
+ return (ret);
+
+ if ((ret = db_create(&dbp, dbenv, 0)) != 0)
+ return (ret);
+
+ /*
+ * Create local transaction as necessary, check for consistent
+ * transaction usage.
+ */
+ if (IS_AUTO_COMMIT(dbenv, txn, flags)) {
+ if ((ret = __db_txn_auto(dbp, &txn)) != 0)
+ return (ret);
+ txn_local = 1;
+ } else
+ if (txn != NULL && !TXN_ON(dbenv))
+ return (__db_not_txn_env(dbenv));
+
+ ret = __db_rename_i(dbp, txn, name, subdb, newname);
+
+ /* Commit for DB_AUTO_COMMIT. */
+ if (txn_local) {
+ if (ret == 0)
+ ret = txn->commit(txn, 0);
+ else
+ if ((t_ret = txn->abort(txn)) != 0)
+ ret = __db_panic(dbenv, t_ret);
+
+ /*
+ * We created the DBP here and when we committed/aborted,
+ * we release all the tranasctional locks, which includes
+ * the handle lock; mark the handle cleared explicitly.
+ */
+ LOCK_INIT(dbp->handle_lock);
+ dbp->lid = DB_LOCK_INVALIDID;
+ }
+
+ /*
+ * We never opened this dbp for real, so don't call the transactional
+ * version of DB->close, and use NOSYNC to avoid calling into mpool.
+ */
+ if ((t_ret = dbp->close(dbp, DB_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __db_rename
+ * Rename method for DB.
+ *
+ * PUBLIC: int __db_rename __P((DB *,
+ * PUBLIC: const char *, const char *, const char *, u_int32_t));
+ */
+int
+__db_rename(dbp, name, subdb, newname, flags)
+ DB *dbp;
+ const char *name, *subdb, *newname;
+ u_int32_t flags;
+{
+ DB_ENV *dbenv;
+ int ret, t_ret;
+
+ dbenv = dbp->dbenv;
+
+ PANIC_CHECK(dbenv);
+
+ /*
+ * Validate arguments, continuing to destroy the handle on failure.
+ *
+ * Cannot use DB_ILLEGAL_AFTER_OPEN directly because it returns.
+ *
+ * !!!
+ * We have a serious problem if we're here with a handle used to open
+ * a database -- we'll destroy the handle, and the application won't
+ * ever be able to close the database.
+ */
+ if (F_ISSET(dbp, DB_AM_OPEN_CALLED)) {
+ ret = __db_mi_open(dbenv, "DB->rename", 1);
+ goto err;
+ }
+
+ /* Validate arguments. */
+ if ((ret = __db_fchk(dbenv, "DB->rename", flags, 0)) != 0)
+ goto err;
+
+ /* Check for consistent transaction usage. */
+ if ((ret = __db_check_txn(dbp, NULL, DB_LOCK_INVALIDID, 0)) != 0)
+ goto err;
+
+ /* Rename the file. */
+ ret = __db_rename_i(dbp, NULL, name, subdb, newname);
+
+ /*
+ * We never opened this dbp for real, use NOSYNC to avoid calling into
+ * mpool.
+ */
+err: if ((t_ret = dbp->close(dbp, DB_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __db_rename_i
+ * Internal rename method for DB.
+ *
+ * PUBLIC: int __db_rename_i __P((DB *,
+ * PUBLIC: DB_TXN *, const char *, const char *, const char *));
+ */
+int
+__db_rename_i(dbp, txn, name, subdb, newname)
+ DB *dbp;
+ DB_TXN *txn;
+ const char *name, *subdb, *newname;
+{
+ DB_ENV *dbenv;
+ int ret;
+ char *real_name;
+
+ dbenv = dbp->dbenv;
+ real_name = NULL;
+
+ DB_TEST_RECOVERY(dbp, DB_TEST_PREDESTROY, ret, name);
+
+ if (subdb != NULL) {
+ ret = __db_subdb_rename(dbp, txn, name, subdb, newname);
+ goto err;
+ }
+
+ /* From here on down, this pertains to files. */
+
+ /* Find the real name of the file. */
+ if ((ret = __db_appname(dbenv,
+ DB_APP_DATA, name, 0, NULL, &real_name)) != 0)
+ goto err;
+
+ if ((ret = __fop_remove_setup(dbp, txn, real_name, 0)) != 0)
+ goto err;
+
+ if (dbp->db_am_rename != NULL &&
+ (ret = dbp->db_am_rename(dbp, txn, name, subdb, newname)) != 0)
+ goto err;
+
+ /*
+ * The transactional case and non-transactional case are
+ * quite different. In the non-transactional case, we simply
+ * do the rename. In the transactional case, since we need
+ * the ability to back out and maintain locking, we have to
+ * create a temporary object as a placeholder. This is all
+ * taken care of in the fop layer.
+ */
+ if (txn != NULL) {
+ if ((ret = __fop_dummy(dbp, txn, name, newname, 0)) != 0)
+ goto err;
+ } else {
+ if ((ret = __fop_dbrename(dbp, name, newname)) != 0)
+ goto err;
+ }
+
+ /*
+ * I am pretty sure that we haven't gotten a dbreg id, so calling
+ * dbreg_filelist_update is not necessary.
+ */
+ DB_ASSERT(dbp->log_filename == NULL ||
+ dbp->log_filename->id == DB_LOGFILEID_INVALID);
+
+ DB_TEST_RECOVERY(dbp, DB_TEST_POSTDESTROY, ret, newname);
+
+DB_TEST_RECOVERY_LABEL
+err:
+ if (real_name != NULL)
+ __os_free(dbenv, real_name);
+
+ return (ret);
+}
+
+/*
+ * __db_subdb_rename --
+ * Rename a subdatabase.
+ */
+static int
+__db_subdb_rename(dbp, txn, name, subdb, newname)
+ DB *dbp;
+ DB_TXN *txn;
+ const char *name, *subdb, *newname;
+{
+ DB *mdbp;
+ DB_ENV *dbenv;
+ PAGE *meta;
+ int ret, t_ret;
+
+ mdbp = NULL;
+ meta = NULL;
+ dbenv = dbp->dbenv;
+
+ /*
+ * We have not opened this dbp so it isn't marked as a subdb,
+ * but it ought to be.
+ */
+ F_SET(dbp, DB_AM_SUBDB);
+
+ /*
+ * Rename the entry in the main database. We need to first
+ * get the meta-data page number (via MU_OPEN) so that we can
+ * read the meta-data page and obtain a handle lock. Once we've
+ * done that, we can proceed to do the rename in the master.
+ */
+ if ((ret = __db_master_open(dbp, txn, name, 0, 0, &mdbp)) != 0)
+ goto err;
+
+ if ((ret = __db_master_update(mdbp, dbp, txn, subdb, dbp->type,
+ MU_OPEN, NULL, 0)) != 0)
+ goto err;
+
+ if ((ret = mdbp->mpf->get(mdbp->mpf, &dbp->meta_pgno, 0, &meta)) != 0)
+ goto err;
+ memcpy(&dbp->fileid, ((DBMETA *)meta)->uid, DB_FILE_ID_LEN);
+ if ((ret = __fop_lock_handle(dbenv,
+ dbp, mdbp->lid, DB_LOCK_WRITE, NULL, 0)) != 0)
+ goto err;
+
+ ret = mdbp->mpf->put(mdbp->mpf, meta, 0);
+ meta = NULL;
+ if (ret != 0)
+ goto err;
+
+ if ((ret = __db_master_update(mdbp, dbp, txn,
+ subdb, dbp->type, MU_RENAME, newname, 0)) != 0)
+ goto err;
+
+ DB_TEST_RECOVERY(dbp, DB_TEST_POSTDESTROY, ret, name);
+
+DB_TEST_RECOVERY_LABEL
+err:
+ if (meta != NULL &&
+ (t_ret = mdbp->mpf->put(mdbp->mpf, meta, 0)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (mdbp != NULL &&
+ (t_ret = __db_close_i(mdbp, txn, 0)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
diff --git a/bdb/db/db_ret.c b/bdb/db/db_ret.c
index 0782de3e450..b1af7b4ffeb 100644
--- a/bdb/db/db_ret.c
+++ b/bdb/db/db_ret.c
@@ -1,14 +1,14 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: db_ret.c,v 11.12 2000/11/30 00:58:33 ubell Exp $";
+static const char revid[] = "$Id: db_ret.c,v 11.21 2002/03/28 19:21:47 bostic Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -18,9 +18,8 @@ static const char revid[] = "$Id: db_ret.c,v 11.12 2000/11/30 00:58:33 ubell Exp
#endif
#include "db_int.h"
-#include "db_page.h"
-#include "btree.h"
-#include "db_am.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
/*
* __db_ret --
@@ -47,19 +46,19 @@ __db_ret(dbp, h, indx, dbt, memp, memsize)
switch (TYPE(h)) {
case P_HASH:
- hk = P_ENTRY(h, indx);
+ hk = P_ENTRY(dbp, h, indx);
if (HPAGE_PTYPE(hk) == H_OFFPAGE) {
memcpy(&ho, hk, sizeof(HOFFPAGE));
return (__db_goff(dbp, dbt,
ho.tlen, ho.pgno, memp, memsize));
}
- len = LEN_HKEYDATA(h, dbp->pgsize, indx);
+ len = LEN_HKEYDATA(dbp, h, dbp->pgsize, indx);
data = HKEYDATA_DATA(hk);
break;
case P_LBTREE:
case P_LDUP:
case P_LRECNO:
- bk = GET_BKEYDATA(h, indx);
+ bk = GET_BKEYDATA(dbp, h, indx);
if (B_TYPE(bk->type) == B_OVERFLOW) {
bo = (BOVERFLOW *)bk;
return (__db_goff(dbp, dbt,
@@ -69,33 +68,30 @@ __db_ret(dbp, h, indx, dbt, memp, memsize)
data = bk->data;
break;
default:
- return (__db_pgfmt(dbp, h->pgno));
+ return (__db_pgfmt(dbp->dbenv, h->pgno));
}
- return (__db_retcopy(dbp, dbt, data, len, memp, memsize));
+ return (__db_retcopy(dbp->dbenv, dbt, data, len, memp, memsize));
}
/*
* __db_retcopy --
* Copy the returned data into the user's DBT, handling special flags.
*
- * PUBLIC: int __db_retcopy __P((DB *, DBT *,
+ * PUBLIC: int __db_retcopy __P((DB_ENV *, DBT *,
* PUBLIC: void *, u_int32_t, void **, u_int32_t *));
*/
int
-__db_retcopy(dbp, dbt, data, len, memp, memsize)
- DB *dbp;
+__db_retcopy(dbenv, dbt, data, len, memp, memsize)
+ DB_ENV *dbenv;
DBT *dbt;
void *data;
u_int32_t len;
void **memp;
u_int32_t *memsize;
{
- DB_ENV *dbenv;
int ret;
- dbenv = dbp == NULL ? NULL : dbp->dbenv;
-
/* If returning a partial record, reset the length. */
if (F_ISSET(dbt, DB_DBT_PARTIAL)) {
data = (u_int8_t *)data + dbt->doff;
@@ -131,12 +127,10 @@ __db_retcopy(dbp, dbt, data, len, memp, memsize)
* memory pointer is allowed to be NULL.
*/
if (F_ISSET(dbt, DB_DBT_MALLOC)) {
- if ((ret = __os_malloc(dbenv, len,
- dbp == NULL ? NULL : dbp->db_malloc, &dbt->data)) != 0)
+ if ((ret = __os_umalloc(dbenv, len, &dbt->data)) != 0)
return (ret);
} else if (F_ISSET(dbt, DB_DBT_REALLOC)) {
- if ((ret = __os_realloc(dbenv, len,
- dbp == NULL ? NULL : dbp->db_realloc, &dbt->data)) != 0)
+ if ((ret = __os_urealloc(dbenv, len, &dbt->data)) != 0)
return (ret);
} else if (F_ISSET(dbt, DB_DBT_USERMEM)) {
if (len != 0 && (dbt->data == NULL || dbt->ulen < len))
@@ -145,7 +139,7 @@ __db_retcopy(dbp, dbt, data, len, memp, memsize)
return (EINVAL);
} else {
if (len != 0 && (*memsize == 0 || *memsize < len)) {
- if ((ret = __os_realloc(dbenv, len, NULL, memp)) != 0) {
+ if ((ret = __os_realloc(dbenv, len, memp)) != 0) {
*memsize = 0;
return (ret);
}
diff --git a/bdb/db/db_truncate.c b/bdb/db/db_truncate.c
new file mode 100644
index 00000000000..49546ae51b9
--- /dev/null
+++ b/bdb/db/db_truncate.c
@@ -0,0 +1,95 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001-2002
+ * Sleepycat Software. All rights reserved.
+ */
+
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: db_truncate.c,v 11.185 2002/08/07 16:16:48 bostic Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+#endif
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/qam.h"
+
+/*
+ * __db_truncate
+ * truncate method for DB.
+ *
+ * PUBLIC: int __db_truncate __P((DB *, DB_TXN *, u_int32_t *, u_int32_t));
+ */
+int
+__db_truncate(dbp, txn, countp, flags)
+ DB *dbp;
+ DB_TXN *txn;
+ u_int32_t *countp, flags;
+{
+ DB_ENV *dbenv;
+ int ret, t_ret, txn_local;
+
+ dbenv = dbp->dbenv;
+ ret = txn_local = 0;
+
+ PANIC_CHECK(dbenv);
+
+ /* Check for invalid flags. */
+ if ((ret =
+ __db_fchk(dbenv, "DB->truncate", flags, DB_AUTO_COMMIT)) != 0)
+ return (ret);
+
+ /*
+ * Create local transaction as necessary, check for consistent
+ * transaction usage.
+ */
+ if (IS_AUTO_COMMIT(dbenv, txn, flags)) {
+ if ((ret = __db_txn_auto(dbp, &txn)) != 0)
+ return (ret);
+ txn_local = 1;
+ } else
+ if (txn != NULL && !TXN_ON(dbenv))
+ return (__db_not_txn_env(dbenv));
+
+ DB_TEST_RECOVERY(dbp, DB_TEST_PREDESTROY, ret, NULL);
+ switch (dbp->type) {
+ case DB_BTREE:
+ case DB_RECNO:
+ if ((ret = __bam_truncate(dbp, txn, countp)) != 0)
+ goto err;
+ break;
+ case DB_HASH:
+ if ((ret = __ham_truncate(dbp, txn, countp)) != 0)
+ goto err;
+ break;
+ case DB_QUEUE:
+ if ((ret = __qam_truncate(dbp, txn, countp)) != 0)
+ goto err;
+ break;
+ default:
+ ret = __db_unknown_type(
+ dbenv, "__db_truncate", dbp->type);
+ goto err;
+ }
+ DB_TEST_RECOVERY(dbp, DB_TEST_POSTDESTROY, ret, NULL);
+
+DB_TEST_RECOVERY_LABEL
+err:
+ /* Commit for DB_AUTO_COMMIT. */
+ if (txn_local) {
+ if (ret == 0)
+ ret = txn->commit(txn, 0);
+ else
+ if ((t_ret = txn->abort(txn)) != 0)
+ ret = __db_panic(dbenv, t_ret);
+ }
+
+ return (ret);
+}
diff --git a/bdb/db/db_upg.c b/bdb/db/db_upg.c
index d8573146ad6..c0eb72f3713 100644
--- a/bdb/db/db_upg.c
+++ b/bdb/db/db_upg.c
@@ -1,14 +1,14 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: db_upg.c,v 11.20 2000/12/12 17:35:30 bostic Exp $";
+static const char revid[] = "$Id: db_upg.c,v 11.29 2002/03/27 18:59:04 krinsky Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -18,11 +18,11 @@ static const char revid[] = "$Id: db_upg.c,v 11.20 2000/12/12 17:35:30 bostic Ex
#endif
#include "db_int.h"
-#include "db_page.h"
-#include "db_swap.h"
-#include "btree.h"
-#include "hash.h"
-#include "qam.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/qam.h"
static int (* const func_31_list[P_PAGETYPE_MAX])
__P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)) = {
@@ -68,7 +68,7 @@ __db_upgrade(dbp, fname, flags)
/* Get the real backing file name. */
if ((ret = __db_appname(dbenv,
- DB_APP_DATA, NULL, fname, 0, NULL, &real_name)) != 0)
+ DB_APP_DATA, fname, 0, NULL, &real_name)) != 0)
return (ret);
/* Open the file. */
@@ -117,6 +117,7 @@ __db_upgrade(dbp, fname, flags)
goto err;
/* FALLTHROUGH */
case 8:
+ case 9:
break;
default:
__db_err(dbenv, "%s: unsupported btree version: %lu",
@@ -173,6 +174,7 @@ __db_upgrade(dbp, fname, flags)
goto err;
/* FALLTHROUGH */
case 7:
+ case 8:
break;
default:
__db_err(dbenv, "%s: unsupported hash version: %lu",
@@ -202,6 +204,7 @@ __db_upgrade(dbp, fname, flags)
goto err;
/* FALLTHROUGH */
case 3:
+ case 4:
break;
default:
__db_err(dbenv, "%s: unsupported queue version: %lu",
@@ -231,9 +234,9 @@ __db_upgrade(dbp, fname, flags)
ret = __os_fsync(dbenv, &fh);
-err: if ((t_ret = __os_closehandle(&fh)) != 0 && ret == 0)
+err: if ((t_ret = __os_closehandle(dbenv, &fh)) != 0 && ret == 0)
ret = t_ret;
- __os_freestr(real_name);
+ __os_free(dbenv, real_name);
/* We're done. */
if (dbp->db_feedback != NULL)
@@ -268,7 +271,7 @@ __db_page_pass(dbp, real_name, flags, fl, fhp)
return (ret);
/* Allocate memory for a single page. */
- if ((ret = __os_malloc(dbenv, dbp->pgsize, NULL, &page)) != 0)
+ if ((ret = __os_malloc(dbenv, dbp->pgsize, &page)) != 0)
return (ret);
/* Walk the file, calling the underlying conversion functions. */
@@ -294,7 +297,7 @@ __db_page_pass(dbp, real_name, flags, fl, fhp)
}
}
- __os_free(page, dbp->pgsize);
+ __os_free(dbp->dbenv, page);
return (ret);
}
diff --git a/bdb/db/db_upg_opd.c b/bdb/db/db_upg_opd.c
index a7be784afb8..f410b797bff 100644
--- a/bdb/db/db_upg_opd.c
+++ b/bdb/db/db_upg_opd.c
@@ -1,14 +1,14 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: db_upg_opd.c,v 11.9 2000/11/30 00:58:33 ubell Exp $";
+static const char revid[] = "$Id: db_upg_opd.c,v 11.18 2002/08/06 06:11:18 bostic Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -18,11 +18,8 @@ static const char revid[] = "$Id: db_upg_opd.c,v 11.9 2000/11/30 00:58:33 ubell
#endif
#include "db_int.h"
-#include "db_page.h"
-#include "db_swap.h"
-#include "btree.h"
-#include "hash.h"
-#include "qam.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
static int __db_build_bi __P((DB *, DB_FH *, PAGE *, PAGE *, u_int32_t, int *));
static int __db_build_ri __P((DB *, DB_FH *, PAGE *, PAGE *, u_int32_t, int *));
@@ -71,7 +68,7 @@ __db_31_offdup(dbp, real_name, fhp, sorted, pgnop)
pgno_cur = pgno_next = NULL;
/* Allocate room to hold a page. */
- if ((ret = __os_malloc(dbp->dbenv, dbp->pgsize, NULL, &page)) != 0)
+ if ((ret = __os_malloc(dbp->dbenv, dbp->pgsize, &page)) != 0)
goto err;
/*
@@ -85,7 +82,7 @@ __db_31_offdup(dbp, real_name, fhp, sorted, pgnop)
if (pgno_max == cur_cnt) {
pgno_max += 20;
if ((ret = __os_realloc(dbp->dbenv, pgno_max *
- sizeof(db_pgno_t), NULL, &pgno_cur)) != 0)
+ sizeof(db_pgno_t), &pgno_cur)) != 0)
goto err;
}
pgno_cur[cur_cnt++] = pgno;
@@ -112,7 +109,7 @@ __db_31_offdup(dbp, real_name, fhp, sorted, pgnop)
* list while we do so.
*/
if ((ret = __os_malloc(dbp->dbenv,
- cur_cnt * sizeof(db_pgno_t), NULL, &pgno_next)) != 0)
+ cur_cnt * sizeof(db_pgno_t), &pgno_next)) != 0)
goto err;
/* Figure out where we can start allocating new pages. */
@@ -121,7 +118,7 @@ __db_31_offdup(dbp, real_name, fhp, sorted, pgnop)
/* Allocate room for an internal page. */
if ((ret = __os_malloc(dbp->dbenv,
- dbp->pgsize, NULL, &ipage)) != 0)
+ dbp->pgsize, &ipage)) != 0)
goto err;
PGNO(ipage) = PGNO_INVALID;
}
@@ -187,13 +184,13 @@ __db_31_offdup(dbp, real_name, fhp, sorted, pgnop)
*pgnop = pgno_cur[0];
err: if (pgno_cur != NULL)
- __os_free(pgno_cur, 0);
+ __os_free(dbp->dbenv, pgno_cur);
if (pgno_next != NULL)
- __os_free(pgno_next, 0);
+ __os_free(dbp->dbenv, pgno_next);
if (ipage != NULL)
- __os_free(ipage, dbp->pgsize);
+ __os_free(dbp->dbenv, ipage);
if (page != NULL)
- __os_free(page, dbp->pgsize);
+ __os_free(dbp->dbenv, page);
return (ret);
}
@@ -214,22 +211,24 @@ __db_build_bi(dbp, fhp, ipage, page, indx, nomemp)
BKEYDATA *child_bk;
u_int8_t *p;
int ret;
+ db_indx_t *inp;
+ inp = P_INP(dbp, ipage);
switch (TYPE(page)) {
case P_IBTREE:
- child_bi = GET_BINTERNAL(page, 0);
- if (P_FREESPACE(ipage) < BINTERNAL_PSIZE(child_bi->len)) {
+ child_bi = GET_BINTERNAL(dbp, page, 0);
+ if (P_FREESPACE(dbp, ipage) < BINTERNAL_PSIZE(child_bi->len)) {
*nomemp = 1;
return (0);
}
- ipage->inp[indx] =
- HOFFSET(ipage) -= BINTERNAL_SIZE(child_bi->len);
- p = P_ENTRY(ipage, indx);
+ inp[indx] =
+ HOFFSET(ipage) -= BINTERNAL_SIZE(child_bi->len);
+ p = P_ENTRY(dbp, ipage, indx);
bi.len = child_bi->len;
B_TSET(bi.type, child_bi->type, 0);
bi.pgno = PGNO(page);
- bi.nrecs = __bam_total(page);
+ bi.nrecs = __bam_total(dbp, page);
memcpy(p, &bi, SSZA(BINTERNAL, data));
p += SSZA(BINTERNAL, data);
memcpy(p, child_bi->data, child_bi->len);
@@ -241,40 +240,40 @@ __db_build_bi(dbp, fhp, ipage, page, indx, nomemp)
return (ret);
break;
case P_LDUP:
- child_bk = GET_BKEYDATA(page, 0);
+ child_bk = GET_BKEYDATA(dbp, page, 0);
switch (B_TYPE(child_bk->type)) {
case B_KEYDATA:
- if (P_FREESPACE(ipage) <
+ if (P_FREESPACE(dbp, ipage) <
BINTERNAL_PSIZE(child_bk->len)) {
*nomemp = 1;
return (0);
}
- ipage->inp[indx] =
+ inp[indx] =
HOFFSET(ipage) -= BINTERNAL_SIZE(child_bk->len);
- p = P_ENTRY(ipage, indx);
+ p = P_ENTRY(dbp, ipage, indx);
bi.len = child_bk->len;
B_TSET(bi.type, child_bk->type, 0);
bi.pgno = PGNO(page);
- bi.nrecs = __bam_total(page);
+ bi.nrecs = __bam_total(dbp, page);
memcpy(p, &bi, SSZA(BINTERNAL, data));
p += SSZA(BINTERNAL, data);
memcpy(p, child_bk->data, child_bk->len);
break;
case B_OVERFLOW:
- if (P_FREESPACE(ipage) <
+ if (P_FREESPACE(dbp, ipage) <
BINTERNAL_PSIZE(BOVERFLOW_SIZE)) {
*nomemp = 1;
return (0);
}
- ipage->inp[indx] =
+ inp[indx] =
HOFFSET(ipage) -= BINTERNAL_SIZE(BOVERFLOW_SIZE);
- p = P_ENTRY(ipage, indx);
+ p = P_ENTRY(dbp, ipage, indx);
bi.len = BOVERFLOW_SIZE;
B_TSET(bi.type, child_bk->type, 0);
bi.pgno = PGNO(page);
- bi.nrecs = __bam_total(page);
+ bi.nrecs = __bam_total(dbp, page);
memcpy(p, &bi, SSZA(BINTERNAL, data));
p += SSZA(BINTERNAL, data);
memcpy(p, child_bk, BOVERFLOW_SIZE);
@@ -285,11 +284,11 @@ __db_build_bi(dbp, fhp, ipage, page, indx, nomemp)
return (ret);
break;
default:
- return (__db_pgfmt(dbp, PGNO(page)));
+ return (__db_pgfmt(dbp->dbenv, PGNO(page)));
}
break;
default:
- return (__db_pgfmt(dbp, PGNO(page)));
+ return (__db_pgfmt(dbp->dbenv, PGNO(page)));
}
return (0);
@@ -308,19 +307,19 @@ __db_build_ri(dbp, fhp, ipage, page, indx, nomemp)
int *nomemp;
{
RINTERNAL ri;
+ db_indx_t *inp;
- COMPQUIET(dbp, NULL);
COMPQUIET(fhp, NULL);
-
- if (P_FREESPACE(ipage) < RINTERNAL_PSIZE) {
+ inp = P_INP(dbp, ipage);
+ if (P_FREESPACE(dbp, ipage) < RINTERNAL_PSIZE) {
*nomemp = 1;
return (0);
}
ri.pgno = PGNO(page);
- ri.nrecs = __bam_total(page);
- ipage->inp[indx] = HOFFSET(ipage) -= RINTERNAL_SIZE;
- memcpy(P_ENTRY(ipage, indx), &ri, RINTERNAL_SIZE);
+ ri.nrecs = __bam_total(dbp, page);
+ inp[indx] = HOFFSET(ipage) -= RINTERNAL_SIZE;
+ memcpy(P_ENTRY(dbp, ipage, indx), &ri, RINTERNAL_SIZE);
return (0);
}
@@ -340,14 +339,14 @@ __db_up_ovref(dbp, fhp, pgno)
int ret;
/* Allocate room to hold a page. */
- if ((ret = __os_malloc(dbp->dbenv, dbp->pgsize, NULL, &page)) != 0)
+ if ((ret = __os_malloc(dbp->dbenv, dbp->pgsize, &page)) != 0)
return (ret);
GET_PAGE(dbp, fhp, pgno, page);
++OV_REF(page);
PUT_PAGE(dbp, fhp, pgno, page);
-err: __os_free(page, dbp->pgsize);
+err: __os_free(dbp->dbenv, page);
return (ret);
}
diff --git a/bdb/db/db_vrfy.c b/bdb/db/db_vrfy.c
index 3509e05e91f..1bbecdbd87a 100644
--- a/bdb/db/db_vrfy.c
+++ b/bdb/db/db_vrfy.c
@@ -1,16 +1,16 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2000
+ * Copyright (c) 2000-2002
* Sleepycat Software. All rights reserved.
*
- * $Id: db_vrfy.c,v 1.53 2001/01/11 18:19:51 bostic Exp $
+ * $Id: db_vrfy.c,v 1.107 2002/09/03 17:27:15 bostic Exp $
*/
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: db_vrfy.c,v 1.53 2001/01/11 18:19:51 bostic Exp $";
+static const char revid[] = "$Id: db_vrfy.c,v 1.107 2002/09/03 17:27:15 bostic Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -20,19 +20,25 @@ static const char revid[] = "$Id: db_vrfy.c,v 1.53 2001/01/11 18:19:51 bostic Ex
#endif
#include "db_int.h"
-#include "db_page.h"
-#include "db_swap.h"
-#include "db_verify.h"
-#include "db_ext.h"
-#include "btree.h"
-#include "hash.h"
-#include "qam.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/db_verify.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/lock.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
static int __db_guesspgsize __P((DB_ENV *, DB_FH *));
static int __db_is_valid_magicno __P((u_int32_t, DBTYPE *));
static int __db_is_valid_pagetype __P((u_int32_t));
static int __db_meta2pgset
__P((DB *, VRFY_DBINFO *, db_pgno_t, u_int32_t, DB *));
+static int __db_salvage __P((DB *, VRFY_DBINFO *, db_pgno_t,
+ PAGE *, void *, int (*)(void *, const void *), u_int32_t));
+static int __db_salvage_subdbpg __P((DB *, VRFY_DBINFO *,
+ PAGE *, void *, int (*)(void *, const void *), u_int32_t));
static int __db_salvage_subdbs
__P((DB *, VRFY_DBINFO *, void *,
int(*)(void *, const void *), u_int32_t, int *));
@@ -136,9 +142,7 @@ __db_verify_internal(dbp_orig, name, subdb, handle, callback, flags)
DB *dbp;
DB_ENV *dbenv;
DB_FH fh, *fhp;
- PAGE *h;
VRFY_DBINFO *vdp;
- db_pgno_t last;
int has, ret, isbad;
char *real_name;
@@ -153,16 +157,22 @@ __db_verify_internal(dbp_orig, name, subdb, handle, callback, flags)
PANIC_CHECK(dbenv);
DB_ILLEGAL_AFTER_OPEN(dbp_orig, "verify");
-#define OKFLAGS (DB_AGGRESSIVE | DB_NOORDERCHK | DB_ORDERCHKONLY | DB_SALVAGE)
+#define OKFLAGS (DB_AGGRESSIVE | DB_NOORDERCHK | DB_ORDERCHKONLY | \
+ DB_PRINTABLE | DB_SALVAGE)
if ((ret = __db_fchk(dbenv, "DB->verify", flags, OKFLAGS)) != 0)
return (ret);
/*
* DB_SALVAGE is mutually exclusive with the other flags except
- * DB_AGGRESSIVE.
+ * DB_AGGRESSIVE and DB_PRINTABLE.
*/
if (LF_ISSET(DB_SALVAGE) &&
- (flags & ~DB_AGGRESSIVE) != DB_SALVAGE)
+ (flags & ~DB_AGGRESSIVE & ~DB_PRINTABLE) != DB_SALVAGE)
+ return (__db_ferr(dbenv, "__db_verify", 1));
+
+ /* DB_AGGRESSIVE and DB_PRINTABLE are only meaningful when salvaging. */
+ if ((LF_ISSET(DB_AGGRESSIVE) || LF_ISSET(DB_PRINTABLE)) &&
+ !LF_ISSET(DB_SALVAGE))
return (__db_ferr(dbenv, "__db_verify", 1));
if (LF_ISSET(DB_ORDERCHKONLY) && flags != DB_ORDERCHKONLY)
@@ -232,9 +242,17 @@ __db_verify_internal(dbp_orig, name, subdb, handle, callback, flags)
if ((ret = __db_vrfy_dbinfo_create(dbenv, 1024, &vdp)) != 0)
goto err;
+ /*
+ * Note whether the user has requested that we use printable
+ * chars where possible. We won't get here with this flag if
+ * we're not salvaging.
+ */
+ if (LF_ISSET(DB_PRINTABLE))
+ F_SET(vdp, SALVAGE_PRINTABLE);
+
/* Find the real name of the file. */
if ((ret = __db_appname(dbenv,
- DB_APP_DATA, NULL, name, 0, NULL, &real_name)) != 0)
+ DB_APP_DATA, name, 0, NULL, &real_name)) != 0)
goto err;
/*
@@ -271,25 +289,15 @@ __db_verify_internal(dbp_orig, name, subdb, handle, callback, flags)
* the [safe] part of __db_open that initializes the environment--
* and the mpool--manually.
*/
- if ((ret = __db_dbenv_setup(dbp,
- name, DB_ODDFILESIZE | DB_RDONLY)) != 0)
+ if ((ret = __db_dbenv_setup(dbp, NULL,
+ name, TXN_INVALID, DB_ODDFILESIZE | DB_RDONLY)) != 0)
return (ret);
/* Mark the dbp as opened, so that we correctly handle its close. */
- F_SET(dbp, DB_OPEN_CALLED);
-
- /*
- * Find out the page number of the last page in the database.
- *
- * XXX: This currently fails if the last page is of bad type,
- * because it calls __db_pgin and that pukes. This is bad.
- */
- if ((ret = memp_fget(dbp->mpf, &last, DB_MPOOL_LAST, &h)) != 0)
- goto err;
- if ((ret = memp_fput(dbp->mpf, h, 0)) != 0)
- goto err;
+ F_SET(dbp, DB_AM_OPEN_CALLED);
- vdp->last_pgno = last;
+ /* Find out the page number of the last page in the database. */
+ dbp->mpf->last_pgno(dbp->mpf, &vdp->last_pgno);
/*
* DB_ORDERCHKONLY is a special case; our file consists of
@@ -373,7 +381,10 @@ __db_verify_internal(dbp_orig, name, subdb, handle, callback, flags)
}
if (0) {
-err: (void)__db_err(dbenv, "%s: %s", name, db_strerror(ret));
+ /* Don't try to strerror() DB_VERIFY_FATAL; it's private. */
+err: if (ret == DB_VERIFY_FATAL)
+ ret = DB_VERIFY_BAD;
+ (void)__db_err(dbenv, "%s: %s", name, db_strerror(ret));
}
if (LF_ISSET(DB_SALVAGE) &&
@@ -385,13 +396,13 @@ done: if (!LF_ISSET(DB_SALVAGE) && dbp->db_feedback != NULL)
dbp->db_feedback(dbp, DB_VERIFY, 100);
if (F_ISSET(fhp, DB_FH_VALID))
- (void)__os_closehandle(fhp);
+ (void)__os_closehandle(dbenv, fhp);
if (dbp)
(void)dbp->close(dbp, 0);
if (vdp)
- (void)__db_vrfy_dbinfo_destroy(vdp);
+ (void)__db_vrfy_dbinfo_destroy(dbenv, vdp);
if (real_name)
- __os_freestr(real_name);
+ __os_free(dbenv, real_name);
if ((ret == 0 && isbad == 1) || ret == DB_VERIFY_FATAL)
ret = DB_VERIFY_BAD;
@@ -417,10 +428,11 @@ __db_vrfy_pagezero(dbp, vdp, fhp, flags)
DB_ENV *dbenv;
VRFY_PAGEINFO *pip;
db_pgno_t freelist;
- int t_ret, ret, nr, swapped;
+ size_t nr;
+ int isbad, ret, swapped;
u_int8_t mbuf[DBMETASIZE];
- swapped = ret = t_ret = 0;
+ isbad = ret = swapped = 0;
freelist = 0;
dbenv = dbp->dbenv;
meta = (DBMETA *)mbuf;
@@ -432,29 +444,43 @@ __db_vrfy_pagezero(dbp, vdp, fhp, flags)
* may be zero; this is okay, as we want page zero anyway and
* 0*0 == 0.
*/
- if ((ret = __os_seek(dbenv, fhp, 0, 0, 0, 0, DB_OS_SEEK_SET)) != 0)
- goto err;
-
- if ((ret = __os_read(dbenv, fhp, mbuf, DBMETASIZE, (size_t *)&nr)) != 0)
- goto err;
+ if ((ret = __os_seek(dbenv, fhp, 0, 0, 0, 0, DB_OS_SEEK_SET)) != 0 ||
+ (ret = __os_read(dbenv, fhp, mbuf, DBMETASIZE, &nr)) != 0) {
+ __db_err(dbenv,
+ "Metadata page %lu cannot be read: %s",
+ (u_long)PGNO_BASE_MD, db_strerror(ret));
+ return (ret);
+ }
if (nr != DBMETASIZE) {
- EPRINT((dbp->dbenv,
- "Incomplete metadata page %lu", (u_long)PGNO_BASE_MD));
- t_ret = DB_VERIFY_FATAL;
- goto err;
+ EPRINT((dbenv,
+ "Page %lu: Incomplete metadata page",
+ (u_long)PGNO_BASE_MD));
+ return (DB_VERIFY_FATAL);
+ }
+
+ if ((ret = __db_chk_meta(dbenv, dbp, meta, 1)) != 0) {
+ EPRINT((dbenv,
+ "Page %lu: metadata page corrupted, (u_long)PGNO_BASE_MD"));
+ isbad = 1;
+ if (ret != -1) {
+ EPRINT((dbenv,
+ "Page %lu: could not check metadata page",
+ (u_long)PGNO_BASE_MD));
+ return (DB_VERIFY_FATAL);
+ }
}
/*
* Check all of the fields that we can.
+ *
+ * 08-11: Current page number. Must == pgno.
+ * Note that endianness doesn't matter--it's zero.
*/
-
- /* 08-11: Current page number. Must == pgno. */
- /* Note that endianness doesn't matter--it's zero. */
if (meta->pgno != PGNO_BASE_MD) {
- EPRINT((dbp->dbenv, "Bad pgno: was %lu, should be %lu",
- (u_long)meta->pgno, (u_long)PGNO_BASE_MD));
- ret = DB_VERIFY_BAD;
+ isbad = 1;
+ EPRINT((dbenv, "Page %lu: pgno incorrectly set to %lu",
+ (u_long)PGNO_BASE_MD, (u_long)meta->pgno));
}
/* 12-15: Magic number. Must be one of valid set. */
@@ -466,9 +492,10 @@ __db_vrfy_pagezero(dbp, vdp, fhp, flags)
&dbp->type))
swapped = 1;
else {
- EPRINT((dbp->dbenv,
- "Bad magic number: %lu", (u_long)meta->magic));
- ret = DB_VERIFY_BAD;
+ isbad = 1;
+ EPRINT((dbenv,
+ "Page %lu: bad magic number %lu",
+ (u_long)PGNO_BASE_MD, (u_long)meta->magic));
}
}
@@ -478,12 +505,19 @@ __db_vrfy_pagezero(dbp, vdp, fhp, flags)
*/
if (swapped)
M_32_SWAP(meta->version);
- if ((dbp->type == DB_BTREE && meta->version != DB_BTREEVERSION) ||
- (dbp->type == DB_HASH && meta->version != DB_HASHVERSION) ||
- (dbp->type == DB_QUEUE && meta->version != DB_QAMVERSION)) {
- ret = DB_VERIFY_BAD;
- EPRINT((dbp->dbenv, "%s%s", "Old or incorrect DB ",
- "version; extraneous errors may result"));
+ if ((dbp->type == DB_BTREE &&
+ (meta->version > DB_BTREEVERSION ||
+ meta->version < DB_BTREEOLDVER)) ||
+ (dbp->type == DB_HASH &&
+ (meta->version > DB_HASHVERSION ||
+ meta->version < DB_HASHOLDVER)) ||
+ (dbp->type == DB_QUEUE &&
+ (meta->version > DB_QAMVERSION ||
+ meta->version < DB_QAMOLDVER))) {
+ isbad = 1;
+ EPRINT((dbenv,
+ "Page %lu: unsupported DB version %lu; extraneous errors may result",
+ (u_long)PGNO_BASE_MD, (u_long)meta->version));
}
/*
@@ -495,9 +529,9 @@ __db_vrfy_pagezero(dbp, vdp, fhp, flags)
if (IS_VALID_PAGESIZE(meta->pagesize))
dbp->pgsize = meta->pagesize;
else {
- EPRINT((dbp->dbenv,
- "Bad page size: %lu", (u_long)meta->pagesize));
- ret = DB_VERIFY_BAD;
+ isbad = 1;
+ EPRINT((dbenv, "Page %lu: bad page size %lu",
+ (u_long)PGNO_BASE_MD, (u_long)meta->pagesize));
/*
* Now try to settle on a pagesize to use.
@@ -516,8 +550,9 @@ __db_vrfy_pagezero(dbp, vdp, fhp, flags)
if ((dbp->type == DB_BTREE && meta->type != P_BTREEMETA) ||
(dbp->type == DB_HASH && meta->type != P_HASHMETA) ||
(dbp->type == DB_QUEUE && meta->type != P_QAMMETA)) {
- ret = DB_VERIFY_BAD;
- EPRINT((dbp->dbenv, "Bad page type: %lu", (u_long)meta->type));
+ isbad = 1;
+ EPRINT((dbenv, "Page %lu: bad page type %lu",
+ (u_long)PGNO_BASE_MD, (u_long)meta->type));
}
/*
@@ -547,21 +582,16 @@ __db_vrfy_pagezero(dbp, vdp, fhp, flags)
pip->free = freelist;
- if ((ret = __db_vrfy_putpageinfo(vdp, pip)) != 0)
+ if ((ret = __db_vrfy_putpageinfo(dbenv, vdp, pip)) != 0)
return (ret);
/* Set up the dbp's fileid. We don't use the regular open path. */
memcpy(dbp->fileid, meta->uid, DB_FILE_ID_LEN);
- if (0) {
-err: __db_err(dbenv, "%s", db_strerror(ret));
- }
-
if (swapped == 1)
F_SET(dbp, DB_AM_SWAP);
- if (t_ret != 0)
- ret = t_ret;
- return (ret);
+
+ return (isbad ? DB_VERIFY_BAD : 0);
}
/*
@@ -578,12 +608,14 @@ __db_vrfy_walkpages(dbp, vdp, handle, callback, flags)
u_int32_t flags;
{
DB_ENV *dbenv;
+ DB_MPOOLFILE *mpf;
PAGE *h;
db_pgno_t i;
int ret, t_ret, isbad;
- ret = isbad = t_ret = 0;
dbenv = dbp->dbenv;
+ mpf = dbp->mpf;
+ ret = isbad = t_ret = 0;
if ((ret = __db_fchk(dbenv,
"__db_vrfy_walkpages", flags, OKFLAGS)) != 0)
@@ -598,11 +630,17 @@ __db_vrfy_walkpages(dbp, vdp, handle, callback, flags)
if (LF_ISSET(DB_SALVAGE) && (__db_salvage_isdone(vdp, i) != 0))
continue;
- /* If an individual page get fails, keep going. */
- if ((t_ret = memp_fget(dbp->mpf, &i, 0, &h)) != 0) {
+ /*
+ * If an individual page get fails, keep going if and only
+ * if we're salvaging.
+ */
+ if ((t_ret = mpf->get(mpf, &i, 0, &h)) != 0) {
if (ret == 0)
ret = t_ret;
- continue;
+ if (LF_ISSET(DB_SALVAGE))
+ continue;
+ else
+ return (ret);
}
if (LF_ISSET(DB_SALVAGE)) {
@@ -619,63 +657,75 @@ __db_vrfy_walkpages(dbp, vdp, handle, callback, flags)
}
} else {
/*
+ * If we are not salvaging, and we get any error
+ * other than DB_VERIFY_BAD, return immediately;
+ * it may not be safe to proceed. If we get
+ * DB_VERIFY_BAD, keep going; listing more errors
+ * may make it easier to diagnose problems and
+ * determine the magnitude of the corruption.
+ */
+
+ /*
* Verify info common to all page
* types.
*/
- if (i != PGNO_BASE_MD)
- if ((t_ret = __db_vrfy_common(dbp,
- vdp, h, i, flags)) == DB_VERIFY_BAD)
+ if (i != PGNO_BASE_MD) {
+ ret = __db_vrfy_common(dbp, vdp, h, i, flags);
+ if (ret == DB_VERIFY_BAD)
isbad = 1;
+ else if (ret != 0)
+ goto err;
+ }
switch (TYPE(h)) {
case P_INVALID:
- t_ret = __db_vrfy_invalid(dbp,
- vdp, h, i, flags);
+ ret = __db_vrfy_invalid(dbp, vdp, h, i, flags);
break;
case __P_DUPLICATE:
isbad = 1;
- EPRINT((dbp->dbenv,
- "Old-style duplicate page: %lu",
+ EPRINT((dbenv,
+ "Page %lu: old-style duplicate page",
(u_long)i));
break;
case P_HASH:
- t_ret = __ham_vrfy(dbp,
+ ret = __ham_vrfy(dbp,
vdp, h, i, flags);
break;
case P_IBTREE:
case P_IRECNO:
case P_LBTREE:
case P_LDUP:
- t_ret = __bam_vrfy(dbp,
+ ret = __bam_vrfy(dbp,
vdp, h, i, flags);
break;
case P_LRECNO:
- t_ret = __ram_vrfy_leaf(dbp,
+ ret = __ram_vrfy_leaf(dbp,
vdp, h, i, flags);
break;
case P_OVERFLOW:
- t_ret = __db_vrfy_overflow(dbp,
+ ret = __db_vrfy_overflow(dbp,
vdp, h, i, flags);
break;
case P_HASHMETA:
- t_ret = __ham_vrfy_meta(dbp,
+ ret = __ham_vrfy_meta(dbp,
vdp, (HMETA *)h, i, flags);
break;
case P_BTREEMETA:
- t_ret = __bam_vrfy_meta(dbp,
+ ret = __bam_vrfy_meta(dbp,
vdp, (BTMETA *)h, i, flags);
break;
case P_QAMMETA:
- t_ret = __qam_vrfy_meta(dbp,
+ ret = __qam_vrfy_meta(dbp,
vdp, (QMETA *)h, i, flags);
break;
case P_QAMDATA:
- t_ret = __qam_vrfy_data(dbp,
+ ret = __qam_vrfy_data(dbp,
vdp, (QPAGE *)h, i, flags);
break;
default:
- EPRINT((dbp->dbenv,
- "Unknown page type: %lu", (u_long)TYPE(h)));
+ EPRINT((dbenv,
+ "Page %lu: unknown page type %lu",
+ (u_long)i, (u_long)TYPE(h)));
isbad = 1;
break;
}
@@ -683,12 +733,10 @@ __db_vrfy_walkpages(dbp, vdp, handle, callback, flags)
/*
* Set up error return.
*/
- if (t_ret == DB_VERIFY_BAD)
+ if (ret == DB_VERIFY_BAD)
isbad = 1;
- else if (t_ret == DB_VERIFY_FATAL)
+ else if (ret != 0)
goto err;
- else
- ret = t_ret;
/*
* Provide feedback to the application about our
@@ -701,14 +749,21 @@ __db_vrfy_walkpages(dbp, vdp, handle, callback, flags)
(i + 1) * 50 / (vdp->last_pgno + 1));
}
- if ((t_ret = memp_fput(dbp->mpf, h, 0)) != 0 && ret == 0)
- ret = t_ret;
+ /*
+ * Just as with the page get, bail if and only if we're
+ * not salvaging.
+ */
+ if ((t_ret = mpf->put(mpf, h, 0)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ if (!LF_ISSET(DB_SALVAGE))
+ return (ret);
+ }
}
if (0) {
-err: if ((t_ret = memp_fput(dbp->mpf, h, 0)) != 0)
+err: if ((t_ret = mpf->put(mpf, h, 0)) != 0)
return (ret == 0 ? t_ret : ret);
- return (DB_VERIFY_BAD);
}
return ((isbad == 1 && ret == 0) ? DB_VERIFY_BAD : ret);
@@ -786,8 +841,8 @@ __db_vrfy_structure(dbp, vdp, dbname, meta_pgno, flags)
*/
if ((ret = __db_vrfy_getpageinfo(vdp, 0, &pip)) != 0)
goto err;
- hassubs = F_ISSET(pip, VRFY_HAS_SUBDBS);
- if ((ret = __db_vrfy_putpageinfo(vdp, pip)) != 0)
+ hassubs = F_ISSET(pip, VRFY_HAS_SUBDBS) ? 1 : 0;
+ if ((ret = __db_vrfy_putpageinfo(dbenv, vdp, pip)) != 0)
goto err;
if (isbad == 0 && hassubs)
@@ -855,23 +910,23 @@ __db_vrfy_structure(dbp, vdp, dbname, meta_pgno, flags)
if ((ret = __db_vrfy_pgset_get(pgset, i, &p)) != 0)
goto err;
if (p == 0) {
- EPRINT((dbp->dbenv,
- "Unreferenced page %lu", (u_long)i));
+ EPRINT((dbenv,
+ "Page %lu: unreferenced page", (u_long)i));
isbad = 1;
}
if (F_ISSET(pip, VRFY_IS_ALLZEROES)) {
- EPRINT((dbp->dbenv,
- "Totally zeroed page %lu", (u_long)i));
+ EPRINT((dbenv,
+ "Page %lu: totally zeroed page", (u_long)i));
isbad = 1;
}
- if ((ret = __db_vrfy_putpageinfo(vdp, pip)) != 0)
+ if ((ret = __db_vrfy_putpageinfo(dbenv, vdp, pip)) != 0)
goto err;
pip = NULL;
}
err: if (pip != NULL)
- (void)__db_vrfy_putpageinfo(vdp, pip);
+ (void)__db_vrfy_putpageinfo(dbenv, vdp, pip);
return ((isbad == 1 && ret == 0) ? DB_VERIFY_BAD : ret);
}
@@ -936,10 +991,13 @@ __db_vrfy_common(dbp, vdp, h, pgno, flags)
db_pgno_t pgno;
u_int32_t flags;
{
+ DB_ENV *dbenv;
VRFY_PAGEINFO *pip;
int ret, t_ret;
u_int8_t *p;
+ dbenv = dbp->dbenv;
+
if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
return (ret);
@@ -957,8 +1015,8 @@ __db_vrfy_common(dbp, vdp, h, pgno, flags)
if (pgno != 0 && PGNO(h) == 0) {
for (p = (u_int8_t *)h; p < (u_int8_t *)h + dbp->pgsize; p++)
if (*p != 0) {
- EPRINT((dbp->dbenv,
- "Page %lu should be zeroed and is not",
+ EPRINT((dbenv,
+ "Page %lu: partially zeroed page",
(u_long)pgno));
ret = DB_VERIFY_BAD;
goto err;
@@ -976,19 +1034,19 @@ __db_vrfy_common(dbp, vdp, h, pgno, flags)
}
if (PGNO(h) != pgno) {
- EPRINT((dbp->dbenv,
- "Bad page number: %lu should be %lu",
- (u_long)h->pgno, (u_long)pgno));
+ EPRINT((dbenv, "Page %lu: bad page number %lu",
+ (u_long)pgno, (u_long)h->pgno));
ret = DB_VERIFY_BAD;
}
if (!__db_is_valid_pagetype(h->type)) {
- EPRINT((dbp->dbenv, "Bad page type: %lu", (u_long)h->type));
+ EPRINT((dbenv, "Page %lu: bad page type %lu",
+ (u_long)pgno, (u_long)h->type));
ret = DB_VERIFY_BAD;
}
pip->type = h->type;
-err: if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0)
+err: if ((t_ret = __db_vrfy_putpageinfo(dbenv, vdp, pip)) != 0 && ret == 0)
ret = t_ret;
return (ret);
@@ -1007,22 +1065,24 @@ __db_vrfy_invalid(dbp, vdp, h, pgno, flags)
db_pgno_t pgno;
u_int32_t flags;
{
+ DB_ENV *dbenv;
VRFY_PAGEINFO *pip;
int ret, t_ret;
+ dbenv = dbp->dbenv;
+
if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
return (ret);
pip->next_pgno = pip->prev_pgno = 0;
if (!IS_VALID_PGNO(NEXT_PGNO(h))) {
- EPRINT((dbp->dbenv,
- "Invalid next_pgno %lu on page %lu",
- (u_long)NEXT_PGNO(h), (u_long)pgno));
+ EPRINT((dbenv, "Page %lu: invalid next_pgno %lu",
+ (u_long)pgno, (u_long)NEXT_PGNO(h)));
ret = DB_VERIFY_BAD;
} else
pip->next_pgno = NEXT_PGNO(h);
- if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0)
+ if ((t_ret = __db_vrfy_putpageinfo(dbenv, vdp, pip)) != 0 && ret == 0)
ret = t_ret;
return (ret);
}
@@ -1048,9 +1108,12 @@ __db_vrfy_datapage(dbp, vdp, h, pgno, flags)
db_pgno_t pgno;
u_int32_t flags;
{
+ DB_ENV *dbenv;
VRFY_PAGEINFO *pip;
int isbad, ret, t_ret;
+ dbenv = dbp->dbenv;
+
if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
return (ret);
isbad = 0;
@@ -1066,12 +1129,12 @@ __db_vrfy_datapage(dbp, vdp, h, pgno, flags)
if (TYPE(h) != P_IBTREE && TYPE(h) != P_IRECNO) {
if (!IS_VALID_PGNO(PREV_PGNO(h)) || PREV_PGNO(h) == pip->pgno) {
isbad = 1;
- EPRINT((dbp->dbenv, "Page %lu: Invalid prev_pgno %lu",
+ EPRINT((dbenv, "Page %lu: invalid prev_pgno %lu",
(u_long)pip->pgno, (u_long)PREV_PGNO(h)));
}
if (!IS_VALID_PGNO(NEXT_PGNO(h)) || NEXT_PGNO(h) == pip->pgno) {
isbad = 1;
- EPRINT((dbp->dbenv, "Page %lu: Invalid next_pgno %lu",
+ EPRINT((dbenv, "Page %lu: invalid next_pgno %lu",
(u_long)pip->pgno, (u_long)NEXT_PGNO(h)));
}
pip->prev_pgno = PREV_PGNO(h);
@@ -1089,8 +1152,7 @@ __db_vrfy_datapage(dbp, vdp, h, pgno, flags)
if (TYPE(h) != P_OVERFLOW) {
if (BKEYDATA_PSIZE(0) * NUM_ENT(h) > dbp->pgsize) {
isbad = 1;
- EPRINT((dbp->dbenv,
- "Page %lu: Too many entries: %lu",
+ EPRINT((dbenv, "Page %lu: too many entries: %lu",
(u_long)pgno, (u_long)NUM_ENT(h)));
}
pip->entries = NUM_ENT(h);
@@ -1106,8 +1168,8 @@ __db_vrfy_datapage(dbp, vdp, h, pgno, flags)
case P_IRECNO:
if (LEVEL(h) < LEAFLEVEL + 1 || LEVEL(h) > MAXBTREELEVEL) {
isbad = 1;
- EPRINT((dbp->dbenv, "Bad btree level %lu on page %lu",
- (u_long)LEVEL(h), (u_long)pgno));
+ EPRINT((dbenv, "Page %lu: bad btree level %lu",
+ (u_long)pgno, (u_long)LEVEL(h)));
}
pip->bt_level = LEVEL(h);
break;
@@ -1116,17 +1178,17 @@ __db_vrfy_datapage(dbp, vdp, h, pgno, flags)
case P_LRECNO:
if (LEVEL(h) != LEAFLEVEL) {
isbad = 1;
- EPRINT((dbp->dbenv,
- "Btree leaf page %lu has incorrect level %lu",
+ EPRINT((dbenv,
+ "Page %lu: btree leaf page has incorrect level %lu",
(u_long)pgno, (u_long)LEVEL(h)));
}
break;
default:
if (LEVEL(h) != 0) {
isbad = 1;
- EPRINT((dbp->dbenv,
- "Nonzero level %lu in non-btree database page %lu",
- (u_long)LEVEL(h), (u_long)pgno));
+ EPRINT((dbenv,
+ "Page %lu: nonzero level %lu in non-btree database",
+ (u_long)pgno, (u_long)LEVEL(h)));
}
break;
}
@@ -1139,7 +1201,7 @@ __db_vrfy_datapage(dbp, vdp, h, pgno, flags)
* by offset and length--cover the right part of the page
* without overlaps, gaps, or violations of the page boundary.
*/
- if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0)
+ if ((t_ret = __db_vrfy_putpageinfo(dbenv, vdp, pip)) != 0 && ret == 0)
ret = t_ret;
return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
@@ -1161,11 +1223,14 @@ __db_vrfy_meta(dbp, vdp, meta, pgno, flags)
db_pgno_t pgno;
u_int32_t flags;
{
+ DB_ENV *dbenv;
DBTYPE dbtype, magtype;
VRFY_PAGEINFO *pip;
int isbad, ret, t_ret;
isbad = 0;
+ dbenv = dbp->dbenv;
+
if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
return (ret);
@@ -1190,31 +1255,37 @@ __db_vrfy_meta(dbp, vdp, meta, pgno, flags)
/* magic number valid */
if (!__db_is_valid_magicno(meta->magic, &magtype)) {
isbad = 1;
- EPRINT((dbp->dbenv,
- "Magic number invalid on page %lu", (u_long)pgno));
+ EPRINT((dbenv,
+ "Page %lu: invalid magic number", (u_long)pgno));
}
if (magtype != dbtype) {
isbad = 1;
- EPRINT((dbp->dbenv,
- "Magic number does not match type of page %lu",
+ EPRINT((dbenv,
+ "Page %lu: magic number does not match database type",
(u_long)pgno));
}
/* version */
- if ((dbtype == DB_BTREE && meta->version != DB_BTREEVERSION) ||
- (dbtype == DB_HASH && meta->version != DB_HASHVERSION) ||
- (dbtype == DB_QUEUE && meta->version != DB_QAMVERSION)) {
+ if ((dbtype == DB_BTREE &&
+ (meta->version > DB_BTREEVERSION ||
+ meta->version < DB_BTREEOLDVER)) ||
+ (dbtype == DB_HASH &&
+ (meta->version > DB_HASHVERSION ||
+ meta->version < DB_HASHOLDVER)) ||
+ (dbtype == DB_QUEUE &&
+ (meta->version > DB_QAMVERSION ||
+ meta->version < DB_QAMOLDVER))) {
isbad = 1;
- EPRINT((dbp->dbenv, "%s%s", "Old of incorrect DB ",
- "version; extraneous errors may result"));
+ EPRINT((dbenv,
+ "Page %lu: unsupported database version %lu; extraneous errors may result",
+ (u_long)pgno, (u_long)meta->version));
}
/* pagesize */
if (meta->pagesize != dbp->pgsize) {
isbad = 1;
- EPRINT((dbp->dbenv,
- "Invalid pagesize %lu on page %lu",
- (u_long)meta->pagesize, (u_long)pgno));
+ EPRINT((dbenv, "Page %lu: invalid pagesize %lu",
+ (u_long)pgno, (u_long)meta->pagesize));
}
/* free list */
@@ -1224,9 +1295,9 @@ __db_vrfy_meta(dbp, vdp, meta, pgno, flags)
*/
if (pgno != PGNO_BASE_MD && meta->free != PGNO_INVALID) {
isbad = 1;
- EPRINT((dbp->dbenv,
- "Nonempty free list on subdatabase metadata page %lu",
- pgno));
+ EPRINT((dbenv,
+ "Page %lu: nonempty free list on subdatabase metadata page",
+ (u_long)pgno));
}
/* Can correctly be PGNO_INVALID--that's just the end of the list. */
@@ -1234,9 +1305,9 @@ __db_vrfy_meta(dbp, vdp, meta, pgno, flags)
pip->free = meta->free;
else if (!IS_VALID_PGNO(meta->free)) {
isbad = 1;
- EPRINT((dbp->dbenv,
- "Nonsensical free list pgno %lu on page %lu",
- (u_long)meta->free, (u_long)pgno));
+ EPRINT((dbenv,
+ "Page %lu: nonsensical free list pgno %lu",
+ (u_long)pgno, (u_long)meta->free));
}
/*
@@ -1245,7 +1316,7 @@ __db_vrfy_meta(dbp, vdp, meta, pgno, flags)
*/
F_CLR(pip, VRFY_INCOMPLETE);
-err: if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0)
+err: if ((t_ret = __db_vrfy_putpageinfo(dbenv, vdp, pip)) != 0 && ret == 0)
ret = t_ret;
return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
@@ -1264,51 +1335,56 @@ __db_vrfy_freelist(dbp, vdp, meta, flags)
u_int32_t flags;
{
DB *pgset;
+ DB_ENV *dbenv;
VRFY_PAGEINFO *pip;
- db_pgno_t pgno;
+ db_pgno_t cur_pgno, next_pgno;
int p, ret, t_ret;
pgset = vdp->pgset;
DB_ASSERT(pgset != NULL);
+ dbenv = dbp->dbenv;
if ((ret = __db_vrfy_getpageinfo(vdp, meta, &pip)) != 0)
return (ret);
- for (pgno = pip->free; pgno != PGNO_INVALID; pgno = pip->next_pgno) {
- if ((ret = __db_vrfy_putpageinfo(vdp, pip)) != 0)
+ for (next_pgno = pip->free;
+ next_pgno != PGNO_INVALID; next_pgno = pip->next_pgno) {
+ cur_pgno = pip->pgno;
+ if ((ret = __db_vrfy_putpageinfo(dbenv, vdp, pip)) != 0)
return (ret);
/* This shouldn't happen, but just in case. */
- if (!IS_VALID_PGNO(pgno)) {
- EPRINT((dbp->dbenv,
- "Invalid next_pgno on free list page %lu",
- (u_long)pgno));
+ if (!IS_VALID_PGNO(next_pgno)) {
+ EPRINT((dbenv,
+ "Page %lu: invalid next_pgno %lu on free list page",
+ (u_long)cur_pgno, (u_long)next_pgno));
return (DB_VERIFY_BAD);
}
/* Detect cycles. */
- if ((ret = __db_vrfy_pgset_get(pgset, pgno, &p)) != 0)
+ if ((ret = __db_vrfy_pgset_get(pgset, next_pgno, &p)) != 0)
return (ret);
if (p != 0) {
- EPRINT((dbp->dbenv,
- "Page %lu encountered a second time on free list",
- (u_long)pgno));
+ EPRINT((dbenv,
+ "Page %lu: page %lu encountered a second time on free list",
+ (u_long)cur_pgno, (u_long)next_pgno));
return (DB_VERIFY_BAD);
}
- if ((ret = __db_vrfy_pgset_inc(pgset, pgno)) != 0)
+ if ((ret = __db_vrfy_pgset_inc(pgset, next_pgno)) != 0)
return (ret);
- if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ if ((ret = __db_vrfy_getpageinfo(vdp, next_pgno, &pip)) != 0)
return (ret);
if (pip->type != P_INVALID) {
- EPRINT((dbp->dbenv,
- "Non-invalid page %lu on free list", (u_long)pgno));
+ EPRINT((dbenv,
+ "Page %lu: non-invalid page %lu on free list",
+ (u_long)cur_pgno, (u_long)next_pgno));
ret = DB_VERIFY_BAD; /* unsafe to continue */
break;
}
}
- if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0)
+ if ((t_ret = __db_vrfy_putpageinfo(dbenv, vdp, pip)) != 0)
ret = t_ret;
return (ret);
}
@@ -1328,6 +1404,7 @@ __db_vrfy_subdbs(dbp, vdp, dbname, flags)
DB *mdbp;
DBC *dbc;
DBT key, data;
+ DB_ENV *dbenv;
VRFY_PAGEINFO *pip;
db_pgno_t meta_pgno;
int ret, t_ret, isbad;
@@ -1335,19 +1412,22 @@ __db_vrfy_subdbs(dbp, vdp, dbname, flags)
isbad = 0;
dbc = NULL;
+ dbenv = dbp->dbenv;
- if ((ret = __db_master_open(dbp, dbname, DB_RDONLY, 0, &mdbp)) != 0)
+ if ((ret =
+ __db_master_open(dbp, NULL, dbname, DB_RDONLY, 0, &mdbp)) != 0)
return (ret);
- if ((ret =
- __db_icursor(mdbp, NULL, DB_BTREE, PGNO_INVALID, 0, &dbc)) != 0)
+ if ((ret = __db_icursor(mdbp,
+ NULL, DB_BTREE, PGNO_INVALID, 0, DB_LOCK_INVALIDID, &dbc)) != 0)
goto err;
memset(&key, 0, sizeof(key));
memset(&data, 0, sizeof(data));
while ((ret = dbc->c_get(dbc, &key, &data, DB_NEXT)) == 0) {
if (data.size != sizeof(db_pgno_t)) {
- EPRINT((dbp->dbenv, "Database entry of invalid size"));
+ EPRINT((dbenv,
+ "Subdatabase entry not page-number size"));
isbad = 1;
goto err;
}
@@ -1358,8 +1438,8 @@ __db_vrfy_subdbs(dbp, vdp, dbname, flags)
*/
DB_NTOHL(&meta_pgno);
if (meta_pgno == PGNO_INVALID || meta_pgno > vdp->last_pgno) {
- EPRINT((dbp->dbenv,
- "Database entry references invalid page %lu",
+ EPRINT((dbenv,
+ "Subdatabase entry references invalid page %lu",
(u_long)meta_pgno));
isbad = 1;
goto err;
@@ -1367,7 +1447,7 @@ __db_vrfy_subdbs(dbp, vdp, dbname, flags)
if ((ret = __db_vrfy_getpageinfo(vdp, meta_pgno, &pip)) != 0)
goto err;
type = pip->type;
- if ((ret = __db_vrfy_putpageinfo(vdp, pip)) != 0)
+ if ((ret = __db_vrfy_putpageinfo(dbenv, vdp, pip)) != 0)
goto err;
switch (type) {
case P_BTREEMETA:
@@ -1390,8 +1470,8 @@ __db_vrfy_subdbs(dbp, vdp, dbname, flags)
break;
case P_QAMMETA:
default:
- EPRINT((dbp->dbenv,
- "Database entry references page %lu of invalid type %lu",
+ EPRINT((dbenv,
+ "Subdatabase entry references page %lu of invalid type %lu",
(u_long)meta_pgno, (u_long)type));
ret = DB_VERIFY_BAD;
goto err;
@@ -1416,9 +1496,9 @@ err: if (dbc != NULL && (t_ret = __db_c_close(dbc)) != 0 && ret == 0)
* Provide feedback during top-down database structure traversal.
* (See comment at the beginning of __db_vrfy_structure.)
*
- * PUBLIC: int __db_vrfy_struct_feedback __P((DB *, VRFY_DBINFO *));
+ * PUBLIC: void __db_vrfy_struct_feedback __P((DB *, VRFY_DBINFO *));
*/
-int
+void
__db_vrfy_struct_feedback(dbp, vdp)
DB *dbp;
VRFY_DBINFO *vdp;
@@ -1426,7 +1506,7 @@ __db_vrfy_struct_feedback(dbp, vdp)
int progress;
if (dbp->db_feedback == NULL)
- return (0);
+ return;
if (vdp->pgs_remaining > 0)
vdp->pgs_remaining--;
@@ -1434,8 +1514,6 @@ __db_vrfy_struct_feedback(dbp, vdp)
/* Don't allow a feedback call of 100 until we're really done. */
progress = 100 - (vdp->pgs_remaining * 50 / (vdp->last_pgno + 1));
dbp->db_feedback(dbp, DB_VERIFY, progress == 100 ? 99 : progress);
-
- return (0);
}
/*
@@ -1453,6 +1531,8 @@ __db_vrfy_orderchkonly(dbp, vdp, name, subdb, flags)
DB *mdbp, *pgset;
DBC *pgsc;
DBT key, data;
+ DB_ENV *dbenv;
+ DB_MPOOLFILE *mpf;
HASH *h_internal;
HMETA *hmeta;
PAGE *h, *currpg;
@@ -1460,36 +1540,45 @@ __db_vrfy_orderchkonly(dbp, vdp, name, subdb, flags)
u_int32_t bucket;
int t_ret, ret;
- currpg = h = NULL;
- pgsc = NULL;
pgset = NULL;
+ pgsc = NULL;
+ dbenv = dbp->dbenv;
+ mpf = dbp->mpf;
+ currpg = h = NULL;
LF_CLR(DB_NOORDERCHK);
/* Open the master database and get the meta_pgno for the subdb. */
if ((ret = db_create(&mdbp, NULL, 0)) != 0)
return (ret);
- if ((ret = __db_master_open(dbp, name, DB_RDONLY, 0, &mdbp)) != 0)
+ if ((ret = __db_master_open(dbp, NULL, name, DB_RDONLY, 0, &mdbp)) != 0)
goto err;
memset(&key, 0, sizeof(key));
key.data = (void *)subdb;
+ key.size = (u_int32_t)strlen(subdb);
memset(&data, 0, sizeof(data));
- if ((ret = dbp->get(dbp, NULL, &key, &data, 0)) != 0)
+ if ((ret = mdbp->get(mdbp, NULL, &key, &data, 0)) != 0)
goto err;
if (data.size != sizeof(db_pgno_t)) {
- EPRINT((dbp->dbenv, "Database entry of invalid size"));
+ EPRINT((dbenv, "Subdatabase entry of invalid size"));
ret = DB_VERIFY_BAD;
goto err;
}
memcpy(&meta_pgno, data.data, data.size);
- if ((ret = memp_fget(dbp->mpf, &meta_pgno, 0, &h)) != 0)
+ /*
+ * Subdatabase meta pgnos are stored in network byte
+ * order for cross-endian compatibility. Swap if appropriate.
+ */
+ DB_NTOHL(&meta_pgno);
+
+ if ((ret = mpf->get(mpf, &meta_pgno, 0, &h)) != 0)
goto err;
- if ((ret = __db_vrfy_pgset(dbp->dbenv, dbp->pgsize, &pgset)) != 0)
+ if ((ret = __db_vrfy_pgset(dbenv, dbp->pgsize, &pgset)) != 0)
goto err;
switch (TYPE(h)) {
@@ -1506,18 +1595,24 @@ __db_vrfy_orderchkonly(dbp, vdp, name, subdb, flags)
if ((ret = pgset->cursor(pgset, NULL, &pgsc, 0)) != 0)
goto err;
while ((ret = __db_vrfy_pgset_next(pgsc, &p)) == 0) {
- if ((ret = memp_fget(dbp->mpf, &p, 0, &currpg)) != 0)
+ if ((ret = mpf->get(mpf, &p, 0, &currpg)) != 0)
goto err;
if ((ret = __bam_vrfy_itemorder(dbp,
NULL, currpg, p, NUM_ENT(currpg), 1,
F_ISSET(&btmeta->dbmeta, BTM_DUP), flags)) != 0)
goto err;
- if ((ret = memp_fput(dbp->mpf, currpg, 0)) != 0)
+ if ((ret = mpf->put(mpf, currpg, 0)) != 0)
goto err;
currpg = NULL;
}
- if ((ret = pgsc->c_close(pgsc)) != 0)
- goto err;
+
+ /*
+ * The normal exit condition for the loop above is DB_NOTFOUND.
+ * If we see that, zero it and continue on to cleanup.
+ * Otherwise, it's a real error and will be returned.
+ */
+ if (ret == DB_NOTFOUND)
+ ret = 0;
break;
case P_HASHMETA:
hmeta = (HMETA *)h;
@@ -1525,16 +1620,21 @@ __db_vrfy_orderchkonly(dbp, vdp, name, subdb, flags)
/*
* Make sure h_charkey is right.
*/
- if (h_internal == NULL || h_internal->h_hash == NULL) {
- EPRINT((dbp->dbenv,
- "DB_ORDERCHKONLY requires that a hash function be set"));
+ if (h_internal == NULL) {
+ EPRINT((dbenv,
+ "Page %lu: DB->h_internal field is NULL",
+ (u_long)meta_pgno));
ret = DB_VERIFY_BAD;
goto err;
}
+ if (h_internal->h_hash == NULL)
+ h_internal->h_hash = hmeta->dbmeta.version < 5
+ ? __ham_func4 : __ham_func5;
if (hmeta->h_charkey !=
h_internal->h_hash(dbp, CHARKEY, sizeof(CHARKEY))) {
- EPRINT((dbp->dbenv,
- "Incorrect hash function for database"));
+ EPRINT((dbenv,
+ "Page %lu: incorrect hash function for database",
+ (u_long)meta_pgno));
ret = DB_VERIFY_BAD;
goto err;
}
@@ -1546,34 +1646,35 @@ __db_vrfy_orderchkonly(dbp, vdp, name, subdb, flags)
for (bucket = 0; bucket <= hmeta->max_bucket; bucket++) {
pgno = BS_TO_PAGE(bucket, hmeta->spares);
while (pgno != PGNO_INVALID) {
- if ((ret = memp_fget(dbp->mpf,
+ if ((ret = mpf->get(mpf,
&pgno, 0, &currpg)) != 0)
goto err;
if ((ret = __ham_vrfy_hashing(dbp,
- NUM_ENT(currpg),hmeta, bucket, pgno,
+ NUM_ENT(currpg), hmeta, bucket, pgno,
flags, h_internal->h_hash)) != 0)
goto err;
pgno = NEXT_PGNO(currpg);
- if ((ret = memp_fput(dbp->mpf, currpg, 0)) != 0)
+ if ((ret = mpf->put(mpf, currpg, 0)) != 0)
goto err;
currpg = NULL;
}
}
break;
default:
- EPRINT((dbp->dbenv, "Database meta page %lu of bad type %lu",
+ EPRINT((dbenv, "Page %lu: database metapage of bad type %lu",
(u_long)meta_pgno, (u_long)TYPE(h)));
ret = DB_VERIFY_BAD;
break;
}
-err: if (pgsc != NULL)
- (void)pgsc->c_close(pgsc);
- if (pgset != NULL)
- (void)pgset->close(pgset, 0);
- if (h != NULL && (t_ret = memp_fput(dbp->mpf, h, 0)) != 0)
+err: if (pgsc != NULL && (t_ret = pgsc->c_close(pgsc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (pgset != NULL &&
+ (t_ret = pgset->close(pgset, 0)) != 0 && ret == 0)
ret = t_ret;
- if (currpg != NULL && (t_ret = memp_fput(dbp->mpf, currpg, 0)) != 0)
+ if (h != NULL && (t_ret = mpf->put(mpf, h, 0)) != 0)
+ ret = t_ret;
+ if (currpg != NULL && (t_ret = mpf->put(mpf, currpg, 0)) != 0)
ret = t_ret;
if ((t_ret = mdbp->close(mdbp, 0)) != 0)
ret = t_ret;
@@ -1584,11 +1685,8 @@ err: if (pgsc != NULL)
* __db_salvage --
* Walk through a page, salvaging all likely or plausible (w/
* DB_AGGRESSIVE) key/data pairs.
- *
- * PUBLIC: int __db_salvage __P((DB *, VRFY_DBINFO *, db_pgno_t, PAGE *,
- * PUBLIC: void *, int (*)(void *, const void *), u_int32_t));
*/
-int
+static int
__db_salvage(dbp, vdp, pgno, h, handle, callback, flags)
DB *dbp;
VRFY_DBINFO *vdp;
@@ -1659,24 +1757,29 @@ __db_salvage_unknowns(dbp, vdp, handle, callback, flags)
u_int32_t flags;
{
DBT unkdbt, key, *dbt;
+ DB_ENV *dbenv;
+ DB_MPOOLFILE *mpf;
PAGE *h;
db_pgno_t pgno;
u_int32_t pgtype;
int ret, err_ret;
void *ovflbuf;
+ dbenv = dbp->dbenv;
+ mpf = dbp->mpf;
+
memset(&unkdbt, 0, sizeof(DBT));
- unkdbt.size = strlen("UNKNOWN") + 1;
+ unkdbt.size = (u_int32_t)strlen("UNKNOWN") + 1;
unkdbt.data = "UNKNOWN";
- if ((ret = __os_malloc(dbp->dbenv, dbp->pgsize, 0, &ovflbuf)) != 0)
+ if ((ret = __os_malloc(dbenv, dbp->pgsize, &ovflbuf)) != 0)
return (ret);
err_ret = 0;
while ((ret = __db_salvage_getnext(vdp, &pgno, &pgtype)) == 0) {
dbt = NULL;
- if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) {
+ if ((ret = mpf->get(mpf, &pgno, 0, &h)) != 0) {
err_ret = ret;
continue;
}
@@ -1699,17 +1802,11 @@ __db_salvage_unknowns(dbp, vdp, handle, callback, flags)
* a database with no dups. What to do?
*/
if ((ret = __db_safe_goff(dbp,
- vdp, pgno, &key, &ovflbuf, flags)) != 0) {
- err_ret = ret;
- continue;
- }
- if ((ret = __db_prdbt(&key,
- 0, " ", handle, callback, 0, NULL)) != 0) {
- err_ret = ret;
- continue;
- }
- if ((ret = __db_prdbt(&unkdbt,
- 0, " ", handle, callback, 0, NULL)) != 0)
+ vdp, pgno, &key, &ovflbuf, flags)) != 0 ||
+ (ret = __db_prdbt(&key,
+ 0, " ", handle, callback, 0, vdp)) != 0 ||
+ (ret = __db_prdbt(&unkdbt,
+ 0, " ", handle, callback, 0, vdp)) != 0)
err_ret = ret;
break;
case SALVAGE_HASH:
@@ -1727,11 +1824,11 @@ __db_salvage_unknowns(dbp, vdp, handle, callback, flags)
DB_ASSERT(0);
break;
}
- if ((ret = memp_fput(dbp->mpf, h, 0)) != 0)
+ if ((ret = mpf->put(mpf, h, 0)) != 0)
err_ret = ret;
}
- __os_free(ovflbuf, 0);
+ __os_free(dbenv, ovflbuf);
if (err_ret != 0 && ret == 0)
ret = err_ret;
@@ -1743,8 +1840,8 @@ __db_salvage_unknowns(dbp, vdp, handle, callback, flags)
* Offset of the ith inp array entry, which we can compare to the offset
* the entry stores.
*/
-#define INP_OFFSET(h, i) \
- ((db_indx_t)((u_int8_t *)(h)->inp + (i) - (u_int8_t *)(h)))
+#define INP_OFFSET(dbp, h, i) \
+ ((db_indx_t)((u_int8_t *)((P_INP(dbp,(h))) + (i)) - (u_int8_t *)(h)))
/*
* __db_vrfy_inpitem --
@@ -1770,33 +1867,35 @@ __db_vrfy_inpitem(dbp, h, pgno, i, is_btree, flags, himarkp, offsetp)
u_int32_t flags, *himarkp, *offsetp;
{
BKEYDATA *bk;
- db_indx_t offset, len;
+ DB_ENV *dbenv;
+ db_indx_t *inp, offset, len;
+
+ dbenv = dbp->dbenv;
DB_ASSERT(himarkp != NULL);
+ inp = P_INP(dbp, h);
/*
* Check that the inp array, which grows from the beginning of the
* page forward, has not collided with the data, which grow from the
* end of the page backward.
*/
- if (h->inp + i >= (db_indx_t *)((u_int8_t *)h + *himarkp)) {
+ if (inp + i >= (db_indx_t *)((u_int8_t *)h + *himarkp)) {
/* We've collided with the data. We need to bail. */
- EPRINT((dbp->dbenv,
- "Page %lu entries listing %lu overlaps data",
+ EPRINT((dbenv, "Page %lu: entries listing %lu overlaps data",
(u_long)pgno, (u_long)i));
return (DB_VERIFY_FATAL);
}
- offset = h->inp[i];
+ offset = inp[i];
/*
* Check that the item offset is reasonable: it points somewhere
* after the inp array and before the end of the page.
*/
- if (offset <= INP_OFFSET(h, i) || offset > dbp->pgsize) {
- EPRINT((dbp->dbenv,
- "Bad offset %lu at page %lu index %lu",
- (u_long)offset, (u_long)pgno, (u_long)i));
+ if (offset <= INP_OFFSET(dbp, h, i) || offset > dbp->pgsize) {
+ EPRINT((dbenv, "Page %lu: bad offset %lu at page index %lu",
+ (u_long)pgno, (u_long)offset, (u_long)i));
return (DB_VERIFY_BAD);
}
@@ -1808,7 +1907,7 @@ __db_vrfy_inpitem(dbp, h, pgno, i, is_btree, flags, himarkp, offsetp)
/*
* Check that the item length remains on-page.
*/
- bk = GET_BKEYDATA(h, i);
+ bk = GET_BKEYDATA(dbp, h, i);
/*
* We need to verify the type of the item here;
@@ -1826,16 +1925,16 @@ __db_vrfy_inpitem(dbp, h, pgno, i, is_btree, flags, himarkp, offsetp)
len = BOVERFLOW_SIZE;
break;
default:
- EPRINT((dbp->dbenv,
- "Item %lu on page %lu of unrecognizable type",
- i, pgno));
+ EPRINT((dbenv,
+ "Page %lu: item %lu of unrecognizable type",
+ (u_long)pgno, (u_long)i));
return (DB_VERIFY_BAD);
}
if ((size_t)(offset + len) > dbp->pgsize) {
- EPRINT((dbp->dbenv,
- "Item %lu on page %lu extends past page boundary",
- (u_long)i, (u_long)pgno));
+ EPRINT((dbenv,
+ "Page %lu: item %lu extends past page boundary",
+ (u_long)pgno, (u_long)i));
return (DB_VERIFY_BAD);
}
}
@@ -1861,9 +1960,11 @@ __db_vrfy_duptype(dbp, vdp, pgno, flags)
db_pgno_t pgno;
u_int32_t flags;
{
+ DB_ENV *dbenv;
VRFY_PAGEINFO *pip;
int ret, isbad;
+ dbenv = dbp->dbenv;
isbad = 0;
if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
@@ -1873,8 +1974,8 @@ __db_vrfy_duptype(dbp, vdp, pgno, flags)
case P_IBTREE:
case P_LDUP:
if (!LF_ISSET(ST_DUPSORT)) {
- EPRINT((dbp->dbenv,
- "Sorted duplicate set at page %lu in unsorted-dup database",
+ EPRINT((dbenv,
+ "Page %lu: sorted duplicate set in unsorted-dup database",
(u_long)pgno));
isbad = 1;
}
@@ -1882,21 +1983,29 @@ __db_vrfy_duptype(dbp, vdp, pgno, flags)
case P_IRECNO:
case P_LRECNO:
if (LF_ISSET(ST_DUPSORT)) {
- EPRINT((dbp->dbenv,
- "Unsorted duplicate set at page %lu in sorted-dup database",
+ EPRINT((dbenv,
+ "Page %lu: unsorted duplicate set in sorted-dup database",
(u_long)pgno));
isbad = 1;
}
break;
default:
- EPRINT((dbp->dbenv,
- "Duplicate page %lu of inappropriate type %lu",
- (u_long)pgno, (u_long)pip->type));
+ /*
+ * If the page is entirely zeroed, its pip->type will be a lie
+ * (we assumed it was a hash page, as they're allowed to be
+ * zeroed); handle this case specially.
+ */
+ if (F_ISSET(pip, VRFY_IS_ALLZEROES))
+ ZEROPG_ERR_PRINT(dbenv, pgno, "duplicate page");
+ else
+ EPRINT((dbenv,
+ "Page %lu: duplicate page of inappropriate type %lu",
+ (u_long)pgno, (u_long)pip->type));
isbad = 1;
break;
}
- if ((ret = __db_vrfy_putpageinfo(vdp, pip)) != 0)
+ if ((ret = __db_vrfy_putpageinfo(dbenv, vdp, pip)) != 0)
return (ret);
return (isbad == 1 ? DB_VERIFY_BAD : 0);
}
@@ -1934,14 +2043,17 @@ __db_salvage_duptree(dbp, vdp, pgno, key, handle, callback, flags)
int (*callback) __P((void *, const void *));
u_int32_t flags;
{
+ DB_MPOOLFILE *mpf;
PAGE *h;
int ret, t_ret;
+ mpf = dbp->mpf;
+
if (pgno == PGNO_INVALID || !IS_VALID_PGNO(pgno))
return (DB_VERIFY_BAD);
/* We have a plausible page. Try it. */
- if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
+ if ((ret = mpf->get(mpf, &pgno, 0, &h)) != 0)
return (ret);
switch (TYPE(h)) {
@@ -1972,7 +2084,7 @@ __db_salvage_duptree(dbp, vdp, pgno, key, handle, callback, flags)
/* NOTREACHED */
}
-err: if ((t_ret = memp_fput(dbp->mpf, h, 0)) != 0 && ret == 0)
+err: if ((t_ret = mpf->put(mpf, h, 0)) != 0 && ret == 0)
ret = t_ret;
return (ret);
}
@@ -1994,16 +2106,18 @@ __db_salvage_subdbs(dbp, vdp, handle, callback, flags, hassubsp)
BTMETA *btmeta;
DB *pgset;
DBC *pgsc;
+ DB_MPOOLFILE *mpf;
PAGE *h;
db_pgno_t p, meta_pgno;
int ret, err_ret;
- err_ret = 0;
- pgsc = NULL;
pgset = NULL;
+ pgsc = NULL;
+ mpf = dbp->mpf;
+ err_ret = 0;
meta_pgno = PGNO_BASE_MD;
- if ((ret = memp_fget(dbp->mpf, &meta_pgno, 0, &h)) != 0)
+ if ((ret = mpf->get(mpf, &meta_pgno, 0, &h)) != 0)
return (ret);
if (TYPE(h) == P_BTREEMETA)
@@ -2028,7 +2142,7 @@ __db_salvage_subdbs(dbp, vdp, handle, callback, flags, hassubsp)
/* We think we've got subdbs. Mark it so. */
*hassubsp = 1;
- if ((ret = memp_fput(dbp->mpf, h, 0)) != 0)
+ if ((ret = mpf->put(mpf, h, 0)) != 0)
return (ret);
/*
@@ -2048,7 +2162,7 @@ __db_salvage_subdbs(dbp, vdp, handle, callback, flags, hassubsp)
if ((ret = pgset->cursor(pgset, NULL, &pgsc, 0)) != 0)
goto err;
while ((ret = __db_vrfy_pgset_next(pgsc, &p)) == 0) {
- if ((ret = memp_fget(dbp->mpf, &p, 0, &h)) != 0) {
+ if ((ret = mpf->get(mpf, &p, 0, &h)) != 0) {
err_ret = ret;
continue;
}
@@ -2061,7 +2175,7 @@ __db_salvage_subdbs(dbp, vdp, handle, callback, flags, hassubsp)
else if ((ret = __db_salvage_subdbpg(
dbp, vdp, h, handle, callback, flags)) != 0)
err_ret = ret;
-nextpg: if ((ret = memp_fput(dbp->mpf, h, 0)) != 0)
+nextpg: if ((ret = mpf->put(mpf, h, 0)) != 0)
err_ret = ret;
}
@@ -2079,7 +2193,7 @@ err: if (pgsc != NULL)
(void)pgsc->c_close(pgsc);
if (pgset != NULL)
(void)pgset->close(pgset, 0);
- (void)memp_fput(dbp->mpf, h, 0);
+ (void)mpf->put(mpf, h, 0);
return (ret);
}
@@ -2087,12 +2201,8 @@ err: if (pgsc != NULL)
* __db_salvage_subdbpg --
* Given a known-good leaf page in the master database, salvage all
* leaf pages corresponding to each subdb.
- *
- * PUBLIC: int __db_salvage_subdbpg
- * PUBLIC: __P((DB *, VRFY_DBINFO *, PAGE *, void *,
- * PUBLIC: int (*)(void *, const void *), u_int32_t));
*/
-int
+static int
__db_salvage_subdbpg(dbp, vdp, master, handle, callback, flags)
DB *dbp;
VRFY_DBINFO *vdp;
@@ -2106,16 +2216,20 @@ __db_salvage_subdbpg(dbp, vdp, master, handle, callback, flags)
DB *pgset;
DBC *pgsc;
DBT key;
+ DB_ENV *dbenv;
+ DB_MPOOLFILE *mpf;
PAGE *subpg;
db_indx_t i;
db_pgno_t meta_pgno, p;
int ret, err_ret, t_ret;
char *subdbname;
+ dbenv = dbp->dbenv;
+ mpf = dbp->mpf;
ret = err_ret = 0;
subdbname = NULL;
- if ((ret = __db_vrfy_pgset(dbp->dbenv, dbp->pgsize, &pgset)) != 0)
+ if ((ret = __db_vrfy_pgset(dbenv, dbp->pgsize, &pgset)) != 0)
return (ret);
/*
@@ -2123,8 +2237,8 @@ __db_salvage_subdbpg(dbp, vdp, master, handle, callback, flags)
* corresponding to that entry.
*/
for (i = 0; i < NUM_ENT(master); i += P_INDX) {
- bkkey = GET_BKEYDATA(master, i);
- bkdata = GET_BKEYDATA(master, i + O_INDX);
+ bkkey = GET_BKEYDATA(dbp, master, i);
+ bkdata = GET_BKEYDATA(dbp, master, i + O_INDX);
/* Get the subdatabase name. */
if (B_TYPE(bkkey->type) == B_OVERFLOW) {
@@ -2140,13 +2254,13 @@ __db_salvage_subdbpg(dbp, vdp, master, handle, callback, flags)
}
/* Nul-terminate it. */
- if ((ret = __os_realloc(dbp->dbenv,
- key.size + 1, NULL, &subdbname)) != 0)
+ if ((ret = __os_realloc(dbenv,
+ key.size + 1, &subdbname)) != 0)
goto err;
subdbname[key.size] = '\0';
} else if (B_TYPE(bkkey->type == B_KEYDATA)) {
- if ((ret = __os_realloc(dbp->dbenv,
- bkkey->len + 1, NULL, &subdbname)) != 0)
+ if ((ret = __os_realloc(dbenv,
+ bkkey->len + 1, &subdbname)) != 0)
goto err;
memcpy(subdbname, bkkey->data, bkkey->len);
subdbname[bkkey->len] = '\0';
@@ -2159,9 +2273,15 @@ __db_salvage_subdbpg(dbp, vdp, master, handle, callback, flags)
}
memcpy(&meta_pgno, bkdata->data, sizeof(db_pgno_t));
+ /*
+ * Subdatabase meta pgnos are stored in network byte
+ * order for cross-endian compatibility. Swap if appropriate.
+ */
+ DB_NTOHL(&meta_pgno);
+
/* If we can't get the subdb meta page, just skip the subdb. */
if (!IS_VALID_PGNO(meta_pgno) ||
- (ret = memp_fget(dbp->mpf, &meta_pgno, 0, &subpg)) != 0) {
+ (ret = mpf->get(mpf, &meta_pgno, 0, &subpg)) != 0) {
err_ret = ret;
continue;
}
@@ -2177,7 +2297,7 @@ __db_salvage_subdbpg(dbp, vdp, master, handle, callback, flags)
if ((ret =
__db_vrfy_common(dbp, vdp, subpg, meta_pgno, flags)) != 0) {
err_ret = ret;
- (void)memp_fput(dbp->mpf, subpg, 0);
+ (void)mpf->put(mpf, subpg, 0);
continue;
}
switch (TYPE(subpg)) {
@@ -2185,7 +2305,7 @@ __db_salvage_subdbpg(dbp, vdp, master, handle, callback, flags)
if ((ret = __bam_vrfy_meta(dbp,
vdp, (BTMETA *)subpg, meta_pgno, flags)) != 0) {
err_ret = ret;
- (void)memp_fput(dbp->mpf, subpg, 0);
+ (void)mpf->put(mpf, subpg, 0);
continue;
}
break;
@@ -2193,7 +2313,7 @@ __db_salvage_subdbpg(dbp, vdp, master, handle, callback, flags)
if ((ret = __ham_vrfy_meta(dbp,
vdp, (HMETA *)subpg, meta_pgno, flags)) != 0) {
err_ret = ret;
- (void)memp_fput(dbp->mpf, subpg, 0);
+ (void)mpf->put(mpf, subpg, 0);
continue;
}
break;
@@ -2204,7 +2324,7 @@ __db_salvage_subdbpg(dbp, vdp, master, handle, callback, flags)
/* NOTREACHED */
}
- if ((ret = memp_fput(dbp->mpf, subpg, 0)) != 0) {
+ if ((ret = mpf->put(mpf, subpg, 0)) != 0) {
err_ret = ret;
continue;
}
@@ -2223,14 +2343,14 @@ __db_salvage_subdbpg(dbp, vdp, master, handle, callback, flags)
if ((ret = pgset->cursor(pgset, NULL, &pgsc, 0)) != 0)
goto err;
while ((ret = __db_vrfy_pgset_next(pgsc, &p)) == 0) {
- if ((ret = memp_fget(dbp->mpf, &p, 0, &subpg)) != 0) {
+ if ((ret = mpf->get(mpf, &p, 0, &subpg)) != 0) {
err_ret = ret;
continue;
}
if ((ret = __db_salvage(dbp, vdp, p, subpg,
handle, callback, flags)) != 0)
err_ret = ret;
- if ((ret = memp_fput(dbp->mpf, subpg, 0)) != 0)
+ if ((ret = mpf->put(mpf, subpg, 0)) != 0)
err_ret = ret;
}
@@ -2243,7 +2363,7 @@ __db_salvage_subdbpg(dbp, vdp, master, handle, callback, flags)
goto err;
}
err: if (subdbname)
- __os_free(subdbname, 0);
+ __os_free(dbenv, subdbname);
if ((t_ret = pgset->close(pgset, 0)) != 0)
ret = t_ret;
@@ -2268,10 +2388,13 @@ __db_meta2pgset(dbp, vdp, pgno, flags, pgset)
u_int32_t flags;
DB *pgset;
{
+ DB_MPOOLFILE *mpf;
PAGE *h;
int ret, t_ret;
- if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
+ mpf = dbp->mpf;
+
+ if ((ret = mpf->get(mpf, &pgno, 0, &h)) != 0)
return (ret);
switch (TYPE(h)) {
@@ -2286,7 +2409,7 @@ __db_meta2pgset(dbp, vdp, pgno, flags, pgset)
break;
}
- if ((t_ret = memp_fput(dbp->mpf, h, 0)) != 0)
+ if ((t_ret = mpf->put(mpf, h, 0)) != 0)
return (t_ret);
return (ret);
}
@@ -2305,7 +2428,6 @@ __db_guesspgsize(dbenv, fhp)
size_t nr;
u_int32_t guess;
u_int8_t type;
- int ret;
for (guess = DB_MAX_PGSIZE; guess >= DB_MIN_PGSIZE; guess >>= 1) {
/*
@@ -2321,11 +2443,11 @@ __db_guesspgsize(dbenv, fhp)
* our previous guess; that last one was probably the page size.
*/
for (i = 1; i <= 3; i++) {
- if ((ret = __os_seek(dbenv, fhp, guess,
- i, SSZ(DBMETA, type), 0, DB_OS_SEEK_SET)) != 0)
+ if (__os_seek(dbenv, fhp, guess,
+ i, SSZ(DBMETA, type), 0, DB_OS_SEEK_SET) != 0)
break;
- if ((ret = __os_read(dbenv,
- fhp, &type, 1, &nr)) != 0 || nr == 0)
+ if (__os_read(dbenv,
+ fhp, &type, 1, &nr) != 0 || nr == 0)
break;
if (type == P_INVALID || type >= P_PAGETYPE_MAX)
return (guess << 1);
diff --git a/bdb/db/db_vrfyutil.c b/bdb/db/db_vrfyutil.c
index 89dccdcc760..44344ceed11 100644
--- a/bdb/db/db_vrfyutil.c
+++ b/bdb/db/db_vrfyutil.c
@@ -1,16 +1,16 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2000
+ * Copyright (c) 2000-2002
* Sleepycat Software. All rights reserved.
*
- * $Id: db_vrfyutil.c,v 11.11 2000/11/28 21:36:04 bostic Exp $
+ * $Id: db_vrfyutil.c,v 11.29 2002/08/08 03:57:50 bostic Exp $
*/
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: db_vrfyutil.c,v 11.11 2000/11/28 21:36:04 bostic Exp $";
+static const char revid[] = "$Id: db_vrfyutil.c,v 11.29 2002/08/08 03:57:50 bostic Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -20,10 +20,11 @@ static const char revid[] = "$Id: db_vrfyutil.c,v 11.11 2000/11/28 21:36:04 bost
#endif
#include "db_int.h"
-#include "db_page.h"
-#include "db_verify.h"
-#include "db_ext.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_verify.h"
+#include "dbinc/db_am.h"
+static int __db_vrfy_pageinfo_create __P((DB_ENV *, VRFY_PAGEINFO **));
static int __db_vrfy_pgset_iinc __P((DB *, db_pgno_t, int));
/*
@@ -34,7 +35,7 @@ static int __db_vrfy_pgset_iinc __P((DB *, db_pgno_t, int));
* PUBLIC: __P((DB_ENV *, u_int32_t, VRFY_DBINFO **));
*/
int
-__db_vrfy_dbinfo_create (dbenv, pgsize, vdpp)
+__db_vrfy_dbinfo_create(dbenv, pgsize, vdpp)
DB_ENV *dbenv;
u_int32_t pgsize;
VRFY_DBINFO **vdpp;
@@ -53,14 +54,14 @@ __db_vrfy_dbinfo_create (dbenv, pgsize, vdpp)
if ((ret = db_create(&cdbp, dbenv, 0)) != 0)
goto err;
- if ((ret = cdbp->set_flags(cdbp, DB_DUP | DB_DUPSORT)) != 0)
+ if ((ret = cdbp->set_flags(cdbp, DB_DUP)) != 0)
goto err;
if ((ret = cdbp->set_pagesize(cdbp, pgsize)) != 0)
goto err;
if ((ret =
- cdbp->open(cdbp, NULL, NULL, DB_BTREE, DB_CREATE, 0600)) != 0)
+ cdbp->open(cdbp, NULL, NULL, NULL, DB_BTREE, DB_CREATE, 0600)) != 0)
goto err;
if ((ret = db_create(&pgdbp, dbenv, 0)) != 0)
@@ -69,8 +70,8 @@ __db_vrfy_dbinfo_create (dbenv, pgsize, vdpp)
if ((ret = pgdbp->set_pagesize(pgdbp, pgsize)) != 0)
goto err;
- if ((ret =
- pgdbp->open(pgdbp, NULL, NULL, DB_BTREE, DB_CREATE, 0600)) != 0)
+ if ((ret = pgdbp->open(pgdbp,
+ NULL, NULL, NULL, DB_BTREE, DB_CREATE, 0600)) != 0)
goto err;
if ((ret = __db_vrfy_pgset(dbenv, pgsize, &pgset)) != 0)
@@ -90,7 +91,7 @@ err: if (cdbp != NULL)
if (pgdbp != NULL)
(void)pgdbp->close(pgdbp, 0);
if (vdp != NULL)
- __os_free(vdp, sizeof(VRFY_DBINFO));
+ __os_free(dbenv, vdp);
return (ret);
}
@@ -99,10 +100,11 @@ err: if (cdbp != NULL)
* Destructor for VRFY_DBINFO. Destroys VRFY_PAGEINFOs and deallocates
* structure.
*
- * PUBLIC: int __db_vrfy_dbinfo_destroy __P((VRFY_DBINFO *));
+ * PUBLIC: int __db_vrfy_dbinfo_destroy __P((DB_ENV *, VRFY_DBINFO *));
*/
int
-__db_vrfy_dbinfo_destroy(vdp)
+__db_vrfy_dbinfo_destroy(dbenv, vdp)
+ DB_ENV *dbenv;
VRFY_DBINFO *vdp;
{
VRFY_CHILDINFO *c, *d;
@@ -112,7 +114,7 @@ __db_vrfy_dbinfo_destroy(vdp)
for (c = LIST_FIRST(&vdp->subdbs); c != NULL; c = d) {
d = LIST_NEXT(c, links);
- __os_free(c, 0);
+ __os_free(NULL, c);
}
if ((t_ret = vdp->pgdbp->close(vdp->pgdbp, 0)) != 0)
@@ -126,7 +128,7 @@ __db_vrfy_dbinfo_destroy(vdp)
DB_ASSERT(LIST_FIRST(&vdp->activepips) == NULL);
- __os_free(vdp, sizeof(VRFY_DBINFO));
+ __os_free(dbenv, vdp);
return (ret);
}
@@ -192,7 +194,7 @@ __db_vrfy_getpageinfo(vdp, pgno, pipp)
return (ret);
/* Case 3 */
- if ((ret = __db_vrfy_pageinfo_create(&pip)) != 0)
+ if ((ret = __db_vrfy_pageinfo_create(pgdbp->dbenv, &pip)) != 0)
return (ret);
LIST_INSERT_HEAD(&vdp->activepips, pip, links);
@@ -208,10 +210,12 @@ found: pip->pi_refcount++;
* __db_vrfy_putpageinfo --
* Put back a VRFY_PAGEINFO that we're done with.
*
- * PUBLIC: int __db_vrfy_putpageinfo __P((VRFY_DBINFO *, VRFY_PAGEINFO *));
+ * PUBLIC: int __db_vrfy_putpageinfo __P((DB_ENV *,
+ * PUBLIC: VRFY_DBINFO *, VRFY_PAGEINFO *));
*/
int
-__db_vrfy_putpageinfo(vdp, pip)
+__db_vrfy_putpageinfo(dbenv, vdp, pip)
+ DB_ENV *dbenv;
VRFY_DBINFO *vdp;
VRFY_PAGEINFO *pip;
{
@@ -255,7 +259,7 @@ __db_vrfy_putpageinfo(vdp, pip)
#endif
DB_ASSERT(pip->pi_refcount == 0);
- __os_free(pip, 0);
+ __os_ufree(dbenv, pip);
return (0);
}
@@ -280,7 +284,8 @@ __db_vrfy_pgset(dbenv, pgsize, dbpp)
return (ret);
if ((ret = dbp->set_pagesize(dbp, pgsize)) != 0)
goto err;
- if ((ret = dbp->open(dbp, NULL, NULL, DB_BTREE, DB_CREATE, 0600)) == 0)
+ if ((ret = dbp->open(dbp,
+ NULL, NULL, NULL, DB_BTREE, DB_CREATE, 0600)) == 0)
*dbpp = dbp;
else
err: (void)dbp->close(dbp, 0);
@@ -382,7 +387,7 @@ __db_vrfy_pgset_iinc(dbp, pgno, i)
F_SET(&data, DB_DBT_USERMEM);
if ((ret = dbp->get(dbp, NULL, &key, &data, 0)) == 0) {
- DB_ASSERT(data.size = sizeof(int));
+ DB_ASSERT(data.size == sizeof(int));
memcpy(&val, data.data, sizeof(int));
} else if (ret != DB_NOTFOUND)
return (ret);
@@ -463,8 +468,10 @@ __db_vrfy_childput(vdp, pgno, cip)
db_pgno_t pgno;
VRFY_CHILDINFO *cip;
{
- DBT key, data;
DB *cdbp;
+ DBC *cc;
+ DBT key, data;
+ VRFY_CHILDINFO *oldcip;
int ret;
cdbp = vdp->cdbp;
@@ -474,17 +481,44 @@ __db_vrfy_childput(vdp, pgno, cip)
key.data = &pgno;
key.size = sizeof(db_pgno_t);
+ /*
+ * We want to avoid adding multiple entries for a single child page;
+ * we only need to verify each child once, even if a child (such
+ * as an overflow key) is multiply referenced.
+ *
+ * However, we also need to make sure that when walking the list
+ * of children, we encounter them in the order they're referenced
+ * on a page. (This permits us, for example, to verify the
+ * prev_pgno/next_pgno chain of Btree leaf pages.)
+ *
+ * Check the child database to make sure that this page isn't
+ * already a child of the specified page number. If it's not,
+ * put it at the end of the duplicate set.
+ */
+ if ((ret = __db_vrfy_childcursor(vdp, &cc)) != 0)
+ return (ret);
+ for (ret = __db_vrfy_ccset(cc, pgno, &oldcip); ret == 0;
+ ret = __db_vrfy_ccnext(cc, &oldcip))
+ if (oldcip->pgno == cip->pgno) {
+ /*
+ * Found a matching child. Return without
+ * putting it again.
+ */
+ if ((ret = __db_vrfy_ccclose(cc)) != 0)
+ return (ret);
+ return (0);
+ }
+ if (ret != DB_NOTFOUND) {
+ (void)__db_vrfy_ccclose(cc);
+ return (ret);
+ }
+ if ((ret = __db_vrfy_ccclose(cc)) != 0)
+ return (ret);
+
data.data = cip;
data.size = sizeof(VRFY_CHILDINFO);
- /*
- * Don't add duplicate (data) entries for a given child, and accept
- * DB_KEYEXIST as a successful return; we only need to verify
- * each child once, even if a child (such as an overflow key) is
- * multiply referenced.
- */
- ret = cdbp->put(cdbp, NULL, &key, &data, DB_NODUPDATA);
- return (ret == DB_KEYEXIST ? 0 : ret);
+ return (cdbp->put(cdbp, NULL, &key, &data, 0));
}
/*
@@ -568,19 +602,26 @@ __db_vrfy_ccclose(dbc)
/*
* __db_vrfy_pageinfo_create --
* Constructor for VRFY_PAGEINFO; allocates and initializes.
- *
- * PUBLIC: int __db_vrfy_pageinfo_create __P((VRFY_PAGEINFO **));
*/
-int
-__db_vrfy_pageinfo_create(pgipp)
+static int
+__db_vrfy_pageinfo_create(dbenv, pgipp)
+ DB_ENV *dbenv;
VRFY_PAGEINFO **pgipp;
{
VRFY_PAGEINFO *pgip;
int ret;
- if ((ret = __os_calloc(NULL,
- 1, sizeof(VRFY_PAGEINFO), (void **)&pgip)) != 0)
+ /*
+ * pageinfo structs are sometimes allocated here and sometimes
+ * allocated by fetching them from a database with DB_DBT_MALLOC.
+ * There's no easy way for the destructor to tell which was
+ * used, and so we always allocate with __os_umalloc so we can free
+ * with __os_ufree.
+ */
+ if ((ret = __os_umalloc(dbenv,
+ sizeof(VRFY_PAGEINFO), (void **)&pgip)) != 0)
return (ret);
+ memset(pgip, 0, sizeof(VRFY_PAGEINFO));
DB_ASSERT(pgip->pi_refcount == 0);
@@ -607,7 +648,8 @@ __db_salvage_init(vdp)
if ((ret = dbp->set_pagesize(dbp, 1024)) != 0)
goto err;
- if ((ret = dbp->open(dbp, NULL, NULL, DB_BTREE, DB_CREATE, 0)) != 0)
+ if ((ret = dbp->open(dbp,
+ NULL, NULL, NULL, DB_BTREE, DB_CREATE, 0)) != 0)
goto err;
vdp->salvage_pages = dbp;