diff options
author | unknown <ram@mysql.r18.ru> | 2002-10-30 15:57:05 +0400 |
---|---|---|
committer | unknown <ram@mysql.r18.ru> | 2002-10-30 15:57:05 +0400 |
commit | 155e78f014de1a2e259ae5119f4621fbb210a784 (patch) | |
tree | 6881a3cca88bea0bb9eeffd5aae34be437152786 /bdb/db | |
parent | b8798d25ab71436bf690ee8ae48285a655c5487e (diff) | |
download | mariadb-git-155e78f014de1a2e259ae5119f4621fbb210a784.tar.gz |
BDB 4.1.24
BitKeeper/deleted/.del-ex_access.wpj~3df6ae8c99bf7c5f:
Delete: bdb/build_vxworks/ex_access/ex_access.wpj
BitKeeper/deleted/.del-ex_btrec.wpj~a7622f1c6f432dc6:
Delete: bdb/build_vxworks/ex_btrec/ex_btrec.wpj
BitKeeper/deleted/.del-ex_dbclient.wpj~7345440f3b204cdd:
Delete: bdb/build_vxworks/ex_dbclient/ex_dbclient.wpj
BitKeeper/deleted/.del-ex_env.wpj~fbe1ab10b04e8b74:
Delete: bdb/build_vxworks/ex_env/ex_env.wpj
BitKeeper/deleted/.del-ex_mpool.wpj~4479cfd5c45f327d:
Delete: bdb/build_vxworks/ex_mpool/ex_mpool.wpj
BitKeeper/deleted/.del-ex_tpcb.wpj~f78093006e14bf41:
Delete: bdb/build_vxworks/ex_tpcb/ex_tpcb.wpj
BitKeeper/deleted/.del-db_buildall.dsp~bd749ff6da11682:
Delete: bdb/build_win32/db_buildall.dsp
BitKeeper/deleted/.del-cxx_app.cpp~ad8df8e0791011ed:
Delete: bdb/cxx/cxx_app.cpp
BitKeeper/deleted/.del-cxx_log.cpp~a50ff3118fe06952:
Delete: bdb/cxx/cxx_log.cpp
BitKeeper/deleted/.del-cxx_table.cpp~ecd751e79b055556:
Delete: bdb/cxx/cxx_table.cpp
BitKeeper/deleted/.del-namemap.txt~796a3acd3885d8fd:
Delete: bdb/cxx/namemap.txt
BitKeeper/deleted/.del-Design.fileop~3ca4da68f1727373:
Delete: bdb/db/Design.fileop
BitKeeper/deleted/.del-db185_int.h~61bee3736e7959ef:
Delete: bdb/db185/db185_int.h
BitKeeper/deleted/.del-acconfig.h~411e8854d67ad8b5:
Delete: bdb/dist/acconfig.h
BitKeeper/deleted/.del-mutex.m4~a13383cde18a64e1:
Delete: bdb/dist/aclocal/mutex.m4
BitKeeper/deleted/.del-options.m4~b9d0ca637213750a:
Delete: bdb/dist/aclocal/options.m4
BitKeeper/deleted/.del-programs.m4~3ce7890b47732b30:
Delete: bdb/dist/aclocal/programs.m4
BitKeeper/deleted/.del-tcl.m4~f944e2db93c3b6db:
Delete: bdb/dist/aclocal/tcl.m4
BitKeeper/deleted/.del-types.m4~59cae158c9a32cff:
Delete: bdb/dist/aclocal/types.m4
BitKeeper/deleted/.del-script~d38f6d3a4f159cb4:
Delete: bdb/dist/build/script
BitKeeper/deleted/.del-configure.in~ac795a92c8fe049c:
Delete: bdb/dist/configure.in
BitKeeper/deleted/.del-ltconfig~66bbd007d8024af:
Delete: bdb/dist/ltconfig
BitKeeper/deleted/.del-rec_ctemp~a28554362534f00a:
Delete: bdb/dist/rec_ctemp
BitKeeper/deleted/.del-s_tcl~2ffe4326459fcd9f:
Delete: bdb/dist/s_tcl
BitKeeper/deleted/.del-.IGNORE_ME~d8148b08fa7d5d15:
Delete: bdb/dist/template/.IGNORE_ME
BitKeeper/deleted/.del-btree.h~179f2aefec1753d:
Delete: bdb/include/btree.h
BitKeeper/deleted/.del-cxx_int.h~6b649c04766508f8:
Delete: bdb/include/cxx_int.h
BitKeeper/deleted/.del-db.src~6b433ae615b16a8d:
Delete: bdb/include/db.src
BitKeeper/deleted/.del-db_185.h~ad8b373d9391d35c:
Delete: bdb/include/db_185.h
BitKeeper/deleted/.del-db_am.h~a714912b6b75932f:
Delete: bdb/include/db_am.h
BitKeeper/deleted/.del-db_cxx.h~fcafadf45f5d19e9:
Delete: bdb/include/db_cxx.h
BitKeeper/deleted/.del-db_dispatch.h~6844f20f7eb46904:
Delete: bdb/include/db_dispatch.h
BitKeeper/deleted/.del-db_int.src~419a3f48b6a01da7:
Delete: bdb/include/db_int.src
BitKeeper/deleted/.del-db_join.h~76f9747a42c3399a:
Delete: bdb/include/db_join.h
BitKeeper/deleted/.del-db_page.h~e302ca3a4db3abdc:
Delete: bdb/include/db_page.h
BitKeeper/deleted/.del-db_server_int.h~e1d20b6ba3bca1ab:
Delete: bdb/include/db_server_int.h
BitKeeper/deleted/.del-db_shash.h~5fbf2d696fac90f3:
Delete: bdb/include/db_shash.h
BitKeeper/deleted/.del-db_swap.h~1e60887550864a59:
Delete: bdb/include/db_swap.h
BitKeeper/deleted/.del-db_upgrade.h~c644eee73701fc8d:
Delete: bdb/include/db_upgrade.h
BitKeeper/deleted/.del-db_verify.h~b8d6c297c61f342e:
Delete: bdb/include/db_verify.h
BitKeeper/deleted/.del-debug.h~dc2b4f2cf27ccebc:
Delete: bdb/include/debug.h
BitKeeper/deleted/.del-hash.h~2aaa548b28882dfb:
Delete: bdb/include/hash.h
BitKeeper/deleted/.del-lock.h~a761c1b7de57b77f:
Delete: bdb/include/lock.h
BitKeeper/deleted/.del-log.h~ff20184238e35e4d:
Delete: bdb/include/log.h
BitKeeper/deleted/.del-mp.h~7e317597622f3411:
Delete: bdb/include/mp.h
BitKeeper/deleted/.del-mutex.h~d3ae7a2977a68137:
Delete: bdb/include/mutex.h
BitKeeper/deleted/.del-os.h~91867cc8757cd0e3:
Delete: bdb/include/os.h
BitKeeper/deleted/.del-os_jump.h~e1b939fa5151d4be:
Delete: bdb/include/os_jump.h
BitKeeper/deleted/.del-qam.h~6fad0c1b5723d597:
Delete: bdb/include/qam.h
BitKeeper/deleted/.del-queue.h~4c72c0826c123d5:
Delete: bdb/include/queue.h
BitKeeper/deleted/.del-region.h~513fe04d977ca0fc:
Delete: bdb/include/region.h
BitKeeper/deleted/.del-shqueue.h~525fc3e6c2025c36:
Delete: bdb/include/shqueue.h
BitKeeper/deleted/.del-tcl_db.h~c536fd61a844f23f:
Delete: bdb/include/tcl_db.h
BitKeeper/deleted/.del-txn.h~c8d94b221ec147e4:
Delete: bdb/include/txn.h
BitKeeper/deleted/.del-xa.h~ecc466493aae9d9a:
Delete: bdb/include/xa.h
BitKeeper/deleted/.del-DbRecoveryInit.java~756b52601a0b9023:
Delete: bdb/java/src/com/sleepycat/db/DbRecoveryInit.java
BitKeeper/deleted/.del-DbTxnRecover.java~74607cba7ab89d6d:
Delete: bdb/java/src/com/sleepycat/db/DbTxnRecover.java
BitKeeper/deleted/.del-lock_conflict.c~fc5e0f14cf597a2b:
Delete: bdb/lock/lock_conflict.c
BitKeeper/deleted/.del-log.src~53ac9e7b5cb023f2:
Delete: bdb/log/log.src
BitKeeper/deleted/.del-log_findckp.c~24287f008916e81f:
Delete: bdb/log/log_findckp.c
BitKeeper/deleted/.del-log_rec.c~d51711f2cac09297:
Delete: bdb/log/log_rec.c
BitKeeper/deleted/.del-log_register.c~b40bb4efac75ca15:
Delete: bdb/log/log_register.c
BitKeeper/deleted/.del-Design~b3d0f179f2767b:
Delete: bdb/mp/Design
BitKeeper/deleted/.del-os_finit.c~95dbefc6fe79b26c:
Delete: bdb/os/os_finit.c
BitKeeper/deleted/.del-os_abs.c~df95d1e7db81924:
Delete: bdb/os_vxworks/os_abs.c
BitKeeper/deleted/.del-os_finit.c~803b484bdb9d0122:
Delete: bdb/os_vxworks/os_finit.c
BitKeeper/deleted/.del-os_map.c~3a6d7926398b76d3:
Delete: bdb/os_vxworks/os_map.c
BitKeeper/deleted/.del-os_finit.c~19a227c6d3c78ad:
Delete: bdb/os_win32/os_finit.c
BitKeeper/deleted/.del-log-corruption.patch~1cf2ecc7c6408d5d:
Delete: bdb/patches/log-corruption.patch
BitKeeper/deleted/.del-Btree.pm~af6d0c5eaed4a98e:
Delete: bdb/perl.BerkeleyDB/BerkeleyDB/Btree.pm
BitKeeper/deleted/.del-BerkeleyDB.pm~7244036d4482643:
Delete: bdb/perl.BerkeleyDB/BerkeleyDB.pm
BitKeeper/deleted/.del-BerkeleyDB.pod~e7b18fd6132448e3:
Delete: bdb/perl.BerkeleyDB/BerkeleyDB.pod
BitKeeper/deleted/.del-Hash.pm~10292a26c06a5c95:
Delete: bdb/perl.BerkeleyDB/BerkeleyDB/Hash.pm
BitKeeper/deleted/.del-BerkeleyDB.pod.P~79f76a1495eda203:
Delete: bdb/perl.BerkeleyDB/BerkeleyDB.pod.P
BitKeeper/deleted/.del-BerkeleyDB.xs~80c99afbd98e392c:
Delete: bdb/perl.BerkeleyDB/BerkeleyDB.xs
BitKeeper/deleted/.del-Changes~729c1891efa60de9:
Delete: bdb/perl.BerkeleyDB/Changes
BitKeeper/deleted/.del-MANIFEST~63a1e34aecf157a0:
Delete: bdb/perl.BerkeleyDB/MANIFEST
BitKeeper/deleted/.del-Makefile.PL~c68797707d8df87a:
Delete: bdb/perl.BerkeleyDB/Makefile.PL
BitKeeper/deleted/.del-README~5f2f579b1a241407:
Delete: bdb/perl.BerkeleyDB/README
BitKeeper/deleted/.del-Todo~dca3c66c193adda9:
Delete: bdb/perl.BerkeleyDB/Todo
BitKeeper/deleted/.del-config.in~ae81681e450e0999:
Delete: bdb/perl.BerkeleyDB/config.in
BitKeeper/deleted/.del-dbinfo~28ad67d83be4f68e:
Delete: bdb/perl.BerkeleyDB/dbinfo
BitKeeper/deleted/.del-mkconsts~543ab60669c7a04e:
Delete: bdb/perl.BerkeleyDB/mkconsts
BitKeeper/deleted/.del-mkpod~182c0ca54e439afb:
Delete: bdb/perl.BerkeleyDB/mkpod
BitKeeper/deleted/.del-5.004~e008cb5a48805543:
Delete: bdb/perl.BerkeleyDB/patches/5.004
BitKeeper/deleted/.del-irix_6_5.pl~61662bb08afcdec8:
Delete: bdb/perl.BerkeleyDB/hints/irix_6_5.pl
BitKeeper/deleted/.del-solaris.pl~6771e7182394e152:
Delete: bdb/perl.BerkeleyDB/hints/solaris.pl
BitKeeper/deleted/.del-typemap~783b8f5295b05f3d:
Delete: bdb/perl.BerkeleyDB/typemap
BitKeeper/deleted/.del-5.004_01~6081ce2fff7b0bc:
Delete: bdb/perl.BerkeleyDB/patches/5.004_01
BitKeeper/deleted/.del-5.004_02~87214eac35ad9e6:
Delete: bdb/perl.BerkeleyDB/patches/5.004_02
BitKeeper/deleted/.del-5.004_03~9a672becec7cb40f:
Delete: bdb/perl.BerkeleyDB/patches/5.004_03
BitKeeper/deleted/.del-5.004_04~e326cb51af09d154:
Delete: bdb/perl.BerkeleyDB/patches/5.004_04
BitKeeper/deleted/.del-5.004_05~7ab457a1e41a92fe:
Delete: bdb/perl.BerkeleyDB/patches/5.004_05
BitKeeper/deleted/.del-5.005~f9e2d59b5964cd4b:
Delete: bdb/perl.BerkeleyDB/patches/5.005
BitKeeper/deleted/.del-5.005_01~3eb9fb7b5842ea8e:
Delete: bdb/perl.BerkeleyDB/patches/5.005_01
BitKeeper/deleted/.del-5.005_02~67477ce0bef717cb:
Delete: bdb/perl.BerkeleyDB/patches/5.005_02
BitKeeper/deleted/.del-5.005_03~c4c29a1fb21e290a:
Delete: bdb/perl.BerkeleyDB/patches/5.005_03
BitKeeper/deleted/.del-5.6.0~e1fb9897d124ee22:
Delete: bdb/perl.BerkeleyDB/patches/5.6.0
BitKeeper/deleted/.del-btree.t~e4a1a3c675ddc406:
Delete: bdb/perl.BerkeleyDB/t/btree.t
BitKeeper/deleted/.del-db-3.0.t~d2c60991d84558f2:
Delete: bdb/perl.BerkeleyDB/t/db-3.0.t
BitKeeper/deleted/.del-db-3.1.t~6ee88cd13f55e018:
Delete: bdb/perl.BerkeleyDB/t/db-3.1.t
BitKeeper/deleted/.del-db-3.2.t~f73b6461f98fd1cf:
Delete: bdb/perl.BerkeleyDB/t/db-3.2.t
BitKeeper/deleted/.del-destroy.t~cc6a2ae1980a2ecd:
Delete: bdb/perl.BerkeleyDB/t/destroy.t
BitKeeper/deleted/.del-env.t~a8604a4499c4bd07:
Delete: bdb/perl.BerkeleyDB/t/env.t
BitKeeper/deleted/.del-examples.t~2571b77c3cc75574:
Delete: bdb/perl.BerkeleyDB/t/examples.t
BitKeeper/deleted/.del-examples.t.T~8228bdd75ac78b88:
Delete: bdb/perl.BerkeleyDB/t/examples.t.T
BitKeeper/deleted/.del-examples3.t.T~66a186897a87026d:
Delete: bdb/perl.BerkeleyDB/t/examples3.t.T
BitKeeper/deleted/.del-examples3.t~fe3822ba2f2d7f83:
Delete: bdb/perl.BerkeleyDB/t/examples3.t
BitKeeper/deleted/.del-filter.t~f87b045c1b708637:
Delete: bdb/perl.BerkeleyDB/t/filter.t
BitKeeper/deleted/.del-hash.t~616bfb4d644de3a3:
Delete: bdb/perl.BerkeleyDB/t/hash.t
BitKeeper/deleted/.del-join.t~29fc39f74a83ca22:
Delete: bdb/perl.BerkeleyDB/t/join.t
BitKeeper/deleted/.del-mldbm.t~31f5015341eea040:
Delete: bdb/perl.BerkeleyDB/t/mldbm.t
BitKeeper/deleted/.del-queue.t~8f338034ce44a641:
Delete: bdb/perl.BerkeleyDB/t/queue.t
BitKeeper/deleted/.del-recno.t~d4ddbd3743add63e:
Delete: bdb/perl.BerkeleyDB/t/recno.t
BitKeeper/deleted/.del-strict.t~6885cdd2ea71ca2d:
Delete: bdb/perl.BerkeleyDB/t/strict.t
BitKeeper/deleted/.del-subdb.t~aab62a5d5864c603:
Delete: bdb/perl.BerkeleyDB/t/subdb.t
BitKeeper/deleted/.del-txn.t~65033b8558ae1216:
Delete: bdb/perl.BerkeleyDB/t/txn.t
BitKeeper/deleted/.del-unknown.t~f3710458682665e1:
Delete: bdb/perl.BerkeleyDB/t/unknown.t
BitKeeper/deleted/.del-Changes~436f74a5c414c65b:
Delete: bdb/perl.DB_File/Changes
BitKeeper/deleted/.del-DB_File.pm~ae0951c6c7665a82:
Delete: bdb/perl.DB_File/DB_File.pm
BitKeeper/deleted/.del-DB_File.xs~89e49a0b5556f1d8:
Delete: bdb/perl.DB_File/DB_File.xs
BitKeeper/deleted/.del-DB_File_BS~290fad5dbbb87069:
Delete: bdb/perl.DB_File/DB_File_BS
BitKeeper/deleted/.del-MANIFEST~90ee581572bdd4ac:
Delete: bdb/perl.DB_File/MANIFEST
BitKeeper/deleted/.del-Makefile.PL~ac0567bb5a377e38:
Delete: bdb/perl.DB_File/Makefile.PL
BitKeeper/deleted/.del-README~77e924a5a9bae6b3:
Delete: bdb/perl.DB_File/README
BitKeeper/deleted/.del-config.in~ab4c2792b86a810b:
Delete: bdb/perl.DB_File/config.in
BitKeeper/deleted/.del-dbinfo~461c43b30fab2cb:
Delete: bdb/perl.DB_File/dbinfo
BitKeeper/deleted/.del-dynixptx.pl~50dcddfae25d17e9:
Delete: bdb/perl.DB_File/hints/dynixptx.pl
BitKeeper/deleted/.del-typemap~55cffb3288a9e587:
Delete: bdb/perl.DB_File/typemap
BitKeeper/deleted/.del-version.c~a4df0e646f8b3975:
Delete: bdb/perl.DB_File/version.c
BitKeeper/deleted/.del-5.004_01~d6830d0082702af7:
Delete: bdb/perl.DB_File/patches/5.004_01
BitKeeper/deleted/.del-5.004_02~78b082dc80c91031:
Delete: bdb/perl.DB_File/patches/5.004_02
BitKeeper/deleted/.del-5.004~4411ec2e3c9e008b:
Delete: bdb/perl.DB_File/patches/5.004
BitKeeper/deleted/.del-sco.pl~1e795fe14fe4dcfe:
Delete: bdb/perl.DB_File/hints/sco.pl
BitKeeper/deleted/.del-5.004_03~33f274648b160d95:
Delete: bdb/perl.DB_File/patches/5.004_03
BitKeeper/deleted/.del-5.004_04~8f3d1b3cf18bb20a:
Delete: bdb/perl.DB_File/patches/5.004_04
BitKeeper/deleted/.del-5.004_05~9c0f02e7331e142:
Delete: bdb/perl.DB_File/patches/5.004_05
BitKeeper/deleted/.del-5.005~c2108cb2e3c8d951:
Delete: bdb/perl.DB_File/patches/5.005
BitKeeper/deleted/.del-5.005_01~3b45e9673afc4cfa:
Delete: bdb/perl.DB_File/patches/5.005_01
BitKeeper/deleted/.del-5.005_02~9fe5766bb02a4522:
Delete: bdb/perl.DB_File/patches/5.005_02
BitKeeper/deleted/.del-5.005_03~ffa1c38c19ae72ea:
Delete: bdb/perl.DB_File/patches/5.005_03
BitKeeper/deleted/.del-5.6.0~373be3a5ce47be85:
Delete: bdb/perl.DB_File/patches/5.6.0
BitKeeper/deleted/.del-db-btree.t~3231595a1c241eb3:
Delete: bdb/perl.DB_File/t/db-btree.t
BitKeeper/deleted/.del-db-hash.t~7c4ad0c795c7fad2:
Delete: bdb/perl.DB_File/t/db-hash.t
BitKeeper/deleted/.del-db-recno.t~6c2d3d80b9ba4a50:
Delete: bdb/perl.DB_File/t/db-recno.t
BitKeeper/deleted/.del-db_server.sed~cdb00ebcd48a64e2:
Delete: bdb/rpc_server/db_server.sed
BitKeeper/deleted/.del-db_server_proc.c~d46c8f409c3747f4:
Delete: bdb/rpc_server/db_server_proc.c
BitKeeper/deleted/.del-db_server_svc.sed~3f5e59f334fa4607:
Delete: bdb/rpc_server/db_server_svc.sed
BitKeeper/deleted/.del-db_server_util.c~a809f3a4629acda:
Delete: bdb/rpc_server/db_server_util.c
BitKeeper/deleted/.del-log.tcl~ff1b41f1355b97d7:
Delete: bdb/test/log.tcl
BitKeeper/deleted/.del-mpool.tcl~b0df4dc1b04db26c:
Delete: bdb/test/mpool.tcl
BitKeeper/deleted/.del-mutex.tcl~52fd5c73a150565:
Delete: bdb/test/mutex.tcl
BitKeeper/deleted/.del-txn.tcl~c4ff071550b5446e:
Delete: bdb/test/txn.tcl
BitKeeper/deleted/.del-README~e800a12a5392010a:
Delete: bdb/test/upgrade/README
BitKeeper/deleted/.del-pack-2.6.6.pl~89d5076d758d3e98:
Delete: bdb/test/upgrade/generate-2.X/pack-2.6.6.pl
BitKeeper/deleted/.del-test-2.6.patch~4a52dc83d447547b:
Delete: bdb/test/upgrade/generate-2.X/test-2.6.patch
Diffstat (limited to 'bdb/db')
-rw-r--r-- | bdb/db/Design.fileop | 452 | ||||
-rw-r--r-- | bdb/db/crdel.src | 85 | ||||
-rw-r--r-- | bdb/db/crdel_rec.c | 577 | ||||
-rw-r--r-- | bdb/db/db.c | 2087 | ||||
-rw-r--r-- | bdb/db/db.src | 133 | ||||
-rw-r--r-- | bdb/db/db_am.c | 926 | ||||
-rw-r--r-- | bdb/db/db_cam.c | 1538 | ||||
-rw-r--r-- | bdb/db/db_conv.c | 290 | ||||
-rw-r--r-- | bdb/db/db_dispatch.c | 1305 | ||||
-rw-r--r-- | bdb/db/db_dup.c | 118 | ||||
-rw-r--r-- | bdb/db/db_iface.c | 504 | ||||
-rw-r--r-- | bdb/db/db_join.c | 250 | ||||
-rw-r--r-- | bdb/db/db_meta.c | 287 | ||||
-rw-r--r-- | bdb/db/db_method.c | 288 | ||||
-rw-r--r-- | bdb/db/db_open.c | 705 | ||||
-rw-r--r-- | bdb/db/db_overflow.c | 213 | ||||
-rw-r--r-- | bdb/db/db_pr.c | 444 | ||||
-rw-r--r-- | bdb/db/db_rec.c | 456 | ||||
-rw-r--r-- | bdb/db/db_reclaim.c | 228 | ||||
-rw-r--r-- | bdb/db/db_remove.c | 318 | ||||
-rw-r--r-- | bdb/db/db_rename.c | 297 | ||||
-rw-r--r-- | bdb/db/db_ret.c | 36 | ||||
-rw-r--r-- | bdb/db/db_truncate.c | 95 | ||||
-rw-r--r-- | bdb/db/db_upg.c | 27 | ||||
-rw-r--r-- | bdb/db/db_upg_opd.c | 79 | ||||
-rw-r--r-- | bdb/db/db_vrfy.c | 704 | ||||
-rw-r--r-- | bdb/db/db_vrfyutil.c | 118 |
27 files changed, 7954 insertions, 4606 deletions
diff --git a/bdb/db/Design.fileop b/bdb/db/Design.fileop deleted file mode 100644 index 187f1ffaf22..00000000000 --- a/bdb/db/Design.fileop +++ /dev/null @@ -1,452 +0,0 @@ -# $Id: Design.fileop,v 11.4 2000/02/19 20:57:54 bostic Exp $ - -The design of file operation recovery. - -Keith has asked me to write up notes on our current status of database -create and delete and recovery, why it's so hard, and how we've violated -all the cornerstone assumptions on which our recovery framework is based. - -I am including two documents at the end of this one. The first is the -initial design of the recoverability of file create and delete (there is -no talk of subdatabases there, because we didn't think we'd have to do -anything special there). I will annotate this document on where things -changed. - -The second is the design of recd007 which is supposed to test our ability -to recover these operations regardless of where one crashes. This test -is fundamentally different from our other recovery tests in the following -manner. Normally, the application controls transaction boundaries. -Therefore, we can perform an operation and then decide whether to commit -or abort it. In the normal recovery tests, we force the database into -each of the four possible states from a recovery perspective: - - database is pre-op, undo (do nothing) - database is pre-op, redo - database is post-op, undo - database is post-op, redo (do nothing) - -By copying databases at various points and initiating txn_commit and abort -appropriately, we can make all these things happen. Notice that the one -case we don't handle is where page A is in one state (e.g., pre-op) and -page B is in another state (e.g., post-op). I will argue that these don't -matter because each page is recovered independently. If anyone can poke -holes in this, I'm interested. - -The problem with create/delete recovery testing is that the transaction -is begun and ended all inside the library. Therefore, there is never any -point (outside the library) where we can copy files and or initiate -abort/commit. In order to still put the recovery code through its paces, -Sue designed an infrastructure that lets you tell the library where to -make copies of things and where to suddenly inject errors so that the -transaction gets aborted. This level of detail allows us to push the -create/delete recovery code through just about every recovery path -possible (although I'm sure Mike will tell me I'm wrong when he starts to -run code coverage tools). - -OK, so that's all preamble and a brief discussion of the documents I'm -enclosing. - -Why was this so hard and painful and why is the code so Q@#$!% complicated? -The following is a discussion/explanation, but to the best of my knowledge, -the structure we have in place now works. The key question we need to be -asking is, "Does this need to have to be so complex or should we redesign -portions to simplify it?" At this point, there is no obvious way to simplify -it in my book, but I may be having difficulty seeing this because my mind is -too polluted at this point. - -Our overall strategy for recovery is that we do write-ahead logging, -that is we log an operation and make sure it is on disk before any -data corresponding to the data that log record describes is on disk. -Typically we use log sequence numbers (LSNs) to mark the data so that -during recovery, we can look at the data and determine if it is in a -state before a particular log record or after a particular log record. - -In the good old days, opens were not transaction protected, so we could -do regular old opens during recovery and if the file existed, we opened -it and if it didn't (or appeared corrupt), we didn't and treated it like -a missing file. As will be discussed below in detail, our states are much -more complicated and recovery can't make such simplistic assumptions. - -Also, since we are now dealing with file system operations, we have less -control about when they actually happen and what the state of the system -can be. That is, we have to write create log records synchronously, because -the create/open system call may force a newly created (0-length) file to -disk. This file has to now be identified as being in the "being-created" -state. - -A. We used to make a number of assumptions during recovery: - -1. We could call db_open at any time and one of three things would happen: - a) the file would be opened cleanly - b) the file would not exist - c) we would encounter an error while opening the file - -Case a posed no difficulty. -In Case b, we simply spit out a warning that a file was missing and then - ignored all subsequent operations to that file. -In Case c, we reported a fatal error. - -2. We can always generate a warning if a file is missing. - -3. We never encounter NULL file names in the log. - -B. We also made some assumptions in the main-line library: - -1. If you try to open a file and it exists but is 0-length, then -someone else is trying to open it. - -2. You can write pages anywhere in a file and any non-existent pages -are 0-filled. [This breaks on Windows.] - -3. If you have proper permissions then you can always evict pages from -the buffer pool. - -4. During open, we can close the master database handle as soon as -we're done with it since all the rest of the activity will take place -on the subdatabase handle. - -In our brave new world, most of these assumptions are no longer valid. -Let's address them one at a time. - -A.1 We could call db_open at any time and one of three things would happen: - a) the file would be opened cleanly - b) the file would not exist - c) we would encounter an error while opening the file -There are now additional states. Since we are trying to make file -operations recoverable, you can now die in the middle of such an -operation and we have to be able to pick up the pieces. What this -now means is that: - - * a 0-length file can be an indication of a create in-progress - * you can have a meta-data page but no root page (of a btree) - * if a file doesn't exist, it could mean that it was just about - to be created and needs to be rolled forward. - * if you encounter an error in a file (e.g., the meta-data page - is all 0's) you could still be in mid-open. - -I have now made this all work, but it required significant changes to the -db_open code and error handling and this is the sort of change that makes -everyone nervous. - -A.2. We can always generate a warning if a file is missing. - -Now that we have a delete file method in the API, we need to make sure -that we do not generate warning messages for files that don't exist if -we see that they were explicitly deleted. - -This means that we need to save state during recovery, determine which -files were missing and were not being recreated and were not deleted and -only complain about those. - -A.3. We never encounter NULL file names in the log. - -Now that we allow tranaction protection on memory-resident files, we write -log messages for files with NULL file names. This means that our assumption -of always being able to call "db_open" on any log_register OPEN message found -in the log is no longer valid. - -B.1. If you try to open a file and it exists but is 0-length, then -someone else is trying to open it. - -As discussed for A.1, this is no longer true. It may be instead that you -are in the process of recovering a create. - -B.2. You can write pages anywhere in a file and any non-existent pages -are 0-filled. - -It turns out that this is not true on Windows. This means that places -we do group allocation (hash) must explicitly allocate each page, because -we can't count on recognizing the uninitialized pages later. - -B.3. If you have proper permissions then you can always evict pages from -the buffer pool. - -In the brave new world though, files can be deleted and they may -have pages in the mpool. If you later try to evict these, you -discover that the file doesn't exist. We'd get here when we had -to dirty pages during a remove operation. - -B.4. You can close files any time you want. - -However, if the file takes part in the open/remove transaction, -then we had better not close it until after the transaction -commits/aborts, because we need to be able to get our hands on the -dbp and the open happened in a different transaction. - -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- -Design for recovering file create and delete in the presence of subdatabases. - -Assumptions: - Remove the O_TRUNCATE flag. - Single-thread all open/create/delete operations. - (Well, almost all; we'll optimize opens without DB_CREATE set.) - The reasoning for this is that with two simultaneous - open/creaters, during recovery, we cannot identify which - transaction successfully created files and therefore cannot - recovery correctly. - File system creates/deletes are synchronous - Once the file is open, subdatabase creates look like regular - get/put operations and a metadata page creation. - -There are 4 cases to deal with: - 1. Open/create file - 2. Open/create subdatabase - 3. Delete - 4. Recovery records - - __db_fileopen_recover - __db_metapage_recover - __db_delete_recover - existing c_put and c_get routines for subdatabase creation - - Note that the open/create of the file and the open/create of the - subdatabase need to be in the same transaction. - -1. Open/create (full file and subdb version) - -If create - LOCK_FILEOP - txn_begin - log create message (open message below) - do file system open/create - if we did not create - abort transaction (before going to open_only) - if (!subdb) - set dbp->open_txn = NULL - else - txn_begin a new transaction for the subdb open - - construct meta-data page - log meta-data page (see metapage) - write the meta-data page - * It may be the case that btrees need to log both meta-data pages - and root pages. If that is the case, I believe that we can use - this same record and recovery routines for both - - txn_commit - UNLOCK_FILEOP - -2. Delete - LOCK_FILEOP - txn_begin - log delete message (delete message below) - mv file __db.file.lsn - txn_commit - unlink __db.file.lsn - UNLOCK_FILEOP - -3. Recovery Routines - -__db_fileopen_recover - if (argp->name.size == 0 - done; - - if (redo) /* Commit */ - __os_open(argp->name, DB_OSO_CREATE, argp->mode, &fh) - __os_closehandle(fh) - if (undo) /* Abort */ - if (argp->name exists) - unlink(argp->name); - -__db_metapage_recover - if (redo) - __os_open(argp->name, 0, 0, &fh) - __os_lseek(meta data page) - __os_write(meta data page) - __os_closehandle(fh); - if (undo) - done = 0; - if (argp->name exists) - if (length of argp->name != 0) - __os_open(argp->name, 0, 0, &fh) - __os_lseek(meta data page) - __os_read(meta data page) - if (read succeeds && page lsn != current_lsn) - done = 1 - __os_closehandle(fh); - if (!done) - unlink(argp->name) - -__db_delete_recover - if (redo) - Check if the backup file still exists and if so, delete it. - - if (undo) - if (__db_appname(__db.file.lsn exists)) - mv __db_appname(__db.file.lsn) __db_appname(file) - -__db_metasub_recover - /* This is like a normal recovery routine */ - Get the metadata page - if (cmp_n && redo) - copy the log page onto the page - update the lsn - make sure page gets put dirty - else if (cmp_p && undo) - update the lsn to the lsn in the log record - make sure page gets put dirty - - if the page was modified, put it back dirty - -In db.src - -# name: filename (before call to __db_appname) -# mode: file system mode -BEGIN open -DBT name DBT s -ARG mode u_int32_t o -END - -# opcode: indicate if it is a create/delete and if it is a subdatabase -# pgsize: page size on which we're going to write the meta-data page -# pgno: page number on which to write this meta-data page -# page: the actual meta-data page -# lsn: LSN of the meta-data page -- 0 for new databases, may be non-0 -# for subdatabases. - -BEGIN metapage -ARG opcode u_int32_t x -DBT name DBT s -ARG pgno db_pgno_t d -DBT page DBT s -POINTER lsn DB_LSN * lu -END - -# We do not need a subdatabase name here because removing a subdatabase -# name is simply a regular bt_delete operation from the master database. -# It will get logged normally. -# name: filename -BEGIN delete -DBT name DBT s -END - -# We also need to reclaim pages, but we can use the existing -# bt_pg_alloc routines. - -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- -Testing recoverability of create/delete. - -These tests are unlike other tests in that they are going to -require hooks in the library. The reason is that the create -and delete calls are internally wrapped in a transaction, so -that if the call returns, the transaction has already either -commited or aborted. Using only that interface limits what -kind of testing we can do. To match our other recovery testing -efforts, we need to add hooks to trigger aborts at particular -times in the create/delete path. - -The general recovery testing strategy is that we wish to -execute every path through every recovery routine. That -means that we try to: - catch each operation in its pre-operation state - call the recovery function with redo - call the recovery function with undo - catch each operation in its post-operation state - call the recovery function with redo - call the recovery function with undo - -In addition, there are a few critical points in the create and -delete path that we want to make sure we capture. - -1. Test Structure - -The test structure should be similar to the existing recovery -tests. We will want to have a structure in place where we -can execute different commands: - create a file/database - create a file that will contain subdatabases. - create a subdatabase - remove a subdatabase (that contains valid data) - remove a subdatabase (that does not contain any data) - remove a file that used to contain subdatabases - remove a file that contains a database - -The tricky part is capturing the state of the world at the -various points in the create/delete process. - -The critical points in the create process are: - - 1. After we've logged the create, but before we've done anything. - in db/db.c - after the open_retry - after the __crdel_fileopen_log call (and before we've - called __os_open). - - 2. Immediately after the __os_open - - 3. Immediately after each __db_log_page call - in bt_open.c - log meta-data page - log root page - in hash.c - log meta-data page - - 4. With respect to the log records above, shortly after each - log write is an memp_fput. We need to do a sync after - each memp_fput and trigger a point after that sync. - -The critical points in the remove process are: - - 1. Right after the crdel_delete_log in db/db.c - - 2. Right after the __os_rename call (below the crdel_delete_log) - - 3. After the __db_remove_callback call. - -I believe that there are the places where we'll need some sort of hook. - -2. Adding hooks to the library. - -The hooks need two components. One component is to capture the state of -the database at the hook point and the other is to trigger a txn_abort at -the hook point. The second part is fairly trivial. - -The first part requires more thought. Let me explain what we do in a -"normal" recovery test. In a normal recovery test, we save an intial -copy of the database (this copy is called init). Then we execute one -or more operations. Then, right before the commit/abort, we sync the -file, and save another copy (the afterop copy). Finally, we call txn_commit -or txn_abort, sync the file again, and save the database one last time (the -final copy). - -Then we run recovery. The first time, this should be a no-op, because -we've either committed the transaction and are checking to redo it or -we aborted the transaction, undid it on the abort and are checking to -undo it again. - -We then run recovery again on whatever database will force us through -the path that requires work. In the commit case, this means we start -with the init copy of the database and run recovery. This pushes us -through all the redo paths. In the abort case, we start with the afterop -copy which pushes us through all the undo cases. - -In some sense, we're asking the create/delete test to be more exhaustive -by defining all the trigger points, but I think that's the correct thing -to do, since the create/delete is not initiated by a user transaction. - -So, what do we have to do at the hook points? - 1. sync the file to disk. - 2. save the file itself - 3. save any files named __db_backup_name(name, &backup_name, lsn) - Since we may not know the right lsns, I think we should save - every file of the form __db.name.0xNNNNNNNN.0xNNNNNNNN into - some temporary files from which we can restore it to run - recovery. - -3. Putting it all together - -So, the three pieces are writing the test structure, putting in the hooks -and then writing the recovery portions so that we restore the right thing -that the hooks saved in order to initiate recovery. - -Some of the technical issues that need to be solved are: - How does the hook code become active (i.e., we don't - want it in there normally, but it's got to be - there when you configure for testing)? - How do you (the test) tell the library that you want a - particular hook to abort? - How do you (the test) tell the library that you want the - hook code doing its copies (do we really want - *every* test doing these copies during testing? - Maybe it's not a big deal, but maybe it is; we - should at least think about it). diff --git a/bdb/db/crdel.src b/bdb/db/crdel.src index 17c061d6887..d89fa7a0382 100644 --- a/bdb/db/crdel.src +++ b/bdb/db/crdel.src @@ -1,13 +1,14 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2002 * Sleepycat Software. All rights reserved. * - * $Id: crdel.src,v 11.12 2000/12/12 17:41:48 bostic Exp $ + * $Id: crdel.src,v 11.24 2002/04/17 19:02:57 krinsky Exp $ */ -PREFIX crdel +PREFIX __crdel +DBPRIVATE INCLUDE #include "db_config.h" INCLUDE @@ -15,30 +16,20 @@ INCLUDE #ifndef NO_SYSTEM_INCLUDES INCLUDE #include <sys/types.h> INCLUDE INCLUDE #include <ctype.h> -INCLUDE #include <errno.h> INCLUDE #include <string.h> INCLUDE #endif INCLUDE INCLUDE #include "db_int.h" -INCLUDE #include "db_page.h" -INCLUDE #include "db_dispatch.h" -INCLUDE #include "db_am.h" -INCLUDE #include "txn.h" +INCLUDE #include "dbinc/crypto.h" +INCLUDE #include "dbinc/db_page.h" +INCLUDE #include "dbinc/db_dispatch.h" +INCLUDE #include "dbinc/db_am.h" +INCLUDE #include "dbinc/log.h" +INCLUDE #include "dbinc/rep.h" +INCLUDE #include "dbinc/txn.h" INCLUDE /* - * Fileopen -- log a potential file create operation - * - * name: filename - * subname: sub database name - * mode: file system mode - */ -BEGIN fileopen 141 -DBT name DBT s -ARG mode u_int32_t o -END - -/* * Metasub: log the creation of a subdatabase meta data page. * * fileid: identifies the file being acted upon. @@ -47,57 +38,9 @@ END * lsn: lsn of the page. */ BEGIN metasub 142 -ARG fileid int32_t ld -ARG pgno db_pgno_t d -DBT page DBT s +DB fileid int32_t ld +WRLOCK pgno db_pgno_t lu +PGDBT page DBT s POINTER lsn DB_LSN * lu END -/* - * Metapage: log the creation of a meta data page for a new file. - * - * fileid: identifies the file being acted upon. - * name: file containing the page. - * pgno: page number on which to write this meta-data page - * page: the actual meta-data page - */ -BEGIN metapage 143 -ARG fileid int32_t ld -DBT name DBT s -ARG pgno db_pgno_t d -DBT page DBT s -END - -/* - * Delete: remove a file. - * Note that we don't need a special log record for subdatabase - * removes, because we use normal btree operations to remove them. - * - * name: name of the file being removed (relative to DBHOME). - */ -DEPRECATED old_delete 144 -DBT name DBT s -END - -/* - * Rename: rename a file - * We do not need this for subdatabases - * - * name: name of the file being removed (relative to DBHOME). - */ -BEGIN rename 145 -ARG fileid int32_t ld -DBT name DBT s -DBT newname DBT s -END -/* - * Delete: remove a file. - * Note that we don't need a special log record for subdatabase - * removes, because we use normal btree operations to remove them. - * - * name: name of the file being removed (relative to DBHOME). - */ -BEGIN delete 146 -ARG fileid int32_t ld -DBT name DBT s -END diff --git a/bdb/db/crdel_rec.c b/bdb/db/crdel_rec.c index 495b92a0ad7..542a0c358dd 100644 --- a/bdb/db/crdel_rec.c +++ b/bdb/db/crdel_rec.c @@ -1,14 +1,14 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2002 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: crdel_rec.c,v 11.43 2000/12/13 08:06:34 krinsky Exp $"; +static const char revid[] = "$Id: crdel_rec.c,v 11.64 2002/08/14 20:27:34 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -18,112 +18,9 @@ static const char revid[] = "$Id: crdel_rec.c,v 11.43 2000/12/13 08:06:34 krinsk #endif #include "db_int.h" -#include "db_page.h" -#include "log.h" -#include "hash.h" -#include "mp.h" -#include "db_dispatch.h" - -/* - * __crdel_fileopen_recover -- - * Recovery function for fileopen. - * - * PUBLIC: int __crdel_fileopen_recover - * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); - */ -int -__crdel_fileopen_recover(dbenv, dbtp, lsnp, op, info) - DB_ENV *dbenv; - DBT *dbtp; - DB_LSN *lsnp; - db_recops op; - void *info; -{ - __crdel_fileopen_args *argp; - DBMETA ondisk; - DB_FH fh; - size_t nr; - int do_unlink, ret; - u_int32_t b, mb, io; - char *real_name; - - COMPQUIET(info, NULL); - - real_name = NULL; - REC_PRINT(__crdel_fileopen_print); - - if ((ret = __crdel_fileopen_read(dbenv, dbtp->data, &argp)) != 0) - goto out; - /* - * If this is an in-memory database, then the name is going to - * be NULL, which looks like a 0-length name in recovery. - */ - if (argp->name.size == 0) - goto done; - - if ((ret = __db_appname(dbenv, DB_APP_DATA, - NULL, argp->name.data, 0, NULL, &real_name)) != 0) - goto out; - if (DB_REDO(op)) { - /* - * The create commited, so we need to make sure that the file - * exists. A simple open should suffice. - */ - if ((ret = __os_open(dbenv, real_name, - DB_OSO_CREATE, argp->mode, &fh)) != 0) - goto out; - if ((ret = __os_closehandle(&fh)) != 0) - goto out; - } else if (DB_UNDO(op)) { - /* - * If the file is 0-length then it was in the process of being - * created, so we should unlink it. If it is non-0 length, then - * either someone else created it and we need to leave it - * untouched or we were in the process of creating it, allocated - * the first page on a system that requires you to actually - * write pages as you allocate them, but never got any data - * on it. - * If the file doesn't exist, we never got around to creating - * it, so that's fine. - */ - if (__os_exists(real_name, NULL) != 0) - goto done; - - if ((ret = __os_open(dbenv, real_name, 0, 0, &fh)) != 0) - goto out; - if ((ret = __os_ioinfo(dbenv, - real_name, &fh, &mb, &b, &io)) != 0) - goto out; - do_unlink = 0; - if (mb != 0 || b != 0) { - /* - * We need to read the first page - * to see if its got valid data on it. - */ - if ((ret = __os_read(dbenv, &fh, - &ondisk, sizeof(ondisk), &nr)) != 0 || - nr != sizeof(ondisk)) - goto out; - if (ondisk.magic == 0) - do_unlink = 1; - } - if ((ret = __os_closehandle(&fh)) != 0) - goto out; - /* Check for 0-length and if it is, delete it. */ - if (do_unlink || (mb == 0 && b == 0)) - if ((ret = __os_unlink(dbenv, real_name)) != 0) - goto out; - } - -done: *lsnp = argp->prev_lsn; - ret = 0; - -out: if (argp != NULL) - __os_free(argp, 0); - if (real_name != NULL) - __os_freestr(real_name); - return (ret); -} +#include "dbinc/db_page.h" +#include "dbinc/hash.h" +#include "dbinc/log.h" /* * __crdel_metasub_recover -- @@ -145,16 +42,16 @@ __crdel_metasub_recover(dbenv, dbtp, lsnp, op, info) DBC *dbc; DB_MPOOLFILE *mpf; PAGE *pagep; - u_int8_t *file_uid, ptype; - int cmp_p, modified, reopen, ret; + int cmp_p, modified, ret; + pagep = NULL; COMPQUIET(info, NULL); REC_PRINT(__crdel_metasub_print); REC_INTRO(__crdel_metasub_read, 0); - if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) { + if ((ret = mpf->get(mpf, &argp->pgno, 0, &pagep)) != 0) { if (DB_REDO(op)) { - if ((ret = memp_fget(mpf, + if ((ret = mpf->get(mpf, &argp->pgno, DB_MPOOL_CREATE, &pagep)) != 0) goto out; } else { @@ -165,7 +62,6 @@ __crdel_metasub_recover(dbenv, dbtp, lsnp, op, info) } modified = 0; - reopen = 0; cmp_p = log_compare(&LSN(pagep), &argp->lsn); CHECK_LSN(op, cmp_p, &LSN(pagep), &argp->lsn); @@ -173,14 +69,6 @@ __crdel_metasub_recover(dbenv, dbtp, lsnp, op, info) memcpy(pagep, argp->page.data, argp->page.size); LSN(pagep) = *lsnp; modified = 1; - /* - * If this is a meta-data page, then we must reopen; - * if it was a root page, then we do not. - */ - ptype = ((DBMETA *)argp->page.data)->type; - if (ptype == P_HASHMETA || ptype == P_BTREEMETA || - ptype == P_QAMMETA) - reopen = 1; } else if (DB_UNDO(op)) { /* * We want to undo this page creation. The page creation @@ -196,451 +84,14 @@ __crdel_metasub_recover(dbenv, dbtp, lsnp, op, info) LSN(pagep) = argp->lsn; modified = 1; } - if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) + if ((ret = mpf->put(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) goto out; - - /* - * If we are redoing a subdatabase create, we must close and reopen the - * file to be sure that we have the proper meta information in the - * in-memory structures - */ - if (reopen) { - /* Close cursor if it's open. */ - if (dbc != NULL) { - dbc->c_close(dbc); - dbc = NULL; - } - - if ((ret = __os_malloc(dbenv, - DB_FILE_ID_LEN, NULL, &file_uid)) != 0) - goto out; - memcpy(file_uid, &file_dbp->fileid[0], DB_FILE_ID_LEN); - ret = __log_reopen_file(dbenv, - NULL, argp->fileid, file_uid, argp->pgno); - (void)__os_free(file_uid, DB_FILE_ID_LEN); - if (ret != 0) - goto out; - } - -done: *lsnp = argp->prev_lsn; - ret = 0; - -out: REC_CLOSE; -} - -/* - * __crdel_metapage_recover -- - * Recovery function for metapage. - * - * PUBLIC: int __crdel_metapage_recover - * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); - */ -int -__crdel_metapage_recover(dbenv, dbtp, lsnp, op, info) - DB_ENV *dbenv; - DBT *dbtp; - DB_LSN *lsnp; - db_recops op; - void *info; -{ - __crdel_metapage_args *argp; - DB *dbp; - DBMETA *meta, ondisk; - DB_FH fh; - size_t nr; - u_int32_t b, io, mb, pagesize; - int is_done, ret; - char *real_name; - - COMPQUIET(info, NULL); - - real_name = NULL; - memset(&fh, 0, sizeof(fh)); - REC_PRINT(__crdel_metapage_print); - - if ((ret = __crdel_metapage_read(dbenv, dbtp->data, &argp)) != 0) - goto out; - - /* - * If this is an in-memory database, then the name is going to - * be NULL, which looks like a 0-length name in recovery. - */ - if (argp->name.size == 0) - goto done; - - meta = (DBMETA *)argp->page.data; - __ua_memcpy(&pagesize, &meta->pagesize, sizeof(pagesize)); - - if ((ret = __db_appname(dbenv, DB_APP_DATA, - NULL, argp->name.data, 0, NULL, &real_name)) != 0) - goto out; - if (DB_REDO(op)) { - if ((ret = __db_fileid_to_db(dbenv, - &dbp, argp->fileid, 0)) != 0) { - if (ret == DB_DELETED) - goto done; - else - goto out; - } - - /* - * We simply read the first page and if the LSN is 0, we - * write the meta-data page. - */ - if ((ret = __os_open(dbenv, real_name, 0, 0, &fh)) != 0) - goto out; - if ((ret = __os_seek(dbenv, &fh, - pagesize, argp->pgno, 0, 0, DB_OS_SEEK_SET)) != 0) - goto out; - /* - * If the read succeeds then the page exists, then we need - * to vrify that the page has actually been written, because - * on some systems (e.g., Windows) we preallocate pages because - * files aren't allowed to have holes in them. If the page - * looks good then we're done. - */ - if ((ret = __os_read(dbenv, &fh, &ondisk, - sizeof(ondisk), &nr)) == 0 && nr == sizeof(ondisk)) { - if (ondisk.magic != 0) - goto done; - if ((ret = __os_seek(dbenv, &fh, - pagesize, argp->pgno, 0, 0, DB_OS_SEEK_SET)) != 0) - goto out; - } - - /* - * Page didn't exist, update the LSN and write a new one. - * (seek pointer shouldn't have moved) - */ - __ua_memcpy(&meta->lsn, lsnp, sizeof(DB_LSN)); - if ((ret = __os_write(dbp->dbenv, &fh, - argp->page.data, argp->page.size, &nr)) != 0) - goto out; - if (nr != (size_t)argp->page.size) { - __db_err(dbenv, "Write failed during recovery"); - ret = EIO; - goto out; - } - - /* - * We must close and reopen the file to be sure - * that we have the proper meta information - * in the in memory structures - */ - - if ((ret = __log_reopen_file(dbenv, - argp->name.data, argp->fileid, - meta->uid, argp->pgno)) != 0) - goto out; - - /* Handle will be closed on exit. */ - } else if (DB_UNDO(op)) { - is_done = 0; - - /* If file does not exist, there is nothing to undo. */ - if (__os_exists(real_name, NULL) != 0) - goto done; - - /* - * Before we can look at anything on disk, we have to check - * if there is a valid dbp for this, and if there is, we'd - * better flush it. - */ - dbp = NULL; - if ((ret = - __db_fileid_to_db(dbenv, &dbp, argp->fileid, 0)) == 0) - (void)dbp->sync(dbp, 0); - - /* - * We need to make sure that we do not remove a file that - * someone else created. If the file is 0-length, then we - * can assume that we created it and remove it. If it is - * not 0-length, then we need to check the LSN and make - * sure that it's the file we created. - */ - if ((ret = __os_open(dbenv, real_name, 0, 0, &fh)) != 0) - goto out; - if ((ret = __os_ioinfo(dbenv, - real_name, &fh, &mb, &b, &io)) != 0) - goto out; - if (mb != 0 || b != 0) { - /* The file has something in it. */ - if ((ret = __os_seek(dbenv, &fh, - pagesize, argp->pgno, 0, 0, DB_OS_SEEK_SET)) != 0) - goto out; - if ((ret = __os_read(dbenv, &fh, - &ondisk, sizeof(ondisk), &nr)) != 0) - goto out; - if (log_compare(&ondisk.lsn, lsnp) != 0) - is_done = 1; - } - - /* - * Must close here, because unlink with the file open fails - * on some systems. - */ - if ((ret = __os_closehandle(&fh)) != 0) - goto out; - - if (!is_done) { - /* - * On some systems, you cannot unlink an open file so - * we close the fd in the dbp here and make sure we - * don't try to close it again. First, check for a - * saved_open_fhp, then close down the mpool. - */ - if (dbp != NULL && dbp->saved_open_fhp != NULL && - F_ISSET(dbp->saved_open_fhp, DB_FH_VALID) && - (ret = __os_closehandle(dbp->saved_open_fhp)) != 0) - goto out; - if (dbp != NULL && dbp->mpf != NULL) { - (void)__memp_fremove(dbp->mpf); - if ((ret = memp_fclose(dbp->mpf)) != 0) - goto out; - F_SET(dbp, DB_AM_DISCARD); - dbp->mpf = NULL; - } - if ((ret = __os_unlink(dbenv, real_name)) != 0) - goto out; - } - } + pagep = NULL; done: *lsnp = argp->prev_lsn; ret = 0; -out: if (argp != NULL) - __os_free(argp, 0); - if (real_name != NULL) - __os_freestr(real_name); - if (F_ISSET(&fh, DB_FH_VALID)) - (void)__os_closehandle(&fh); - return (ret); -} - -/* - * __crdel_delete_recover -- - * Recovery function for delete. - * - * PUBLIC: int __crdel_delete_recover - * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); - */ -int -__crdel_delete_recover(dbenv, dbtp, lsnp, op, info) - DB_ENV *dbenv; - DBT *dbtp; - DB_LSN *lsnp; - db_recops op; - void *info; -{ - DB *dbp; - __crdel_delete_args *argp; - int ret; - char *backup, *real_back, *real_name; - - REC_PRINT(__crdel_delete_print); - - backup = real_back = real_name = NULL; - if ((ret = __crdel_delete_read(dbenv, dbtp->data, &argp)) != 0) - goto out; - - if (DB_REDO(op)) { - /* - * On a recovery, as we recreate what was going on, we - * recreate the creation of the file. And so, even though - * it committed, we need to delete it. Try to delete it, - * but it is not an error if that delete fails. - */ - if ((ret = __db_appname(dbenv, DB_APP_DATA, - NULL, argp->name.data, 0, NULL, &real_name)) != 0) - goto out; - if (__os_exists(real_name, NULL) == 0) { - /* - * If a file is deleted and then recreated, it's - * possible for the __os_exists call above to - * return success and for us to get here, but for - * the fileid we're looking for to be marked - * deleted. In that case, we needn't redo the - * unlink even though the file exists, and it's - * not an error. - */ - ret = __db_fileid_to_db(dbenv, &dbp, argp->fileid, 0); - if (ret == 0) { - /* - * On Windows, the underlying file must be - * closed to perform a remove. - */ - (void)__memp_fremove(dbp->mpf); - if ((ret = memp_fclose(dbp->mpf)) != 0) - goto out; - dbp->mpf = NULL; - if ((ret = __os_unlink(dbenv, real_name)) != 0) - goto out; - } else if (ret != DB_DELETED) - goto out; - } - /* - * The transaction committed, so the only thing that might - * be true is that the backup file is still around. Try - * to delete it, but it's not an error if that delete fails. - */ - if ((ret = __db_backup_name(dbenv, argp->name.data, - &backup, lsnp)) != 0) - goto out; - if ((ret = __db_appname(dbenv, - DB_APP_DATA, NULL, backup, 0, NULL, &real_back)) != 0) - goto out; - if (__os_exists(real_back, NULL) == 0) - if ((ret = __os_unlink(dbenv, real_back)) != 0) - goto out; - if ((ret = __db_txnlist_delete(dbenv, info, - argp->name.data, TXNLIST_INVALID_ID, 1)) != 0) - goto out; - } else if (DB_UNDO(op)) { - /* - * Trying to undo. File may or may not have been deleted. - * Try to move the backup to the original. If the backup - * exists, then this is right. If it doesn't exist, then - * nothing will happen and that's OK. - */ - if ((ret = __db_backup_name(dbenv, argp->name.data, - &backup, lsnp)) != 0) - goto out; - if ((ret = __db_appname(dbenv, - DB_APP_DATA, NULL, backup, 0, NULL, &real_back)) != 0) - goto out; - if ((ret = __db_appname(dbenv, DB_APP_DATA, - NULL, argp->name.data, 0, NULL, &real_name)) != 0) - goto out; - if (__os_exists(real_back, NULL) == 0) - if ((ret = - __os_rename(dbenv, real_back, real_name)) != 0) - goto out; - } - - *lsnp = argp->prev_lsn; - ret = 0; - -out: if (argp != NULL) - __os_free(argp, 0); - if (backup != NULL) - __os_freestr(backup); - if (real_back != NULL) - __os_freestr(real_back); - if (real_name != NULL) - __os_freestr(real_name); - return (ret); -} -/* - * __crdel_rename_recover -- - * Recovery function for rename. - * - * PUBLIC: int __crdel_rename_recover - * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); - */ -int -__crdel_rename_recover(dbenv, dbtp, lsnp, op, info) - DB_ENV *dbenv; - DBT *dbtp; - DB_LSN *lsnp; - db_recops op; - void *info; -{ - DB *dbp; - __crdel_rename_args *argp; - char *new_name, *real_name; - int ret, set; - - COMPQUIET(info, NULL); - - REC_PRINT(__crdel_rename_print); - - new_name = real_name = NULL; - - if ((ret = __crdel_rename_read(dbenv, dbtp->data, &argp)) != 0) - goto out; - - if ((ret = __db_fileid_to_db(dbenv, &dbp, argp->fileid, 0)) != 0) - goto out; - if (DB_REDO(op)) { - /* - * We don't use the dbp parameter to __log_filelist_update - * in the rename case, so passing NULL for it is OK. - */ - if ((ret = __log_filelist_update(dbenv, NULL, - argp->fileid, argp->newname.data, &set)) != 0) - goto out; - if (set != 0) { - if ((ret = __db_appname(dbenv, DB_APP_DATA, - NULL, argp->name.data, 0, NULL, &real_name)) != 0) - goto out; - if (__os_exists(real_name, NULL) == 0) { - if ((ret = __db_appname(dbenv, - DB_APP_DATA, NULL, argp->newname.data, - 0, NULL, &new_name)) != 0) - goto out; - /* - * On Windows, the underlying file - * must be closed to perform a remove. - * The db will be closed by a - * log_register record. Rename - * has exclusive access to the db. - */ - (void)__memp_fremove(dbp->mpf); - if ((ret = memp_fclose(dbp->mpf)) != 0) - goto out; - dbp->mpf = NULL; - if ((ret = __os_rename(dbenv, - real_name, new_name)) != 0) - goto out; - } - } - } else { - /* - * We don't use the dbp parameter to __log_filelist_update - * in the rename case, so passing NULL for it is OK. - */ - if ((ret = __log_filelist_update(dbenv, NULL, - argp->fileid, argp->name.data, &set)) != 0) - goto out; - if (set != 0) { - if ((ret = __db_appname(dbenv, DB_APP_DATA, - NULL, argp->newname.data, 0, NULL, &new_name)) != 0) - goto out; - if (__os_exists(new_name, NULL) == 0) { - if ((ret = __db_appname(dbenv, - DB_APP_DATA, NULL, argp->name.data, - 0, NULL, &real_name)) != 0) - goto out; - /* - * On Windows, the underlying file - * must be closed to perform a remove. - * The file may have already been closed - * if we are aborting the transaction. - */ - if (dbp->mpf != NULL) { - (void)__memp_fremove(dbp->mpf); - if ((ret = memp_fclose(dbp->mpf)) != 0) - goto out; - dbp->mpf = NULL; - } - if ((ret = __os_rename(dbenv, - new_name, real_name)) != 0) - goto out; - } - } - } - - *lsnp = argp->prev_lsn; - ret = 0; - -out: if (argp != NULL) - __os_free(argp, 0); - - if (new_name != NULL) - __os_free(new_name, 0); - - if (real_name != NULL) - __os_free(real_name, 0); - - return (ret); +out: if (pagep != NULL) + (void)mpf->put(mpf, pagep, 0); + REC_CLOSE; } diff --git a/bdb/db/db.c b/bdb/db/db.c index 6e74b4b21bd..986167d5ade 100644 --- a/bdb/db/db.c +++ b/bdb/db/db.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2002 * Sleepycat Software. All rights reserved. */ /* @@ -40,7 +40,7 @@ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: db.c,v 11.117 2001/01/11 18:19:50 bostic Exp $"; +static const char revid[] = "$Id: db.c,v 11.246 2002/08/20 14:40:00 margo Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -52,352 +52,41 @@ static const char revid[] = "$Id: db.c,v 11.117 2001/01/11 18:19:50 bostic Exp $ #endif #include "db_int.h" -#include "db_page.h" -#include "db_shash.h" -#include "db_swap.h" -#include "btree.h" -#include "db_am.h" -#include "hash.h" -#include "lock.h" -#include "log.h" -#include "mp.h" -#include "qam.h" -#include "common_ext.h" - -/* Actions that __db_master_update can take. */ -typedef enum { MU_REMOVE, MU_RENAME, MU_OPEN } mu_action; - -/* Flag values that __db_file_setup can return. */ -#define DB_FILE_SETUP_CREATE 0x01 -#define DB_FILE_SETUP_ZERO 0x02 - -static int __db_file_setup __P((DB *, - const char *, u_int32_t, int, db_pgno_t, int *)); -static int __db_master_update __P((DB *, - const char *, u_int32_t, - db_pgno_t *, mu_action, const char *, u_int32_t)); -static int __db_refresh __P((DB *)); -static int __db_remove_callback __P((DB *, void *)); -static int __db_set_pgsize __P((DB *, DB_FH *, char *)); -static int __db_subdb_remove __P((DB *, const char *, const char *)); -static int __db_subdb_rename __P(( DB *, - const char *, const char *, const char *)); -#if CONFIG_TEST +#include "dbinc/db_page.h" +#include "dbinc/db_shash.h" +#include "dbinc/db_swap.h" +#include "dbinc/btree.h" +#include "dbinc/hash.h" +#include "dbinc/lock.h" +#include "dbinc/log.h" +#include "dbinc/log.h" +#include "dbinc/mp.h" +#include "dbinc/qam.h" +#include "dbinc/txn.h" + +static int __db_disassociate __P((DB *)); +#if CONFIG_TEST static void __db_makecopy __P((const char *, const char *)); -static int __db_testdocopy __P((DB *, const char *)); -static int __qam_testdocopy __P((DB *, const char *)); +static int __db_testdocopy __P((DB_ENV *, const char *)); +static int __qam_testdocopy __P((DB *, const char *)); #endif /* - * __db_open -- - * Main library interface to the DB access methods. - * - * PUBLIC: int __db_open __P((DB *, - * PUBLIC: const char *, const char *, DBTYPE, u_int32_t, int)); + * DB.C -- + * This file contains the utility functions for the DBP layer. */ -int -__db_open(dbp, name, subdb, type, flags, mode) - DB *dbp; - const char *name, *subdb; - DBTYPE type; - u_int32_t flags; - int mode; -{ - DB_ENV *dbenv; - DB_LOCK open_lock; - DB *mdbp; - db_pgno_t meta_pgno; - u_int32_t ok_flags; - int ret, t_ret; - - dbenv = dbp->dbenv; - mdbp = NULL; - - /* Validate arguments. */ -#define OKFLAGS \ - (DB_CREATE | DB_EXCL | DB_FCNTL_LOCKING | \ - DB_NOMMAP | DB_RDONLY | DB_RDWRMASTER | DB_THREAD | DB_TRUNCATE) - if ((ret = __db_fchk(dbenv, "DB->open", flags, OKFLAGS)) != 0) - return (ret); - if (LF_ISSET(DB_EXCL) && !LF_ISSET(DB_CREATE)) - return (__db_ferr(dbenv, "DB->open", 1)); - if (LF_ISSET(DB_RDONLY) && LF_ISSET(DB_CREATE)) - return (__db_ferr(dbenv, "DB->open", 1)); -#ifdef HAVE_VXWORKS - if (LF_ISSET(DB_TRUNCATE)) { - __db_err(dbenv, "DB_TRUNCATE unsupported in VxWorks"); - return (__db_eopnotsup(dbenv)); - } -#endif - switch (type) { - case DB_UNKNOWN: - if (LF_ISSET(DB_CREATE|DB_TRUNCATE)) { - __db_err(dbenv, - "%s: DB_UNKNOWN type specified with DB_CREATE or DB_TRUNCATE", - name); - return (EINVAL); - } - ok_flags = 0; - break; - case DB_BTREE: - ok_flags = DB_OK_BTREE; - break; - case DB_HASH: - ok_flags = DB_OK_HASH; - break; - case DB_QUEUE: - ok_flags = DB_OK_QUEUE; - break; - case DB_RECNO: - ok_flags = DB_OK_RECNO; - break; - default: - __db_err(dbenv, "unknown type: %lu", (u_long)type); - return (EINVAL); - } - if (ok_flags) - DB_ILLEGAL_METHOD(dbp, ok_flags); - - /* The environment may have been created, but never opened. */ - if (!F_ISSET(dbenv, DB_ENV_DBLOCAL | DB_ENV_OPEN_CALLED)) { - __db_err(dbenv, "environment not yet opened"); - return (EINVAL); - } - - /* - * Historically, you could pass in an environment that didn't have a - * mpool, and DB would create a private one behind the scenes. This - * no longer works. - */ - if (!F_ISSET(dbenv, DB_ENV_DBLOCAL) && !MPOOL_ON(dbenv)) { - __db_err(dbenv, "environment did not include a memory pool."); - return (EINVAL); - } - - /* - * You can't specify threads during DB->open if subsystems in the - * environment weren't configured with them. - */ - if (LF_ISSET(DB_THREAD) && - !F_ISSET(dbenv, DB_ENV_DBLOCAL | DB_ENV_THREAD)) { - __db_err(dbenv, "environment not created using DB_THREAD"); - return (EINVAL); - } - - /* - * If the environment was configured with threads, the DB handle - * must also be free-threaded, so we force the DB_THREAD flag on. - * (See SR #2033 for why this is a requirement--recovery needs - * to be able to grab a dbp using __db_fileid_to_dbp, and it has - * no way of knowing which dbp goes with which thread, so whichever - * one it finds has to be usable in any of them.) - */ - if (F_ISSET(dbenv, DB_ENV_THREAD)) - LF_SET(DB_THREAD); - - /* DB_TRUNCATE is not transaction recoverable. */ - if (LF_ISSET(DB_TRUNCATE) && TXN_ON(dbenv)) { - __db_err(dbenv, - "DB_TRUNCATE illegal in a transaction protected environment"); - return (EINVAL); - } - - /* Subdatabase checks. */ - if (subdb != NULL) { - /* Subdatabases must be created in named files. */ - if (name == NULL) { - __db_err(dbenv, - "multiple databases cannot be created in temporary files"); - return (EINVAL); - } - - /* QAM can't be done as a subdatabase. */ - if (type == DB_QUEUE) { - __db_err(dbenv, "Queue databases must be one-per-file"); - return (EINVAL); - } - } - - /* Convert any DB->open flags. */ - if (LF_ISSET(DB_RDONLY)) - F_SET(dbp, DB_AM_RDONLY); - - /* Fill in the type. */ - dbp->type = type; - - /* - * If we're potentially creating a database, wrap the open inside of - * a transaction. - */ - if (TXN_ON(dbenv) && LF_ISSET(DB_CREATE)) - if ((ret = __db_metabegin(dbp, &open_lock)) != 0) - return (ret); - - /* - * If we're opening a subdatabase, we have to open (and potentially - * create) the main database, and then get (and potentially store) - * our base page number in that database. Then, we can finally open - * the subdatabase. - */ - if (subdb == NULL) - meta_pgno = PGNO_BASE_MD; - else { - /* - * Open the master database, optionally creating or updating - * it, and retrieve the metadata page number. - */ - if ((ret = - __db_master_open(dbp, name, flags, mode, &mdbp)) != 0) - goto err; - - /* Copy the page size and file id from the master. */ - dbp->pgsize = mdbp->pgsize; - F_SET(dbp, DB_AM_SUBDB); - memcpy(dbp->fileid, mdbp->fileid, DB_FILE_ID_LEN); - - if ((ret = __db_master_update(mdbp, - subdb, type, &meta_pgno, MU_OPEN, NULL, flags)) != 0) - goto err; - - /* - * Clear the exclusive open and truncation flags, they only - * apply to the open of the master database. - */ - LF_CLR(DB_EXCL | DB_TRUNCATE); - } - - ret = __db_dbopen(dbp, name, flags, mode, meta_pgno); - - /* - * You can open the database that describes the subdatabases in the - * rest of the file read-only. The content of each key's data is - * unspecified and applications should never be adding new records - * or updating existing records. However, during recovery, we need - * to open these databases R/W so we can redo/undo changes in them. - * Likewise, we need to open master databases read/write during - * rename and remove so we can be sure they're fully sync'ed, so - * we provide an override flag for the purpose. - */ - if (subdb == NULL && !IS_RECOVERING(dbenv) && !LF_ISSET(DB_RDONLY) && - !LF_ISSET(DB_RDWRMASTER) && F_ISSET(dbp, DB_AM_SUBDB)) { - __db_err(dbenv, - "files containing multiple databases may only be opened read-only"); - ret = EINVAL; - goto err; - } - -err: /* - * End any transaction, committing if we were successful, aborting - * otherwise. - */ - if (TXN_ON(dbenv) && LF_ISSET(DB_CREATE)) - if ((t_ret = __db_metaend(dbp, - &open_lock, ret == 0, NULL, NULL)) != 0 && ret == 0) - ret = t_ret; - - /* If we were successful, don't discard the file on close. */ - if (ret == 0) - F_CLR(dbp, DB_AM_DISCARD); - - /* If we were unsuccessful, destroy the DB handle. */ - if (ret != 0) { - /* In recovery we set log_fileid early. */ - if (IS_RECOVERING(dbenv)) - dbp->log_fileid = DB_LOGFILEID_INVALID; - __db_refresh(dbp); - } - - if (mdbp != NULL) { - /* If we were successful, don't discard the file on close. */ - if (ret == 0) - F_CLR(mdbp, DB_AM_DISCARD); - if ((t_ret = mdbp->close(mdbp, 0)) != 0 && ret == 0) - ret = t_ret; - } - - return (ret); -} - -/* - * __db_dbopen -- - * Open a database. - * PUBLIC: int __db_dbopen __P((DB *, const char *, u_int32_t, int, db_pgno_t)); - */ -int -__db_dbopen(dbp, name, flags, mode, meta_pgno) - DB *dbp; - const char *name; - u_int32_t flags; - int mode; - db_pgno_t meta_pgno; -{ - DB_ENV *dbenv; - int ret, retinfo; - - dbenv = dbp->dbenv; - - /* Set up the underlying file. */ - if ((ret = __db_file_setup(dbp, - name, flags, mode, meta_pgno, &retinfo)) != 0) - return (ret); - - /* - * If we created the file, set the truncate flag for the mpool. This - * isn't for anything we've done, it's protection against stupid user - * tricks: if the user deleted a file behind Berkeley DB's back, we - * may still have pages in the mpool that match the file's "unique" ID. - */ - if (retinfo & DB_FILE_SETUP_CREATE) - flags |= DB_TRUNCATE; - - /* Set up the underlying environment. */ - if ((ret = __db_dbenv_setup(dbp, name, flags)) != 0) - return (ret); - - /* - * Do access method specific initialization. - * - * !!! - * Set the open flag. (The underlying access method open functions - * may want to do things like acquire cursors, so the open flag has - * to be set before calling them.) - */ - F_SET(dbp, DB_OPEN_CALLED); - - if (retinfo & DB_FILE_SETUP_ZERO) - return (0); - - switch (dbp->type) { - case DB_BTREE: - ret = __bam_open(dbp, name, meta_pgno, flags); - break; - case DB_HASH: - ret = __ham_open(dbp, name, meta_pgno, flags); - break; - case DB_RECNO: - ret = __ram_open(dbp, name, meta_pgno, flags); - break; - case DB_QUEUE: - ret = __qam_open(dbp, name, meta_pgno, mode, flags); - break; - case DB_UNKNOWN: - return (__db_unknown_type(dbp->dbenv, - "__db_dbopen", dbp->type)); - break; - } - return (ret); -} /* * __db_master_open -- * Open up a handle on a master database. * * PUBLIC: int __db_master_open __P((DB *, - * PUBLIC: const char *, u_int32_t, int, DB **)); + * PUBLIC: DB_TXN *, const char *, u_int32_t, int, DB **)); */ int -__db_master_open(subdbp, name, flags, mode, dbpp) +__db_master_open(subdbp, txn, name, flags, mode, dbpp) DB *subdbp; + DB_TXN *txn; const char *name; u_int32_t flags; int mode; @@ -417,30 +106,62 @@ __db_master_open(subdbp, name, flags, mode, dbpp) * Flag that we're creating a database with subdatabases. */ dbp->type = DB_BTREE; - dbp->open_txn = subdbp->open_txn; dbp->pgsize = subdbp->pgsize; F_SET(dbp, DB_AM_SUBDB); + F_SET(dbp, F_ISSET(subdbp, + DB_AM_RECOVER | DB_AM_SWAP | DB_AM_ENCRYPT | DB_AM_CHKSUM)); - if ((ret = __db_dbopen(dbp, name, flags, mode, PGNO_BASE_MD)) != 0) { - if (!F_ISSET(dbp, DB_AM_DISCARD)) - dbp->close(dbp, 0); - return (ret); - } + /* + * If there was a subdb specified, then we only want to apply + * DB_EXCL to the subdb, not the actual file. We only got here + * because there was a subdb specified. + */ + LF_CLR(DB_EXCL); + LF_SET(DB_RDWRMASTER); + if ((ret = __db_dbopen(dbp, txn, name, NULL, flags, mode, PGNO_BASE_MD)) + != 0) + goto err; - *dbpp = dbp; - return (0); + /* + * Verify that pagesize is the same on both. + * The items in dbp were now initialized from the meta + * page. The items in dbp were set in __db_dbopen + * when we either read or created the master file. + * Other items such as checksum and encryption are + * checked when we read the meta-page. So we do not + * check those here. However, if the meta-page caused + * chksumming to be turned on and it wasn't already, set + * it here. + */ + if (F_ISSET(dbp, DB_AM_CHKSUM)) + F_SET(subdbp, DB_AM_CHKSUM); + if (subdbp->pgsize != 0 && dbp->pgsize != subdbp->pgsize) { + ret = EINVAL; + __db_err(dbp->dbenv, + "Different pagesize specified on existent file"); + goto err; + } +err: + if (ret != 0 && !F_ISSET(dbp, DB_AM_DISCARD)) + __db_close_i(dbp, txn, 0); + else + *dbpp = dbp; + return (ret); } /* * __db_master_update -- - * Add/Remove a subdatabase from a master database. + * Add/Open/Remove a subdatabase from a master database. + * + * PUBLIC: int __db_master_update __P((DB *, DB *, DB_TXN *, const char *, + * PUBLIC: DBTYPE, mu_action, const char *, u_int32_t)); */ -static int -__db_master_update(mdbp, subdb, type, meta_pgnop, action, newname, flags) - DB *mdbp; +int +__db_master_update(mdbp, sdbp, txn, subdb, type, action, newname, flags) + DB *mdbp, *sdbp; + DB_TXN *txn; const char *subdb; - u_int32_t type; - db_pgno_t *meta_pgnop; /* may be NULL on MU_RENAME */ + DBTYPE type; mu_action action; const char *newname; u_int32_t flags; @@ -456,33 +177,37 @@ __db_master_update(mdbp, subdb, type, meta_pgnop, action, newname, flags) dbc = ndbc = NULL; p = NULL; - /* Might we modify the master database? If so, we'll need to lock. */ - modify = (action != MU_OPEN || LF_ISSET(DB_CREATE)) ? 1 : 0; - memset(&key, 0, sizeof(key)); memset(&data, 0, sizeof(data)); + /* Might we modify the master database? If so, we'll need to lock. */ + modify = (action != MU_OPEN || LF_ISSET(DB_CREATE)) ? 1 : 0; + /* * Open up a cursor. If this is CDB and we're creating the database, * make it an update cursor. */ - if ((ret = mdbp->cursor(mdbp, mdbp->open_txn, &dbc, + if ((ret = mdbp->cursor(mdbp, txn, &dbc, (CDB_LOCKING(dbenv) && modify) ? DB_WRITECURSOR : 0)) != 0) goto err; /* - * Try to point the cursor at the record. + * Point the cursor at the record. * * If we're removing or potentially creating an entry, lock the page * with DB_RMW. * + * We do multiple cursor operations with the cursor in some cases and + * subsequently access the data DBT information. Set DB_DBT_MALLOC so + * we don't risk modification of the data between our uses of it. + * * !!! * We don't include the name's nul termination in the database. */ - key.data = (char *)subdb; - key.size = strlen(subdb); - /* In the rename case, we do multiple cursor ops, so MALLOC is safer. */ + key.data = (void *)subdb; + key.size = (u_int32_t)strlen(subdb); F_SET(&data, DB_DBT_MALLOC); + ret = dbc->c_get(dbc, &key, &data, DB_SET | ((STD_LOCKING(dbc) && modify) ? DB_RMW : 0)); @@ -514,9 +239,10 @@ __db_master_update(mdbp, subdb, type, meta_pgnop, action, newname, flags) * so it hasn't been converted to/from opposite * endian architectures. Do it explicitly, now. */ - memcpy(meta_pgnop, data.data, sizeof(db_pgno_t)); - DB_NTOHL(meta_pgnop); - if ((ret = memp_fget(mdbp->mpf, meta_pgnop, 0, &p)) != 0) + memcpy(&sdbp->meta_pgno, data.data, sizeof(db_pgno_t)); + DB_NTOHL(&sdbp->meta_pgno); + if ((ret = + mdbp->mpf->get(mdbp->mpf, &sdbp->meta_pgno, 0, &p)) != 0) goto err; /* Free and put the page. */ @@ -538,11 +264,11 @@ __db_master_update(mdbp, subdb, type, meta_pgnop, action, newname, flags) * for the existence of newname; it shouldn't appear under * us since we hold the metadata lock. */ - if ((ret = mdbp->cursor(mdbp, mdbp->open_txn, &ndbc, 0)) != 0) + if ((ret = mdbp->cursor(mdbp, txn, &ndbc, 0)) != 0) goto err; DB_ASSERT(newname != NULL); - key.data = (void *) newname; - key.size = strlen(newname); + key.data = (void *)newname; + key.size = (u_int32_t)strlen(newname); /* * We don't actually care what the meta page of the potentially- @@ -583,8 +309,12 @@ __db_master_update(mdbp, subdb, type, meta_pgnop, action, newname, flags) */ switch (ret) { case 0: - memcpy(meta_pgnop, data.data, sizeof(db_pgno_t)); - DB_NTOHL(meta_pgnop); + if (LF_ISSET(DB_CREATE) && LF_ISSET(DB_EXCL)) { + ret = EEXIST; + goto err; + } + memcpy(&sdbp->meta_pgno, data.data, sizeof(db_pgno_t)); + DB_NTOHL(&sdbp->meta_pgno); goto done; case DB_NOTFOUND: if (LF_ISSET(DB_CREATE)) @@ -599,10 +329,22 @@ __db_master_update(mdbp, subdb, type, meta_pgnop, action, newname, flags) goto err; } + /* + * We need to check against the master lorder here because + * we only want to check this if we are creating. In the + * case where we don't create we just want to inherit. + */ + if (F_ISSET(mdbp, DB_AM_SWAP) != F_ISSET(sdbp, DB_AM_SWAP)) { + ret = EINVAL; + __db_err(mdbp->dbenv, + "Different lorder specified on existent file"); + goto err; + } + /* Create a subdatabase. */ if ((ret = __db_new(dbc, type == DB_HASH ? P_HASHMETA : P_BTREEMETA, &p)) != 0) goto err; - *meta_pgnop = PGNO(p); + sdbp->meta_pgno = PGNO(p); /* * XXX @@ -617,6 +359,7 @@ __db_master_update(mdbp, subdb, type, meta_pgnop, action, newname, flags) ndata.size = sizeof(db_pgno_t); if ((ret = dbc->c_put(dbc, &key, &ndata, DB_KEYLAST)) != 0) goto err; + F_SET(sdbp, DB_AM_CREATED); break; } @@ -628,7 +371,7 @@ done: /* if (p != NULL) { if (ret == 0) { if ((t_ret = - memp_fput(mdbp->mpf, p, DB_MPOOL_DIRTY)) != 0) + mdbp->mpf->put(mdbp->mpf, p, DB_MPOOL_DIRTY)) != 0) ret = t_ret; /* * Since we cannot close this file until after @@ -639,12 +382,12 @@ done: /* if ((t_ret = mdbp->sync(mdbp, 0)) != 0 && ret == 0) ret = t_ret; } else - (void)__db_free(dbc, p); + (void)mdbp->mpf->put(mdbp->mpf, p, 0); } /* Discard the cursor(s) and data. */ if (data.data != NULL) - __os_free(data.data, data.size); + __os_ufree(dbenv, data.data); if (dbc != NULL && (t_ret = dbc->c_close(dbc)) != 0 && ret == 0) ret = t_ret; if (ndbc != NULL && (t_ret = ndbc->c_close(ndbc)) != 0 && ret == 0) @@ -657,21 +400,25 @@ done: /* * __db_dbenv_setup -- * Set up the underlying environment during a db_open. * - * PUBLIC: int __db_dbenv_setup __P((DB *, const char *, u_int32_t)); + * PUBLIC: int __db_dbenv_setup __P((DB *, + * PUBLIC: DB_TXN *, const char *, u_int32_t, u_int32_t)); */ int -__db_dbenv_setup(dbp, name, flags) +__db_dbenv_setup(dbp, txn, name, id, flags) DB *dbp; + DB_TXN *txn; const char *name; + u_int32_t id; u_int32_t flags; { DB *ldbp; - DB_ENV *dbenv; DBT pgcookie; - DB_MPOOL_FINFO finfo; + DB_ENV *dbenv; + DB_MPOOL *dbmp; + DB_MPOOLFILE *mpf; DB_PGINFO pginfo; - int ret; u_int32_t maxid; + int ftype, ret; dbenv = dbp->dbenv; @@ -690,8 +437,18 @@ __db_dbenv_setup(dbp, name, flags) } /* Register DB's pgin/pgout functions. */ - if ((ret = - memp_register(dbenv, DB_FTYPE_SET, __db_pgin, __db_pgout)) != 0) + if ((ret = dbenv->memp_register( + dbenv, DB_FTYPE_SET, __db_pgin, __db_pgout)) != 0) + return (ret); + + /* Create the DB_MPOOLFILE structure. */ + if ((ret = dbenv->memp_fcreate(dbenv, &dbp->mpf, 0)) != 0) + return (ret); + mpf = dbp->mpf; + + /* Set the database's cache priority if we've been given one. */ + if (dbp->priority != 0 && + (ret = mpf->set_priority(mpf, dbp->priority)) != 0) return (ret); /* @@ -704,22 +461,26 @@ __db_dbenv_setup(dbp, name, flags) * need to page the file in and out. This has to be right -- we can't * mmap files that are being paged in and out. */ - memset(&finfo, 0, sizeof(finfo)); switch (dbp->type) { case DB_BTREE: case DB_RECNO: - finfo.ftype = - F_ISSET(dbp, DB_AM_SWAP) ? DB_FTYPE_SET : DB_FTYPE_NOTSET; - finfo.clear_len = DB_PAGE_DB_LEN; + ftype = F_ISSET(dbp, DB_AM_SWAP | DB_AM_ENCRYPT | DB_AM_CHKSUM) + ? DB_FTYPE_SET : DB_FTYPE_NOTSET; + (void)mpf->set_ftype(mpf, ftype); + (void)mpf->set_clear_len(mpf, (CRYPTO_ON(dbenv) ? + dbp->pgsize : DB_PAGE_DB_LEN)); break; case DB_HASH: - finfo.ftype = DB_FTYPE_SET; - finfo.clear_len = DB_PAGE_DB_LEN; + (void)mpf->set_ftype(mpf, DB_FTYPE_SET); + (void)mpf->set_clear_len(mpf, (CRYPTO_ON(dbenv) ? + dbp->pgsize : DB_PAGE_DB_LEN)); break; case DB_QUEUE: - finfo.ftype = - F_ISSET(dbp, DB_AM_SWAP) ? DB_FTYPE_SET : DB_FTYPE_NOTSET; - finfo.clear_len = DB_PAGE_QUEUE_LEN; + ftype = F_ISSET(dbp, DB_AM_SWAP | DB_AM_ENCRYPT | DB_AM_CHKSUM) + ? DB_FTYPE_SET : DB_FTYPE_NOTSET; + (void)mpf->set_ftype(mpf, ftype); + (void)mpf->set_clear_len(mpf, (CRYPTO_ON(dbenv) ? + dbp->pgsize : DB_PAGE_QUEUE_LEN)); break; case DB_UNKNOWN: /* @@ -735,48 +496,63 @@ __db_dbenv_setup(dbp, name, flags) * to salvage some data even with no metadata page. */ if (F_ISSET(dbp, DB_AM_VERIFYING)) { - finfo.ftype = DB_FTYPE_NOTSET; - finfo.clear_len = DB_PAGE_DB_LEN; + (void)mpf->set_ftype(mpf, DB_FTYPE_NOTSET); + (void)mpf->set_clear_len(mpf, DB_PAGE_DB_LEN); break; } - return (__db_unknown_type(dbp->dbenv, - "__db_dbenv_setup", dbp->type)); + /* FALLTHROUGH */ + default: + return ( + __db_unknown_type(dbenv, "__db_dbenv_setup", dbp->type)); } - finfo.pgcookie = &pgcookie; - finfo.fileid = dbp->fileid; - finfo.lsn_offset = 0; + + (void)mpf->set_fileid(mpf, dbp->fileid); + (void)mpf->set_lsn_offset(mpf, 0); pginfo.db_pagesize = dbp->pgsize; - pginfo.needswap = F_ISSET(dbp, DB_AM_SWAP); + pginfo.flags = + F_ISSET(dbp, (DB_AM_CHKSUM | DB_AM_ENCRYPT | DB_AM_SWAP)); + pginfo.type = dbp->type; pgcookie.data = &pginfo; pgcookie.size = sizeof(DB_PGINFO); + (void)mpf->set_pgcookie(mpf, &pgcookie); - if ((ret = memp_fopen(dbenv, name, - LF_ISSET(DB_RDONLY | DB_NOMMAP | DB_ODDFILESIZE | DB_TRUNCATE), - 0, dbp->pgsize, &finfo, &dbp->mpf)) != 0) + if ((ret = mpf->open(mpf, name, + LF_ISSET(DB_RDONLY | DB_NOMMAP | DB_ODDFILESIZE | DB_TRUNCATE) | + (F_ISSET(dbenv, DB_ENV_DIRECT_DB) ? DB_DIRECT : 0), + 0, dbp->pgsize)) != 0) return (ret); /* - * We may need a per-thread mutex. Allocate it from the environment + * We may need a per-thread mutex. Allocate it from the mpool * region, there's supposed to be extra space there for that purpose. */ if (LF_ISSET(DB_THREAD)) { - if ((ret = __db_mutex_alloc( - dbenv, dbenv->reginfo, (MUTEX **)&dbp->mutexp)) != 0) + dbmp = dbenv->mp_handle; + if ((ret = __db_mutex_setup(dbenv, dbmp->reginfo, &dbp->mutexp, + MUTEX_ALLOC | MUTEX_THREAD)) != 0) return (ret); - if ((ret = __db_mutex_init( - dbenv, dbp->mutexp, 0, MUTEX_THREAD)) != 0) { - __db_mutex_free(dbenv, dbenv->reginfo, dbp->mutexp); - return (ret); - } } - /* Get a log file id. */ - if (LOGGING_ON(dbenv) && !IS_RECOVERING(dbenv) && + /* + * Set up a bookkeeping entry for this database in the log region, + * if such a region exists. Note that even if we're in recovery + * or a replication client, where we won't log registries, we'll + * still need an FNAME struct, so LOGGING_ON is the correct macro. + */ + if (LOGGING_ON(dbenv) && + (ret = __dbreg_setup(dbp, name, id)) != 0) + return (ret); + + /* + * If we're actively logging and our caller isn't a recovery function + * that already did so, assign this dbp a log fileid. + */ + if (DBENV_LOGGING(dbenv) && !F_ISSET(dbp, DB_AM_RECOVER) && #if !defined(DEBUG_ROP) !F_ISSET(dbp, DB_AM_RDONLY) && #endif - (ret = log_register(dbenv, dbp, name)) != 0) + (ret = __dbreg_new_id(dbp, txn)) != 0) return (ret); /* @@ -822,541 +598,69 @@ __db_dbenv_setup(dbp, name, flags) } /* - * __db_file_setup -- - * Setup the file or in-memory data. - * Read the database metadata and resolve it with our arguments. + * __db_close -- + * DB destructor. + * + * PUBLIC: int __db_close __P((DB *, u_int32_t)); */ -static int -__db_file_setup(dbp, name, flags, mode, meta_pgno, retflags) +int +__db_close(dbp, flags) DB *dbp; - const char *name; u_int32_t flags; - int mode; - db_pgno_t meta_pgno; - int *retflags; -{ - DB *mdb; - DBT namedbt; - DB_ENV *dbenv; - DB_FH *fhp, fh; - DB_LSN lsn; - DB_TXN *txn; - size_t nr; - u_int32_t magic, oflags; - int ret, retry_cnt, t_ret; - char *real_name, mbuf[DBMETASIZE]; - -#define IS_SUBDB_SETUP (meta_pgno != PGNO_BASE_MD) - - dbenv = dbp->dbenv; - dbp->meta_pgno = meta_pgno; - txn = NULL; - *retflags = 0; - - /* - * If we open a file handle and our caller is doing fcntl(2) locking, - * we can't close it because that would discard the caller's lock. - * Save it until we close the DB handle. - */ - if (LF_ISSET(DB_FCNTL_LOCKING)) { - if ((ret = __os_malloc(dbenv, sizeof(*fhp), NULL, &fhp)) != 0) - return (ret); - } else - fhp = &fh; - memset(fhp, 0, sizeof(*fhp)); - - /* - * If the file is in-memory, set up is simple. Otherwise, do the - * hard work of opening and reading the file. - * - * If we have a file name, try and read the first page, figure out - * what type of file it is, and initialize everything we can based - * on that file's meta-data page. - * - * !!! - * There's a reason we don't push this code down into the buffer cache. - * The problem is that there's no information external to the file that - * we can use as a unique ID. UNIX has dev/inode pairs, but they are - * not necessarily unique after reboot, if the file was mounted via NFS. - * Windows has similar problems, as the FAT filesystem doesn't maintain - * dev/inode numbers across reboot. So, we must get something from the - * file we can use to ensure that, even after a reboot, the file we're - * joining in the cache is the right file for us to join. The solution - * we use is to maintain a file ID that's stored in the database, and - * that's why we have to open and read the file before calling into the - * buffer cache. - * - * The secondary reason is that there's additional information that - * we want to have before instantiating a file in the buffer cache: - * the page size, file type (btree/hash), if swapping is required, - * and flags (DB_RDONLY, DB_CREATE, DB_TRUNCATE). We could handle - * needing this information by allowing it to be set for a file in - * the buffer cache even after the file has been opened, and, of - * course, supporting the ability to flush a file from the cache as - * necessary, e.g., if we guessed wrongly about the page size. Given - * that we have to read the file anyway to get the file ID, we might - * as well get the rest, too. - * - * Get the real file name. - */ - if (name == NULL) { - F_SET(dbp, DB_AM_INMEM); - - if (dbp->type == DB_UNKNOWN) { - __db_err(dbenv, - "DBTYPE of unknown without existing file"); - return (EINVAL); - } - real_name = NULL; - - /* Set the page size if we don't have one yet. */ - if (dbp->pgsize == 0) - dbp->pgsize = DB_DEF_IOSIZE; - - /* - * If the file is a temporary file and we're doing locking, - * then we have to create a unique file ID. We can't use our - * normal dev/inode pair (or whatever this OS uses in place of - * dev/inode pairs) because no backing file will be created - * until the mpool cache is filled forcing the buffers to disk. - * Grab a random locker ID to use as a file ID. The created - * ID must never match a potential real file ID -- we know it - * won't because real file IDs contain a time stamp after the - * dev/inode pair, and we're simply storing a 4-byte value. - * - * !!! - * Store the locker in the file id structure -- we can get it - * from there as necessary, and it saves having two copies. - */ - if (LOCKING_ON(dbenv) && - (ret = lock_id(dbenv, (u_int32_t *)dbp->fileid)) != 0) - return (ret); - - return (0); - } - - /* Get the real backing file name. */ - if ((ret = __db_appname(dbenv, - DB_APP_DATA, NULL, name, 0, NULL, &real_name)) != 0) - return (ret); - - /* - * Open the backing file. We need to make sure that multiple processes - * attempting to create the file at the same time are properly ordered - * so that only one of them creates the "unique" file ID, so we open it - * O_EXCL and O_CREAT so two simultaneous attempts to create the region - * will return failure in one of the attempts. If we're the one that - * fails, simply retry without the O_CREAT flag, which will require the - * meta-data page exist. - */ - - /* Fill in the default file mode. */ - if (mode == 0) - mode = __db_omode("rwrw--"); - - oflags = 0; - if (LF_ISSET(DB_RDONLY)) - oflags |= DB_OSO_RDONLY; - if (LF_ISSET(DB_TRUNCATE)) - oflags |= DB_OSO_TRUNC; - - retry_cnt = 0; -open_retry: - *retflags = 0; - ret = 0; - if (!IS_SUBDB_SETUP && LF_ISSET(DB_CREATE)) { - if (dbp->open_txn != NULL) { - /* - * Start a child transaction to wrap this individual - * create. - */ - if ((ret = - txn_begin(dbenv, dbp->open_txn, &txn, 0)) != 0) - goto err_msg; - - memset(&namedbt, 0, sizeof(namedbt)); - namedbt.data = (char *)name; - namedbt.size = strlen(name) + 1; - if ((ret = __crdel_fileopen_log(dbenv, txn, - &lsn, DB_FLUSH, &namedbt, mode)) != 0) - goto err_msg; - } - DB_TEST_RECOVERY(dbp, DB_TEST_PREOPEN, ret, name); - if ((ret = __os_open(dbenv, real_name, - oflags | DB_OSO_CREATE | DB_OSO_EXCL, mode, fhp)) == 0) { - DB_TEST_RECOVERY(dbp, DB_TEST_POSTOPEN, ret, name); - - /* Commit the file create. */ - if (dbp->open_txn != NULL) { - if ((ret = txn_commit(txn, DB_TXN_SYNC)) != 0) - goto err_msg; - txn = NULL; - } - - /* - * We created the file. This means that if we later - * fail, we need to delete the file and if we're going - * to do that, we need to trash any pages in the - * memory pool. Since we only know here that we - * created the file, we're going to set the flag here - * and clear it later if we commit successfully. - */ - F_SET(dbp, DB_AM_DISCARD); - *retflags |= DB_FILE_SETUP_CREATE; - } else { - /* - * Abort the file create. If the abort fails, report - * the error returned by txn_abort(), rather than the - * open error, for no particular reason. - */ - if (dbp->open_txn != NULL) { - if ((t_ret = txn_abort(txn)) != 0) { - ret = t_ret; - goto err_msg; - } - txn = NULL; - } - - /* - * If we were not doing an exclusive open, try again - * without the create flag. - */ - if (ret == EEXIST && !LF_ISSET(DB_EXCL)) { - LF_CLR(DB_CREATE); - DB_TEST_RECOVERY(dbp, - DB_TEST_POSTOPEN, ret, name); - goto open_retry; - } - } - } else - ret = __os_open(dbenv, real_name, oflags, mode, fhp); - - /* - * Be quiet if we couldn't open the file because it didn't exist - * or we did not have permission, - * the customers don't like those messages appearing in the logs. - * Otherwise, complain loudly. - */ - if (ret != 0) { - if (ret == EACCES || ret == ENOENT) - goto err; - goto err_msg; - } - - /* Set the page size if we don't have one yet. */ - if (dbp->pgsize == 0) { - if (IS_SUBDB_SETUP) { - if ((ret = __db_master_open(dbp, - name, flags, mode, &mdb)) != 0) - goto err; - dbp->pgsize = mdb->pgsize; - (void)mdb->close(mdb, 0); - } else if ((ret = __db_set_pgsize(dbp, fhp, real_name)) != 0) - goto err; - } - - /* - * Seek to the metadata offset; if it's a master database open or a - * database without subdatabases, we're seeking to 0, but that's OK. - */ - if ((ret = __os_seek(dbenv, fhp, - dbp->pgsize, meta_pgno, 0, 0, DB_OS_SEEK_SET)) != 0) - goto err_msg; - - /* - * Read the metadata page. We read DBMETASIZE bytes, which is larger - * than any access method's metadata page and smaller than any disk - * sector. - */ - if ((ret = __os_read(dbenv, fhp, mbuf, sizeof(mbuf), &nr)) != 0) - goto err_msg; - - if (nr == sizeof(mbuf)) { - /* - * Figure out what access method we're dealing with, and then - * call access method specific code to check error conditions - * based on conflicts between the found file and application - * arguments. A found file overrides some user information -- - * we don't consider it an error, for example, if the user set - * an expected byte order and the found file doesn't match it. - */ - F_CLR(dbp, DB_AM_SWAP); - magic = ((DBMETA *)mbuf)->magic; - -swap_retry: switch (magic) { - case DB_BTREEMAGIC: - if ((ret = - __bam_metachk(dbp, name, (BTMETA *)mbuf)) != 0) - goto err; - break; - case DB_HASHMAGIC: - if ((ret = - __ham_metachk(dbp, name, (HMETA *)mbuf)) != 0) - goto err; - break; - case DB_QAMMAGIC: - if ((ret = - __qam_metachk(dbp, name, (QMETA *)mbuf)) != 0) - goto err; - break; - case 0: - /* - * There are two ways we can get a 0 magic number. - * If we're creating a subdatabase, then the magic - * number will be 0. We allocate a page as part of - * finding out what the base page number will be for - * the new subdatabase, but it's not initialized in - * any way. - * - * The second case happens if we are in recovery - * and we are going to recreate a database, it's - * possible that it's page was created (on systems - * where pages must be created explicitly to avoid - * holes in files) but is still 0. - */ - if (IS_SUBDB_SETUP) { /* Case 1 */ - if ((IS_RECOVERING(dbenv) - && F_ISSET((DB_LOG *) - dbenv->lg_handle, DBLOG_FORCE_OPEN)) - || ((DBMETA *)mbuf)->pgno != PGNO_INVALID) - goto empty; - - ret = EINVAL; - goto err; - } - /* Case 2 */ - if (IS_RECOVERING(dbenv)) { - *retflags |= DB_FILE_SETUP_ZERO; - goto empty; - } - goto bad_format; - default: - if (F_ISSET(dbp, DB_AM_SWAP)) - goto bad_format; - - M_32_SWAP(magic); - F_SET(dbp, DB_AM_SWAP); - goto swap_retry; - } - } else { - /* - * Only newly created files are permitted to fail magic - * number tests. - */ - if (nr != 0 || (!IS_RECOVERING(dbenv) && IS_SUBDB_SETUP)) - goto bad_format; - - /* Let the caller know that we had a 0-length file. */ - if (!LF_ISSET(DB_CREATE | DB_TRUNCATE)) - *retflags |= DB_FILE_SETUP_ZERO; - - /* - * The only way we can reach here with the DB_CREATE flag set - * is if we created the file. If that's not the case, then - * either (a) someone else created the file but has not yet - * written out the metadata page, or (b) we truncated the file - * (DB_TRUNCATE) leaving it zero-length. In the case of (a), - * we want to sleep and give the file creator time to write - * the metadata page. In the case of (b), we want to continue. - * - * !!! - * There's a race in the case of two processes opening the file - * with the DB_TRUNCATE flag set at roughly the same time, and - * they could theoretically hurt each other. Sure hope that's - * unlikely. - */ - if (!LF_ISSET(DB_CREATE | DB_TRUNCATE) && - !IS_RECOVERING(dbenv)) { - if (retry_cnt++ < 3) { - __os_sleep(dbenv, 1, 0); - goto open_retry; - } -bad_format: if (!IS_RECOVERING(dbenv)) - __db_err(dbenv, - "%s: unexpected file type or format", name); - ret = EINVAL; - goto err; - } - - DB_ASSERT (dbp->type != DB_UNKNOWN); - -empty: /* - * The file is empty, and that's OK. If it's not a subdatabase, - * though, we do need to generate a unique file ID for it. The - * unique file ID includes a timestamp so that we can't collide - * with any other files, even when the file IDs (dev/inode pair) - * are reused. - */ - if (!IS_SUBDB_SETUP) { - if (*retflags & DB_FILE_SETUP_ZERO) - memset(dbp->fileid, 0, DB_FILE_ID_LEN); - else if ((ret = __os_fileid(dbenv, - real_name, 1, dbp->fileid)) != 0) - goto err_msg; - } - } - - if (0) { -err_msg: __db_err(dbenv, "%s: %s", name, db_strerror(ret)); - } - - /* - * Abort any running transaction -- it can only exist if something - * went wrong. - */ -err: -DB_TEST_RECOVERY_LABEL - - /* - * If we opened a file handle and our caller is doing fcntl(2) locking, - * then we can't close it because that would discard the caller's lock. - * Otherwise, close the handle. - */ - if (F_ISSET(fhp, DB_FH_VALID)) { - if (ret == 0 && LF_ISSET(DB_FCNTL_LOCKING)) - dbp->saved_open_fhp = fhp; - else - if ((t_ret = __os_closehandle(fhp)) != 0 && ret == 0) - ret = t_ret; - } - - /* - * This must be done after the file is closed, since - * txn_abort() may remove the file, and an open file - * cannot be removed on a Windows platforms. - */ - if (txn != NULL) - (void)txn_abort(txn); - - if (real_name != NULL) - __os_freestr(real_name); - - return (ret); -} - -/* - * __db_set_pgsize -- - * Set the page size based on file information. - */ -static int -__db_set_pgsize(dbp, fhp, name) - DB *dbp; - DB_FH *fhp; - char *name; { DB_ENV *dbenv; - u_int32_t iopsize; - int ret; dbenv = dbp->dbenv; - /* - * Use the filesystem's optimum I/O size as the pagesize if a pagesize - * not specified. Some filesystems have 64K as their optimum I/O size, - * but as that results in fairly large default caches, we limit the - * default pagesize to 16K. - */ - if ((ret = __os_ioinfo(dbenv, name, fhp, NULL, NULL, &iopsize)) != 0) { - __db_err(dbenv, "%s: %s", name, db_strerror(ret)); - return (ret); - } - if (iopsize < 512) - iopsize = 512; - if (iopsize > 16 * 1024) - iopsize = 16 * 1024; - - /* - * Sheer paranoia, but we don't want anything that's not a power-of-2 - * (we rely on that for alignment of various types on the pages), and - * we want a multiple of the sector size as well. - */ - OS_ROUNDOFF(iopsize, 512); + PANIC_CHECK(dbenv); - dbp->pgsize = iopsize; - F_SET(dbp, DB_AM_PGDEF); + /* Validate arguments, but as a DB handle destructor, we can't fail. */ + if (flags != 0 && flags != DB_NOSYNC) + (void)__db_ferr(dbenv, "DB->close", 0); - return (0); + return (__db_close_i(dbp, NULL, flags)); } /* - * __db_close -- - * DB destructor. + * __db_close_i -- + * Internal DB destructor. * - * PUBLIC: int __db_close __P((DB *, u_int32_t)); + * PUBLIC: int __db_close_i __P((DB *, DB_TXN *, u_int32_t)); */ int -__db_close(dbp, flags) +__db_close_i(dbp, txn, flags) DB *dbp; + DB_TXN *txn; u_int32_t flags; { DB_ENV *dbenv; - DBC *dbc; int ret, t_ret; - ret = 0; - dbenv = dbp->dbenv; - PANIC_CHECK(dbenv); - - /* Validate arguments. */ - if ((ret = __db_closechk(dbp, flags)) != 0) - goto err; - - /* If never opened, or not currently open, it's easy. */ - if (!F_ISSET(dbp, DB_OPEN_CALLED)) - goto never_opened; - - /* Sync the underlying access method. */ - if (!LF_ISSET(DB_NOSYNC) && !F_ISSET(dbp, DB_AM_DISCARD) && - (t_ret = dbp->sync(dbp, 0)) != 0 && ret == 0) - ret = t_ret; - - /* - * Go through the active cursors and call the cursor recycle routine, - * which resolves pending operations and moves the cursors onto the - * free list. Then, walk the free list and call the cursor destroy - * routine. - */ - while ((dbc = TAILQ_FIRST(&dbp->active_queue)) != NULL) - if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0) - ret = t_ret; - while ((dbc = TAILQ_FIRST(&dbp->free_queue)) != NULL) - if ((t_ret = __db_c_destroy(dbc)) != 0 && ret == 0) - ret = t_ret; + ret = 0; /* - * Close any outstanding join cursors. Join cursors destroy - * themselves on close and have no separate destroy routine. + * Validate arguments, but as a DB handle destructor, we can't fail. + * + * Check for consistent transaction usage -- ignore errors. Only + * internal callers specify transactions, so it's a serious problem + * if we get error messages. */ - while ((dbc = TAILQ_FIRST(&dbp->join_queue)) != NULL) - if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0) - ret = t_ret; - - /* Remove this DB handle from the DB_ENV's dblist. */ - MUTEX_THREAD_LOCK(dbenv, dbenv->dblist_mutexp); - LIST_REMOVE(dbp, dblistlinks); - MUTEX_THREAD_UNLOCK(dbenv, dbenv->dblist_mutexp); - - /* Sync the memory pool. */ - if (!LF_ISSET(DB_NOSYNC) && !F_ISSET(dbp, DB_AM_DISCARD) && - (t_ret = memp_fsync(dbp->mpf)) != 0 && - t_ret != DB_INCOMPLETE && ret == 0) - ret = t_ret; + if (txn != NULL) + (void)__db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 0); - /* Close any handle we've been holding since the open. */ - if (dbp->saved_open_fhp != NULL && - F_ISSET(dbp->saved_open_fhp, DB_FH_VALID) && - (t_ret = __os_closehandle(dbp->saved_open_fhp)) != 0 && ret == 0) + /* Refresh the structure and close any local environment. */ + if ((t_ret = __db_refresh(dbp, txn, flags)) != 0 && ret == 0) ret = t_ret; -never_opened: /* * Call the access specific close function. * * !!! - * Because of where the function is called in the close process, - * these routines can't do anything that would dirty pages or - * otherwise affect closing down the database. + * Because of where these functions are called in the DB handle close + * process, these routines can't do anything that would dirty pages or + * otherwise affect closing down the database. Specifically, we can't + * abort and recover any of the information they control. */ if ((t_ret = __ham_db_close(dbp)) != 0 && ret == 0) ret = t_ret; @@ -1365,17 +669,14 @@ never_opened: if ((t_ret = __qam_db_close(dbp)) != 0 && ret == 0) ret = t_ret; -err: - /* Refresh the structure and close any local environment. */ - if ((t_ret = __db_refresh(dbp)) != 0 && ret == 0) - ret = t_ret; - if (F_ISSET(dbenv, DB_ENV_DBLOCAL) && - --dbenv->dblocal_ref == 0 && + --dbenv->db_ref; + if (F_ISSET(dbenv, DB_ENV_DBLOCAL) && dbenv->db_ref == 0 && (t_ret = dbenv->close(dbenv, 0)) != 0 && ret == 0) ret = t_ret; + /* Free the database handle. */ memset(dbp, CLEAR_BYTE, sizeof(*dbp)); - __os_free(dbp, sizeof(*dbp)); + __os_free(dbenv, dbp); return (ret); } @@ -1383,653 +684,257 @@ err: /* * __db_refresh -- * Refresh the DB structure, releasing any allocated resources. + * This does most of the work of closing files now because refresh + * is what is used during abort processing (since we can't destroy + * the actual handle) and during abort processing, we may have a + * fully opened handle. + * + * PUBLIC: int __db_refresh __P((DB *, DB_TXN *, u_int32_t)); */ -static int -__db_refresh(dbp) +int +__db_refresh(dbp, txn, flags) DB *dbp; + DB_TXN *txn; + u_int32_t flags; { - DB_ENV *dbenv; + DB *sdbp; DBC *dbc; + DB_ENV *dbenv; + DB_LOCKREQ lreq; + DB_MPOOL *dbmp; int ret, t_ret; ret = 0; dbenv = dbp->dbenv; + /* If never opened, or not currently open, it's easy. */ + if (!F_ISSET(dbp, DB_AM_OPEN_CALLED)) + goto never_opened; + /* - * Go through the active cursors and call the cursor recycle routine, - * which resolves pending operations and moves the cursors onto the - * free list. Then, walk the free list and call the cursor destroy - * routine. + * If we have any secondary indices, disassociate them from us. + * We don't bother with the mutex here; it only protects some + * of the ops that will make us core-dump mid-close anyway, and + * if you're trying to do something with a secondary *while* you're + * closing the primary, you deserve what you get. The disassociation + * is mostly done just so we can close primaries and secondaries in + * any order--but within one thread of control. */ - while ((dbc = TAILQ_FIRST(&dbp->active_queue)) != NULL) - if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0) - ret = t_ret; - while ((dbc = TAILQ_FIRST(&dbp->free_queue)) != NULL) - if ((t_ret = __db_c_destroy(dbc)) != 0 && ret == 0) + for (sdbp = LIST_FIRST(&dbp->s_secondaries); + sdbp != NULL; sdbp = LIST_NEXT(sdbp, s_links)) { + LIST_REMOVE(sdbp, s_links); + if ((t_ret = __db_disassociate(sdbp)) != 0 && ret == 0) ret = t_ret; - - dbp->type = 0; - - /* Close the memory pool file handle. */ - if (dbp->mpf != NULL) { - if (F_ISSET(dbp, DB_AM_DISCARD)) - (void)__memp_fremove(dbp->mpf); - if ((t_ret = memp_fclose(dbp->mpf)) != 0 && ret == 0) - ret = t_ret; - dbp->mpf = NULL; } - /* Discard the thread mutex. */ - if (dbp->mutexp != NULL) { - __db_mutex_free(dbenv, dbenv->reginfo, dbp->mutexp); - dbp->mutexp = NULL; - } - - /* Discard the log file id. */ - if (!IS_RECOVERING(dbenv) - && dbp->log_fileid != DB_LOGFILEID_INVALID) - (void)log_unregister(dbenv, dbp); - - F_CLR(dbp, DB_AM_DISCARD); - F_CLR(dbp, DB_AM_INMEM); - F_CLR(dbp, DB_AM_RDONLY); - F_CLR(dbp, DB_AM_SWAP); - F_CLR(dbp, DB_DBM_ERROR); - F_CLR(dbp, DB_OPEN_CALLED); - - return (ret); -} - -/* - * __db_remove - * Remove method for DB. - * - * PUBLIC: int __db_remove __P((DB *, const char *, const char *, u_int32_t)); - */ -int -__db_remove(dbp, name, subdb, flags) - DB *dbp; - const char *name, *subdb; - u_int32_t flags; -{ - DBT namedbt; - DB_ENV *dbenv; - DB_LOCK remove_lock; - DB_LSN newlsn; - int ret, t_ret, (*callback_func) __P((DB *, void *)); - char *backup, *real_back, *real_name; - void *cookie; - - dbenv = dbp->dbenv; - ret = 0; - backup = real_back = real_name = NULL; - - PANIC_CHECK(dbenv); /* - * Cannot use DB_ILLEGAL_AFTER_OPEN here because that returns - * and we cannot return, but must deal with the error and destroy - * the handle anyway. + * Sync the underlying access method. Do before closing the cursors + * because DB->sync allocates cursors in order to write Recno backing + * source text files. */ - if (F_ISSET(dbp, DB_OPEN_CALLED)) { - ret = __db_mi_open(dbp->dbenv, "remove", 1); - goto err_close; - } - - /* Validate arguments. */ - if ((ret = __db_removechk(dbp, flags)) != 0) - goto err_close; + if (!LF_ISSET(DB_NOSYNC) && !F_ISSET(dbp, DB_AM_DISCARD) && + (t_ret = dbp->sync(dbp, 0)) != 0 && ret == 0) + ret = t_ret; /* - * Subdatabases. + * Go through the active cursors and call the cursor recycle routine, + * which resolves pending operations and moves the cursors onto the + * free list. Then, walk the free list and call the cursor destroy + * routine. Note that any failure on a close is considered "really + * bad" and we just break out of the loop and force forward. */ - if (subdb != NULL) { - /* Subdatabases must be created in named files. */ - if (name == NULL) { - __db_err(dbenv, - "multiple databases cannot be created in temporary files"); - goto err_close; + while ((dbc = TAILQ_FIRST(&dbp->active_queue)) != NULL) + if ((t_ret = dbc->c_close(dbc)) != 0) { + if (ret == 0) + ret = t_ret; + break; } - return (__db_subdb_remove(dbp, name, subdb)); - } - - if ((ret = dbp->open(dbp, - name, NULL, DB_UNKNOWN, DB_RDWRMASTER, 0)) != 0) - goto err_close; - - if (LOGGING_ON(dbenv) && (ret = __log_file_lock(dbp)) != 0) - goto err_close; - if ((ret = dbp->sync(dbp, 0)) != 0) - goto err_close; - - /* Start the transaction and log the delete. */ - if (TXN_ON(dbenv) && (ret = __db_metabegin(dbp, &remove_lock)) != 0) - goto err_close; - - if (LOGGING_ON(dbenv)) { - memset(&namedbt, 0, sizeof(namedbt)); - namedbt.data = (char *)name; - namedbt.size = strlen(name) + 1; - - if ((ret = __crdel_delete_log(dbenv, - dbp->open_txn, &newlsn, DB_FLUSH, - dbp->log_fileid, &namedbt)) != 0) { - __db_err(dbenv, - "%s: %s", name, db_strerror(ret)); - goto err; + while ((dbc = TAILQ_FIRST(&dbp->free_queue)) != NULL) + if ((t_ret = __db_c_destroy(dbc)) != 0) { + if (ret == 0) + ret = t_ret; + break; } - } - - /* Find the real name of the file. */ - if ((ret = __db_appname(dbenv, - DB_APP_DATA, NULL, name, 0, NULL, &real_name)) != 0) - goto err; /* - * XXX - * We don't bother to open the file and call __memp_fremove on the mpf. - * There is a potential race here. It is at least possible that, if - * the unique filesystem ID (dev/inode pair on UNIX) is reallocated - * within a second (the granularity of the fileID timestamp), a new - * file open will get the same fileID as the file being "removed". - * We may actually want to open the file and call __memp_fremove on - * the mpf to get around this. - */ - - /* Create name for backup file. */ - if (TXN_ON(dbenv)) { - if ((ret = - __db_backup_name(dbenv, name, &backup, &newlsn)) != 0) - goto err; - if ((ret = __db_appname(dbenv, - DB_APP_DATA, NULL, backup, 0, NULL, &real_back)) != 0) - goto err; - } - - callback_func = __db_remove_callback; - cookie = real_back; - DB_TEST_RECOVERY(dbp, DB_TEST_PRERENAME, ret, name); - if (dbp->db_am_remove != NULL && - (ret = dbp->db_am_remove(dbp, - name, subdb, &newlsn, &callback_func, &cookie)) != 0) - goto err; - /* - * On Windows, the underlying file must be closed to perform a remove. - * Nothing later in __db_remove requires that it be open, and the - * dbp->close closes it anyway, so we just close it early. + * Close any outstanding join cursors. Join cursors destroy + * themselves on close and have no separate destroy routine. */ - (void)__memp_fremove(dbp->mpf); - if ((ret = memp_fclose(dbp->mpf)) != 0) - goto err; - dbp->mpf = NULL; - - if (TXN_ON(dbenv)) - ret = __os_rename(dbenv, real_name, real_back); - else - ret = __os_unlink(dbenv, real_name); - - DB_TEST_RECOVERY(dbp, DB_TEST_POSTRENAME, ret, name); + while ((dbc = TAILQ_FIRST(&dbp->join_queue)) != NULL) + if ((t_ret = dbc->c_close(dbc)) != 0) { + if (ret == 0) + ret = t_ret; + break; + } -err: -DB_TEST_RECOVERY_LABEL /* - * End the transaction, committing the transaction if we were - * successful, aborting otherwise. + * Sync the memory pool, even though we've already called DB->sync, + * because closing cursors can dirty pages by deleting items they + * referenced. */ - if (dbp->open_txn != NULL && (t_ret = __db_metaend(dbp, &remove_lock, - ret == 0, callback_func, cookie)) != 0 && ret == 0) + if (!LF_ISSET(DB_NOSYNC) && !F_ISSET(dbp, DB_AM_DISCARD) && + (t_ret = dbp->mpf->sync(dbp->mpf)) != 0 && ret == 0) ret = t_ret; - /* FALLTHROUGH */ - -err_close: - if (real_back != NULL) - __os_freestr(real_back); - if (real_name != NULL) - __os_freestr(real_name); - if (backup != NULL) - __os_freestr(backup); - - /* We no longer have an mpool, so syncing would be disastrous. */ - if ((t_ret = dbp->close(dbp, DB_NOSYNC)) != 0 && ret == 0) + /* Close any handle we've been holding since the open. */ + if (dbp->saved_open_fhp != NULL && + F_ISSET(dbp->saved_open_fhp, DB_FH_VALID) && + (t_ret = __os_closehandle(dbenv, dbp->saved_open_fhp)) != 0 && + ret == 0) ret = t_ret; - return (ret); -} - -/* - * __db_subdb_remove -- - * Remove a subdatabase. - */ -static int -__db_subdb_remove(dbp, name, subdb) - DB *dbp; - const char *name, *subdb; -{ - DB *mdbp; - DBC *dbc; - DB_ENV *dbenv; - DB_LOCK remove_lock; - db_pgno_t meta_pgno; - int ret, t_ret; - - mdbp = NULL; - dbc = NULL; - dbenv = dbp->dbenv; - - /* Start the transaction. */ - if (TXN_ON(dbenv) && (ret = __db_metabegin(dbp, &remove_lock)) != 0) - goto err_close; - +never_opened: /* - * Open the subdatabase. We can use the user's DB handle for this - * purpose, I think. + * We are not releasing the handle lock here because we're about + * to release all locks held by dbp->lid below. There are two + * ways that we can get in here with a handle_lock, but not a + * dbp->lid. The first is when our lid has been hijacked by a + * subdb. The second is when we are a Queue database in the midst + * of a rename. If the queue file hasn't actually been opened, we + * hijack the main dbp's locker id to do the open so we can get the + * extent files. In both cases, we needn't free the handle lock + * because it will be freed when the hijacked locker-id is freed. */ - if ((ret = __db_open(dbp, name, subdb, DB_UNKNOWN, 0, 0)) != 0) - goto err; + DB_ASSERT(!LOCK_ISSET(dbp->handle_lock) || + dbp->lid != DB_LOCK_INVALIDID || + dbp->type == DB_QUEUE || + F_ISSET(dbp, DB_AM_SUBDB)); + + if (dbp->lid != DB_LOCK_INVALIDID) { + /* We may have pending trade operations on this dbp. */ + if (txn != NULL) + __txn_remlock(dbenv, txn, &dbp->handle_lock, dbp->lid); + + /* We may be holding the handle lock; release it. */ + lreq.op = DB_LOCK_PUT_ALL; + if ((t_ret = __lock_vec(dbenv, + dbp->lid, 0, &lreq, 1, NULL)) != 0 && ret == 0) + ret = t_ret; - /* Free up the pages in the subdatabase. */ - switch (dbp->type) { - case DB_BTREE: - case DB_RECNO: - if ((ret = __bam_reclaim(dbp, dbp->open_txn)) != 0) - goto err; - break; - case DB_HASH: - if ((ret = __ham_reclaim(dbp, dbp->open_txn)) != 0) - goto err; - break; - default: - ret = __db_unknown_type(dbp->dbenv, - "__db_subdb_remove", dbp->type); - goto err; + if ((t_ret = + dbenv->lock_id_free(dbenv, dbp->lid)) != 0 && ret == 0) + ret = t_ret; + dbp->lid = DB_LOCK_INVALIDID; + LOCK_INIT(dbp->handle_lock); } - /* - * Remove the entry from the main database and free the subdatabase - * metadata page. - */ - if ((ret = __db_master_open(dbp, name, 0, 0, &mdbp)) != 0) - goto err; - - if ((ret = __db_master_update(mdbp, - subdb, dbp->type, &meta_pgno, MU_REMOVE, NULL, 0)) != 0) - goto err; - -err: /* - * End the transaction, committing the transaction if we were - * successful, aborting otherwise. - */ - if (dbp->open_txn != NULL && (t_ret = __db_metaend(dbp, - &remove_lock, ret == 0, NULL, NULL)) != 0 && ret == 0) + /* Discard the locker ID allocated as the fileid. */ + if (F_ISSET(dbp, DB_AM_INMEM) && + LOCKING_ON(dbenv) && (t_ret = dbenv->lock_id_free( + dbenv, *(u_int32_t *)dbp->fileid)) != 0 && ret == 0) ret = t_ret; -err_close: - /* - * Close the user's DB handle -- do this LAST to avoid smashing the - * the transaction information. - */ - if ((t_ret = dbp->close(dbp, 0)) != 0 && ret == 0) - ret = t_ret; - - if (mdbp != NULL && (t_ret = mdbp->close(mdbp, 0)) != 0 && ret == 0) - ret = t_ret; + dbp->type = DB_UNKNOWN; - return (ret); -} - -/* - * __db_rename - * Rename method for DB. - * - * PUBLIC: int __db_rename __P((DB *, - * PUBLIC: const char *, const char *, const char *, u_int32_t)); - */ -int -__db_rename(dbp, filename, subdb, newname, flags) - DB *dbp; - const char *filename, *subdb, *newname; - u_int32_t flags; -{ - DBT namedbt, newnamedbt; - DB_ENV *dbenv; - DB_LOCK remove_lock; - DB_LSN newlsn; - char *real_name, *real_newname; - int ret, t_ret; - - dbenv = dbp->dbenv; - ret = 0; - real_name = real_newname = NULL; - - PANIC_CHECK(dbenv); - /* - * Cannot use DB_ILLEGAL_AFTER_OPEN here because that returns - * and we cannot return, but must deal with the error and destroy - * the handle anyway. - */ - if (F_ISSET(dbp, DB_OPEN_CALLED)) { - ret = __db_mi_open(dbp->dbenv, "rename", 1); - goto err_close; + /* Discard the thread mutex. */ + if (dbp->mutexp != NULL) { + dbmp = dbenv->mp_handle; + __db_mutex_free(dbenv, dbmp->reginfo, dbp->mutexp); + dbp->mutexp = NULL; } - /* Validate arguments -- has same rules as remove. */ - if ((ret = __db_removechk(dbp, flags)) != 0) - goto err_close; + /* Discard any memory used to store returned data. */ + if (dbp->my_rskey.data != NULL) + __os_free(dbp->dbenv, dbp->my_rskey.data); + if (dbp->my_rkey.data != NULL) + __os_free(dbp->dbenv, dbp->my_rkey.data); + if (dbp->my_rdata.data != NULL) + __os_free(dbp->dbenv, dbp->my_rdata.data); + + /* For safety's sake; we may refresh twice. */ + memset(&dbp->my_rskey, 0, sizeof(DBT)); + memset(&dbp->my_rkey, 0, sizeof(DBT)); + memset(&dbp->my_rdata, 0, sizeof(DBT)); /* - * Subdatabases. + * Remove this DB handle from the DB_ENV's dblist, if it's been added. */ - if (subdb != NULL) { - if (filename == NULL) { - __db_err(dbenv, - "multiple databases cannot be created in temporary files"); - goto err_close; - } - return (__db_subdb_rename(dbp, filename, subdb, newname)); - } - - if ((ret = dbp->open(dbp, - filename, NULL, DB_UNKNOWN, DB_RDWRMASTER, 0)) != 0) - goto err_close; - - if (LOGGING_ON(dbenv) && (ret = __log_file_lock(dbp)) != 0) - goto err_close; - - if ((ret = dbp->sync(dbp, 0)) != 0) - goto err_close; - - /* Start the transaction and log the rename. */ - if (TXN_ON(dbenv) && (ret = __db_metabegin(dbp, &remove_lock)) != 0) - goto err_close; - - if (LOGGING_ON(dbenv)) { - memset(&namedbt, 0, sizeof(namedbt)); - namedbt.data = (char *)filename; - namedbt.size = strlen(filename) + 1; - - memset(&newnamedbt, 0, sizeof(namedbt)); - newnamedbt.data = (char *)newname; - newnamedbt.size = strlen(newname) + 1; - - if ((ret = __crdel_rename_log(dbenv, dbp->open_txn, - &newlsn, 0, dbp->log_fileid, &namedbt, &newnamedbt)) != 0) { - __db_err(dbenv, "%s: %s", filename, db_strerror(ret)); - goto err; - } + MUTEX_THREAD_LOCK(dbenv, dbenv->dblist_mutexp); + if (dbp->dblistlinks.le_prev != NULL) + LIST_REMOVE(dbp, dblistlinks); + MUTEX_THREAD_UNLOCK(dbenv, dbenv->dblist_mutexp); + dbp->dblistlinks.le_prev = NULL; - if ((ret = __log_filelist_update(dbenv, dbp, - dbp->log_fileid, newname, NULL)) != 0) - goto err; + /* Close the memory pool file handle. */ + if (dbp->mpf != NULL) { + if ((t_ret = dbp->mpf->close(dbp->mpf, + F_ISSET(dbp, DB_AM_DISCARD) ? DB_MPOOL_DISCARD : 0)) != 0 && + ret == 0) + ret = t_ret; + dbp->mpf = NULL; } - /* Find the real name of the file. */ - if ((ret = __db_appname(dbenv, - DB_APP_DATA, NULL, filename, 0, NULL, &real_name)) != 0) - goto err; - - /* Find the real newname of the file. */ - if ((ret = __db_appname(dbenv, - DB_APP_DATA, NULL, newname, 0, NULL, &real_newname)) != 0) - goto err; + if (LOGGING_ON(dbp->dbenv)) { + /* + * Discard the log file id, if any. We want to log the close + * if and only if this is not a recovery dbp. + */ + if (F_ISSET(dbp, DB_AM_RECOVER)) + (void)__dbreg_revoke_id(dbp, 0); + else + (void)__dbreg_close_id(dbp, txn); - /* - * It is an error to rename a file over one that already exists, - * as that wouldn't be transaction-safe. - */ - if (__os_exists(real_newname, NULL) == 0) { - ret = EEXIST; - __db_err(dbenv, "rename: file %s exists", real_newname); - goto err; + /* Discard the log FNAME. */ + (void)__dbreg_teardown(dbp); } - DB_TEST_RECOVERY(dbp, DB_TEST_PRERENAME, ret, filename); - if (dbp->db_am_rename != NULL && - (ret = dbp->db_am_rename(dbp, filename, subdb, newname)) != 0) - goto err; - /* - * We have to flush the cache for a couple of reasons. First, the - * underlying MPOOLFILE maintains a "name" that unrelated processes - * can use to open the file in order to flush pages, and that name - * is about to be wrong. Second, on Windows the unique file ID is - * generated from the file's name, not other file information as is - * the case on UNIX, and so a subsequent open of the old file name - * could conceivably result in a matching "unique" file ID. - */ - if ((ret = __memp_fremove(dbp->mpf)) != 0) - goto err; - - /* - * On Windows, the underlying file must be closed to perform a rename. - * Nothing later in __db_rename requires that it be open, and the call - * to dbp->close closes it anyway, so we just close it early. - */ - if ((ret = memp_fclose(dbp->mpf)) != 0) - goto err; - dbp->mpf = NULL; - - ret = __os_rename(dbenv, real_name, real_newname); - DB_TEST_RECOVERY(dbp, DB_TEST_POSTRENAME, ret, newname); - -DB_TEST_RECOVERY_LABEL -err: if (dbp->open_txn != NULL && (t_ret = __db_metaend(dbp, - &remove_lock, ret == 0, NULL, NULL)) != 0 && ret == 0) - ret = t_ret; - -err_close: - /* We no longer have an mpool, so syncing would be disastrous. */ - dbp->close(dbp, DB_NOSYNC); - if (real_name != NULL) - __os_freestr(real_name); - if (real_newname != NULL) - __os_freestr(real_newname); - - return (ret); -} - -/* - * __db_subdb_rename -- - * Rename a subdatabase. - */ -static int -__db_subdb_rename(dbp, name, subdb, newname) - DB *dbp; - const char *name, *subdb, *newname; -{ - DB *mdbp; - DBC *dbc; - DB_ENV *dbenv; - DB_LOCK remove_lock; - int ret, t_ret; - - mdbp = NULL; - dbc = NULL; - dbenv = dbp->dbenv; - - /* Start the transaction. */ - if (TXN_ON(dbenv) && (ret = __db_metabegin(dbp, &remove_lock)) != 0) - goto err_close; - - /* - * Open the subdatabase. We can use the user's DB handle for this - * purpose, I think. - */ - if ((ret = __db_open(dbp, name, subdb, DB_UNKNOWN, 0, 0)) != 0) - goto err; - - /* - * Rename the entry in the main database. - */ - if ((ret = __db_master_open(dbp, name, 0, 0, &mdbp)) != 0) - goto err; - - if ((ret = __db_master_update(mdbp, - subdb, dbp->type, NULL, MU_RENAME, newname, 0)) != 0) - goto err; - -err: /* - * End the transaction, committing the transaction if we were - * successful, aborting otherwise. - */ - if (dbp->open_txn != NULL && (t_ret = __db_metaend(dbp, - &remove_lock, ret == 0, NULL, NULL)) != 0 && ret == 0) - ret = t_ret; - -err_close: - /* - * Close the user's DB handle -- do this LAST to avoid smashing the - * the transaction information. - */ - if ((t_ret = dbp->close(dbp, 0)) != 0 && ret == 0) - ret = t_ret; - - if (mdbp != NULL && (t_ret = mdbp->close(mdbp, 0)) != 0 && ret == 0) - ret = t_ret; - - return (ret); -} - -/* - * __db_metabegin -- - * - * Begin a meta-data operation. This involves doing any required locking, - * potentially beginning a transaction and then telling the caller if you - * did or did not begin the transaction. - * - * The writing flag indicates if the caller is actually allowing creates - * or doing deletes (i.e., if the caller is opening and not creating, then - * we don't need to do any of this). - * PUBLIC: int __db_metabegin __P((DB *, DB_LOCK *)); - */ -int -__db_metabegin(dbp, lockp) - DB *dbp; - DB_LOCK *lockp; -{ - DB_ENV *dbenv; - DBT dbplock; - u_int32_t locker, lockval; - int ret; - - dbenv = dbp->dbenv; - - lockp->off = LOCK_INVALID; + /* Clear out fields that normally get set during open. */ + memset(dbp->fileid, 0, sizeof(dbp->fileid)); + dbp->adj_fileid = 0; + dbp->meta_pgno = 0; + dbp->cur_lid = DB_LOCK_INVALIDID; + dbp->associate_lid = DB_LOCK_INVALIDID; + dbp->cl_id = 0; /* - * There is no single place where we can know that we are or are not - * going to be creating any files and/or subdatabases, so we will - * always begin a tranasaction when we start creating one. If we later - * discover that this was unnecessary, we will abort the transaction. - * Recovery is written so that if we log a file create, but then - * discover that we didn't have to do it, we recover correctly. The - * file recovery design document has details. - * - * We need to single thread all create and delete operations, so if we - * are running with locking, we must obtain a lock. We use lock_id to - * generate a unique locker id and use a handcrafted DBT as the object - * on which we are locking. + * If we are being refreshed with a txn specified, then we need + * to make sure that we clear out the lock handle field, because + * releasing all the locks for this transaction will release this + * lock and we don't want close to stumble upon this handle and + * try to close it. */ - if (LOCKING_ON(dbenv)) { - if ((ret = lock_id(dbenv, &locker)) != 0) - return (ret); - lockval = 0; - dbplock.data = &lockval; - dbplock.size = sizeof(lockval); - if ((ret = lock_get(dbenv, - locker, 0, &dbplock, DB_LOCK_WRITE, lockp)) != 0) - return (ret); - } - - return (txn_begin(dbenv, NULL, &dbp->open_txn, 0)); -} - -/* - * __db_metaend -- - * End a meta-data operation. - * PUBLIC: int __db_metaend __P((DB *, - * PUBLIC: DB_LOCK *, int, int (*)(DB *, void *), void *)); - */ -int -__db_metaend(dbp, lockp, commit, callback, cookie) - DB *dbp; - DB_LOCK *lockp; - int commit, (*callback) __P((DB *, void *)); - void *cookie; -{ - DB_ENV *dbenv; - int ret, t_ret; - - ret = 0; - dbenv = dbp->dbenv; - - /* End the transaction. */ - if (commit) { - if ((ret = txn_commit(dbp->open_txn, DB_TXN_SYNC)) == 0) { - /* - * Unlink any underlying file, we've committed the - * transaction. - */ - if (callback != NULL) - ret = callback(dbp, cookie); - } - } else if ((t_ret = txn_abort(dbp->open_txn)) && ret == 0) - ret = t_ret; + if (txn != NULL) + LOCK_INIT(dbp->handle_lock); - /* Release our lock. */ - if (lockp->off != LOCK_INVALID && - (t_ret = lock_put(dbenv, lockp)) != 0 && ret == 0) - ret = t_ret; + F_CLR(dbp, DB_AM_DBM_ERROR); + F_CLR(dbp, DB_AM_DISCARD); + F_CLR(dbp, DB_AM_INMEM); + F_CLR(dbp, DB_AM_RECOVER); + F_CLR(dbp, DB_AM_OPEN_CALLED); + F_CLR(dbp, DB_AM_RDONLY); + F_CLR(dbp, DB_AM_SWAP); return (ret); } /* * __db_log_page - * Log a meta-data or root page during a create operation. + * Log a meta-data or root page during a subdatabase create operation. * - * PUBLIC: int __db_log_page __P((DB *, - * PUBLIC: const char *, DB_LSN *, db_pgno_t, PAGE *)); + * PUBLIC: int __db_log_page __P((DB *, DB_TXN *, DB_LSN *, db_pgno_t, PAGE *)); */ int -__db_log_page(dbp, name, lsn, pgno, page) +__db_log_page(dbp, txn, lsn, pgno, page) DB *dbp; - const char *name; + DB_TXN *txn; DB_LSN *lsn; db_pgno_t pgno; PAGE *page; { - DBT name_dbt, page_dbt; + DBT page_dbt; DB_LSN new_lsn; int ret; - if (dbp->open_txn == NULL) + if (!LOGGING_ON(dbp->dbenv) || txn == NULL) return (0); memset(&page_dbt, 0, sizeof(page_dbt)); page_dbt.size = dbp->pgsize; page_dbt.data = page; - if (pgno == PGNO_BASE_MD) { - /* - * !!! - * Make sure that we properly handle a null name. The old - * Tcl sent us pathnames of the form ""; it may be the case - * that the new Tcl doesn't do that, so we can get rid of - * the second check here. - */ - memset(&name_dbt, 0, sizeof(name_dbt)); - name_dbt.data = (char *)name; - if (name == NULL || *name == '\0') - name_dbt.size = 0; - else - name_dbt.size = strlen(name) + 1; - ret = __crdel_metapage_log(dbp->dbenv, - dbp->open_txn, &new_lsn, DB_FLUSH, - dbp->log_fileid, &name_dbt, pgno, &page_dbt); - } else - ret = __crdel_metasub_log(dbp->dbenv, dbp->open_txn, - &new_lsn, 0, dbp->log_fileid, pgno, &page_dbt, lsn); + ret = __crdel_metasub_log(dbp, txn, &new_lsn, 0, pgno, &page_dbt, lsn); if (ret == 0) page->lsn = new_lsn; @@ -2041,50 +946,89 @@ __db_log_page(dbp, name, lsn, pgno, page) * Create the backup file name for a given file. * * PUBLIC: int __db_backup_name __P((DB_ENV *, - * PUBLIC: const char *, char **, DB_LSN *)); + * PUBLIC: const char *, DB_TXN *, char **)); */ #undef BACKUP_PREFIX #define BACKUP_PREFIX "__db." #undef MAX_LSN_TO_TEXT -#define MAX_LSN_TO_TEXT 21 +#define MAX_LSN_TO_TEXT 17 + int -__db_backup_name(dbenv, name, backup, lsn) +__db_backup_name(dbenv, name, txn, backup) DB_ENV *dbenv; const char *name; + DB_TXN *txn; char **backup; - DB_LSN *lsn; { + DB_LSN lsn; size_t len; int plen, ret; char *p, *retp; - len = strlen(name) + strlen(BACKUP_PREFIX) + MAX_LSN_TO_TEXT + 1; - - if ((ret = __os_malloc(dbenv, len, NULL, &retp)) != 0) - return (ret); - /* - * Create the name. Backup file names are of the form: + * Create the name. Backup file names are in one of two forms: * - * __db.name.0x[lsn-file].0x[lsn-offset] + * In a transactional env: __db.LSN(8).LSN(8) + * and + * in a non-transactional env: __db.FILENAME. * - * which guarantees uniqueness. + * If the transaction doesn't have a current LSN, we write + * a dummy log record to force it, so that we ensure that + * all tmp names are unique. * - * However, name may contain an env-relative path in it. - * In that case, put the __db. after the last portion of - * the pathname. + * In addition, the name passed may contain an env-relative path. + * In that case, put the __db. in the right place (in the last + * component of the pathname). */ - if ((p = __db_rpath(name)) == NULL) - snprintf(retp, len, - "%s%s.0x%x0x%x", BACKUP_PREFIX, name, - lsn->file, lsn->offset); - else { - plen = p - name + 1; + if (txn != NULL) { + if (IS_ZERO_LSN(txn->last_lsn)) { + /* + * Write dummy log record. The two choices for + * dummy log records are __db_noop_log and + * __db_debug_log; unfortunately __db_noop_log requires + * a valid dbp, and we aren't guaranteed to be able + * to pass one in here. + */ + if ((ret = __db_debug_log(dbenv, txn, &lsn, 0, + NULL, 0, NULL, NULL, 0)) != 0) + return (ret); + } else + lsn = txn->last_lsn; + } + + /* + * Part of the name may be a full path, so we need to make sure that + * we allocate enough space for it, even in the case where we don't + * use the entire filename for the backup name. + */ + len = strlen(name) + strlen(BACKUP_PREFIX) + MAX_LSN_TO_TEXT; + + if ((ret = __os_malloc(dbenv, len, &retp)) != 0) + return (ret); + + /* + * There are four cases here: + * 1. simple path w/out transaction + * 2. simple path + transaction + * 3. multi-component path w/out transaction + * 4. multi-component path + transaction + */ + if ((p = __db_rpath(name)) == NULL) { + if (txn == NULL) /* case 1 */ + snprintf(retp, len, "%s%s.", BACKUP_PREFIX, name); + else /* case 2 */ + snprintf(retp, len, + "%s%x.%x", BACKUP_PREFIX, lsn.file, lsn.offset); + } else { + plen = (int)(p - name) + 1; p++; - snprintf(retp, len, - "%.*s%s%s.0x%x0x%x", plen, name, BACKUP_PREFIX, p, - lsn->file, lsn->offset); + if (txn == NULL) /* case 3 */ + snprintf(retp, len, + "%.*s%s%s.", plen, name, BACKUP_PREFIX, p); + else /* case 4 */ + snprintf(retp, len, + "%.*s%x.%x.", plen, name, lsn.file, lsn.offset); } *backup = retp; @@ -2092,19 +1036,6 @@ __db_backup_name(dbenv, name, backup, lsn) } /* - * __db_remove_callback -- - * Callback function -- on file remove commit, it unlinks the backing - * file. - */ -static int -__db_remove_callback(dbp, cookie) - DB *dbp; - void *cookie; -{ - return (__os_unlink(dbp->dbenv, cookie)); -} - -/* * __dblist_get -- * Get the first element of dbenv->dblist with * dbp->adj_fileid matching adjid. @@ -2126,22 +1057,73 @@ __dblist_get(dbenv, adjid) return (dbp); } -#if CONFIG_TEST +/* + * __db_disassociate -- + * Destroy the association between a given secondary and its primary. + */ +static int +__db_disassociate(sdbp) + DB *sdbp; +{ + DBC *dbc; + int ret, t_ret; + + ret = 0; + + sdbp->s_callback = NULL; + sdbp->s_primary = NULL; + sdbp->get = sdbp->stored_get; + sdbp->close = sdbp->stored_close; + + /* + * Complain, but proceed, if we have any active cursors. (We're in + * the middle of a close, so there's really no turning back.) + */ + if (sdbp->s_refcnt != 1 || + TAILQ_FIRST(&sdbp->active_queue) != NULL || + TAILQ_FIRST(&sdbp->join_queue) != NULL) { + __db_err(sdbp->dbenv, + "Closing a primary DB while a secondary DB has active cursors is unsafe"); + ret = EINVAL; + } + sdbp->s_refcnt = 0; + + while ((dbc = TAILQ_FIRST(&sdbp->free_queue)) != NULL) + if ((t_ret = __db_c_destroy(dbc)) != 0 && ret == 0) + ret = t_ret; + + F_CLR(sdbp, DB_AM_SECONDARY); + return (ret); +} + +#if CONFIG_TEST /* * __db_testcopy * Create a copy of all backup files and our "main" DB. * - * PUBLIC: int __db_testcopy __P((DB *, const char *)); + * PUBLIC: #if CONFIG_TEST + * PUBLIC: int __db_testcopy __P((DB_ENV *, DB *, const char *)); + * PUBLIC: #endif */ int -__db_testcopy(dbp, name) +__db_testcopy(dbenv, dbp, name) + DB_ENV *dbenv; DB *dbp; const char *name; { - if (dbp->type == DB_QUEUE) + DB_MPOOLFILE *mpf; + + DB_ASSERT(dbp != NULL || name != NULL); + + if (name == NULL) { + mpf = dbp->mpf; + name = R_ADDR(mpf->dbmp->reginfo, mpf->mfp->path_off); + } + + if (dbp != NULL && dbp->type == DB_QUEUE) return (__qam_testdocopy(dbp, name)); else - return (__db_testdocopy(dbp, name)); + return (__db_testdocopy(dbenv, name)); } static int @@ -2154,7 +1136,7 @@ __qam_testdocopy(dbp, name) int ret; filelist = NULL; - if ((ret = __db_testdocopy(dbp, name)) != 0) + if ((ret = __db_testdocopy(dbp->dbenv, name)) != 0) return (ret); if (dbp->mpf != NULL && (ret = __qam_gen_filelist(dbp, &filelist)) != 0) @@ -2164,12 +1146,13 @@ __qam_testdocopy(dbp, name) return (0); dir = ((QUEUE *)dbp->q_internal)->dir; for (fp = filelist; fp->mpf != NULL; fp++) { - snprintf(buf, sizeof(buf), QUEUE_EXTENT, dir, name, fp->id); - if ((ret = __db_testdocopy(dbp, buf)) != 0) + snprintf(buf, sizeof(buf), + QUEUE_EXTENT, dir, PATH_SEPARATOR[0], name, fp->id); + if ((ret = __db_testdocopy(dbp->dbenv, buf)) != 0) return (ret); } - __os_free(filelist, 0); + __os_free(dbp->dbenv, filelist); return (0); } @@ -2179,8 +1162,8 @@ __qam_testdocopy(dbp, name) * */ static int -__db_testdocopy(dbp, name) - DB *dbp; +__db_testdocopy(dbenv, name) + DB_ENV *dbenv; const char *name; { size_t len; @@ -2188,8 +1171,8 @@ __db_testdocopy(dbp, name) char **namesp, *backup, *copy, *dir, *p, *real_name; real_name = NULL; /* Get the real backing file name. */ - if ((ret = __db_appname(dbp->dbenv, - DB_APP_DATA, NULL, name, 0, NULL, &real_name)) != 0) + if ((ret = __db_appname(dbenv, + DB_APP_DATA, name, 0, NULL, &real_name)) != 0) return (ret); copy = backup = NULL; @@ -2200,10 +1183,10 @@ __db_testdocopy(dbp, name) */ len = strlen(real_name) + strlen(BACKUP_PREFIX) + MAX_LSN_TO_TEXT + 9; - if ((ret = __os_malloc(dbp->dbenv, len, NULL, ©)) != 0) + if ((ret = __os_malloc(dbenv, len, ©)) != 0) goto out; - if ((ret = __os_malloc(dbp->dbenv, len, NULL, &backup)) != 0) + if ((ret = __os_malloc(dbenv, len, &backup)) != 0) goto out; /* @@ -2212,9 +1195,9 @@ __db_testdocopy(dbp, name) snprintf(copy, len, "%s.afterop", real_name); __db_makecopy(real_name, copy); - if ((ret = __os_strdup(dbp->dbenv, real_name, &dir)) != 0) + if ((ret = __os_strdup(dbenv, real_name, &dir)) != 0) goto out; - __os_freestr(real_name); + __os_free(dbenv, real_name); real_name = NULL; /* * Create the name. Backup file names are of the form: @@ -2234,7 +1217,7 @@ __db_testdocopy(dbp, name) p = __db_rpath(dir); if (p != NULL) *p = '\0'; - ret = __os_dirlist(dbp->dbenv, dir, &namesp, &dircnt); + ret = __os_dirlist(dbenv, dir, &namesp, &dircnt); #if DIAGNOSTIC /* * XXX @@ -2245,7 +1228,7 @@ __db_testdocopy(dbp, name) */ *p = '/'; #endif - __os_freestr(dir); + __os_free(dbenv, dir); if (ret != 0) goto out; for (i = 0; i < dircnt; i++) { @@ -2258,8 +1241,8 @@ __db_testdocopy(dbp, name) * know its LSN's. */ if (strncmp(namesp[i], backup, strlen(backup)) == 0) { - if ((ret = __db_appname(dbp->dbenv, DB_APP_DATA, - NULL, namesp[i], 0, NULL, &real_name)) != 0) + if ((ret = __db_appname(dbenv, DB_APP_DATA, + namesp[i], 0, NULL, &real_name)) != 0) goto out; /* @@ -2268,25 +1251,25 @@ __db_testdocopy(dbp, name) * If so, just move on. */ if (strstr(real_name, ".afterop") != NULL) { - __os_freestr(real_name); + __os_free(dbenv, real_name); real_name = NULL; continue; } snprintf(copy, len, "%s.afterop", real_name); __db_makecopy(real_name, copy); - __os_freestr(real_name); + __os_free(dbenv, real_name); real_name = NULL; } } out: if (backup != NULL) - __os_freestr(backup); + __os_free(dbenv, backup); if (copy != NULL) - __os_freestr(copy); + __os_free(dbenv, copy); if (namesp != NULL) - __os_dirfree(namesp, dircnt); + __os_dirfree(dbenv, namesp, dircnt); if (real_name != NULL) - __os_freestr(real_name); + __os_free(dbenv, real_name); return (ret); } @@ -2301,7 +1284,7 @@ __db_makecopy(src, dest) memset(&rfh, 0, sizeof(rfh)); memset(&wfh, 0, sizeof(wfh)); - if (__os_malloc(NULL, 1024, NULL, &buf) != 0) + if (__os_malloc(NULL, 1024, &buf) != 0) return; if (__os_open(NULL, @@ -2313,13 +1296,13 @@ __db_makecopy(src, dest) for (;;) if (__os_read(NULL, &rfh, buf, 1024, &rcnt) < 0 || rcnt == 0 || - __os_write(NULL, &wfh, buf, rcnt, &wcnt) < 0 || wcnt != rcnt) + __os_write(NULL, &wfh, buf, rcnt, &wcnt) < 0) break; -err: __os_free(buf, 1024); +err: __os_free(NULL, buf); if (F_ISSET(&rfh, DB_FH_VALID)) - __os_closehandle(&rfh); + __os_closehandle(NULL, &rfh); if (F_ISSET(&wfh, DB_FH_VALID)) - __os_closehandle(&wfh); + __os_closehandle(NULL, &wfh); } #endif diff --git a/bdb/db/db.src b/bdb/db/db.src index b695e1360c5..414321fcbbd 100644 --- a/bdb/db/db.src +++ b/bdb/db/db.src @@ -1,13 +1,14 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2002 * Sleepycat Software. All rights reserved. * - * $Id: db.src,v 11.8 2000/02/17 20:24:07 bostic Exp $ + * $Id: db.src,v 11.18 2002/04/17 19:02:58 krinsky Exp $ */ -PREFIX db +PREFIX __db +DBPRIVATE INCLUDE #include "db_config.h" INCLUDE @@ -15,15 +16,17 @@ INCLUDE #ifndef NO_SYSTEM_INCLUDES INCLUDE #include <sys/types.h> INCLUDE INCLUDE #include <ctype.h> -INCLUDE #include <errno.h> INCLUDE #include <string.h> INCLUDE #endif INCLUDE INCLUDE #include "db_int.h" -INCLUDE #include "db_page.h" -INCLUDE #include "db_dispatch.h" -INCLUDE #include "db_am.h" -INCLUDE #include "txn.h" +INCLUDE #include "dbinc/crypto.h" +INCLUDE #include "dbinc/db_page.h" +INCLUDE #include "dbinc/db_dispatch.h" +INCLUDE #include "dbinc/db_am.h" +INCLUDE #include "dbinc/log.h" +INCLUDE #include "dbinc/rep.h" +INCLUDE #include "dbinc/txn.h" INCLUDE /* @@ -44,33 +47,16 @@ INCLUDE */ BEGIN addrem 41 ARG opcode u_int32_t lu -ARG fileid int32_t ld -ARG pgno db_pgno_t lu +DB fileid int32_t ld +WRLOCK pgno db_pgno_t lu ARG indx u_int32_t lu -ARG nbytes size_t lu -DBT hdr DBT s +ARG nbytes u_int32_t lu +PGDBT hdr DBT s DBT dbt DBT s POINTER pagelsn DB_LSN * lu END /* - * split -- Handles the split of a duplicate page. - * - * opcode: defines whether we are splitting from or splitting onto - * fileid: file identifier of the file being modified. - * pgno: page number being split. - * pageimage: entire page contents. - * pagelsn: former lsn of the page. - */ -DEPRECATED split 42 -ARG opcode u_int32_t lu -ARG fileid int32_t ld -ARG pgno db_pgno_t lu -DBT pageimage DBT s -POINTER pagelsn DB_LSN * lu -END - -/* * big -- Handles addition and deletion of big key/data items. * * opcode: identifies get/put. @@ -87,10 +73,10 @@ END */ BEGIN big 43 ARG opcode u_int32_t lu -ARG fileid int32_t ld -ARG pgno db_pgno_t lu -ARG prev_pgno db_pgno_t lu -ARG next_pgno db_pgno_t lu +DB fileid int32_t ld +WRLOCK pgno db_pgno_t lu +WRLOCKNZ prev_pgno db_pgno_t lu +WRLOCKNZ next_pgno db_pgno_t lu DBT dbt DBT s POINTER pagelsn DB_LSN * lu POINTER prevlsn DB_LSN * lu @@ -106,8 +92,8 @@ END * lsn: the page's original lsn. */ BEGIN ovref 44 -ARG fileid int32_t ld -ARG pgno db_pgno_t lu +DB fileid int32_t ld +WRLOCK pgno db_pgno_t lu ARG adjust int32_t ld POINTER lsn DB_LSN * lu END @@ -125,33 +111,16 @@ END */ BEGIN relink 45 ARG opcode u_int32_t lu -ARG fileid int32_t ld -ARG pgno db_pgno_t lu +DB fileid int32_t ld +WRLOCK pgno db_pgno_t lu POINTER lsn DB_LSN * lu -ARG prev db_pgno_t lu +WRLOCKNZ prev db_pgno_t lu POINTER lsn_prev DB_LSN * lu -ARG next db_pgno_t lu +WRLOCKNZ next db_pgno_t lu POINTER lsn_next DB_LSN * lu END /* - * Addpage -- Handles adding a new duplicate page onto the end of - * an existing duplicate page. - * fileid: identifies the file being changed. - * pgno: page number to which a new page is being added. - * lsn: lsn of pgno - * nextpgno: new page number being added. - * nextlsn: lsn of nextpgno; - */ -DEPRECATED addpage 46 -ARG fileid int32_t ld -ARG pgno db_pgno_t lu -POINTER lsn DB_LSN * lu -ARG nextpgno db_pgno_t lu -POINTER nextlsn DB_LSN * lu -END - -/* * Debug -- log an operation upon entering an access method. * op: Operation (cursor, c_close, c_get, c_put, c_del, * get, put, delete). @@ -172,7 +141,55 @@ END * noop -- do nothing, but get an LSN. */ BEGIN noop 48 -ARG fileid int32_t ld -ARG pgno db_pgno_t lu +DB fileid int32_t ld +WRLOCK pgno db_pgno_t lu POINTER prevlsn DB_LSN * lu END + +/* + * pg_alloc: used to record allocating a new page. + * + * meta_lsn: the meta-data page's original lsn. + * meta_pgno the meta-data page number. + * page_lsn: the allocated page's original lsn. + * pgno: the page allocated. + * ptype: the type of the page allocated. + * next: the next page on the free list. + */ +BEGIN pg_alloc 49 +DB fileid int32_t ld +POINTER meta_lsn DB_LSN * lu +WRLOCK meta_pgno db_pgno_t lu +POINTER page_lsn DB_LSN * lu +WRLOCK pgno db_pgno_t lu +ARG ptype u_int32_t lu +ARG next db_pgno_t lu +END + +/* + * pg_free: used to record freeing a page. + * + * pgno: the page being freed. + * meta_lsn: the meta-data page's original lsn. + * meta_pgno: the meta-data page number. + * header: the header from the free'd page. + * next: the previous next pointer on the metadata page. + */ +BEGIN pg_free 50 +DB fileid int32_t ld +WRLOCK pgno db_pgno_t lu +POINTER meta_lsn DB_LSN * lu +WRLOCK meta_pgno db_pgno_t lu +PGDBT header DBT s +ARG next db_pgno_t lu +END + +/* + * cksum -- + * This log record is written when we're unable to checksum a page, + * before returning DB_RUNRECOVERY. This log record causes normal + * recovery to itself return DB_RUNRECOVERY, as only catastrophic + * recovery can fix things. + */ +BEGIN cksum 51 +END diff --git a/bdb/db/db_am.c b/bdb/db/db_am.c index 2d224566904..cf6ef18549b 100644 --- a/bdb/db/db_am.c +++ b/bdb/db/db_am.c @@ -1,14 +1,14 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1998, 1999, 2000 + * Copyright (c) 1998-2002 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: db_am.c,v 11.42 2001/01/11 18:19:50 bostic Exp $"; +static const char revid[] = "$Id: db_am.c,v 11.96 2002/08/27 15:17:32 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -18,16 +18,22 @@ static const char revid[] = "$Id: db_am.c,v 11.42 2001/01/11 18:19:50 bostic Exp #endif #include "db_int.h" -#include "db_page.h" -#include "db_shash.h" -#include "btree.h" -#include "hash.h" -#include "qam.h" -#include "lock.h" -#include "mp.h" -#include "txn.h" -#include "db_am.h" -#include "db_ext.h" +#include "dbinc/db_page.h" +#include "dbinc/db_shash.h" +#include "dbinc/btree.h" +#include "dbinc/hash.h" +#include "dbinc/lock.h" +#include "dbinc/log.h" +#include "dbinc/mp.h" +#include "dbinc/qam.h" + +static int __db_append_primary __P((DBC *, DBT *, DBT *)); +static int __db_secondary_get __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t)); +static int __db_secondary_close __P((DB *, u_int32_t)); + +#ifdef DEBUG +static int __db_cprint_item __P((DBC *)); +#endif /* * __db_cursor -- @@ -53,12 +59,22 @@ __db_cursor(dbp, txn, dbcp, flags) PANIC_CHECK(dbenv); DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->cursor"); - /* Check for invalid flags. */ - if ((ret = __db_cursorchk(dbp, flags, F_ISSET(dbp, DB_AM_RDONLY))) != 0) + /* Validate arguments. */ + if ((ret = __db_cursorchk(dbp, flags)) != 0) return (ret); - if ((ret = - __db_icursor(dbp, txn, dbp->type, PGNO_INVALID, 0, dbcp)) != 0) + /* + * Check for consistent transaction usage. For now, assume that + * this cursor might be used for read operations only (in which + * case it may not require a txn). We'll check more stringently + * in c_del and c_put. (Note that this all means that the + * read-op txn tests have to be a subset of the write-op ones.) + */ + if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 1)) != 0) + return (ret); + + if ((ret = __db_icursor(dbp, + txn, dbp->type, PGNO_INVALID, 0, DB_LOCK_INVALIDID, dbcp)) != 0) return (ret); dbc = *dbcp; @@ -70,7 +86,7 @@ __db_cursor(dbp, txn, dbcp, flags) op = LF_ISSET(DB_OPFLAGS_MASK); mode = (op == DB_WRITELOCK) ? DB_LOCK_WRITE : ((op == DB_WRITECURSOR) ? DB_LOCK_IWRITE : DB_LOCK_READ); - if ((ret = lock_get(dbenv, dbc->locker, 0, + if ((ret = dbenv->lock_get(dbenv, dbc->locker, 0, &dbc->lock_dbt, mode, &dbc->mylock)) != 0) { (void)__db_c_close(dbc); return (ret); @@ -81,6 +97,9 @@ __db_cursor(dbp, txn, dbcp, flags) F_SET(dbc, DBC_WRITER); } + if (LF_ISSET(DB_DIRTY_READ) || + (txn != NULL && F_ISSET(txn, TXN_DIRTY_READ))) + F_SET(dbc, DBC_DIRTY_READ); return (0); } @@ -91,15 +110,16 @@ __db_cursor(dbp, txn, dbcp, flags) * initialize as a cursor. * * PUBLIC: int __db_icursor - * PUBLIC: __P((DB *, DB_TXN *, DBTYPE, db_pgno_t, int, DBC **)); + * PUBLIC: __P((DB *, DB_TXN *, DBTYPE, db_pgno_t, int, u_int32_t, DBC **)); */ int -__db_icursor(dbp, txn, dbtype, root, is_opd, dbcp) +__db_icursor(dbp, txn, dbtype, root, is_opd, lockerid, dbcp) DB *dbp; DB_TXN *txn; DBTYPE dbtype; db_pgno_t root; int is_opd; + u_int32_t lockerid; DBC **dbcp; { DBC *dbc, *adbc; @@ -120,7 +140,7 @@ __db_icursor(dbp, txn, dbtype, root, is_opd, dbcp) dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) if (dbtype == dbc->dbtype) { TAILQ_REMOVE(&dbp->free_queue, dbc, links); - dbc->flags = 0; + F_CLR(dbc, ~DBC_OWN_LID); break; } MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp); @@ -144,11 +164,35 @@ __db_icursor(dbp, txn, dbtype, root, is_opd, dbcp) if (!DB_IS_THREADED(dbp) && (adbc = TAILQ_FIRST(&dbp->active_queue)) != NULL) dbc->lid = adbc->lid; - else - if ((ret = lock_id(dbenv, &dbc->lid)) != 0) + else { + if ((ret = + dbenv->lock_id(dbenv, &dbc->lid)) != 0) goto err; + F_SET(dbc, DBC_OWN_LID); + } + + /* + * In CDB, secondary indices should share a lock file + * ID with the primary; otherwise we're susceptible to + * deadlocks. We also use __db_icursor rather + * than sdbp->cursor to create secondary update + * cursors in c_put and c_del; these won't + * acquire a new lock. + * + * !!! + * Since this is in the one-time cursor allocation + * code, we need to be sure to destroy, not just + * close, all cursors in the secondary when we + * associate. + */ + if (CDB_LOCKING(dbp->dbenv) && + F_ISSET(dbp, DB_AM_SECONDARY)) + memcpy(dbc->lock.fileid, + dbp->s_primary->fileid, DB_FILE_ID_LEN); + else + memcpy(dbc->lock.fileid, + dbp->fileid, DB_FILE_ID_LEN); - memcpy(dbc->lock.fileid, dbp->fileid, DB_FILE_ID_LEN); if (CDB_LOCKING(dbenv)) { if (F_ISSET(dbenv, DB_ENV_CDB_ALLDB)) { /* @@ -198,18 +242,55 @@ __db_icursor(dbp, txn, dbtype, root, is_opd, dbcp) /* Refresh the DBC structure. */ dbc->dbtype = dbtype; + RESET_RET_MEM(dbc); - if ((dbc->txn = txn) == NULL) - dbc->locker = dbc->lid; - else { + if ((dbc->txn = txn) == NULL) { + /* + * There are certain cases in which we want to create a + * new cursor with a particular locker ID that is known + * to be the same as (and thus not conflict with) an + * open cursor. + * + * The most obvious case is cursor duplication; when we + * call DBC->c_dup or __db_c_idup, we want to use the original + * cursor's locker ID. + * + * Another case is when updating secondary indices. Standard + * CDB locking would mean that we might block ourself: we need + * to open an update cursor in the secondary while an update + * cursor in the primary is open, and when the secondary and + * primary are subdatabases or we're using env-wide locking, + * this is disastrous. + * + * In these cases, our caller will pass a nonzero locker ID + * into this function. Use this locker ID instead of dbc->lid + * as the locker ID for our new cursor. + */ + if (lockerid != DB_LOCK_INVALIDID) + dbc->locker = lockerid; + else + dbc->locker = dbc->lid; + } else { dbc->locker = txn->txnid; txn->cursors++; } + /* + * These fields change when we are used as a secondary index, so + * if the DB is a secondary, make sure they're set properly just + * in case we opened some cursors before we were associated. + * + * __db_c_get is used by all access methods, so this should be safe. + */ + if (F_ISSET(dbp, DB_AM_SECONDARY)) + dbc->c_get = __db_c_secondary_get; + if (is_opd) F_SET(dbc, DBC_OPD); if (F_ISSET(dbp, DB_AM_RECOVER)) F_SET(dbc, DBC_RECOVER); + if (F_ISSET(dbp, DB_AM_COMPENSATE)) + F_SET(dbc, DBC_COMPENSATE); /* Refresh the DBC internal structure. */ cp = dbc->internal; @@ -243,14 +324,14 @@ __db_icursor(dbp, txn, dbtype, root, is_opd, dbcp) return (0); err: if (allocated) - __os_free(dbc, sizeof(*dbc)); + __os_free(dbp->dbenv, dbc); return (ret); } #ifdef DEBUG /* * __db_cprint -- - * Display the current cursor list. + * Display the cursor active and free queues. * * PUBLIC: int __db_cprint __P((DB *)); */ @@ -258,60 +339,76 @@ int __db_cprint(dbp) DB *dbp; { + DBC *dbc; + int ret, t_ret; + + ret = 0; + MUTEX_THREAD_LOCK(dbp->dbenv, dbp->mutexp); + fprintf(stderr, "Active queue:\n"); + for (dbc = TAILQ_FIRST(&dbp->active_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) + if ((t_ret = __db_cprint_item(dbc)) != 0 && ret == 0) + ret = t_ret; + fprintf(stderr, "Free queue:\n"); + for (dbc = TAILQ_FIRST(&dbp->free_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) + if ((t_ret = __db_cprint_item(dbc)) != 0 && ret == 0) + ret = t_ret; + MUTEX_THREAD_UNLOCK(dbp->dbenv, dbp->mutexp); + + return (ret); +} + +static +int __db_cprint_item(dbc) + DBC *dbc; +{ static const FN fn[] = { { DBC_ACTIVE, "active" }, + { DBC_COMPENSATE, "compensate" }, { DBC_OPD, "off-page-dup" }, { DBC_RECOVER, "recover" }, { DBC_RMW, "read-modify-write" }, + { DBC_TRANSIENT, "transient" }, { DBC_WRITECURSOR, "write cursor" }, { DBC_WRITEDUP, "internally dup'ed write cursor" }, { DBC_WRITER, "short-term write cursor" }, { 0, NULL } }; - DBC *dbc; + DB *dbp; DBC_INTERNAL *cp; - char *s; + const char *s; - MUTEX_THREAD_LOCK(dbp->dbenv, dbp->mutexp); - for (dbc = TAILQ_FIRST(&dbp->active_queue); - dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { - switch (dbc->dbtype) { - case DB_BTREE: - s = "btree"; - break; - case DB_HASH: - s = "hash"; - break; - case DB_RECNO: - s = "recno"; - break; - case DB_QUEUE: - s = "queue"; - break; - default: - DB_ASSERT(0); - return (1); - } - cp = dbc->internal; - fprintf(stderr, "%s/%#0lx: opd: %#0lx\n", - s, P_TO_ULONG(dbc), P_TO_ULONG(cp->opd)); - fprintf(stderr, "\ttxn: %#0lx lid: %lu locker: %lu\n", - P_TO_ULONG(dbc->txn), - (u_long)dbc->lid, (u_long)dbc->locker); - fprintf(stderr, "\troot: %lu page/index: %lu/%lu", - (u_long)cp->root, (u_long)cp->pgno, (u_long)cp->indx); - __db_prflags(dbc->flags, fn, stderr); - fprintf(stderr, "\n"); - - if (dbp->type == DB_BTREE) - __bam_cprint(dbc); + dbp = dbc->dbp; + cp = dbc->internal; + + s = __db_dbtype_to_string(dbc->dbtype); + if (strcmp(s, "UNKNOWN TYPE") == 0) { + DB_ASSERT(0); + return (1); } - for (dbc = TAILQ_FIRST(&dbp->free_queue); - dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) - fprintf(stderr, "free: %#0lx ", P_TO_ULONG(dbc)); + fprintf(stderr, "%s/%#0lx: opd: %#0lx\n", + s, P_TO_ULONG(dbc), P_TO_ULONG(cp->opd)); + + fprintf(stderr, "\ttxn: %#0lx lid: %lu locker: %lu\n", + P_TO_ULONG(dbc->txn), (u_long)dbc->lid, (u_long)dbc->locker); + + fprintf(stderr, "\troot: %lu page/index: %lu/%lu", + (u_long)cp->root, (u_long)cp->pgno, (u_long)cp->indx); + + __db_prflags(dbc->flags, fn, stderr); fprintf(stderr, "\n"); - MUTEX_THREAD_UNLOCK(dbp->dbenv, dbp->mutexp); + switch (dbp->type) { + case DB_BTREE: + __bam_cprint(dbc); + break; + case DB_HASH: + __ham_cprint(dbc); + break; + default: + break; + } return (0); } #endif /* DEBUG */ @@ -345,7 +442,7 @@ __db_fd(dbp, fdp) return (0); } else { *fdp = -1; - __db_err(dbp->dbenv, "DB does not have a valid file handle."); + __db_err(dbp->dbenv, "DB does not have a valid file handle"); return (ENOENT); } } @@ -372,8 +469,16 @@ __db_get(dbp, txn, key, data, flags) if ((ret = __db_getchk(dbp, key, data, flags)) != 0) return (ret); + /* Check for consistent transaction usage. */ + if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 1)) != 0) + return (ret); + mode = 0; - if (flags == DB_CONSUME || flags == DB_CONSUME_WAIT) + if (LF_ISSET(DB_DIRTY_READ)) { + mode = DB_DIRTY_READ; + LF_CLR(DB_DIRTY_READ); + } + else if (flags == DB_CONSUME || flags == DB_CONSUME_WAIT) mode = DB_WRITELOCK; if ((ret = dbp->cursor(dbp, txn, &dbc, mode)) != 0) return (ret); @@ -387,11 +492,17 @@ __db_get(dbp, txn, key, data, flags) * going to close it right away. Thus, we can perform the get * without duplicating the cursor, saving some cycles in this * common case. + * + * SET_RET_MEM indicates that if key and/or data have no DBT + * flags set and DB manages the returned-data memory, that memory + * will belong to this handle, not to the underlying cursor. */ F_SET(dbc, DBC_TRANSIENT); + SET_RET_MEM(dbc, dbp); - ret = dbc->c_get(dbc, key, data, - flags == 0 || flags == DB_RMW ? flags | DB_SET : flags); + if (LF_ISSET(~(DB_RMW | DB_MULTIPLE)) == 0) + LF_SET(DB_SET); + ret = dbc->c_get(dbc, key, data, flags); if ((t_ret = __db_c_close(dbc)) != 0 && ret == 0) ret = t_ret; @@ -414,20 +525,39 @@ __db_put(dbp, txn, key, data, flags) { DBC *dbc; DBT tdata; - int ret, t_ret; + DB_ENV *dbenv; + int ret, t_ret, txn_local; - PANIC_CHECK(dbp->dbenv); + dbc = NULL; + dbenv = dbp->dbenv; + txn_local = 0; + + PANIC_CHECK(dbenv); DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->put"); + /* Validate arguments. */ if ((ret = __db_putchk(dbp, key, data, - flags, F_ISSET(dbp, DB_AM_RDONLY), - F_ISSET(dbp, DB_AM_DUP) || F_ISSET(key, DB_DBT_DUPOK))) != 0) + flags, F_ISSET(dbp, DB_AM_DUP) || F_ISSET(key, DB_DBT_DUPOK))) != 0) return (ret); - DB_CHECK_TXN(dbp, txn); + /* Create local transaction as necessary. */ + if (IS_AUTO_COMMIT(dbenv, txn, flags)) { + if ((ret = __db_txn_auto(dbp, &txn)) != 0) + return (ret); + txn_local = 1; + LF_CLR(DB_AUTO_COMMIT); + } + + /* Check for consistent transaction usage. */ + if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 0)) != 0) + goto err; if ((ret = dbp->cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0) - return (ret); + goto err; + + DEBUG_LWRITE(dbc, txn, "db_put", key, data, flags); + + SET_RET_MEM(dbc, dbp); /* * See the comment in __db_get(). @@ -441,9 +571,58 @@ __db_put(dbp, txn, key, data, flags) */ F_SET(dbc, DBC_TRANSIENT); - DEBUG_LWRITE(dbc, txn, "__db_put", key, data, flags); + switch (flags) { + case DB_APPEND: + /* + * If there is an append callback, the value stored in + * data->data may be replaced and then freed. To avoid + * passing a freed pointer back to the user, just operate + * on a copy of the data DBT. + */ + tdata = *data; - if (flags == DB_NOOVERWRITE) { + /* + * Append isn't a normal put operation; call the appropriate + * access method's append function. + */ + switch (dbp->type) { + case DB_QUEUE: + if ((ret = __qam_append(dbc, key, &tdata)) != 0) + goto err; + break; + case DB_RECNO: + if ((ret = __ram_append(dbc, key, &tdata)) != 0) + goto err; + break; + default: + /* The interface should prevent this. */ + DB_ASSERT(0); + ret = __db_ferr(dbenv, "__db_put", flags); + goto err; + } + + /* + * Secondary indices: since we've returned zero from + * an append function, we've just put a record, and done + * so outside __db_c_put. We know we're not a secondary-- + * the interface prevents puts on them--but we may be a + * primary. If so, update our secondary indices + * appropriately. + */ + DB_ASSERT(!F_ISSET(dbp, DB_AM_SECONDARY)); + + if (LIST_FIRST(&dbp->s_secondaries) != NULL) + ret = __db_append_primary(dbc, key, &tdata); + + /* + * The append callback, if one exists, may have allocated + * a new tdata.data buffer. If so, free it. + */ + FREE_IF_NEEDED(dbp, &tdata); + + /* No need for a cursor put; we're done. */ + goto err; + case DB_NOOVERWRITE: flags = 0; /* * Set DB_DBT_USERMEM, this might be a threaded application and @@ -460,16 +639,161 @@ __db_put(dbp, txn, key, data, flags) if ((ret = dbc->c_get(dbc, key, &tdata, DB_SET | (STD_LOCKING(dbc) ? DB_RMW : 0))) == 0) ret = DB_KEYEXIST; - else if (ret == DB_NOTFOUND) + else if (ret == DB_NOTFOUND || ret == DB_KEYEMPTY) ret = 0; + break; + default: + /* Fall through to normal cursor put. */ + break; } if (ret == 0) ret = dbc->c_put(dbc, - key, data, flags == 0 ? DB_KEYLAST : flags); + key, data, flags == 0 ? DB_KEYLAST : flags); - if ((t_ret = __db_c_close(dbc)) != 0 && ret == 0) +err: /* Close the cursor. */ + if (dbc != NULL && (t_ret = __db_c_close(dbc)) != 0 && ret == 0) + ret = t_ret; + + /* Commit for DB_AUTO_COMMIT. */ + if (txn_local) { + if (ret == 0) + ret = txn->commit(txn, 0); + else + if ((t_ret = txn->abort(txn)) != 0) + ret = __db_panic(dbenv, t_ret); + } + + return (ret); +} + +/* + * __db_delete -- + * Delete the items referenced by a key. + * + * PUBLIC: int __db_delete __P((DB *, DB_TXN *, DBT *, u_int32_t)); + */ +int +__db_delete(dbp, txn, key, flags) + DB *dbp; + DB_TXN *txn; + DBT *key; + u_int32_t flags; +{ + DBC *dbc; + DBT data, lkey; + DB_ENV *dbenv; + u_int32_t f_init, f_next; + int ret, t_ret, txn_local; + + dbc = NULL; + dbenv = dbp->dbenv; + txn_local = 0; + + PANIC_CHECK(dbenv); + DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->del"); + + /* Check for invalid flags. */ + if ((ret = __db_delchk(dbp, key, flags)) != 0) + return (ret); + + /* Create local transaction as necessary. */ + if (IS_AUTO_COMMIT(dbenv, txn, flags)) { + if ((ret = __db_txn_auto(dbp, &txn)) != 0) + return (ret); + txn_local = 1; + LF_CLR(DB_AUTO_COMMIT); + } + + /* Check for consistent transaction usage. */ + if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 0)) != 0) + goto err; + + /* Allocate a cursor. */ + if ((ret = dbp->cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0) + goto err; + + DEBUG_LWRITE(dbc, txn, "db_delete", key, NULL, flags); + + /* + * Walk a cursor through the key/data pairs, deleting as we go. Set + * the DB_DBT_USERMEM flag, as this might be a threaded application + * and the flags checking will catch us. We don't actually want the + * keys or data, so request a partial of length 0. + */ + memset(&lkey, 0, sizeof(lkey)); + F_SET(&lkey, DB_DBT_USERMEM | DB_DBT_PARTIAL); + memset(&data, 0, sizeof(data)); + F_SET(&data, DB_DBT_USERMEM | DB_DBT_PARTIAL); + + /* + * If locking (and we haven't already acquired CDB locks), set the + * read-modify-write flag. + */ + f_init = DB_SET; + f_next = DB_NEXT_DUP; + if (STD_LOCKING(dbc)) { + f_init |= DB_RMW; + f_next |= DB_RMW; + } + + /* Walk through the set of key/data pairs, deleting as we go. */ + if ((ret = dbc->c_get(dbc, key, &data, f_init)) != 0) + goto err; + + /* + * Hash permits an optimization in DB->del: since on-page + * duplicates are stored in a single HKEYDATA structure, it's + * possible to delete an entire set of them at once, and as + * the HKEYDATA has to be rebuilt and re-put each time it + * changes, this is much faster than deleting the duplicates + * one by one. Thus, if we're not pointing at an off-page + * duplicate set, and we're not using secondary indices (in + * which case we'd have to examine the items one by one anyway), + * let hash do this "quick delete". + * + * !!! + * Note that this is the only application-executed delete call in + * Berkeley DB that does not go through the __db_c_del function. + * If anything other than the delete itself (like a secondary index + * update) has to happen there in a particular situation, the + * conditions here should be modified not to call __ham_quick_delete. + * The ordinary AM-independent alternative will work just fine with + * a hash; it'll just be slower. + */ + if (dbp->type == DB_HASH) { + if (LIST_FIRST(&dbp->s_secondaries) == NULL && + !F_ISSET(dbp, DB_AM_SECONDARY) && + dbc->internal->opd == NULL) { + ret = __ham_quick_delete(dbc); + goto err; + } + } + + for (;;) { + if ((ret = dbc->c_del(dbc, 0)) != 0) + goto err; + if ((ret = dbc->c_get(dbc, &lkey, &data, f_next)) != 0) { + if (ret == DB_NOTFOUND) { + ret = 0; + break; + } + goto err; + } + } + +err: /* Discard the cursor. */ + if (dbc != NULL && (t_ret = dbc->c_close(dbc)) != 0 && ret == 0) ret = t_ret; + /* Commit for DB_AUTO_COMMIT. */ + if (txn_local) { + if (ret == 0) + ret = txn->commit(txn, 0); + else + if ((t_ret = txn->abort(txn)) != 0) + ret = __db_panic(dbenv, t_ret); + } + return (ret); } @@ -505,7 +829,443 @@ __db_sync(dbp, flags) return (0); /* Flush any dirty pages from the cache to the backing file. */ - if ((t_ret = memp_fsync(dbp->mpf)) != 0 && ret == 0) + if ((t_ret = dbp->mpf->sync(dbp->mpf)) != 0 && ret == 0) + ret = t_ret; + return (ret); +} + +/* + * __db_associate -- + * Associate another database as a secondary index to this one. + * + * PUBLIC: int __db_associate __P((DB *, DB_TXN *, DB *, + * PUBLIC: int (*)(DB *, const DBT *, const DBT *, DBT *), u_int32_t)); + */ +int +__db_associate(dbp, txn, sdbp, callback, flags) + DB *dbp, *sdbp; + DB_TXN *txn; + int (*callback) __P((DB *, const DBT *, const DBT *, DBT *)); + u_int32_t flags; +{ + DB_ENV *dbenv; + DBC *pdbc, *sdbc; + DBT skey, key, data; + int build, ret, t_ret, txn_local; + + dbenv = dbp->dbenv; + + PANIC_CHECK(dbenv); + + txn_local = 0; + pdbc = NULL; + + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + memset(&skey, 0, sizeof(DBT)); + + if ((ret = __db_associatechk(dbp, sdbp, callback, flags)) != 0) + return (ret); + + /* + * Create a local transaction as necessary, check for consistent + * transaction usage, and, if we have no transaction but do have + * locking on, acquire a locker id for the handle lock acquisition. + */ + if (IS_AUTO_COMMIT(dbenv, txn, flags)) { + if ((ret = __db_txn_auto(dbp, &txn)) != 0) + return (ret); + txn_local = 1; + } else if (txn != NULL && !TXN_ON(dbenv)) + return (__db_not_txn_env(dbenv)); + + /* + * Check that if an open transaction is in progress, we're in it, + * for other common transaction errors, and for concurrent associates. + */ + if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 0)) != 0) + return (ret); + + sdbp->s_callback = callback; + sdbp->s_primary = dbp; + + sdbp->stored_get = sdbp->get; + sdbp->get = __db_secondary_get; + + sdbp->stored_close = sdbp->close; + sdbp->close = __db_secondary_close; + + /* + * Secondary cursors may have the primary's lock file ID, so we + * need to make sure that no older cursors are lying around + * when we make the transition. + */ + if (TAILQ_FIRST(&sdbp->active_queue) != NULL || + TAILQ_FIRST(&sdbp->join_queue) != NULL) { + __db_err(dbenv, + "Databases may not become secondary indices while cursors are open"); + ret = EINVAL; + goto err; + } + while ((sdbc = TAILQ_FIRST(&sdbp->free_queue)) != NULL) + if ((ret = __db_c_destroy(sdbc)) != 0) + goto err; + + F_SET(sdbp, DB_AM_SECONDARY); + + /* + * Check to see if the secondary is empty--and thus if we should + * build it--before we link it in and risk making it show up in + * other threads. + */ + build = 0; + if (LF_ISSET(DB_CREATE)) { + if ((ret = sdbp->cursor(sdbp, txn, &sdbc, 0)) != 0) + goto err; + + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + + /* + * We don't care about key or data; we're just doing + * an existence check. + */ + F_SET(&key, DB_DBT_PARTIAL | DB_DBT_USERMEM); + F_SET(&data, DB_DBT_PARTIAL | DB_DBT_USERMEM); + if ((ret = sdbc->c_real_get(sdbc, &key, &data, + (STD_LOCKING(sdbc) ? DB_RMW : 0) | + DB_FIRST)) == DB_NOTFOUND) { + build = 1; + ret = 0; + } + + /* + * Secondary cursors have special refcounting close + * methods. Be careful. + */ + if ((t_ret = __db_c_close(sdbc)) != 0) + ret = t_ret; + if (ret != 0) + goto err; + } + + /* + * Add the secondary to the list on the primary. Do it here + * so that we see any updates that occur while we're walking + * the primary. + */ + MUTEX_THREAD_LOCK(dbenv, dbp->mutexp); + + /* See __db_s_next for an explanation of secondary refcounting. */ + DB_ASSERT(sdbp->s_refcnt == 0); + sdbp->s_refcnt = 1; + LIST_INSERT_HEAD(&dbp->s_secondaries, sdbp, s_links); + MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp); + + if (build) { + /* + * We loop through the primary, putting each item we + * find into the new secondary. + * + * If we're using CDB, opening these two cursors puts us + * in a bit of a locking tangle: CDB locks are done on the + * primary, so that we stay deadlock-free, but that means + * that updating the secondary while we have a read cursor + * open on the primary will self-block. To get around this, + * we force the primary cursor to use the same locker ID + * as the secondary, so they won't conflict. This should + * be harmless even if we're not using CDB. + */ + if ((ret = sdbp->cursor(sdbp, txn, &sdbc, + CDB_LOCKING(sdbp->dbenv) ? DB_WRITECURSOR : 0)) != 0) + goto err; + if ((ret = __db_icursor(dbp, + txn, dbp->type, PGNO_INVALID, 0, sdbc->locker, &pdbc)) != 0) + goto err; + + /* Lock out other threads, now that we have a locker ID. */ + dbp->associate_lid = sdbc->locker; + + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + while ((ret = pdbc->c_get(pdbc, &key, &data, DB_NEXT)) == 0) { + memset(&skey, 0, sizeof(DBT)); + if ((ret = callback(sdbp, &key, &data, &skey)) != 0) { + if (ret == DB_DONOTINDEX) + continue; + else + goto err; + } + if ((ret = sdbc->c_put(sdbc, + &skey, &key, DB_UPDATE_SECONDARY)) != 0) { + FREE_IF_NEEDED(sdbp, &skey); + goto err; + } + + FREE_IF_NEEDED(sdbp, &skey); + } + if (ret == DB_NOTFOUND) + ret = 0; + + if ((ret = sdbc->c_close(sdbc)) != 0) + goto err; + } + +err: if (pdbc != NULL && (t_ret = pdbc->c_close(pdbc)) != 0 && ret == 0) + ret = t_ret; + + dbp->associate_lid = DB_LOCK_INVALIDID; + + if (txn_local) { + if (ret == 0) + ret = txn->commit(txn, 0); + else + if ((t_ret = txn->abort(txn)) != 0) + ret = __db_panic(dbenv, t_ret); + } + + return (ret); +} + +/* + * __db_pget -- + * Return a primary key/data pair given a secondary key. + * + * PUBLIC: int __db_pget __P((DB *, DB_TXN *, DBT *, DBT *, DBT *, u_int32_t)); + */ +int +__db_pget(dbp, txn, skey, pkey, data, flags) + DB *dbp; + DB_TXN *txn; + DBT *skey, *pkey, *data; + u_int32_t flags; +{ + DBC *dbc; + int ret, t_ret; + + PANIC_CHECK(dbp->dbenv); + DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->pget"); + + if ((ret = __db_pgetchk(dbp, skey, pkey, data, flags)) != 0) + return (ret); + + if ((ret = dbp->cursor(dbp, txn, &dbc, 0)) != 0) + return (ret); + SET_RET_MEM(dbc, dbp); + + /* + * The underlying cursor pget will fill in a default DBT for null + * pkeys, and use the cursor's returned-key memory internally to + * store any intermediate primary keys. However, we've just set + * the returned-key memory to the DB handle's key memory, which + * is unsafe to use if the DB handle is threaded. If the pkey + * argument is NULL, use the DBC-owned returned-key memory + * instead; it'll go away when we close the cursor before we + * return, but in this case that's just fine, as we're not + * returning the primary key. + */ + if (pkey == NULL) + dbc->rkey = &dbc->my_rkey; + + DEBUG_LREAD(dbc, txn, "__db_pget", skey, NULL, flags); + + /* + * The cursor is just a perfectly ordinary secondary database + * cursor. Call its c_pget() method to do the dirty work. + */ + if (flags == 0 || flags == DB_RMW) + flags |= DB_SET; + ret = dbc->c_pget(dbc, skey, pkey, data, flags); + + if ((t_ret = __db_c_close(dbc)) != 0 && ret == 0) + ret = t_ret; + return (ret); +} + +/* + * __db_secondary_get -- + * This wrapper function for DB->pget() is the DB->get() function + * on a database which has been made into a secondary index. + */ +static int +__db_secondary_get(sdbp, txn, skey, data, flags) + DB *sdbp; + DB_TXN *txn; + DBT *skey, *data; + u_int32_t flags; +{ + + DB_ASSERT(F_ISSET(sdbp, DB_AM_SECONDARY)); + return (sdbp->pget(sdbp, txn, skey, NULL, data, flags)); +} + +/* + * __db_secondary_close -- + * Wrapper function for DB->close() which we use on secondaries to + * manage refcounting and make sure we don't close them underneath + * a primary that is updating. + */ +static int +__db_secondary_close(sdbp, flags) + DB *sdbp; + u_int32_t flags; +{ + DB *primary; + int doclose; + + doclose = 0; + primary = sdbp->s_primary; + + MUTEX_THREAD_LOCK(primary->dbenv, primary->mutexp); + /* + * Check the refcount--if it was at 1 when we were called, no + * thread is currently updating this secondary through the primary, + * so it's safe to close it for real. + * + * If it's not safe to do the close now, we do nothing; the + * database will actually be closed when the refcount is decremented, + * which can happen in either __db_s_next or __db_s_done. + */ + DB_ASSERT(sdbp->s_refcnt != 0); + if (--sdbp->s_refcnt == 0) { + LIST_REMOVE(sdbp, s_links); + /* We don't want to call close while the mutex is held. */ + doclose = 1; + } + MUTEX_THREAD_UNLOCK(primary->dbenv, primary->mutexp); + + /* + * sdbp->close is this function; call the real one explicitly if + * need be. + */ + return (doclose ? __db_close(sdbp, flags) : 0); +} + +/* + * __db_append_primary -- + * Perform the secondary index updates necessary to put(DB_APPEND) + * a record to a primary database. + */ +static int +__db_append_primary(dbc, key, data) + DBC *dbc; + DBT *key, *data; +{ + DB *dbp, *sdbp; + DBC *sdbc, *pdbc; + DBT oldpkey, pkey, pdata, skey; + int cmp, ret, t_ret; + + dbp = dbc->dbp; + sdbp = NULL; + ret = 0; + + /* + * Worrying about partial appends seems a little like worrying + * about Linear A character encodings. But we support those + * too if your application understands them. + */ + pdbc = NULL; + if (F_ISSET(data, DB_DBT_PARTIAL) || F_ISSET(key, DB_DBT_PARTIAL)) { + /* + * The dbc we were passed is all set to pass things + * back to the user; we can't safely do a call on it. + * Dup the cursor, grab the real data item (we don't + * care what the key is--we've been passed it directly), + * and use that instead of the data DBT we were passed. + * + * Note that we can get away with this simple get because + * an appended item is by definition new, and the + * correctly-constructed full data item from this partial + * put is on the page waiting for us. + */ + if ((ret = __db_c_idup(dbc, &pdbc, DB_POSITIONI)) != 0) + return (ret); + memset(&pkey, 0, sizeof(DBT)); + memset(&pdata, 0, sizeof(DBT)); + + if ((ret = pdbc->c_get(pdbc, &pkey, &pdata, DB_CURRENT)) != 0) + goto err; + + key = &pkey; + data = &pdata; + } + + /* + * Loop through the secondary indices, putting a new item in + * each that points to the appended item. + * + * This is much like the loop in "step 3" in __db_c_put, so + * I'm not commenting heavily here; it was unclean to excerpt + * just that section into a common function, but the basic + * overview is the same here. + */ + for (sdbp = __db_s_first(dbp); + sdbp != NULL && ret == 0; ret = __db_s_next(&sdbp)) { + memset(&skey, 0, sizeof(DBT)); + if ((ret = sdbp->s_callback(sdbp, key, data, &skey)) != 0) { + if (ret == DB_DONOTINDEX) + continue; + else + goto err; + } + + if ((ret = __db_icursor(sdbp, dbc->txn, sdbp->type, + PGNO_INVALID, 0, dbc->locker, &sdbc)) != 0) { + FREE_IF_NEEDED(sdbp, &skey); + goto err; + } + if (CDB_LOCKING(sdbp->dbenv)) { + DB_ASSERT(sdbc->mylock.off == LOCK_INVALID); + F_SET(sdbc, DBC_WRITER); + } + + /* + * Since we know we have a new primary key, it can't be a + * duplicate duplicate in the secondary. It can be a + * duplicate in a secondary that doesn't support duplicates, + * however, so we need to be careful to avoid an overwrite + * (which would corrupt our index). + */ + if (!F_ISSET(sdbp, DB_AM_DUP)) { + memset(&oldpkey, 0, sizeof(DBT)); + F_SET(&oldpkey, DB_DBT_MALLOC); + ret = sdbc->c_real_get(sdbc, &skey, &oldpkey, + DB_SET | (STD_LOCKING(dbc) ? DB_RMW : 0)); + if (ret == 0) { + cmp = __bam_defcmp(sdbp, &oldpkey, key); + /* + * XXX + * This needs to use the right free function + * as soon as this is possible. + */ + __os_ufree(sdbp->dbenv, + oldpkey.data); + if (cmp != 0) { + __db_err(sdbp->dbenv, "%s%s", + "Append results in a non-unique secondary key in", + " an index not configured to support duplicates"); + ret = EINVAL; + goto err1; + } + } else if (ret != DB_NOTFOUND && ret != DB_KEYEMPTY) + goto err1; + } + + ret = sdbc->c_put(sdbc, &skey, key, DB_UPDATE_SECONDARY); + +err1: FREE_IF_NEEDED(sdbp, &skey); + + if ((t_ret = sdbc->c_close(sdbc)) != 0 && ret == 0) + ret = t_ret; + + if (ret != 0) + goto err; + } + +err: if (pdbc != NULL && (t_ret = pdbc->c_close(pdbc)) != 0 && ret == 0) + ret = t_ret; + if (sdbp != NULL && (t_ret = __db_s_done(sdbp)) != 0 && ret == 0) ret = t_ret; return (ret); } diff --git a/bdb/db/db_cam.c b/bdb/db/db_cam.c index 708d4cbda4d..4de3467d4aa 100644 --- a/bdb/db/db_cam.c +++ b/bdb/db/db_cam.c @@ -1,14 +1,14 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2000 + * Copyright (c) 2000-2002 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: db_cam.c,v 11.52 2001/01/18 15:11:16 bostic Exp $"; +static const char revid[] = "$Id: db_cam.c,v 11.114 2002/09/03 15:44:46 krinsky Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -18,17 +18,18 @@ static const char revid[] = "$Id: db_cam.c,v 11.52 2001/01/18 15:11:16 bostic Ex #endif #include "db_int.h" -#include "db_page.h" -#include "db_shash.h" -#include "lock.h" -#include "btree.h" -#include "hash.h" -#include "qam.h" -#include "txn.h" -#include "db_ext.h" - +#include "dbinc/db_page.h" +#include "dbinc/db_shash.h" +#include "dbinc/btree.h" +#include "dbinc/hash.h" +#include "dbinc/lock.h" +#include "dbinc/log.h" +#include "dbinc/qam.h" + +static int __db_buildpartial __P((DB *, DBT *, DBT *, DBT *)); static int __db_c_cleanup __P((DBC *, DBC *, int)); -static int __db_c_idup __P((DBC *, DBC **, u_int32_t)); +static int __db_c_del_secondary __P((DBC *)); +static int __db_c_pget_recno __P((DBC *, DBT *, DBT *, u_int32_t)); static int __db_wrlock_err __P((DB_ENV *)); #define CDB_LOCKING_INIT(dbp, dbc) \ @@ -43,9 +44,9 @@ static int __db_wrlock_err __P((DB_ENV *)); return (__db_wrlock_err(dbp->dbenv)); \ \ if (F_ISSET(dbc, DBC_WRITECURSOR) && \ - (ret = lock_get((dbp)->dbenv, (dbc)->locker, \ - DB_LOCK_UPGRADE, &(dbc)->lock_dbt, DB_LOCK_WRITE, \ - &(dbc)->mylock)) != 0) \ + (ret = (dbp)->dbenv->lock_get((dbp)->dbenv, \ + (dbc)->locker, DB_LOCK_UPGRADE, &(dbc)->lock_dbt, \ + DB_LOCK_WRITE, &(dbc)->mylock)) != 0) \ return (ret); \ } #define CDB_LOCKING_DONE(dbp, dbc) \ @@ -63,9 +64,8 @@ static int __db_wrlock_err __P((DB_ENV *)); F_ISSET((dbc_o), DBC_WRITECURSOR | DBC_WRITEDUP)) { \ memcpy(&(dbc_n)->mylock, &(dbc_o)->mylock, \ sizeof((dbc_o)->mylock)); \ - (dbc_n)->locker = (dbc_o)->locker; \ - /* This lock isn't ours to put--just discard it on close. */ \ - F_SET((dbc_n), DBC_WRITEDUP); \ + /* This lock isn't ours to put--just discard it on close. */ \ + F_SET((dbc_n), DBC_WRITEDUP); \ } /* @@ -81,12 +81,14 @@ __db_c_close(dbc) DB *dbp; DBC *opd; DBC_INTERNAL *cp; + DB_ENV *dbenv; int ret, t_ret; dbp = dbc->dbp; + dbenv = dbp->dbenv; ret = 0; - PANIC_CHECK(dbp->dbenv); + PANIC_CHECK(dbenv); /* * If the cursor is already closed we have a serious problem, and we @@ -95,7 +97,7 @@ __db_c_close(dbc) */ if (!F_ISSET(dbc, DBC_ACTIVE)) { if (dbp != NULL) - __db_err(dbp->dbenv, "Closing closed cursor"); + __db_err(dbenv, "Closing already-closed cursor"); DB_ASSERT(0); return (EINVAL); @@ -113,11 +115,9 @@ __db_c_close(dbc) * !!! * Cursors must be removed from the active queue before calling the * access specific cursor close routine, btree depends on having that - * order of operations. It must also happen before any action that - * can fail and cause __db_c_close to return an error, or else calls - * here from __db_close may loop indefinitely. + * order of operations. */ - MUTEX_THREAD_LOCK(dbp->dbenv, dbp->mutexp); + MUTEX_THREAD_LOCK(dbenv, dbp->mutexp); if (opd != NULL) { F_CLR(opd, DBC_ACTIVE); @@ -126,7 +126,7 @@ __db_c_close(dbc) F_CLR(dbc, DBC_ACTIVE); TAILQ_REMOVE(&dbp->active_queue, dbc, links); - MUTEX_THREAD_UNLOCK(dbp->dbenv, dbp->mutexp); + MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp); /* Call the access specific cursor close routine. */ if ((t_ret = @@ -137,17 +137,20 @@ __db_c_close(dbc) * Release the lock after calling the access method specific close * routine, a Btree cursor may have had pending deletes. */ - if (CDB_LOCKING(dbc->dbp->dbenv)) { + if (CDB_LOCKING(dbenv)) { /* * If DBC_WRITEDUP is set, the cursor is an internally * duplicated write cursor and the lock isn't ours to put. + * + * Also, be sure not to free anything if mylock.off is + * INVALID; in some cases, such as idup'ed read cursors + * and secondary update cursors, a cursor in a CDB + * environment may not have a lock at all. */ - if (!F_ISSET(dbc, DBC_WRITEDUP) && - dbc->mylock.off != LOCK_INVALID) { - if ((t_ret = lock_put(dbc->dbp->dbenv, - &dbc->mylock)) != 0 && ret == 0) + if (!F_ISSET(dbc, DBC_WRITEDUP) && LOCK_ISSET(dbc->mylock)) { + if ((t_ret = dbenv->lock_put( + dbenv, &dbc->mylock)) != 0 && ret == 0) ret = t_ret; - dbc->mylock.off = LOCK_INVALID; } /* For safety's sake, since this is going on the free queue. */ @@ -159,7 +162,7 @@ __db_c_close(dbc) dbc->txn->cursors--; /* Move the cursor(s) to the free queue. */ - MUTEX_THREAD_LOCK(dbp->dbenv, dbp->mutexp); + MUTEX_THREAD_LOCK(dbenv, dbp->mutexp); if (opd != NULL) { if (dbc->txn != NULL) dbc->txn->cursors--; @@ -167,7 +170,7 @@ __db_c_close(dbc) opd = NULL; } TAILQ_INSERT_TAIL(&dbp->free_queue, dbc, links); - MUTEX_THREAD_UNLOCK(dbp->dbenv, dbp->mutexp); + MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp); return (ret); } @@ -183,27 +186,37 @@ __db_c_destroy(dbc) DBC *dbc; { DB *dbp; - DBC_INTERNAL *cp; - int ret; + DB_ENV *dbenv; + int ret, t_ret; dbp = dbc->dbp; - cp = dbc->internal; + dbenv = dbp->dbenv; /* Remove the cursor from the free queue. */ - MUTEX_THREAD_LOCK(dbp->dbenv, dbp->mutexp); + MUTEX_THREAD_LOCK(dbenv, dbp->mutexp); TAILQ_REMOVE(&dbp->free_queue, dbc, links); - MUTEX_THREAD_UNLOCK(dbp->dbenv, dbp->mutexp); + MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp); /* Free up allocated memory. */ - if (dbc->rkey.data != NULL) - __os_free(dbc->rkey.data, dbc->rkey.ulen); - if (dbc->rdata.data != NULL) - __os_free(dbc->rdata.data, dbc->rdata.ulen); + if (dbc->my_rskey.data != NULL) + __os_free(dbenv, dbc->my_rskey.data); + if (dbc->my_rkey.data != NULL) + __os_free(dbenv, dbc->my_rkey.data); + if (dbc->my_rdata.data != NULL) + __os_free(dbenv, dbc->my_rdata.data); /* Call the access specific cursor destroy routine. */ ret = dbc->c_am_destroy == NULL ? 0 : dbc->c_am_destroy(dbc); - __os_free(dbc, sizeof(*dbc)); + /* + * Release the lock id for this cursor. + */ + if (LOCKING_ON(dbenv) && + F_ISSET(dbc, DBC_OWN_LID) && + (t_ret = dbenv->lock_id_free(dbenv, dbc->lid)) != 0 && ret == 0) + ret = t_ret; + + __os_free(dbenv, dbc); return (ret); } @@ -256,7 +269,7 @@ __db_c_count(dbc, recnop, flags) break; default: return (__db_unknown_type(dbp->dbenv, - "__db_c_count", dbp->type)); + "__db_c_count", dbp->type)); } return (0); } @@ -286,11 +299,13 @@ __db_c_del(dbc, flags) dbp = dbc->dbp; PANIC_CHECK(dbp->dbenv); - DB_CHECK_TXN(dbp, dbc->txn); /* Check for invalid flags. */ - if ((ret = __db_cdelchk(dbp, flags, - F_ISSET(dbp, DB_AM_RDONLY), IS_INITIALIZED(dbc))) != 0) + if ((ret = __db_cdelchk(dbp, flags, IS_INITIALIZED(dbc))) != 0) + return (ret); + + /* Check for consistent transaction usage. */ + if ((ret = __db_check_txn(dbp, dbc->txn, dbc->locker, 0)) != 0) return (ret); DEBUG_LWRITE(dbc, dbc->txn, "db_c_del", NULL, NULL, flags); @@ -298,6 +313,27 @@ __db_c_del(dbc, flags) CDB_LOCKING_INIT(dbp, dbc); /* + * If we're a secondary index, and DB_UPDATE_SECONDARY isn't set + * (which it only is if we're being called from a primary update), + * then we need to call through to the primary and delete the item. + * + * Note that this will delete the current item; we don't need to + * delete it ourselves as well, so we can just goto done. + */ + if (flags != DB_UPDATE_SECONDARY && F_ISSET(dbp, DB_AM_SECONDARY)) { + ret = __db_c_del_secondary(dbc); + goto done; + } + + /* + * If we are a primary and have secondary indices, go through + * and delete any secondary keys that point at the current record. + */ + if (LIST_FIRST(&dbp->s_secondaries) != NULL && + (ret = __db_c_del_primary(dbc)) != 0) + goto done; + + /* * Off-page duplicate trees are locked in the primary tree, that is, * we acquire a write lock in the primary tree and no locks in the * off-page dup tree. If the del operation is done in an off-page @@ -310,7 +346,7 @@ __db_c_del(dbc, flags) if ((ret = dbc->c_am_writelock(dbc)) == 0) ret = opd->c_am_del(opd); - CDB_LOCKING_DONE(dbp, dbc); +done: CDB_LOCKING_DONE(dbp, dbc); return (ret); } @@ -362,7 +398,7 @@ __db_c_dup(dbc_orig, dbcp, flags) if (CDB_LOCKING(dbenv) && flags != DB_POSITIONI) { DB_ASSERT(!F_ISSET(dbc_orig, DBC_WRITER | DBC_WRITECURSOR)); - if ((ret = lock_get(dbenv, dbc_n->locker, 0, + if ((ret = dbenv->lock_get(dbenv, dbc_n->locker, 0, &dbc_n->lock_dbt, DB_LOCK_READ, &dbc_n->mylock)) != 0) { (void)__db_c_close(dbc_n); return (ret); @@ -380,6 +416,8 @@ __db_c_dup(dbc_orig, dbcp, flags) dbc_n->internal->opd = dbc_nopd; } + /* Copy the dirty read flag to the new cursor. */ + F_SET(dbc_n, F_ISSET(dbc_orig, DBC_DIRTY_READ)); return (0); err: if (dbc_n != NULL) @@ -393,8 +431,10 @@ err: if (dbc_n != NULL) /* * __db_c_idup -- * Internal version of __db_c_dup. + * + * PUBLIC: int __db_c_idup __P((DBC *, DBC **, u_int32_t)); */ -static int +int __db_c_idup(dbc_orig, dbcp, flags) DBC *dbc_orig, **dbcp; u_int32_t flags; @@ -408,17 +448,16 @@ __db_c_idup(dbc_orig, dbcp, flags) dbc_n = *dbcp; if ((ret = __db_icursor(dbp, dbc_orig->txn, dbc_orig->dbtype, - dbc_orig->internal->root, F_ISSET(dbc_orig, DBC_OPD), &dbc_n)) != 0) + dbc_orig->internal->root, F_ISSET(dbc_orig, DBC_OPD), + dbc_orig->locker, &dbc_n)) != 0) return (ret); - dbc_n->locker = dbc_orig->locker; - /* If the user wants the cursor positioned, do it here. */ if (flags == DB_POSITION || flags == DB_POSITIONI) { int_n = dbc_n->internal; int_orig = dbc_orig->internal; - dbc_n->flags = dbc_orig->flags; + dbc_n->flags |= dbc_orig->flags & ~DBC_OWN_LID; int_n->indx = int_orig->indx; int_n->pgno = int_orig->pgno; @@ -449,6 +488,9 @@ __db_c_idup(dbc_orig, dbcp, flags) /* Now take care of duping the CDB information. */ CDB_LOCKING_COPY(dbp, dbc_orig, dbc_n); + /* Copy the dirty read flag to the new cursor. */ + F_SET(dbc_n, F_ISSET(dbc_orig, DBC_DIRTY_READ)); + *dbcp = dbc_n; return (0); @@ -460,12 +502,13 @@ err: (void)dbc_n->c_close(dbc_n); * __db_c_newopd -- * Create a new off-page duplicate cursor. * - * PUBLIC: int __db_c_newopd __P((DBC *, db_pgno_t, DBC **)); + * PUBLIC: int __db_c_newopd __P((DBC *, db_pgno_t, DBC *, DBC **)); */ int -__db_c_newopd(dbc_parent, root, dbcp) +__db_c_newopd(dbc_parent, root, oldopd, dbcp) DBC *dbc_parent; db_pgno_t root; + DBC *oldopd; DBC **dbcp; { DB *dbp; @@ -476,14 +519,44 @@ __db_c_newopd(dbc_parent, root, dbcp) dbp = dbc_parent->dbp; dbtype = (dbp->dup_compare == NULL) ? DB_RECNO : DB_BTREE; + /* + * On failure, we want to default to returning the old off-page dup + * cursor, if any; our caller can't be left with a dangling pointer + * to a freed cursor. On error the only allowable behavior is to + * close the cursor (and the old OPD cursor it in turn points to), so + * this should be safe. + */ + *dbcp = oldopd; + if ((ret = __db_icursor(dbp, - dbc_parent->txn, dbtype, root, 1, &opd)) != 0) + dbc_parent->txn, dbtype, root, 1, dbc_parent->locker, &opd)) != 0) return (ret); + /* !!! + * If the parent is a DBC_WRITER, this won't copy anything. That's + * not actually a problem--we only need lock information in an + * off-page dup cursor in order to upgrade at cursor close time + * if we've done a delete, but WRITERs don't need to upgrade. + */ CDB_LOCKING_COPY(dbp, dbc_parent, opd); *dbcp = opd; + /* + * Check to see if we already have an off-page dup cursor that we've + * passed in. If we do, close it. It'd be nice to use it again + * if it's a cursor belonging to the right tree, but if we're doing + * a cursor-relative operation this might not be safe, so for now + * we'll take the easy way out and always close and reopen. + * + * Note that under no circumstances do we want to close the old + * cursor without returning a valid new one; we don't want to + * leave the main cursor in our caller with a non-NULL pointer + * to a freed off-page dup cursor. + */ + if (oldopd != NULL && (ret = oldopd->c_close(oldopd)) != 0) + return (ret); + return (0); } @@ -502,8 +575,9 @@ __db_c_get(dbc_arg, key, data, flags) DB *dbp; DBC *dbc, *dbc_n, *opd; DBC_INTERNAL *cp, *cp_n; + DB_MPOOLFILE *mpf; db_pgno_t pgno; - u_int32_t tmp_flags, tmp_rmw; + u_int32_t multi, tmp_dirty, tmp_flags, tmp_rmw; u_int8_t type; int ret, t_ret; @@ -517,6 +591,7 @@ __db_c_get(dbc_arg, key, data, flags) * functions. */ dbp = dbc_arg->dbp; + mpf = dbp->mpf; dbc_n = NULL; opd = NULL; @@ -531,6 +606,12 @@ __db_c_get(dbc_arg, key, data, flags) tmp_rmw = LF_ISSET(DB_RMW); LF_CLR(DB_RMW); + tmp_dirty = LF_ISSET(DB_DIRTY_READ); + LF_CLR(DB_DIRTY_READ); + + multi = LF_ISSET(DB_MULTIPLE|DB_MULTIPLE_KEY); + LF_CLR(DB_MULTIPLE|DB_MULTIPLE_KEY); + DEBUG_LREAD(dbc_arg, dbc_arg->txn, "db_c_get", flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, NULL, flags); @@ -538,8 +619,18 @@ __db_c_get(dbc_arg, key, data, flags) * Return a cursor's record number. It has nothing to do with the * cursor get code except that it was put into the interface. */ - if (flags == DB_GET_RECNO) - return (__bam_c_rget(dbc_arg, data, flags | tmp_rmw)); + if (flags == DB_GET_RECNO) { + if (tmp_rmw) + F_SET(dbc_arg, DBC_RMW); + if (tmp_dirty) + F_SET(dbc_arg, DBC_DIRTY_READ); + ret = __bam_c_rget(dbc_arg, data); + if (tmp_rmw) + F_CLR(dbc_arg, DBC_RMW); + if (tmp_dirty) + F_CLR(dbc_arg, DBC_DIRTY_READ); + return (ret); + } if (flags == DB_CONSUME || flags == DB_CONSUME_WAIT) CDB_LOCKING_INIT(dbp, dbc_arg); @@ -564,8 +655,8 @@ __db_c_get(dbc_arg, key, data, flags) if ((ret = __db_c_idup(cp->opd, &opd, DB_POSITIONI)) != 0) return (ret); - switch (ret = opd->c_am_get( - opd, key, data, flags, NULL)) { + switch (ret = + opd->c_am_get(opd, key, data, flags, NULL)) { case 0: goto done; case DB_NOTFOUND: @@ -605,21 +696,49 @@ __db_c_get(dbc_arg, key, data, flags) break; } + if (tmp_dirty) + F_SET(dbc_arg, DBC_DIRTY_READ); + /* * If this cursor is going to be closed immediately, we don't * need to take precautions to clean it up on error. */ if (F_ISSET(dbc_arg, DBC_TRANSIENT)) dbc_n = dbc_arg; - else if ((ret = __db_c_idup(dbc_arg, &dbc_n, tmp_flags)) != 0) - goto err; + else { + ret = __db_c_idup(dbc_arg, &dbc_n, tmp_flags); + if (tmp_dirty) + F_CLR(dbc_arg, DBC_DIRTY_READ); + + if (ret != 0) + goto err; + COPY_RET_MEM(dbc_arg, dbc_n); + } if (tmp_rmw) F_SET(dbc_n, DBC_RMW); + + switch (multi) { + case DB_MULTIPLE: + F_SET(dbc_n, DBC_MULTIPLE); + break; + case DB_MULTIPLE_KEY: + F_SET(dbc_n, DBC_MULTIPLE_KEY); + break; + case DB_MULTIPLE | DB_MULTIPLE_KEY: + F_SET(dbc_n, DBC_MULTIPLE|DBC_MULTIPLE_KEY); + break; + case 0: + break; + } + pgno = PGNO_INVALID; ret = dbc_n->c_am_get(dbc_n, key, data, flags, &pgno); if (tmp_rmw) F_CLR(dbc_n, DBC_RMW); + if (tmp_dirty) + F_CLR(dbc_arg, DBC_DIRTY_READ); + F_CLR(dbc_n, DBC_MULTIPLE|DBC_MULTIPLE_KEY); if (ret != 0) goto err; @@ -630,7 +749,8 @@ __db_c_get(dbc_arg, key, data, flags) * a new cursor and call the underlying function. */ if (pgno != PGNO_INVALID) { - if ((ret = __db_c_newopd(dbc_arg, pgno, &cp_n->opd)) != 0) + if ((ret = __db_c_newopd(dbc_arg, + pgno, cp_n->opd, &cp_n->opd)) != 0) goto err; switch (flags) { @@ -648,10 +768,9 @@ __db_c_get(dbc_arg, key, data, flags) tmp_flags = DB_LAST; break; case DB_GET_BOTH: - tmp_flags = DB_GET_BOTH; - break; case DB_GET_BOTHC: - tmp_flags = DB_GET_BOTHC; + case DB_GET_BOTH_RANGE: + tmp_flags = flags; break; default: ret = @@ -680,19 +799,66 @@ done: /* cp_n = dbc_n == NULL ? dbc_arg->internal : dbc_n->internal; if (!F_ISSET(key, DB_DBT_ISSET)) { if (cp_n->page == NULL && (ret = - memp_fget(dbp->mpf, &cp_n->pgno, 0, &cp_n->page)) != 0) + mpf->get(mpf, &cp_n->pgno, 0, &cp_n->page)) != 0) goto err; if ((ret = __db_ret(dbp, cp_n->page, cp_n->indx, - key, &dbc_arg->rkey.data, &dbc_arg->rkey.ulen)) != 0) + key, &dbc_arg->rkey->data, &dbc_arg->rkey->ulen)) != 0) goto err; } - dbc = opd != NULL ? opd : cp_n->opd != NULL ? cp_n->opd : dbc_n; - if (!F_ISSET(data, DB_DBT_ISSET)) { + if (multi != 0) { + /* + * Even if fetching from the OPD cursor we need a duplicate + * primary cursor if we are going after multiple keys. + */ + if (dbc_n == NULL) { + /* + * Non-"_KEY" DB_MULTIPLE doesn't move the main cursor, + * so it's safe to just use dbc_arg, unless dbc_arg + * has an open OPD cursor whose state might need to + * be preserved. + */ + if ((!(multi & DB_MULTIPLE_KEY) && + dbc_arg->internal->opd == NULL) || + F_ISSET(dbc_arg, DBC_TRANSIENT)) + dbc_n = dbc_arg; + else { + if ((ret = __db_c_idup(dbc_arg, + &dbc_n, DB_POSITIONI)) != 0) + goto err; + if ((ret = dbc_n->c_am_get(dbc_n, + key, data, DB_CURRENT, &pgno)) != 0) + goto err; + } + cp_n = dbc_n->internal; + } + + /* + * If opd is set then we dupped the opd that we came in with. + * When we return we may have a new opd if we went to another + * key. + */ + if (opd != NULL) { + DB_ASSERT(cp_n->opd == NULL); + cp_n->opd = opd; + opd = NULL; + } + + /* + * Bulk get doesn't use __db_retcopy, so data.size won't + * get set up unless there is an error. Assume success + * here. This is the only call to c_am_bulk, and it avoids + * setting it exactly the same everywhere. If we have an + * ENOMEM error, it'll get overwritten with the needed value. + */ + data->size = data->ulen; + ret = dbc_n->c_am_bulk(dbc_n, data, flags | multi); + } else if (!F_ISSET(data, DB_DBT_ISSET)) { + dbc = opd != NULL ? opd : cp_n->opd != NULL ? cp_n->opd : dbc_n; type = TYPE(dbc->internal->page); ret = __db_ret(dbp, dbc->internal->page, dbc->internal->indx + (type == P_LBTREE || type == P_HASH ? O_INDX : 0), - data, &dbc_arg->rdata.data, &dbc_arg->rdata.ulen); + data, &dbc_arg->rdata->data, &dbc_arg->rdata->ulen); } err: /* Don't pass DB_DBT_ISSET back to application level, error or no. */ @@ -701,9 +867,8 @@ err: /* Don't pass DB_DBT_ISSET back to application level, error or no. */ /* Cleanup and cursor resolution. */ if (opd != NULL) { - if ((t_ret = - __db_c_cleanup(dbc_arg->internal->opd, - opd, ret)) != 0 && ret == 0) + if ((t_ret = __db_c_cleanup( + dbc_arg->internal->opd, opd, ret)) != 0 && ret == 0) ret = t_ret; } @@ -728,11 +893,12 @@ __db_c_put(dbc_arg, key, data, flags) DBT *key, *data; u_int32_t flags; { - DB *dbp; - DBC *dbc_n, *opd; + DB *dbp, *sdbp; + DBC *dbc_n, *oldopd, *opd, *sdbc, *pdbc; + DBT olddata, oldpkey, oldskey, newdata, pkey, save_skey, skey, temp; db_pgno_t pgno; - u_int32_t tmp_flags; - int ret, t_ret; + int cmp, have_oldrec, ispartial, nodel, re_pad, ret, rmw, t_ret; + u_int32_t re_len, size, tmp_flags; /* * Cursor Cleanup Note: @@ -744,16 +910,30 @@ __db_c_put(dbc_arg, key, data, flags) * functions. */ dbp = dbc_arg->dbp; - dbc_n = NULL; + sdbp = NULL; + pdbc = dbc_n = NULL; + memset(&newdata, 0, sizeof(DBT)); PANIC_CHECK(dbp->dbenv); - DB_CHECK_TXN(dbp, dbc_arg->txn); /* Check for invalid flags. */ - if ((ret = __db_cputchk(dbp, key, data, flags, - F_ISSET(dbp, DB_AM_RDONLY), IS_INITIALIZED(dbc_arg))) != 0) + if ((ret = __db_cputchk(dbp, + key, data, flags, IS_INITIALIZED(dbc_arg))) != 0) + return (ret); + + /* Check for consistent transaction usage. */ + if ((ret = __db_check_txn(dbp, dbc_arg->txn, dbc_arg->locker, 0)) != 0) return (ret); + /* + * Putting to secondary indices is forbidden; when we need + * to internally update one, we'll call this with a private + * synonym for DB_KEYLAST, DB_UPDATE_SECONDARY, which does + * the right thing but won't return an error from cputchk(). + */ + if (flags == DB_UPDATE_SECONDARY) + flags = DB_KEYLAST; + DEBUG_LWRITE(dbc_arg, dbc_arg->txn, "db_c_put", flags == DB_KEYFIRST || flags == DB_KEYLAST || flags == DB_NODUPDATA ? key : NULL, data, flags); @@ -761,6 +941,439 @@ __db_c_put(dbc_arg, key, data, flags) CDB_LOCKING_INIT(dbp, dbc_arg); /* + * Check to see if we are a primary and have secondary indices. + * If we are not, we save ourselves a good bit of trouble and + * just skip to the "normal" put. + */ + if (LIST_FIRST(&dbp->s_secondaries) == NULL) + goto skip_s_update; + + /* + * We have at least one secondary which we may need to update. + * + * There is a rather vile locking issue here. Secondary gets + * will always involve acquiring a read lock in the secondary, + * then acquiring a read lock in the primary. Ideally, we + * would likewise perform puts by updating all the secondaries + * first, then doing the actual put in the primary, to avoid + * deadlock (since having multiple threads doing secondary + * gets and puts simultaneously is probably a common case). + * + * However, if this put is a put-overwrite--and we have no way to + * tell in advance whether it will be--we may need to delete + * an outdated secondary key. In order to find that old + * secondary key, we need to get the record we're overwriting, + * before we overwrite it. + * + * (XXX: It would be nice to avoid this extra get, and have the + * underlying put routines somehow pass us the old record + * since they need to traverse the tree anyway. I'm saving + * this optimization for later, as it's a lot of work, and it + * would be hard to fit into this locking paradigm anyway.) + * + * The simple thing to do would be to go get the old record before + * we do anything else. Unfortunately, though, doing so would + * violate our "secondary, then primary" lock acquisition + * ordering--even in the common case where no old primary record + * exists, we'll still acquire and keep a lock on the page where + * we're about to do the primary insert. + * + * To get around this, we do the following gyrations, which + * hopefully solve this problem in the common case: + * + * 1) If this is a c_put(DB_CURRENT), go ahead and get the + * old record. We already hold the lock on this page in + * the primary, so no harm done, and we'll need the primary + * key (which we weren't passed in this case) to do any + * secondary puts anyway. + * + * 2) If we're doing a partial put, we need to perform the + * get on the primary key right away, since we don't have + * the whole datum that the secondary key is based on. + * We may also need to pad out the record if the primary + * has a fixed record length. + * + * 3) Loop through the secondary indices, putting into each a + * new secondary key that corresponds to the new record. + * + * 4) If we haven't done so in (1) or (2), get the old primary + * key/data pair. If one does not exist--the common case--we're + * done with secondary indices, and can go straight on to the + * primary put. + * + * 5) If we do have an old primary key/data pair, however, we need + * to loop through all the secondaries a second time and delete + * the old secondary in each. + */ + memset(&pkey, 0, sizeof(DBT)); + memset(&olddata, 0, sizeof(DBT)); + have_oldrec = nodel = 0; + + /* + * Primary indices can't have duplicates, so only DB_CURRENT, + * DB_KEYFIRST, and DB_KEYLAST make any sense. Other flags + * should have been caught by the checking routine, but + * add a sprinkling of paranoia. + */ + DB_ASSERT(flags == DB_CURRENT || + flags == DB_KEYFIRST || flags == DB_KEYLAST); + + /* + * We'll want to use DB_RMW in a few places, but it's only legal + * when locking is on. + */ + rmw = STD_LOCKING(dbc_arg) ? DB_RMW : 0; + + if (flags == DB_CURRENT) { /* Step 1. */ + /* + * This is safe to do on the cursor we already have; + * error or no, it won't move. + * + * We use DB_RMW for all of these gets because we'll be + * writing soon enough in the "normal" put code. In + * transactional databases we'll hold those write locks + * even if we close the cursor we're reading with. + */ + ret = dbc_arg->c_get(dbc_arg, + &pkey, &olddata, rmw | DB_CURRENT); + if (ret == DB_KEYEMPTY) { + nodel = 1; /* + * We know we don't need a delete + * in the secondary. + */ + have_oldrec = 1; /* We've looked for the old record. */ + } else if (ret != 0) + goto err; + else + have_oldrec = 1; + + } else { + /* So we can just use &pkey everywhere instead of key. */ + pkey.data = key->data; + pkey.size = key->size; + } + + /* + * Check for partial puts (step 2). + */ + if (F_ISSET(data, DB_DBT_PARTIAL)) { + if (!have_oldrec && !nodel) { + /* + * We're going to have to search the tree for the + * specified key. Dup a cursor (so we have the same + * locking info) and do a c_get. + */ + if ((ret = __db_c_idup(dbc_arg, &pdbc, 0)) != 0) + goto err; + + /* We should have gotten DB_CURRENT in step 1. */ + DB_ASSERT(flags != DB_CURRENT); + + ret = pdbc->c_get(pdbc, + &pkey, &olddata, rmw | DB_SET); + if (ret == DB_KEYEMPTY || ret == DB_NOTFOUND) { + nodel = 1; + ret = 0; + } + if ((t_ret = pdbc->c_close(pdbc)) != 0) + ret = t_ret; + if (ret != 0) + goto err; + + have_oldrec = 1; + } + + /* + * Now build the new datum from olddata and the partial + * data we were given. + */ + if ((ret = + __db_buildpartial(dbp, &olddata, data, &newdata)) != 0) + goto err; + ispartial = 1; + } else + ispartial = 0; + + /* + * Handle fixed-length records. If the primary database has + * fixed-length records, we need to pad out the datum before + * we pass it into the callback function; we always index the + * "real" record. + */ + if ((dbp->type == DB_RECNO && F_ISSET(dbp, DB_AM_FIXEDLEN)) || + (dbp->type == DB_QUEUE)) { + if (dbp->type == DB_QUEUE) { + re_len = ((QUEUE *)dbp->q_internal)->re_len; + re_pad = ((QUEUE *)dbp->q_internal)->re_pad; + } else { + re_len = ((BTREE *)dbp->bt_internal)->re_len; + re_pad = ((BTREE *)dbp->bt_internal)->re_pad; + } + + size = ispartial ? newdata.size : data->size; + if (size > re_len) { + __db_err(dbp->dbenv, + "Length improper for fixed length record %lu", + (u_long)size); + ret = EINVAL; + goto err; + } else if (size < re_len) { + /* + * If we're not doing a partial put, copy + * data->data into newdata.data, then pad out + * newdata.data. + * + * If we're doing a partial put, the data + * we want are already in newdata.data; we + * just need to pad. + * + * Either way, realloc is safe. + */ + if ((ret = __os_realloc(dbp->dbenv, re_len, + &newdata.data)) != 0) + goto err; + if (!ispartial) + memcpy(newdata.data, data->data, size); + memset((u_int8_t *)newdata.data + size, re_pad, + re_len - size); + newdata.size = re_len; + ispartial = 1; + } + } + + /* + * Loop through the secondaries. (Step 3.) + * + * Note that __db_s_first and __db_s_next will take care of + * thread-locking and refcounting issues. + */ + for (sdbp = __db_s_first(dbp); + sdbp != NULL && ret == 0; ret = __db_s_next(&sdbp)) { + /* + * Call the callback for this secondary, to get the + * appropriate secondary key. + */ + memset(&skey, 0, sizeof(DBT)); + if ((ret = sdbp->s_callback(sdbp, + &pkey, ispartial ? &newdata : data, &skey)) != 0) { + if (ret == DB_DONOTINDEX) + /* + * The callback returned a null value--don't + * put this key in the secondary. Just + * move on to the next one--we'll handle + * any necessary deletes in step 5. + */ + continue; + else + goto err; + } + + /* + * Save the DBT we just got back from the callback function + * off; we want to pass its value into c_get functions + * that may stomp on a buffer the callback function + * allocated. + */ + memset(&save_skey, 0, sizeof(DBT)); /* Paranoia. */ + save_skey = skey; + + /* + * Open a cursor in this secondary. + * + * Use the same locker ID as our primary cursor, so that + * we're guaranteed that the locks don't conflict (e.g. in CDB + * or if we're subdatabases that share and want to lock a + * metadata page). + */ + if ((ret = __db_icursor(sdbp, dbc_arg->txn, sdbp->type, + PGNO_INVALID, 0, dbc_arg->locker, &sdbc)) != 0) + goto err; + + /* + * If we're in CDB, updates will fail since the new cursor + * isn't a writer. However, we hold the WRITE lock in the + * primary and will for as long as our new cursor lasts, + * and the primary and secondary share a lock file ID, + * so it's safe to consider this a WRITER. The close + * routine won't try to put anything because we don't + * really have a lock. + */ + if (CDB_LOCKING(sdbp->dbenv)) { + DB_ASSERT(sdbc->mylock.off == LOCK_INVALID); + F_SET(sdbc, DBC_WRITER); + } + + /* + * There are three cases here-- + * 1) The secondary supports sorted duplicates. + * If we attempt to put a secondary/primary pair + * that already exists, that's a duplicate duplicate, + * and c_put will return DB_KEYEXIST (see __db_duperr). + * This will leave us with exactly one copy of the + * secondary/primary pair, and this is just right--we'll + * avoid deleting it later, as the old and new secondaries + * will match (since the old secondary is the dup dup + * that's already there). + * 2) The secondary supports duplicates, but they're not + * sorted. We need to avoid putting a duplicate + * duplicate, because the matching old and new secondaries + * will prevent us from deleting anything and we'll + * wind up with two secondary records that point to the + * same primary key. Do a c_get(DB_GET_BOTH); if + * that returns 0, skip the put. + * 3) The secondary doesn't support duplicates at all. + * In this case, secondary keys must be unique; if + * another primary key already exists for this + * secondary key, we have to either overwrite it or + * not put this one, and in either case we've + * corrupted the secondary index. Do a c_get(DB_SET). + * If the secondary/primary pair already exists, do + * nothing; if the secondary exists with a different + * primary, return an error; and if the secondary + * does not exist, put it. + */ + if (!F_ISSET(sdbp, DB_AM_DUP)) { + /* Case 3. */ + memset(&oldpkey, 0, sizeof(DBT)); + F_SET(&oldpkey, DB_DBT_MALLOC); + ret = sdbc->c_real_get(sdbc, + &skey, &oldpkey, rmw | DB_SET); + if (ret == 0) { + cmp = __bam_defcmp(sdbp, &oldpkey, &pkey); + __os_ufree(sdbp->dbenv, oldpkey.data); + if (cmp != 0) { + __db_err(sdbp->dbenv, "%s%s", + "Put results in a non-unique secondary key in an ", + "index not configured to support duplicates"); + ret = EINVAL; + goto skipput; + } + } else if (ret != DB_NOTFOUND && ret != DB_KEYEMPTY) + goto skipput; + } else if (!F_ISSET(sdbp, DB_AM_DUPSORT)) + /* Case 2. */ + if ((ret = sdbc->c_real_get(sdbc, + &skey, &pkey, rmw | DB_GET_BOTH)) == 0) + goto skipput; + + ret = sdbc->c_put(sdbc, &skey, &pkey, DB_UPDATE_SECONDARY); + + /* + * We don't know yet whether this was a put-overwrite that + * in fact changed nothing. If it was, we may get DB_KEYEXIST. + * This is not an error. + */ + if (ret == DB_KEYEXIST) + ret = 0; + +skipput: FREE_IF_NEEDED(sdbp, &save_skey) + + if ((t_ret = sdbc->c_close(sdbc)) != 0) + ret = t_ret; + + if (ret != 0) + goto err; + } + if (ret != 0) + goto err; + + /* If still necessary, go get the old primary key/data. (Step 4.) */ + if (!have_oldrec) { + /* See the comments in step 2. This is real familiar. */ + if ((ret = __db_c_idup(dbc_arg, &pdbc, 0)) != 0) + goto err; + DB_ASSERT(flags != DB_CURRENT); + pkey.data = key->data; + pkey.size = key->size; + ret = pdbc->c_get(pdbc, &pkey, &olddata, rmw | DB_SET); + if (ret == DB_KEYEMPTY || ret == DB_NOTFOUND) { + nodel = 1; + ret = 0; + } + if ((t_ret = pdbc->c_close(pdbc)) != 0) + ret = t_ret; + if (ret != 0) + goto err; + have_oldrec = 1; + } + + /* + * If we don't follow this goto, we do in fact have an old record + * we may need to go delete. (Step 5). + */ + if (nodel) + goto skip_s_update; + + for (sdbp = __db_s_first(dbp); + sdbp != NULL && ret == 0; ret = __db_s_next(&sdbp)) { + /* + * Call the callback for this secondary to get the + * old secondary key. + */ + memset(&oldskey, 0, sizeof(DBT)); + if ((ret = sdbp->s_callback(sdbp, + &pkey, &olddata, &oldskey)) != 0) { + if (ret == DB_DONOTINDEX) + /* + * The callback returned a null value--there's + * nothing to delete. Go on to the next + * secondary. + */ + continue; + else + goto err; + } + if ((ret = sdbp->s_callback(sdbp, + &pkey, ispartial ? &newdata : data, &skey)) != 0 && + ret != DB_DONOTINDEX) + goto err; + + /* + * If there is no new secondary key, or if the old secondary + * key is different from the new secondary key, then + * we need to delete the old one. + * + * Note that bt_compare is (and must be) set no matter + * what access method we're in. + */ + sdbc = NULL; + if (ret == DB_DONOTINDEX || + ((BTREE *)sdbp->bt_internal)->bt_compare(sdbp, + &oldskey, &skey) != 0) { + if ((ret = __db_icursor(sdbp, dbc_arg->txn, sdbp->type, + PGNO_INVALID, 0, dbc_arg->locker, &sdbc)) != 0) + goto err; + if (CDB_LOCKING(sdbp->dbenv)) { + DB_ASSERT(sdbc->mylock.off == LOCK_INVALID); + F_SET(sdbc, DBC_WRITER); + } + + /* + * Don't let c_get(DB_GET_BOTH) stomp on + * any secondary key value that the callback + * function may have allocated. Use a temp + * DBT instead. + */ + memset(&temp, 0, sizeof(DBT)); + temp.data = oldskey.data; + temp.size = oldskey.size; + if ((ret = sdbc->c_real_get(sdbc, + &temp, &pkey, rmw | DB_GET_BOTH)) == 0) + ret = sdbc->c_del(sdbc, DB_UPDATE_SECONDARY); + } + + FREE_IF_NEEDED(sdbp, &skey); + FREE_IF_NEEDED(sdbp, &oldskey); + if (sdbc != NULL && (t_ret = sdbc->c_close(sdbc)) != 0) + ret = t_ret; + if (ret != 0) + goto err; + } + + /* Secondary index updates are now done. On to the "real" stuff. */ + +skip_s_update: + /* * If we have an off-page duplicates cursor, and the operation applies * to it, perform the operation. Duplicate the cursor and call the * underlying function. @@ -826,8 +1439,12 @@ __db_c_put(dbc_arg, key, data, flags) * a new cursor and call the underlying function. */ if (pgno != PGNO_INVALID) { - if ((ret = __db_c_newopd(dbc_arg, pgno, &opd)) != 0) + oldopd = dbc_n->internal->opd; + if ((ret = __db_c_newopd(dbc_arg, pgno, oldopd, &opd)) != 0) { + dbc_n->internal->opd = opd; goto err; + } + dbc_n->internal->opd = opd; if ((ret = opd->c_am_put( @@ -840,8 +1457,15 @@ err: /* Cleanup and cursor resolution. */ if ((t_ret = __db_c_cleanup(dbc_arg, dbc_n, ret)) != 0 && ret == 0) ret = t_ret; + /* If newdata was used, free its buffer. */ + if (newdata.data != NULL) + __os_free(dbp->dbenv, newdata.data); + CDB_LOCKING_DONE(dbp, dbc_arg); + if (sdbp != NULL && (t_ret = __db_s_done(sdbp)) != 0) + return (t_ret); + return (ret); } @@ -855,7 +1479,20 @@ __db_duperr(dbp, flags) DB *dbp; u_int32_t flags; { - if (flags != DB_NODUPDATA) + + /* + * If we run into this error while updating a secondary index, + * don't yell--there's no clean way to pass DB_NODUPDATA in along + * with DB_UPDATE_SECONDARY, but we may run into this problem + * in a normal, non-error course of events. + * + * !!! + * If and when we ever permit duplicate duplicates in sorted-dup + * databases, we need to either change the secondary index code + * to check for dup dups, or we need to maintain the implicit + * "DB_NODUPDATA" behavior for databases with DB_AM_SECONDARY set. + */ + if (flags != DB_NODUPDATA && !F_ISSET(dbp, DB_AM_SECONDARY)) __db_err(dbp->dbenv, "Duplicate data items are not supported with sorted data"); return (DB_KEYEXIST); @@ -873,60 +1510,55 @@ __db_c_cleanup(dbc, dbc_n, failed) DB *dbp; DBC *opd; DBC_INTERNAL *internal; + DB_MPOOLFILE *mpf; int ret, t_ret; dbp = dbc->dbp; + mpf = dbp->mpf; internal = dbc->internal; ret = 0; /* Discard any pages we're holding. */ if (internal->page != NULL) { - if ((t_ret = - memp_fput(dbp->mpf, internal->page, 0)) != 0 && ret == 0) + if ((t_ret = mpf->put(mpf, internal->page, 0)) != 0 && ret == 0) ret = t_ret; internal->page = NULL; } opd = internal->opd; if (opd != NULL && opd->internal->page != NULL) { - if ((t_ret = memp_fput(dbp->mpf, - opd->internal->page, 0)) != 0 && ret == 0) + if ((t_ret = + mpf->put(mpf, opd->internal->page, 0)) != 0 && ret == 0) ret = t_ret; opd->internal->page = NULL; } /* - * If dbc_n is NULL, there's no internal cursor swapping to be - * done and no dbc_n to close--we probably did the entire - * operation on an offpage duplicate cursor. Just return. - */ - if (dbc_n == NULL) - return (ret); - - /* - * If dbc is marked DBC_TRANSIENT, we're inside a DB->{put/get} + * If dbc_n is NULL, there's no internal cursor swapping to be done + * and no dbc_n to close--we probably did the entire operation on an + * offpage duplicate cursor. Just return. + * + * If dbc and dbc_n are the same, we're either inside a DB->{put/get} * operation, and as an optimization we performed the operation on - * the main cursor rather than on a duplicated one. Assert - * that dbc_n == dbc (i.e., that we really did skip the - * duplication). Then just do nothing--even if there was - * an error, we're about to close the cursor, and the fact that we - * moved it isn't a user-visible violation of our "cursor - * stays put on error" rule. - */ - if (F_ISSET(dbc, DBC_TRANSIENT)) { - DB_ASSERT(dbc == dbc_n); + * the main cursor rather than on a duplicated one, or we're in a + * bulk get that can't have moved the cursor (DB_MULTIPLE with the + * initial c_get operation on an off-page dup cursor). Just + * return--either we know we didn't move the cursor, or we're going + * to close it before we return to application code, so we're sure + * not to visibly violate the "cursor stays put on error" rule. + */ + if (dbc_n == NULL || dbc == dbc_n) return (ret); - } if (dbc_n->internal->page != NULL) { - if ((t_ret = memp_fput(dbp->mpf, - dbc_n->internal->page, 0)) != 0 && ret == 0) + if ((t_ret = + mpf->put(mpf, dbc_n->internal->page, 0)) != 0 && ret == 0) ret = t_ret; dbc_n->internal->page = NULL; } opd = dbc_n->internal->opd; if (opd != NULL && opd->internal->page != NULL) { - if ((t_ret = memp_fput(dbp->mpf, - opd->internal->page, 0)) != 0 && ret == 0) + if ((t_ret = + mpf->put(mpf, opd->internal->page, 0)) != 0 && ret == 0) ret = t_ret; opd->internal->page = NULL; } @@ -963,6 +1595,316 @@ __db_c_cleanup(dbc, dbc_n, failed) } /* + * __db_c_secondary_get -- + * This wrapper function for DBC->c_pget() is the DBC->c_get() function + * for a secondary index cursor. + * + * PUBLIC: int __db_c_secondary_get __P((DBC *, DBT *, DBT *, u_int32_t)); + */ +int +__db_c_secondary_get(dbc, skey, data, flags) + DBC *dbc; + DBT *skey, *data; + u_int32_t flags; +{ + + DB_ASSERT(F_ISSET(dbc->dbp, DB_AM_SECONDARY)); + return (dbc->c_pget(dbc, skey, NULL, data, flags)); +} + +/* + * __db_c_pget -- + * Get a primary key/data pair through a secondary index. + * + * PUBLIC: int __db_c_pget __P((DBC *, DBT *, DBT *, DBT *, u_int32_t)); + */ +int +__db_c_pget(dbc, skey, pkey, data, flags) + DBC *dbc; + DBT *skey, *pkey, *data; + u_int32_t flags; +{ + DB *pdbp, *sdbp; + DBC *pdbc; + DBT *save_rdata, nullpkey; + int pkeymalloc, ret, save_pkey_flags, t_ret; + + sdbp = dbc->dbp; + pdbp = sdbp->s_primary; + pkeymalloc = t_ret = 0; + + PANIC_CHECK(sdbp->dbenv); + if ((ret = __db_cpgetchk(sdbp, + skey, pkey, data, flags, IS_INITIALIZED(dbc))) != 0) + return (ret); + + /* + * The challenging part of this function is getting the behavior + * right for all the various permutations of DBT flags. The + * next several blocks handle the various cases we need to + * deal with specially. + */ + + /* + * We may be called with a NULL pkey argument, if we've been + * wrapped by a 2-DBT get call. If so, we need to use our + * own DBT. + */ + if (pkey == NULL) { + memset(&nullpkey, 0, sizeof(DBT)); + pkey = &nullpkey; + } + + /* + * DB_GET_RECNO is a special case, because we're interested not in + * the primary key/data pair, but rather in the primary's record + * number. + */ + if ((flags & DB_OPFLAGS_MASK) == DB_GET_RECNO) + return (__db_c_pget_recno(dbc, pkey, data, flags)); + + /* + * If the DBTs we've been passed don't have any of the + * user-specified memory management flags set, we want to make sure + * we return values using the DBTs dbc->rskey, dbc->rkey, and + * dbc->rdata, respectively. + * + * There are two tricky aspects to this: first, we need to pass + * skey and pkey *in* to the initial c_get on the secondary key, + * since either or both may be looked at by it (depending on the + * get flag). Second, we must not use a normal DB->get call + * on the secondary, even though that's what we want to accomplish, + * because the DB handle may be free-threaded. Instead, + * we open a cursor, then take steps to ensure that we actually use + * the rkey/rdata from the *secondary* cursor. + * + * We accomplish all this by passing in the DBTs we started out + * with to the c_get, but having swapped the contents of rskey and + * rkey, respectively, into rkey and rdata; __db_ret will treat + * them like the normal key/data pair in a c_get call, and will + * realloc them as need be (this is "step 1"). Then, for "step 2", + * we swap back rskey/rkey/rdata to normal, and do a get on the primary + * with the secondary dbc appointed as the owner of the returned-data + * memory. + * + * Note that in step 2, we copy the flags field in case we need to + * pass down a DB_DBT_PARTIAL or other flag that is compatible with + * letting DB do the memory management. + */ + /* Step 1. */ + save_rdata = dbc->rdata; + dbc->rdata = dbc->rkey; + dbc->rkey = dbc->rskey; + + /* + * It is correct, though slightly sick, to attempt a partial get + * of a primary key. However, if we do so here, we'll never find the + * primary record; clear the DB_DBT_PARTIAL field of pkey just + * for the duration of the next call. + */ + save_pkey_flags = pkey->flags; + F_CLR(pkey, DB_DBT_PARTIAL); + + /* + * Now we can go ahead with the meat of this call. First, get the + * primary key from the secondary index. (What exactly we get depends + * on the flags, but the underlying cursor get will take care of the + * dirty work.) + */ + if ((ret = dbc->c_real_get(dbc, skey, pkey, flags)) != 0) { + /* Restore rskey/rkey/rdata and return. */ + pkey->flags = save_pkey_flags; + dbc->rskey = dbc->rkey; + dbc->rkey = dbc->rdata; + dbc->rdata = save_rdata; + goto err; + } + + /* Restore pkey's flags in case we stomped the PARTIAL flag. */ + pkey->flags = save_pkey_flags; + + /* + * Restore the cursor's rskey, rkey, and rdata DBTs. If DB + * is handling the memory management, we now have newly + * reallocated buffers and ulens in rkey and rdata which we want + * to put in rskey and rkey. save_rdata contains the old value + * of dbc->rdata. + */ + dbc->rskey = dbc->rkey; + dbc->rkey = dbc->rdata; + dbc->rdata = save_rdata; + + /* + * Now we're ready for "step 2". If either or both of pkey and + * data do not have memory management flags set--that is, if DB is + * managing their memory--we need to swap around the rkey/rdata + * structures so that we don't wind up trying to use memory managed + * by the primary database cursor, which we'll close before we return. + * + * !!! + * If you're carefully following the bouncing ball, you'll note + * that in the DB-managed case, the buffer hanging off of pkey is + * the same as dbc->rkey->data. This is just fine; we may well + * realloc and stomp on it when we return, if we're going a + * DB_GET_BOTH and need to return a different partial or key + * (depending on the comparison function), but this is safe. + * + * !!! + * We need to use __db_icursor here rather than simply calling + * pdbp->cursor, because otherwise, if we're in CDB, we'll + * allocate a new locker ID and leave ourselves open to deadlocks. + * (Even though we're only acquiring read locks, we'll still block + * if there are any waiters.) + */ + if ((ret = __db_icursor(pdbp, + dbc->txn, pdbp->type, PGNO_INVALID, 0, dbc->locker, &pdbc)) != 0) + goto err; + + /* + * We're about to use pkey a second time. If DB_DBT_MALLOC + * is set on it, we'll leak the memory we allocated the first time. + * Thus, set DB_DBT_REALLOC instead so that we reuse that memory + * instead of leaking it. + * + * !!! + * This assumes that the user must always specify a compatible + * realloc function if a malloc function is specified. I think + * this is a reasonable requirement. + */ + if (F_ISSET(pkey, DB_DBT_MALLOC)) { + F_CLR(pkey, DB_DBT_MALLOC); + F_SET(pkey, DB_DBT_REALLOC); + pkeymalloc = 1; + } + + /* + * Do the actual get. Set DBC_TRANSIENT since we don't care + * about preserving the position on error, and it's faster. + * SET_RET_MEM so that the secondary DBC owns any returned-data + * memory. + */ + F_SET(pdbc, DBC_TRANSIENT); + SET_RET_MEM(pdbc, dbc); + ret = pdbc->c_get(pdbc, pkey, data, DB_SET); + + /* + * If the item wasn't found in the primary, this is a bug; + * our secondary has somehow gotten corrupted, and contains + * elements that don't correspond to anything in the primary. + * Complain. + */ + if (ret == DB_NOTFOUND) + ret = __db_secondary_corrupt(pdbp); + + /* Now close the primary cursor. */ + t_ret = pdbc->c_close(pdbc); + +err: if (pkeymalloc) { + /* + * If pkey had a MALLOC flag, we need to restore it; + * otherwise, if the user frees the buffer but reuses + * the DBT without NULL'ing its data field or changing + * the flags, we may drop core. + */ + F_CLR(pkey, DB_DBT_REALLOC); + F_SET(pkey, DB_DBT_MALLOC); + } + return (t_ret == 0 ? ret : t_ret); +} + +/* + * __db_c_pget_recno -- + * Perform a DB_GET_RECNO c_pget on a secondary index. Returns + * the secondary's record number in the pkey field and the primary's + * in the data field. + */ +static int +__db_c_pget_recno(sdbc, pkey, data, flags) + DBC *sdbc; + DBT *pkey, *data; + u_int32_t flags; +{ + DB *pdbp, *sdbp; + DB_ENV *dbenv; + DBC *pdbc; + DBT discardme, primary_key; + db_recno_t oob; + u_int32_t rmw; + int ret, t_ret; + + sdbp = sdbc->dbp; + pdbp = sdbp->s_primary; + dbenv = sdbp->dbenv; + pdbc = NULL; + ret = t_ret = 0; + + rmw = LF_ISSET(DB_RMW); + + memset(&discardme, 0, sizeof(DBT)); + F_SET(&discardme, DB_DBT_USERMEM | DB_DBT_PARTIAL); + + oob = RECNO_OOB; + + /* + * If the primary is an rbtree, we want its record number, whether + * or not the secondary is one too. Fetch the recno into "data". + * + * If it's not an rbtree, return RECNO_OOB in "data". + */ + if (F_ISSET(pdbp, DB_AM_RECNUM)) { + /* + * Get the primary key, so we can find the record number + * in the primary. (We're uninterested in the secondary key.) + */ + memset(&primary_key, 0, sizeof(DBT)); + F_SET(&primary_key, DB_DBT_MALLOC); + if ((ret = sdbc->c_real_get(sdbc, + &discardme, &primary_key, rmw | DB_CURRENT)) != 0) + return (ret); + + /* + * Open a cursor on the primary, set it to the right record, + * and fetch its recno into "data". + * + * (See __db_c_pget for a comment on the use of __db_icursor.) + * + * SET_RET_MEM so that the secondary DBC owns any returned-data + * memory. + */ + if ((ret = __db_icursor(pdbp, sdbc->txn, + pdbp->type, PGNO_INVALID, 0, sdbc->locker, &pdbc)) != 0) + goto perr; + SET_RET_MEM(pdbc, sdbc); + if ((ret = pdbc->c_get(pdbc, + &primary_key, &discardme, rmw | DB_SET)) != 0) + goto perr; + + ret = pdbc->c_get(pdbc, &discardme, data, rmw | DB_GET_RECNO); + +perr: __os_ufree(sdbp->dbenv, primary_key.data); + if (pdbc != NULL && + (t_ret = pdbc->c_close(pdbc)) != 0 && ret == 0) + ret = t_ret; + if (ret != 0) + return (ret); + } else if ((ret = __db_retcopy(dbenv, data, &oob, + sizeof(oob), &sdbc->rkey->data, &sdbc->rkey->ulen)) != 0) + return (ret); + + /* + * If the secondary is an rbtree, we want its record number, whether + * or not the primary is one too. Fetch the recno into "pkey". + * + * If it's not an rbtree, return RECNO_OOB in "pkey". + */ + if (F_ISSET(sdbp, DB_AM_RECNUM)) + return (sdbc->c_real_get(sdbc, &discardme, pkey, flags)); + else + return (__db_retcopy(dbenv, pkey, &oob, + sizeof(oob), &sdbc->rdata->data, &sdbc->rdata->ulen)); +} + +/* * __db_wrlock_err -- do not have a write lock. */ static int @@ -972,3 +1914,373 @@ __db_wrlock_err(dbenv) __db_err(dbenv, "Write attempted on read-only cursor"); return (EPERM); } + +/* + * __db_c_del_secondary -- + * Perform a delete operation on a secondary index: call through + * to the primary and delete the primary record that this record + * points to. + * + * Note that deleting the primary record will call c_del on all + * the secondaries, including this one; thus, it is not necessary + * to execute both this function and an actual delete. + * + */ +static int +__db_c_del_secondary(dbc) + DBC *dbc; +{ + DB *pdbp; + DBC *pdbc; + DBT skey, pkey; + int ret, t_ret; + + memset(&skey, 0, sizeof(DBT)); + memset(&pkey, 0, sizeof(DBT)); + + /* + * Get the current item that we're pointing at. + * We don't actually care about the secondary key, just + * the primary. + */ + F_SET(&skey, DB_DBT_PARTIAL | DB_DBT_USERMEM); + if ((ret = dbc->c_real_get(dbc, + &skey, &pkey, DB_CURRENT)) != 0) + return (ret); + + /* + * Create a cursor on the primary with our locker ID, + * so that when it calls back, we don't conflict. + * + * We create a cursor explicitly because there's no + * way to specify the same locker ID if we're using + * locking but not transactions if we use the DB->del + * interface. This shouldn't be any less efficient + * anyway. + */ + pdbp = dbc->dbp->s_primary; + if ((ret = __db_icursor(pdbp, dbc->txn, + pdbp->type, PGNO_INVALID, 0, dbc->locker, &pdbc)) != 0) + return (ret); + + /* + * See comment in __db_c_put--if we're in CDB, + * we already hold the locks we need, and we need to flag + * the cursor as a WRITER so we don't run into errors + * when we try to delete. + */ + if (CDB_LOCKING(pdbp->dbenv)) { + DB_ASSERT(pdbc->mylock.off == LOCK_INVALID); + F_SET(pdbc, DBC_WRITER); + } + + /* + * Set the new cursor to the correct primary key. Then + * delete it. We don't really care about the datum; + * just reuse our skey DBT. + * + * If the primary get returns DB_NOTFOUND, something is amiss-- + * every record in the secondary should correspond to some record + * in the primary. + */ + if ((ret = pdbc->c_get(pdbc, &pkey, &skey, + (STD_LOCKING(dbc) ? DB_RMW : 0) | DB_SET)) == 0) + ret = pdbc->c_del(pdbc, 0); + else if (ret == DB_NOTFOUND) + ret = __db_secondary_corrupt(pdbp); + + if ((t_ret = pdbc->c_close(pdbc)) != 0 && ret != 0) + ret = t_ret; + + return (ret); +} + +/* + * __db_c_del_primary -- + * Perform a delete operation on a primary index. Loop through + * all the secondary indices which correspond to this primary + * database, and delete any secondary keys that point at the current + * record. + * + * PUBLIC: int __db_c_del_primary __P((DBC *)); + */ +int +__db_c_del_primary(dbc) + DBC *dbc; +{ + DB *dbp, *sdbp; + DBC *sdbc; + DBT data, pkey, skey, temp; + int ret, t_ret; + + dbp = dbc->dbp; + + /* + * If we're called at all, we have at least one secondary. + * (Unfortunately, we can't assert this without grabbing the mutex.) + * Get the current record so that we can construct appropriate + * secondary keys as needed. + */ + memset(&pkey, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + if ((ret = dbc->c_get(dbc, &pkey, &data, DB_CURRENT)) != 0) + return (ret); + + for (sdbp = __db_s_first(dbp); + sdbp != NULL && ret == 0; ret = __db_s_next(&sdbp)) { + /* + * Get the secondary key for this secondary and the current + * item. + */ + memset(&skey, 0, sizeof(DBT)); + if ((ret = sdbp->s_callback(sdbp, &pkey, &data, &skey)) != 0) { + /* + * If the current item isn't in this index, we + * have no work to do. Proceed. + */ + if (ret == DB_DONOTINDEX) + continue; + + /* We had a substantive error. Bail. */ + FREE_IF_NEEDED(sdbp, &skey); + goto done; + } + + /* Open a secondary cursor. */ + if ((ret = __db_icursor(sdbp, dbc->txn, sdbp->type, + PGNO_INVALID, 0, dbc->locker, &sdbc)) != 0) + goto done; + /* See comment above and in __db_c_put. */ + if (CDB_LOCKING(sdbp->dbenv)) { + DB_ASSERT(sdbc->mylock.off == LOCK_INVALID); + F_SET(sdbc, DBC_WRITER); + } + + /* + * Set the secondary cursor to the appropriate item. + * Delete it. + * + * We want to use DB_RMW if locking is on; it's only + * legal then, though. + * + * !!! + * Don't stomp on any callback-allocated buffer in skey + * when we do a c_get(DB_GET_BOTH); use a temp DBT instead. + */ + memset(&temp, 0, sizeof(DBT)); + temp.data = skey.data; + temp.size = skey.size; + if ((ret = sdbc->c_real_get(sdbc, &temp, &pkey, + (STD_LOCKING(dbc) ? DB_RMW : 0) | DB_GET_BOTH)) == 0) + ret = sdbc->c_del(sdbc, DB_UPDATE_SECONDARY); + + FREE_IF_NEEDED(sdbp, &skey); + + if ((t_ret = sdbc->c_close(sdbc)) != 0 || ret != 0) { + if (ret == 0) + ret = t_ret; + goto done; + } + } + +done: if (sdbp != NULL && (t_ret = __db_s_done(sdbp)) != 0 && ret == 0) + return (t_ret); + return (ret); +} + +/* + * __db_s_first -- + * Get the first secondary, if any are present, from the primary. + * + * PUBLIC: DB *__db_s_first __P((DB *)); + */ +DB * +__db_s_first(pdbp) + DB *pdbp; +{ + DB *sdbp; + + MUTEX_THREAD_LOCK(pdbp->dbenv, pdbp->mutexp); + sdbp = LIST_FIRST(&pdbp->s_secondaries); + + /* See __db_s_next. */ + if (sdbp != NULL) + sdbp->s_refcnt++; + MUTEX_THREAD_UNLOCK(pdbp->dbenv, pdbp->mutexp); + + return (sdbp); +} + +/* + * __db_s_next -- + * Get the next secondary in the list. + * + * PUBLIC: int __db_s_next __P((DB **)); + */ +int +__db_s_next(sdbpp) + DB **sdbpp; +{ + DB *sdbp, *pdbp, *closeme; + int ret; + + /* + * Secondary indices are kept in a linked list, s_secondaries, + * off each primary DB handle. If a primary is free-threaded, + * this list may only be traversed or modified while the primary's + * thread mutex is held. + * + * The tricky part is that we don't want to hold the thread mutex + * across the full set of secondary puts necessary for each primary + * put, or we'll wind up essentially single-threading all the puts + * to the handle; the secondary puts will each take about as + * long as the primary does, and may require I/O. So we instead + * hold the thread mutex only long enough to follow one link to the + * next secondary, and then we release it before performing the + * actual secondary put. + * + * The only danger here is that we might legitimately close a + * secondary index in one thread while another thread is performing + * a put and trying to update that same secondary index. To + * prevent this from happening, we refcount the secondary handles. + * If close is called on a secondary index handle while we're putting + * to it, it won't really be closed--the refcount will simply drop, + * and we'll be responsible for closing it here. + */ + sdbp = *sdbpp; + pdbp = sdbp->s_primary; + closeme = NULL; + + MUTEX_THREAD_LOCK(pdbp->dbenv, pdbp->mutexp); + DB_ASSERT(sdbp->s_refcnt != 0); + if (--sdbp->s_refcnt == 0) { + LIST_REMOVE(sdbp, s_links); + closeme = sdbp; + } + sdbp = LIST_NEXT(sdbp, s_links); + if (sdbp != NULL) + sdbp->s_refcnt++; + MUTEX_THREAD_UNLOCK(pdbp->dbenv, pdbp->mutexp); + + *sdbpp = sdbp; + + /* + * closeme->close() is a wrapper; call __db_close explicitly. + */ + ret = closeme != NULL ? __db_close(closeme, 0) : 0; + return (ret); +} + +/* + * __db_s_done -- + * Properly decrement the refcount on a secondary database handle we're + * using, without calling __db_s_next. + * + * PUBLIC: int __db_s_done __P((DB *)); + */ +int +__db_s_done(sdbp) + DB *sdbp; +{ + DB *pdbp; + int doclose; + + pdbp = sdbp->s_primary; + doclose = 0; + + MUTEX_THREAD_LOCK(pdbp->dbenv, pdbp->mutexp); + DB_ASSERT(sdbp->s_refcnt != 0); + if (--sdbp->s_refcnt == 0) { + LIST_REMOVE(sdbp, s_links); + doclose = 1; + } + MUTEX_THREAD_UNLOCK(pdbp->dbenv, pdbp->mutexp); + + return (doclose ? __db_close(sdbp, 0) : 0); +} + +/* + * __db_buildpartial -- + * Build the record that will result after a partial put is applied to + * an existing record. + * + * This should probably be merged with __bam_build, but that requires + * a little trickery if we plan to keep the overflow-record optimization + * in that function. + */ +static int +__db_buildpartial(dbp, oldrec, partial, newrec) + DB *dbp; + DBT *oldrec, *partial, *newrec; +{ + int ret; + u_int8_t *buf; + u_int32_t len, nbytes; + + DB_ASSERT(F_ISSET(partial, DB_DBT_PARTIAL)); + + memset(newrec, 0, sizeof(DBT)); + + nbytes = __db_partsize(oldrec->size, partial); + newrec->size = nbytes; + + if ((ret = __os_malloc(dbp->dbenv, nbytes, &buf)) != 0) + return (ret); + newrec->data = buf; + + /* Nul or pad out the buffer, for any part that isn't specified. */ + memset(buf, + F_ISSET(dbp, DB_AM_FIXEDLEN) ? ((BTREE *)dbp->bt_internal)->re_pad : + 0, nbytes); + + /* Copy in any leading data from the original record. */ + memcpy(buf, oldrec->data, + partial->doff > oldrec->size ? oldrec->size : partial->doff); + + /* Copy the data from partial. */ + memcpy(buf + partial->doff, partial->data, partial->size); + + /* Copy any trailing data from the original record. */ + len = partial->doff + partial->dlen; + if (oldrec->size > len) + memcpy(buf + partial->doff + partial->size, + (u_int8_t *)oldrec->data + len, oldrec->size - len); + + return (0); +} + +/* + * __db_partsize -- + * Given the number of bytes in an existing record and a DBT that + * is about to be partial-put, calculate the size of the record + * after the put. + * + * This code is called from __bam_partsize. + * + * PUBLIC: u_int32_t __db_partsize __P((u_int32_t, DBT *)); + */ +u_int32_t +__db_partsize(nbytes, data) + u_int32_t nbytes; + DBT *data; +{ + + /* + * There are really two cases here: + * + * Case 1: We are replacing some bytes that do not exist (i.e., they + * are past the end of the record). In this case the number of bytes + * we are replacing is irrelevant and all we care about is how many + * bytes we are going to add from offset. So, the new record length + * is going to be the size of the new bytes (size) plus wherever those + * new bytes begin (doff). + * + * Case 2: All the bytes we are replacing exist. Therefore, the new + * size is the oldsize (nbytes) minus the bytes we are replacing (dlen) + * plus the bytes we are adding (size). + */ + if (nbytes < data->doff + data->dlen) /* Case 1 */ + return (data->doff + data->size); + + return (nbytes + data->size - data->dlen); /* Case 2 */ +} diff --git a/bdb/db/db_conv.c b/bdb/db/db_conv.c index df60be06790..f731c82d85e 100644 --- a/bdb/db/db_conv.c +++ b/bdb/db/db_conv.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2002 * Sleepycat Software. All rights reserved. */ /* @@ -40,7 +40,7 @@ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: db_conv.c,v 11.11 2000/11/30 00:58:31 ubell Exp $"; +static const char revid[] = "$Id: db_conv.c,v 11.38 2002/08/15 03:00:13 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -50,12 +50,14 @@ static const char revid[] = "$Id: db_conv.c,v 11.11 2000/11/30 00:58:31 ubell Ex #endif #include "db_int.h" -#include "db_page.h" -#include "db_swap.h" -#include "db_am.h" -#include "btree.h" -#include "hash.h" -#include "qam.h" +#include "dbinc/crypto.h" +#include "dbinc/hmac.h" +#include "dbinc/db_page.h" +#include "dbinc/db_swap.h" +#include "dbinc/btree.h" +#include "dbinc/hash.h" +#include "dbinc/log.h" +#include "dbinc/qam.h" /* * __db_pgin -- @@ -70,15 +72,135 @@ __db_pgin(dbenv, pg, pp, cookie) void *pp; DBT *cookie; { + DB dummydb, *dbp; DB_PGINFO *pginfo; + DB_CIPHER *db_cipher; + DB_LSN not_used; + PAGE *pagep; + size_t pg_off, pg_len, sum_len; + int is_hmac, ret; + u_int8_t *chksum, *iv; pginfo = (DB_PGINFO *)cookie->data; + pagep = (PAGE *)pp; - switch (((PAGE *)pp)->type) { - case P_HASH: + ret = is_hmac = 0; + chksum = iv = NULL; + memset(&dummydb, 0, sizeof(DB)); + dbp = &dummydb; + dummydb.flags = pginfo->flags; + db_cipher = (DB_CIPHER *)dbenv->crypto_handle; + switch (pagep->type) { case P_HASHMETA: + case P_BTREEMETA: + case P_QAMMETA: + /* + * If checksumming is set on the meta-page, we must set + * it in the dbp. + */ + if (FLD_ISSET(((DBMETA *)pp)->metaflags, DBMETA_CHKSUM)) + F_SET(dbp, DB_AM_CHKSUM); + if (((DBMETA *)pp)->encrypt_alg != 0 || + F_ISSET(dbp, DB_AM_ENCRYPT)) + is_hmac = 1; + /* + * !!! + * For all meta pages it is required that the chksum + * be at the same location. Use BTMETA to get to it + * for any meta type. + */ + chksum = ((BTMETA *)pp)->chksum; + sum_len = DBMETASIZE; + break; + case P_INVALID: + /* + * We assume that we've read a file hole if we have + * a zero LSN, zero page number and P_INVALID. Otherwise + * we have an invalid page that might contain real data. + */ + if (IS_ZERO_LSN(LSN(pagep)) && pagep->pgno == PGNO_INVALID) { + sum_len = 0; + break; + } + /* FALLTHROUGH */ + default: + chksum = P_CHKSUM(dbp, pagep); + sum_len = pginfo->db_pagesize; + /* + * If we are reading in a non-meta page, then if we have + * a db_cipher then we are using hmac. + */ + is_hmac = CRYPTO_ON(dbenv) ? 1 : 0; + break; + } + + /* + * We expect a checksum error if there was a configuration problem. + * If there is no configuration problem and we don't get a match, + * it's fatal: panic the system. + */ + if (F_ISSET(dbp, DB_AM_CHKSUM) && sum_len != 0) + switch (ret = __db_check_chksum( + dbenv, db_cipher, chksum, pp, sum_len, is_hmac)) { + case 0: + break; + case -1: + if (DBENV_LOGGING(dbenv)) + __db_cksum_log( + dbenv, NULL, ¬_used, DB_FLUSH); + __db_err(dbenv, + "checksum error: catastrophic recovery required"); + return (__db_panic(dbenv, DB_RUNRECOVERY)); + default: + return (ret); + } + + if (F_ISSET(dbp, DB_AM_ENCRYPT)) { + DB_ASSERT(db_cipher != NULL); + DB_ASSERT(F_ISSET(dbp, DB_AM_CHKSUM)); + + pg_off = P_OVERHEAD(dbp); + DB_ASSERT(db_cipher->adj_size(pg_off) == 0); + + switch (pagep->type) { + case P_HASHMETA: + case P_BTREEMETA: + case P_QAMMETA: + /* + * !!! + * For all meta pages it is required that the iv + * be at the same location. Use BTMETA to get to it + * for any meta type. + */ + iv = ((BTMETA *)pp)->iv; + pg_len = DBMETASIZE; + break; + case P_INVALID: + if (IS_ZERO_LSN(LSN(pagep)) && + pagep->pgno == PGNO_INVALID) { + pg_len = 0; + break; + } + /* FALLTHROUGH */ + default: + iv = P_IV(dbp, pagep); + pg_len = pginfo->db_pagesize; + break; + } + if (pg_len != 0 && (ret = db_cipher->decrypt(dbenv, + db_cipher->data, iv, ((u_int8_t *)pagep) + pg_off, + pg_len - pg_off)) != 0) + return (ret); + } + switch (pagep->type) { case P_INVALID: - return (__ham_pgin(dbenv, pg, pp, cookie)); + if (pginfo->type == DB_QUEUE) + return (__qam_pgin_out(dbenv, pg, pp, cookie)); + else + return (__ham_pgin(dbenv, dbp, pg, pp, cookie)); + case P_HASH: + case P_HASHMETA: + return (__ham_pgin(dbenv, dbp, pg, pp, cookie)); case P_BTREEMETA: case P_IBTREE: case P_IRECNO: @@ -86,14 +208,14 @@ __db_pgin(dbenv, pg, pp, cookie) case P_LDUP: case P_LRECNO: case P_OVERFLOW: - return (__bam_pgin(dbenv, pg, pp, cookie)); + return (__bam_pgin(dbenv, dbp, pg, pp, cookie)); case P_QAMMETA: case P_QAMDATA: return (__qam_pgin_out(dbenv, pg, pp, cookie)); default: break; } - return (__db_unknown_type(dbenv, "__db_pgin", ((PAGE *)pp)->type)); + return (__db_pgfmt(dbenv, pg)); } /* @@ -109,15 +231,33 @@ __db_pgout(dbenv, pg, pp, cookie) void *pp; DBT *cookie; { + DB dummydb, *dbp; + DB_CIPHER *db_cipher; DB_PGINFO *pginfo; + PAGE *pagep; + size_t pg_off, pg_len, sum_len; + int ret; + u_int8_t *chksum, *iv, *key; pginfo = (DB_PGINFO *)cookie->data; + pagep = (PAGE *)pp; - switch (((PAGE *)pp)->type) { + chksum = iv = key = NULL; + memset(&dummydb, 0, sizeof(DB)); + dbp = &dummydb; + dummydb.flags = pginfo->flags; + ret = 0; + switch (pagep->type) { + case P_INVALID: + if (pginfo->type == DB_QUEUE) + ret = __qam_pgin_out(dbenv, pg, pp, cookie); + else + ret = __ham_pgout(dbenv, dbp, pg, pp, cookie); + break; case P_HASH: case P_HASHMETA: - case P_INVALID: - return (__ham_pgout(dbenv, pg, pp, cookie)); + ret = __ham_pgout(dbenv, dbp, pg, pp, cookie); + break; case P_BTREEMETA: case P_IBTREE: case P_IRECNO: @@ -125,14 +265,73 @@ __db_pgout(dbenv, pg, pp, cookie) case P_LDUP: case P_LRECNO: case P_OVERFLOW: - return (__bam_pgout(dbenv, pg, pp, cookie)); + ret = __bam_pgout(dbenv, dbp, pg, pp, cookie); + break; case P_QAMMETA: case P_QAMDATA: - return (__qam_pgin_out(dbenv, pg, pp, cookie)); - default: + ret = __qam_pgin_out(dbenv, pg, pp, cookie); break; + default: + return (__db_pgfmt(dbenv, pg)); + } + if (ret) + return (ret); + + db_cipher = (DB_CIPHER *)dbenv->crypto_handle; + if (F_ISSET(dbp, DB_AM_ENCRYPT)) { + + DB_ASSERT(db_cipher != NULL); + DB_ASSERT(F_ISSET(dbp, DB_AM_CHKSUM)); + + pg_off = P_OVERHEAD(dbp); + DB_ASSERT(db_cipher->adj_size(pg_off) == 0); + + key = db_cipher->mac_key; + + switch (pagep->type) { + case P_HASHMETA: + case P_BTREEMETA: + case P_QAMMETA: + /* + * !!! + * For all meta pages it is required that the iv + * be at the same location. Use BTMETA to get to it + * for any meta type. + */ + iv = ((BTMETA *)pp)->iv; + pg_len = DBMETASIZE; + break; + default: + iv = P_IV(dbp, pagep); + pg_len = pginfo->db_pagesize; + break; + } + if ((ret = db_cipher->encrypt(dbenv, db_cipher->data, + iv, ((u_int8_t *)pagep) + pg_off, pg_len - pg_off)) != 0) + return (ret); + } + if (F_ISSET(dbp, DB_AM_CHKSUM)) { + switch (pagep->type) { + case P_HASHMETA: + case P_BTREEMETA: + case P_QAMMETA: + /* + * !!! + * For all meta pages it is required that the chksum + * be at the same location. Use BTMETA to get to it + * for any meta type. + */ + chksum = ((BTMETA *)pp)->chksum; + sum_len = DBMETASIZE; + break; + default: + chksum = P_CHKSUM(dbp, pagep); + sum_len = pginfo->db_pagesize; + break; + } + __db_chksum(pp, sum_len, key, chksum); } - return (__db_unknown_type(dbenv, "__db_pgout", ((PAGE *)pp)->type)); + return (0); } /* @@ -169,11 +368,13 @@ __db_metaswap(pg) * __db_byteswap -- * Byteswap a page. * - * PUBLIC: int __db_byteswap __P((DB_ENV *, db_pgno_t, PAGE *, size_t, int)); + * PUBLIC: int __db_byteswap + * PUBLIC: __P((DB_ENV *, DB *, db_pgno_t, PAGE *, size_t, int)); */ int -__db_byteswap(dbenv, pg, h, pagesize, pgin) +__db_byteswap(dbenv, dbp, pg, h, pagesize, pgin) DB_ENV *dbenv; + DB *dbp; db_pgno_t pg; PAGE *h; size_t pagesize; @@ -183,11 +384,12 @@ __db_byteswap(dbenv, pg, h, pagesize, pgin) BKEYDATA *bk; BOVERFLOW *bo; RINTERNAL *ri; - db_indx_t i, len, tmp; + db_indx_t i, *inp, len, tmp; u_int8_t *p, *end; COMPQUIET(pg, 0); + inp = P_INP(dbp, h); if (pgin) { M_32_SWAP(h->lsn.file); M_32_SWAP(h->lsn.offset); @@ -202,14 +404,14 @@ __db_byteswap(dbenv, pg, h, pagesize, pgin) case P_HASH: for (i = 0; i < NUM_ENT(h); i++) { if (pgin) - M_16_SWAP(h->inp[i]); + M_16_SWAP(inp[i]); - switch (HPAGE_TYPE(h, i)) { + switch (HPAGE_TYPE(dbp, h, i)) { case H_KEYDATA: break; case H_DUPLICATE: - len = LEN_HKEYDATA(h, pagesize, i); - p = HKEYDATA_DATA(P_ENTRY(h, i)); + len = LEN_HKEYDATA(dbp, h, pagesize, i); + p = HKEYDATA_DATA(P_ENTRY(dbp, h, i)); for (end = p + len; p < end;) { if (pgin) { P_16_SWAP(p); @@ -226,11 +428,11 @@ __db_byteswap(dbenv, pg, h, pagesize, pgin) } break; case H_OFFDUP: - p = HOFFPAGE_PGNO(P_ENTRY(h, i)); + p = HOFFPAGE_PGNO(P_ENTRY(dbp, h, i)); SWAP32(p); /* pgno */ break; case H_OFFPAGE: - p = HOFFPAGE_PGNO(P_ENTRY(h, i)); + p = HOFFPAGE_PGNO(P_ENTRY(dbp, h, i)); SWAP32(p); /* pgno */ SWAP32(p); /* tlen */ break; @@ -246,14 +448,14 @@ __db_byteswap(dbenv, pg, h, pagesize, pgin) */ if (!pgin) for (i = 0; i < NUM_ENT(h); i++) - M_16_SWAP(h->inp[i]); + M_16_SWAP(inp[i]); break; case P_LBTREE: case P_LDUP: case P_LRECNO: for (i = 0; i < NUM_ENT(h); i++) { if (pgin) - M_16_SWAP(h->inp[i]); + M_16_SWAP(inp[i]); /* * In the case of on-page duplicates, key information @@ -261,17 +463,17 @@ __db_byteswap(dbenv, pg, h, pagesize, pgin) */ if (h->type == P_LBTREE && i > 1) { if (pgin) { - if (h->inp[i] == h->inp[i - 2]) + if (inp[i] == inp[i - 2]) continue; } else { - M_16_SWAP(h->inp[i]); - if (h->inp[i] == h->inp[i - 2]) + M_16_SWAP(inp[i]); + if (inp[i] == inp[i - 2]) continue; - M_16_SWAP(h->inp[i]); + M_16_SWAP(inp[i]); } } - bk = GET_BKEYDATA(h, i); + bk = GET_BKEYDATA(dbp, h, i); switch (B_TYPE(bk->type)) { case B_KEYDATA: M_16_SWAP(bk->len); @@ -285,15 +487,15 @@ __db_byteswap(dbenv, pg, h, pagesize, pgin) } if (!pgin) - M_16_SWAP(h->inp[i]); + M_16_SWAP(inp[i]); } break; case P_IBTREE: for (i = 0; i < NUM_ENT(h); i++) { if (pgin) - M_16_SWAP(h->inp[i]); + M_16_SWAP(inp[i]); - bi = GET_BINTERNAL(h, i); + bi = GET_BINTERNAL(dbp, h, i); M_16_SWAP(bi->len); M_32_SWAP(bi->pgno); M_32_SWAP(bi->nrecs); @@ -310,20 +512,20 @@ __db_byteswap(dbenv, pg, h, pagesize, pgin) } if (!pgin) - M_16_SWAP(h->inp[i]); + M_16_SWAP(inp[i]); } break; case P_IRECNO: for (i = 0; i < NUM_ENT(h); i++) { if (pgin) - M_16_SWAP(h->inp[i]); + M_16_SWAP(inp[i]); - ri = GET_RINTERNAL(h, i); + ri = GET_RINTERNAL(dbp, h, i); M_32_SWAP(ri->pgno); M_32_SWAP(ri->nrecs); if (!pgin) - M_16_SWAP(h->inp[i]); + M_16_SWAP(inp[i]); } break; case P_OVERFLOW: @@ -331,7 +533,7 @@ __db_byteswap(dbenv, pg, h, pagesize, pgin) /* Nothing to do. */ break; default: - return (__db_unknown_type(dbenv, "__db_byteswap", h->type)); + return (__db_pgfmt(dbenv, pg)); } if (!pgin) { diff --git a/bdb/db/db_dispatch.c b/bdb/db/db_dispatch.c index c9beac401a7..2cf29ec2f33 100644 --- a/bdb/db/db_dispatch.c +++ b/bdb/db/db_dispatch.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2002 * Sleepycat Software. All rights reserved. */ /* @@ -39,7 +39,7 @@ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: db_dispatch.c,v 11.41 2001/01/11 18:19:50 bostic Exp $"; +static const char revid[] = "$Id: db_dispatch.c,v 11.121 2002/09/07 17:36:31 ubell Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -51,16 +51,24 @@ static const char revid[] = "$Id: db_dispatch.c,v 11.41 2001/01/11 18:19:50 bost #endif #include "db_int.h" -#include "db_page.h" -#include "db_dispatch.h" -#include "db_am.h" -#include "log_auto.h" -#include "txn.h" -#include "txn_auto.h" -#include "log.h" - -static int __db_txnlist_find_internal __P((void *, db_txnlist_type, - u_int32_t, u_int8_t [DB_FILE_ID_LEN], DB_TXNLIST **, int)); +#include "dbinc/db_page.h" +#include "dbinc/hash.h" +#include "dbinc/log.h" +#include "dbinc/fop.h" +#include "dbinc/rep.h" +#include "dbinc/txn.h" + +static int __db_limbo_fix __P((DB *, + DB_TXN *, DB_TXNLIST *, db_pgno_t *, DBMETA *)); +static int __db_limbo_bucket __P((DB_ENV *, DB_TXN *, DB_TXNLIST *)); +static int __db_limbo_move __P((DB_ENV *, DB_TXN *, DB_TXN *, DB_TXNLIST *)); +static int __db_lock_move __P((DB_ENV *, + u_int8_t *, db_pgno_t, db_lockmode_t, DB_TXN *, DB_TXN *)); +static int __db_default_getpgnos __P((DB_ENV *, DB_LSN *lsnp, void *)); +static int __db_txnlist_find_internal __P((DB_ENV *, void *, db_txnlist_type, + u_int32_t, u_int8_t [DB_FILE_ID_LEN], DB_TXNLIST **, int)); +static int __db_txnlist_pgnoadd __P((DB_ENV *, DB_TXNHEAD *, + int32_t, u_int8_t [DB_FILE_ID_LEN], char *, db_pgno_t)); /* * __db_dispatch -- @@ -71,16 +79,21 @@ static int __db_txnlist_find_internal __P((void *, db_txnlist_type, * scripts in the tools directory). An application using a different * recovery paradigm will supply a different dispatch function to txn_open. * - * PUBLIC: int __db_dispatch __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); + * PUBLIC: int __db_dispatch __P((DB_ENV *, + * PUBLIC: int (**)__P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)), + * PUBLIC: size_t, DBT *, DB_LSN *, db_recops, void *)); */ int -__db_dispatch(dbenv, db, lsnp, redo, info) +__db_dispatch(dbenv, dtab, dtabsize, db, lsnp, redo, info) DB_ENV *dbenv; /* The environment. */ + int (**dtab)__P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); + size_t dtabsize; /* Size of the dtab. */ DBT *db; /* The log record upon which to dispatch. */ DB_LSN *lsnp; /* The lsn of the record being dispatched. */ db_recops redo; /* Redo this op (or undo it). */ void *info; { + DB_LSN prev_lsn; u_int32_t rectype, txnid; int make_call, ret; @@ -88,6 +101,9 @@ __db_dispatch(dbenv, db, lsnp, redo, info) memcpy(&txnid, (u_int8_t *)db->data + sizeof(rectype), sizeof(txnid)); make_call = ret = 0; + /* If we don't have a dispatch table, it's hard to dispatch. */ + DB_ASSERT(dtab != NULL); + /* * If we find a record that is in the user's number space and they * have specified a recovery routine, let them handle it. If they @@ -96,17 +112,29 @@ __db_dispatch(dbenv, db, lsnp, redo, info) */ switch (redo) { case DB_TXN_ABORT: - /* - * XXX - * db_printlog depends on DB_TXN_ABORT not examining the TXN - * list. If that ever changes, fix db_printlog too. - */ + case DB_TXN_APPLY: + case DB_TXN_PRINT: make_call = 1; break; case DB_TXN_OPENFILES: - if (rectype == DB_log_register) - return (dbenv->dtab[rectype](dbenv, - db, lsnp, redo, info)); + /* + * We collect all the transactions that have + * "begin" records, those with no previous LSN, + * so that we do not abort partial transactions. + * These are known to be undone, otherwise the + * log would not have been freeable. + */ + memcpy(&prev_lsn, (u_int8_t *)db->data + + sizeof(rectype) + sizeof(txnid), sizeof(prev_lsn)); + if (txnid != 0 && prev_lsn.file == 0 && (ret = + __db_txnlist_add(dbenv, info, txnid, TXN_OK, NULL)) != 0) + return (ret); + + /* FALLTHROUGH */ + case DB_TXN_POPENFILES: + if (rectype == DB___dbreg_register || + rectype == DB___txn_ckp || rectype == DB___txn_recycle) + return (dtab[rectype](dbenv, db, lsnp, redo, info)); break; case DB_TXN_BACKWARD_ROLL: /* @@ -117,43 +145,146 @@ __db_dispatch(dbenv, db, lsnp, redo, info) * we've never seen it, then we call the appropriate recovery * routine. * - * We need to always undo DB_db_noop records, so that we + * We need to always undo DB___db_noop records, so that we * properly handle any aborts before the file was closed. */ - if (rectype == DB_log_register || - rectype == DB_txn_ckp || rectype == DB_db_noop - || rectype == DB_txn_child || (txnid != 0 && - (ret = __db_txnlist_find(info, txnid)) != 0)) { + switch(rectype) { + case DB___txn_regop: + case DB___txn_recycle: + case DB___txn_ckp: + case DB___db_noop: + case DB___fop_file_remove: + case DB___txn_child: make_call = 1; - if (ret == DB_NOTFOUND && rectype != DB_txn_regop && - rectype != DB_txn_xa_regop && (ret = - __db_txnlist_add(dbenv, info, txnid, 1)) != 0) - return (ret); + break; + + case DB___dbreg_register: + if (txnid == 0) { + make_call = 1; + break; + } + /* FALLTHROUGH */ + default: + if (txnid != 0 && (ret = + __db_txnlist_find(dbenv, + info, txnid)) != TXN_COMMIT && ret != TXN_IGNORE) { + /* + * If not found then, this is an incomplete + * abort. + */ + if (ret == TXN_NOTFOUND) + return (__db_txnlist_add(dbenv, + info, txnid, TXN_IGNORE, lsnp)); + make_call = 1; + if (ret == TXN_OK && + (ret = __db_txnlist_update(dbenv, + info, txnid, + rectype == DB___txn_xa_regop ? + TXN_PREPARE : TXN_ABORT, NULL)) != 0) + return (ret); + } } break; case DB_TXN_FORWARD_ROLL: /* * In the forward pass, if we haven't seen the transaction, - * do nothing, else recovery it. + * do nothing, else recover it. * - * We need to always redo DB_db_noop records, so that we + * We need to always redo DB___db_noop records, so that we * properly handle any commits after the file was closed. */ - if (rectype == DB_log_register || - rectype == DB_txn_ckp || - rectype == DB_db_noop || - __db_txnlist_find(info, txnid) == 0) + switch(rectype) { + case DB___txn_recycle: + case DB___txn_ckp: + case DB___db_noop: make_call = 1; + break; + + default: + if (txnid != 0 && (ret = __db_txnlist_find(dbenv, + info, txnid)) == TXN_COMMIT) + make_call = 1; + else if (ret != TXN_IGNORE && + (rectype == DB___ham_metagroup || + rectype == DB___ham_groupalloc || + rectype == DB___db_pg_alloc)) { + /* + * Because we cannot undo file extensions + * all allocation records must be reprocessed + * during rollforward in case the file was + * just created. It may not have been + * present during the backward pass. + */ + make_call = 1; + redo = DB_TXN_BACKWARD_ALLOC; + } else if (rectype == DB___dbreg_register) { + /* + * This may be a transaction dbreg_register. + * If it is, we only make the call on a COMMIT, + * which we checked above. If it's not, then we + * should always make the call, because we need + * the file open information. + */ + if (txnid == 0) + make_call = 1; + } + } break; + case DB_TXN_GETPGNOS: + /* + * If this is one of DB's own log records, we simply + * dispatch. + */ + if (rectype < DB_user_BEGIN) { + make_call = 1; + break; + } + + /* + * If we're still here, this is a custom record in an + * application that's doing app-specific logging. Such a + * record doesn't have a getpgno function for the user + * dispatch function to call--the getpgnos functions return + * which pages replication needs to lock using the TXN_RECS + * structure, which is private and not something we want to + * document. + * + * Thus, we leave any necessary locking for the app's + * recovery function to do during the upcoming + * DB_TXN_APPLY. Fill in default getpgnos info (we need + * a stub entry for every log record that will get + * DB_TXN_APPLY'd) and return success. + */ + return (__db_default_getpgnos(dbenv, lsnp, info)); default: return (__db_unknown_flag(dbenv, "__db_dispatch", redo)); } + /* + * The switch statement uses ret to receive the return value of + * __db_txnlist_find, which returns a large number of different + * statuses, none of which we will be returning. For safety, + * let's reset this here in case we ever do a "return(ret)" + * below in the future. + */ + ret = 0; if (make_call) { - if (rectype >= DB_user_BEGIN && dbenv->tx_recover != NULL) - return (dbenv->tx_recover(dbenv, db, lsnp, redo)); - else - return (dbenv->dtab[rectype](dbenv, db, lsnp, redo, info)); + if (rectype >= DB_user_BEGIN && dbenv->app_dispatch != NULL) + return (dbenv->app_dispatch(dbenv, db, lsnp, redo)); + else { + /* + * The size of the dtab table argument is the same as + * the standard table, use the standard table's size + * as our sanity check. + */ + if (rectype > dtabsize || dtab[rectype] == NULL) { + __db_err(dbenv, + "Illegal record type %lu in log", + (u_long)rectype); + return (EINVAL); + } + return (dtab[rectype](dbenv, db, lsnp, redo, info)); + } } return (0); @@ -163,75 +294,100 @@ __db_dispatch(dbenv, db, lsnp, redo, info) * __db_add_recovery -- * * PUBLIC: int __db_add_recovery __P((DB_ENV *, - * PUBLIC: int (*)(DB_ENV *, DBT *, DB_LSN *, db_recops, void *), u_int32_t)); + * PUBLIC: int (***)(DB_ENV *, DBT *, DB_LSN *, db_recops, void *), size_t *, + * PUBLIC: int (*)(DB_ENV *, DBT *, DB_LSN *, db_recops, void *), u_int32_t)); */ int -__db_add_recovery(dbenv, func, ndx) +__db_add_recovery(dbenv, dtab, dtabsize, func, ndx) DB_ENV *dbenv; + int (***dtab) __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); + size_t *dtabsize; int (*func) __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); u_int32_t ndx; { - u_int32_t i, nsize; + size_t i, nsize; int ret; /* Check if we have to grow the table. */ - if (ndx >= dbenv->dtab_size) { + if (ndx >= *dtabsize) { nsize = ndx + 40; - if ((ret = __os_realloc(dbenv, - nsize * sizeof(dbenv->dtab[0]), NULL, &dbenv->dtab)) != 0) + if ((ret = + __os_realloc(dbenv, nsize * sizeof((*dtab)[0]), dtab)) != 0) return (ret); - for (i = dbenv->dtab_size; i < nsize; ++i) - dbenv->dtab[i] = NULL; - dbenv->dtab_size = nsize; + for (i = *dtabsize; i < nsize; ++i) + (*dtab)[i] = NULL; + *dtabsize = nsize; } - dbenv->dtab[ndx] = func; + (*dtab)[ndx] = func; return (0); } /* - * __deprecated_recover -- - * Stub routine for deprecated recovery functions. - * - * PUBLIC: int __deprecated_recover - * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); - */ -int -__deprecated_recover(dbenv, dbtp, lsnp, op, info) - DB_ENV *dbenv; - DBT *dbtp; - DB_LSN *lsnp; - db_recops op; - void *info; -{ - COMPQUIET(dbenv, NULL); - COMPQUIET(dbtp, NULL); - COMPQUIET(lsnp, NULL); - COMPQUIET(op, 0); - COMPQUIET(info, NULL); - return (EINVAL); -} - -/* * __db_txnlist_init -- * Initialize transaction linked list. * - * PUBLIC: int __db_txnlist_init __P((DB_ENV *, void *)); + * PUBLIC: int __db_txnlist_init __P((DB_ENV *, + * PUBLIC: u_int32_t, u_int32_t, DB_LSN *, void *)); */ int -__db_txnlist_init(dbenv, retp) +__db_txnlist_init(dbenv, low_txn, hi_txn, trunc_lsn, retp) DB_ENV *dbenv; + u_int32_t low_txn, hi_txn; + DB_LSN *trunc_lsn; void *retp; { DB_TXNHEAD *headp; - int ret; + u_int32_t tmp; + int ret, size; - if ((ret = __os_malloc(dbenv, sizeof(DB_TXNHEAD), NULL, &headp)) != 0) + /* + * Size a hash table. + * If low is zero then we are being called during rollback + * and we need only one slot. + * Hi maybe lower than low if we have recycled txnid's. + * The numbers here are guesses about txn density, we can afford + * to look at a few entries in each slot. + */ + if (low_txn == 0) + size = 1; + else { + if (hi_txn < low_txn) { + tmp = hi_txn; + hi_txn = low_txn; + low_txn = tmp; + } + tmp = hi_txn - low_txn; + /* See if we wrapped around. */ + if (tmp > (TXN_MAXIMUM - TXN_MINIMUM) / 2) + tmp = (low_txn - TXN_MINIMUM) + (TXN_MAXIMUM - hi_txn); + size = tmp / 5; + if (size < 100) + size = 100; + } + if ((ret = __os_malloc(dbenv, + sizeof(DB_TXNHEAD) + size * sizeof(headp->head), &headp)) != 0) return (ret); - LIST_INIT(&headp->head); - headp->maxid = 0; - headp->generation = 1; + memset(headp, 0, sizeof(DB_TXNHEAD) + size * sizeof(headp->head)); + headp->maxid = hi_txn; + headp->generation = 0; + headp->nslots = size; + headp->gen_alloc = 8; + if ((ret = __os_malloc(dbenv, headp->gen_alloc * + sizeof(headp->gen_array[0]), &headp->gen_array)) != 0) { + __os_free(dbenv, headp); + return (ret); + } + headp->gen_array[0].generation = 0; + headp->gen_array[0].txn_min = TXN_MINIMUM; + headp->gen_array[0].txn_max = TXN_MAXIMUM; + if (trunc_lsn != NULL) + headp->trunc_lsn = *trunc_lsn; + else + ZERO_LSN(headp->trunc_lsn); + ZERO_LSN(headp->maxlsn); + ZERO_LSN(headp->ckplsn); *(void **)retp = headp; return (0); @@ -241,132 +397,86 @@ __db_txnlist_init(dbenv, retp) * __db_txnlist_add -- * Add an element to our transaction linked list. * - * PUBLIC: int __db_txnlist_add __P((DB_ENV *, void *, u_int32_t, int32_t)); + * PUBLIC: int __db_txnlist_add __P((DB_ENV *, + * PUBLIC: void *, u_int32_t, int32_t, DB_LSN *)); */ int -__db_txnlist_add(dbenv, listp, txnid, aborted) +__db_txnlist_add(dbenv, listp, txnid, status, lsn) DB_ENV *dbenv; void *listp; u_int32_t txnid; - int32_t aborted; + int32_t status; + DB_LSN *lsn; { DB_TXNHEAD *hp; DB_TXNLIST *elp; int ret; - if ((ret = __os_malloc(dbenv, sizeof(DB_TXNLIST), NULL, &elp)) != 0) + if ((ret = __os_malloc(dbenv, sizeof(DB_TXNLIST), &elp)) != 0) return (ret); hp = (DB_TXNHEAD *)listp; - LIST_INSERT_HEAD(&hp->head, elp, links); + LIST_INSERT_HEAD(&hp->head[DB_TXNLIST_MASK(hp, txnid)], elp, links); elp->type = TXNLIST_TXNID; elp->u.t.txnid = txnid; - elp->u.t.aborted = aborted; + elp->u.t.status = status; + elp->u.t.generation = hp->generation; if (txnid > hp->maxid) hp->maxid = txnid; - elp->u.t.generation = hp->generation; + if (lsn != NULL && IS_ZERO_LSN(hp->maxlsn) && status == TXN_COMMIT) + hp->maxlsn = *lsn; + + DB_ASSERT(lsn == NULL || + status != TXN_COMMIT || log_compare(&hp->maxlsn, lsn) >= 0); return (0); } + /* * __db_txnlist_remove -- * Remove an element from our transaction linked list. * - * PUBLIC: int __db_txnlist_remove __P((void *, u_int32_t)); + * PUBLIC: int __db_txnlist_remove __P((DB_ENV *, void *, u_int32_t)); */ int -__db_txnlist_remove(listp, txnid) +__db_txnlist_remove(dbenv, listp, txnid) + DB_ENV *dbenv; void *listp; u_int32_t txnid; { DB_TXNLIST *entry; - return (__db_txnlist_find_internal(listp, - TXNLIST_TXNID, txnid, NULL, &entry, 1)); -} - -/* __db_txnlist_close -- - * - * Call this when we close a file. It allows us to reconcile whether - * we have done any operations on this file with whether the file appears - * to have been deleted. If you never do any operations on a file, then - * we assume it's OK to appear deleted. - * - * PUBLIC: int __db_txnlist_close __P((void *, int32_t, u_int32_t)); - */ - -int -__db_txnlist_close(listp, lid, count) - void *listp; - int32_t lid; - u_int32_t count; -{ - DB_TXNHEAD *hp; - DB_TXNLIST *p; - - hp = (DB_TXNHEAD *)listp; - for (p = LIST_FIRST(&hp->head); p != NULL; p = LIST_NEXT(p, links)) { - if (p->type == TXNLIST_DELETE) - if (lid == p->u.d.fileid && - !F_ISSET(&p->u.d, TXNLIST_FLAG_CLOSED)) { - p->u.d.count += count; - return (0); - } - } - - return (0); + return (__db_txnlist_find_internal(dbenv, + listp, TXNLIST_TXNID, txnid, + NULL, &entry, 1) == TXN_NOTFOUND ? TXN_NOTFOUND : TXN_OK); } /* - * __db_txnlist_delete -- - * - * Record that a file was missing or deleted. If the deleted - * flag is set, then we've encountered a delete of a file, else we've - * just encountered a file that is missing. The lid is the log fileid - * and is only meaningful if deleted is not equal to 0. + * __db_txnlist_ckp -- + * Used to record the maximum checkpoint that will be retained + * after recovery. Typically this is simply the max checkpoint, but + * if we are doing client replication recovery or timestamp-based + * recovery, we are going to virtually truncate the log and we need + * to retain the last checkpoint before the truncation point. * - * PUBLIC: int __db_txnlist_delete __P((DB_ENV *, - * PUBLIC: void *, char *, u_int32_t, int)); + * PUBLIC: void __db_txnlist_ckp __P((DB_ENV *, void *, DB_LSN *)); */ -int -__db_txnlist_delete(dbenv, listp, name, lid, deleted) +void +__db_txnlist_ckp(dbenv, listp, ckp_lsn) DB_ENV *dbenv; void *listp; - char *name; - u_int32_t lid; - int deleted; + DB_LSN *ckp_lsn; { DB_TXNHEAD *hp; - DB_TXNLIST *p; - int ret; - hp = (DB_TXNHEAD *)listp; - for (p = LIST_FIRST(&hp->head); p != NULL; p = LIST_NEXT(p, links)) { - if (p->type == TXNLIST_DELETE) - if (strcmp(name, p->u.d.fname) == 0) { - if (deleted) - F_SET(&p->u.d, TXNLIST_FLAG_DELETED); - else - F_CLR(&p->u.d, TXNLIST_FLAG_CLOSED); - return (0); - } - } - - /* Need to add it. */ - if ((ret = __os_malloc(dbenv, sizeof(DB_TXNLIST), NULL, &p)) != 0) - return (ret); - LIST_INSERT_HEAD(&hp->head, p, links); + COMPQUIET(dbenv, NULL); - p->type = TXNLIST_DELETE; - p->u.d.flags = 0; - if (deleted) - F_SET(&p->u.d, TXNLIST_FLAG_DELETED); - p->u.d.fileid = lid; - p->u.d.count = 0; - ret = __os_strdup(dbenv, name, &p->u.d.fname); + hp = (DB_TXNHEAD *)listp; - return (ret); + if (IS_ZERO_LSN(hp->ckplsn) && !IS_ZERO_LSN(hp->maxlsn) && + log_compare(&hp->maxlsn, ckp_lsn) >= 0) + hp->ckplsn = *ckp_lsn; } /* @@ -383,99 +493,156 @@ __db_txnlist_end(dbenv, listp) { DB_TXNHEAD *hp; DB_TXNLIST *p; - DB_LOG *lp; + int i; - hp = (DB_TXNHEAD *)listp; - lp = (DB_LOG *)dbenv->lg_handle; - while (hp != NULL && (p = LIST_FIRST(&hp->head)) != NULL) { - LIST_REMOVE(p, links); - switch (p->type) { - case TXNLIST_DELETE: - /* - * If we have a file that is not deleted and has - * some operations, we flag the warning. Since - * the file could still be open, we need to check - * the actual log table as well. - */ - if ((!F_ISSET(&p->u.d, TXNLIST_FLAG_DELETED) && - p->u.d.count != 0) || - (!F_ISSET(&p->u.d, TXNLIST_FLAG_CLOSED) && - p->u.d.fileid != (int32_t) TXNLIST_INVALID_ID && - p->u.d.fileid < lp->dbentry_cnt && - lp->dbentry[p->u.d.fileid].count != 0)) - __db_err(dbenv, "warning: %s: %s", - p->u.d.fname, db_strerror(ENOENT)); - __os_freestr(p->u.d.fname); - break; - case TXNLIST_LSN: - __os_free(p->u.l.lsn_array, - p->u.l.maxn * sizeof(DB_LSN)); - break; - default: - /* Possibly an incomplete DB_TXNLIST; just free it. */ - break; + if ((hp = (DB_TXNHEAD *)listp) == NULL) + return; + + for (i = 0; i < hp->nslots; i++) + while (hp != NULL && (p = LIST_FIRST(&hp->head[i])) != NULL) { + LIST_REMOVE(p, links); + switch (p->type) { + case TXNLIST_LSN: + __os_free(dbenv, p->u.l.lsn_array); + break; + default: + /* + * Possibly an incomplete DB_TXNLIST; just + * free it. + */ + break; + } + __os_free(dbenv, p); } - __os_free(p, sizeof(DB_TXNLIST)); - } - __os_free(listp, sizeof(DB_TXNHEAD)); + + if (hp->gen_array != NULL) + __os_free(dbenv, hp->gen_array); + __os_free(dbenv, listp); } /* * __db_txnlist_find -- * Checks to see if a txnid with the current generation is in the - * txnid list. This returns DB_NOTFOUND if the item isn't in the - * list otherwise it returns (like __db_txnlist_find_internal) a - * 1 or 0 indicating if the transaction is aborted or not. A txnid - * of 0 means the record was generated while not in a transaction. + * txnid list. This returns TXN_NOTFOUND if the item isn't in the + * list otherwise it returns (like __db_txnlist_find_internal) + * the status of the transaction. A txnid of 0 means the record + * was generated while not in a transaction. * - * PUBLIC: int __db_txnlist_find __P((void *, u_int32_t)); + * PUBLIC: int __db_txnlist_find __P((DB_ENV *, void *, u_int32_t)); */ int -__db_txnlist_find(listp, txnid) +__db_txnlist_find(dbenv, listp, txnid) + DB_ENV *dbenv; void *listp; u_int32_t txnid; { DB_TXNLIST *entry; if (txnid == 0) - return (DB_NOTFOUND); - return (__db_txnlist_find_internal(listp, - TXNLIST_TXNID, txnid, NULL, &entry, 0)); + return (TXN_NOTFOUND); + return (__db_txnlist_find_internal(dbenv, listp, + TXNLIST_TXNID, txnid, NULL, &entry, 0)); +} + +/* + * __db_txnlist_update -- + * Change the status of an existing transaction entry. + * Returns TXN_NOTFOUND if no such entry exists. + * + * PUBLIC: int __db_txnlist_update __P((DB_ENV *, + * PUBLIC: void *, u_int32_t, u_int32_t, DB_LSN *)); + */ +int +__db_txnlist_update(dbenv, listp, txnid, status, lsn) + DB_ENV *dbenv; + void *listp; + u_int32_t txnid; + u_int32_t status; + DB_LSN *lsn; +{ + DB_TXNHEAD *hp; + DB_TXNLIST *elp; + int ret; + + if (txnid == 0) + return (TXN_NOTFOUND); + hp = (DB_TXNHEAD *)listp; + ret = __db_txnlist_find_internal(dbenv, + listp, TXNLIST_TXNID, txnid, NULL, &elp, 0); + + if (ret == TXN_NOTFOUND) + return (ret); + elp->u.t.status = status; + + if (lsn != NULL && IS_ZERO_LSN(hp->maxlsn) && status == TXN_COMMIT) + hp->maxlsn = *lsn; + + return (ret); } /* * __db_txnlist_find_internal -- - * Find an entry on the transaction list. - * If the entry is not there or the list pointeris not initialized - * we return DB_NOTFOUND. If the item is found, we return the aborted - * status (1 for aborted, 0 for not aborted). Currently we always call - * this with an initialized list pointer but checking for NULL keeps it general. + * Find an entry on the transaction list. If the entry is not there or + * the list pointer is not initialized we return TXN_NOTFOUND. If the + * item is found, we return the status. Currently we always call this + * with an initialized list pointer but checking for NULL keeps it general. */ static int -__db_txnlist_find_internal(listp, type, txnid, uid, txnlistp, delete) +__db_txnlist_find_internal(dbenv, listp, type, txnid, uid, txnlistp, delete) + DB_ENV *dbenv; void *listp; db_txnlist_type type; - u_int32_t txnid; + u_int32_t txnid; u_int8_t uid[DB_FILE_ID_LEN]; DB_TXNLIST **txnlistp; int delete; { DB_TXNHEAD *hp; DB_TXNLIST *p; - int ret; + int32_t generation; + u_int32_t hash; + struct __db_headlink *head; + int i, ret; if ((hp = (DB_TXNHEAD *)listp) == NULL) - return (DB_NOTFOUND); + return (TXN_NOTFOUND); + + switch (type) { + case TXNLIST_TXNID: + hash = txnid; + /* Find the most recent generation containing this ID */ + for (i = 0; i <= hp->generation; i++) + /* The range may wrap around the end. */ + if (hp->gen_array[i].txn_min < + hp->gen_array[i].txn_max ? + (txnid >= hp->gen_array[i].txn_min && + txnid <= hp->gen_array[i].txn_max) : + (txnid >= hp->gen_array[i].txn_min || + txnid <= hp->gen_array[i].txn_max)) + break; + DB_ASSERT(i <= hp->generation); + generation = hp->gen_array[i].generation; + break; + case TXNLIST_PGNO: + memcpy(&hash, uid, sizeof(hash)); + generation = 0; + break; + default: + DB_ASSERT(0); + return (EINVAL); + } + + head = &hp->head[DB_TXNLIST_MASK(hp, hash)]; - for (p = LIST_FIRST(&hp->head); p != NULL; p = LIST_NEXT(p, links)) { + for (p = LIST_FIRST(head); p != NULL; p = LIST_NEXT(p, links)) { if (p->type != type) continue; switch (type) { case TXNLIST_TXNID: if (p->u.t.txnid != txnid || - hp->generation != p->u.t.generation) + generation != p->u.t.generation) continue; - ret = p->u.t.aborted; + ret = p->u.t.status; break; case TXNLIST_PGNO: @@ -490,42 +657,67 @@ __db_txnlist_find_internal(listp, type, txnid, uid, txnlistp, delete) } if (delete == 1) { LIST_REMOVE(p, links); - __os_free(p, sizeof(DB_TXNLIST)); - } else if (p != LIST_FIRST(&hp->head)) { + __os_free(dbenv, p); + } else if (p != LIST_FIRST(head)) { /* Move it to head of list. */ LIST_REMOVE(p, links); - LIST_INSERT_HEAD(&hp->head, p, links); + LIST_INSERT_HEAD(head, p, links); } *txnlistp = p; return (ret); } - return (DB_NOTFOUND); + return (TXN_NOTFOUND); } /* * __db_txnlist_gen -- * Change the current generation number. * - * PUBLIC: void __db_txnlist_gen __P((void *, int)); + * PUBLIC: int __db_txnlist_gen __P((DB_ENV *, + * PUBLIC: void *, int, u_int32_t, u_int32_t)); */ -void -__db_txnlist_gen(listp, incr) +int +__db_txnlist_gen(dbenv, listp, incr, min, max) + DB_ENV *dbenv; void *listp; int incr; + u_int32_t min, max; { DB_TXNHEAD *hp; + int ret; /* - * During recovery generation numbers keep track of how many "restart" - * checkpoints we've seen. Restart checkpoints occur whenever we take - * a checkpoint and there are no outstanding transactions. When that - * happens, we can reset transaction IDs back to 1. It always happens - * at recovery and it prevents us from exhausting the transaction IDs - * name space. + * During recovery generation numbers keep track of "restart" + * checkpoints and recycle records. Restart checkpoints occur + * whenever we take a checkpoint and there are no outstanding + * transactions. When that happens, we can reset transaction IDs + * back to TXNID_MINIMUM. Currently we only do the reset + * at then end of recovery. Recycle records occrur when txnids + * are exhausted during runtime. A free range of ids is identified + * and logged. This code maintains a stack of ranges. A txnid + * is given the generation number of the first range it falls into + * in the stack. */ hp = (DB_TXNHEAD *)listp; hp->generation += incr; + if (incr < 0) + memmove(hp->gen_array, &hp->gen_array[1], + (hp->generation + 1) * sizeof(hp->gen_array[0])); + else { + if (hp->generation >= hp->gen_alloc) { + hp->gen_alloc *= 2; + if ((ret = __os_realloc(dbenv, hp->gen_alloc * + sizeof(hp->gen_array[0]), &hp->gen_array)) != 0) + return (ret); + } + memmove(&hp->gen_array[1], &hp->gen_array[0], + hp->generation * sizeof(hp->gen_array[0])); + hp->gen_array[0].generation = hp->generation; + hp->gen_array[0].txn_min = min; + hp->gen_array[0].txn_max = max; + } + return (0); } #define TXN_BUBBLE(AP, MAX) { \ @@ -542,10 +734,10 @@ __db_txnlist_gen(listp, incr) /* * __db_txnlist_lsnadd -- - * Add to or re-sort the transaction list lsn entry. - * Note that since this is used during an abort, the __txn_undo - * code calls into the "recovery" subsystem explicitly, and there - * is only a single TXNLIST_LSN entry on the list. + * Add to or re-sort the transaction list lsn entry. Note that since this + * is used during an abort, the __txn_undo code calls into the "recovery" + * subsystem explicitly, and there is only a single TXNLIST_LSN entry on + * the list. * * PUBLIC: int __db_txnlist_lsnadd __P((DB_ENV *, void *, DB_LSN *, u_int32_t)); */ @@ -562,19 +754,19 @@ __db_txnlist_lsnadd(dbenv, listp, lsnp, flags) hp = (DB_TXNHEAD *)listp; - for (elp = LIST_FIRST(&hp->head); + for (elp = LIST_FIRST(&hp->head[0]); elp != NULL; elp = LIST_NEXT(elp, links)) if (elp->type == TXNLIST_LSN) break; if (elp == NULL) - return (EINVAL); + return (DB_SURPRISE_KID); if (LF_ISSET(TXNLIST_NEW)) { if (elp->u.l.ntxns >= elp->u.l.maxn) { if ((ret = __os_realloc(dbenv, 2 * elp->u.l.maxn * sizeof(DB_LSN), - NULL, &elp->u.l.lsn_array)) != 0) + &elp->u.l.lsn_array)) != 0) return (ret); elp->u.l.maxn *= 2; } @@ -584,9 +776,9 @@ __db_txnlist_lsnadd(dbenv, listp, lsnp, flags) elp->u.l.lsn_array[0] = *lsnp; /* - * If we just added a new entry and there may be NULL - * entries, so we have to do a complete bubble sort, - * not just trickle a changed entry around. + * If we just added a new entry and there may be NULL entries, so we + * have to do a complete bubble sort, not just trickle a changed entry + * around. */ for (i = 0; i < (!LF_ISSET(TXNLIST_NEW) ? 1 : elp->u.l.ntxns); i++) TXN_BUBBLE(elp->u.l.lsn_array, elp->u.l.ntxns); @@ -597,35 +789,6 @@ __db_txnlist_lsnadd(dbenv, listp, lsnp, flags) } /* - * __db_txnlist_lsnhead -- - * Return a pointer to the beginning of the lsn_array. - * - * PUBLIC: int __db_txnlist_lsnhead __P((void *, DB_LSN **)); - */ -int -__db_txnlist_lsnhead(listp, lsnpp) - void *listp; - DB_LSN **lsnpp; -{ - DB_TXNHEAD *hp; - DB_TXNLIST *elp; - - hp = (DB_TXNHEAD *)listp; - - for (elp = LIST_FIRST(&hp->head); - elp != NULL; elp = LIST_NEXT(elp, links)) - if (elp->type == TXNLIST_LSN) - break; - - if (elp == NULL) - return (EINVAL); - - *lsnpp = &elp->u.l.lsn_array[0]; - - return (0); -} - -/* * __db_txnlist_lsninit -- * Initialize a transaction list with an lsn array entry. * @@ -642,12 +805,12 @@ __db_txnlist_lsninit(dbenv, hp, lsnp) elp = NULL; - if ((ret = __os_malloc(dbenv, sizeof(DB_TXNLIST), NULL, &elp)) != 0) + if ((ret = __os_malloc(dbenv, sizeof(DB_TXNLIST), &elp)) != 0) goto err; - LIST_INSERT_HEAD(&hp->head, elp, links); + LIST_INSERT_HEAD(&hp->head[0], elp, links); if ((ret = __os_malloc(dbenv, - 12 * sizeof(DB_LSN), NULL, &elp->u.l.lsn_array)) != 0) + 12 * sizeof(DB_LSN), &elp->u.l.lsn_array)) != 0) goto err; elp->type = TXNLIST_LSN; elp->u.l.maxn = 12; @@ -662,8 +825,7 @@ err: __db_txnlist_end(dbenv, hp); /* * __db_add_limbo -- add pages to the limbo list. - * Get the file information and call pgnoadd - * for each page. + * Get the file information and call pgnoadd for each page. * * PUBLIC: int __db_add_limbo __P((DB_ENV *, * PUBLIC: void *, int32_t, db_pgno_t, int32_t)); @@ -681,7 +843,7 @@ __db_add_limbo(dbenv, info, fileid, pgno, count) int ret; dblp = dbenv->lg_handle; - if ((ret = __log_lid_to_fname(dblp, fileid, &fnp)) != 0) + if ((ret = __dbreg_id_to_fname(dblp, fileid, 0, &fnp)) != 0) return (ret); do { @@ -698,201 +860,429 @@ __db_add_limbo(dbenv, info, fileid, pgno, count) /* * __db_do_the_limbo -- move pages from limbo to free. * - * If we are in recovery we add things to the free list without - * logging becasue we want to incrementaly apply logs that - * may be generated on another copy of this environment. - * Otherwise we just call __db_free to put the pages on - * the free list and log the activity. + * Limbo processing is what ensures that we correctly handle and + * recover from page allocations. During recovery, for each database, + * we process each in-question allocation, link them into the free list + * and then write out the new meta-data page that contains the pointer + * to the new beginning of the free list. On an abort, we use our + * standard __db_free mechanism in a compensating transaction which logs + * the specific modifications to the free list. + * + * If we run out of log space during an abort, then we can't write the + * compensating transaction, so we abandon the idea of a compenating + * transaction, and go back to processing how we do during recovery. + * The reason that this is not the norm is that it's expensive: it requires + * that we flush any database with an in-question allocation. Thus if + * a compensating transaction fails, we never try to restart it. + * + * Since files may be open and closed within transactions (in particular, + * the master database for subdatabases), we must be prepared to open + * files during this process. If there is a compensating transaction, we + * can open the files in that transaction. If this was an abort and there + * is no compensating transaction, then we've got to perform these opens + * in the context of the aborting transaction so that we do not deadlock. + * During recovery, there's no locking, so this isn't an issue. * - * PUBLIC: int __db_do_the_limbo __P((DB_ENV *, DB_TXNHEAD *)); + * What you want to keep in mind when reading this is that there are two + * algorithms going on here: ctxn == NULL, then we're either in recovery + * or our compensating transaction has failed and we're doing the + * "create list and write meta-data page" algorithm. Otherwise, we're in + * an abort and doing the "use compensating transaction" algorithm. + * + * PUBLIC: int __db_do_the_limbo __P((DB_ENV *, + * PUBLIC: DB_TXN *, DB_TXN *, DB_TXNHEAD *)); */ int -__db_do_the_limbo(dbenv, hp) +__db_do_the_limbo(dbenv, ptxn, txn, hp) DB_ENV *dbenv; + DB_TXN *ptxn, *txn; DB_TXNHEAD *hp; { - DB *dbp; - DBC *dbc; - DBMETA *meta; - DB_TXN *txn; DB_TXNLIST *elp; - PAGE *pagep; - db_pgno_t last_pgno, pgno; - int i, in_recover, put_page, ret, t_ret; + int h, ret; - dbp = NULL; - dbc = NULL; - txn = NULL; ret = 0; + /* + * The slots correspond to hash buckets. We've hashed the + * fileids into hash buckets and need to pick up all affected + * files. (There will only be a single slot for an abort.) + */ + for (h = 0; h < hp->nslots; h++) { + if ((elp = LIST_FIRST(&hp->head[h])) == NULL) + continue; + if (ptxn != NULL) { + if ((ret = + __db_limbo_move(dbenv, ptxn, txn, elp)) != 0) + goto err; + } else if ((ret = __db_limbo_bucket(dbenv, txn, elp)) != 0) + goto err; + } + +err: if (ret != 0) { + __db_err(dbenv, "Fatal error in abort of an allocation"); + ret = __db_panic(dbenv, ret); + } - /* Are we in recovery? */ - in_recover = F_ISSET((DB_LOG *)dbenv->lg_handle, DBLOG_RECOVER); + return (ret); +} - for (elp = LIST_FIRST(&hp->head); - elp != NULL; elp = LIST_NEXT(elp, links)) { +/* Limbo support routines. */ + +/* + * __db_lock_move -- + * Move a lock from child to parent. + */ +static int +__db_lock_move(dbenv, fileid, pgno, mode, ptxn, txn) + DB_ENV *dbenv; + u_int8_t *fileid; + db_pgno_t pgno; + db_lockmode_t mode; + DB_TXN *ptxn, *txn; +{ + DBT lock_dbt; + DB_LOCK lock; + DB_LOCK_ILOCK lock_obj; + DB_LOCKREQ req; + int ret; + + lock_obj.pgno = pgno; + memcpy(lock_obj.fileid, fileid, DB_FILE_ID_LEN); + lock_obj.type = DB_PAGE_LOCK; + + memset(&lock_dbt, 0, sizeof(lock_dbt)); + lock_dbt.data = &lock_obj; + lock_dbt.size = sizeof(lock_obj); + + if ((ret = dbenv->lock_get(dbenv, + txn->txnid, 0, &lock_dbt, mode, &lock)) == 0) { + memset(&req, 0, sizeof(req)); + req.lock = lock; + req.op = DB_LOCK_TRADE; + + ret = dbenv->lock_vec(dbenv, ptxn->txnid, 0, &req, 1, NULL); + } + return (ret); +} + +/* + * __db_limbo_move + * Move just the metapage lock to the parent. + */ +static int +__db_limbo_move(dbenv, ptxn, txn, elp) + DB_ENV *dbenv; + DB_TXN *ptxn, *txn; + DB_TXNLIST *elp; +{ + int ret; + + for (; elp != NULL; elp = LIST_NEXT(elp, links)) { + if (elp->type != TXNLIST_PGNO || elp->u.p.locked == 1) + continue; + if ((ret = __db_lock_move(dbenv, elp->u.p.uid, + PGNO_BASE_MD, DB_LOCK_WRITE, ptxn, txn)) != 0) + return (ret); + elp->u.p.locked = 1; + } + + return (0); +} +/* + * __db_limbo_bucket + * Perform limbo processing for a single hash bucket in the txnlist. + * txn is the transaction aborting in the case of an abort and ctxn is the + * compensating transaction. + */ + +#define T_RESTORED(txn) ((txn) != NULL && F_ISSET(txn, TXN_RESTORED)) +static int +__db_limbo_bucket(dbenv, txn, elp) + DB_ENV *dbenv; + DB_TXN *txn; + DB_TXNLIST *elp; +{ + DB *dbp; + DB_MPOOLFILE *mpf; + DBMETA *meta; + DB_TXN *ctxn, *t; + db_pgno_t last_pgno, pgno; + int dbp_created, in_retry, ret, t_ret; + + ctxn = NULL; + in_retry = 0; + meta = NULL; + mpf = NULL; + ret = 0; + for (; elp != NULL; elp = LIST_NEXT(elp, links)) { if (elp->type != TXNLIST_PGNO) continue; +retry: dbp_created = 0; + + /* + * Pick the transaction in which to potentially + * log compensations. + */ + if (!in_retry && !IS_RECOVERING(dbenv) && !T_RESTORED(txn) + && (ret = __txn_compensate_begin(dbenv, &ctxn)) != 0) + return (ret); + + /* + * Either use the compensating transaction or + * the one passed in, which will be null if recovering. + */ + t = ctxn == NULL ? txn : ctxn; + + /* First try to get a dbp by fileid. */ + ret = __dbreg_id_to_db(dbenv, t, &dbp, elp->u.p.fileid, 0); + + /* + * File is being destroyed. No need to worry about + * dealing with recovery of allocations. + */ + if (ret == DB_DELETED || + (ret == 0 && F_ISSET(dbp, DB_AM_DISCARD))) + goto next; - if (in_recover) { + if (ret != 0) { if ((ret = db_create(&dbp, dbenv, 0)) != 0) goto err; /* - * It is ok if the file is nolonger there. + * This tells the system not to lock, which is always + * OK, whether this is an abort or recovery. */ + F_SET(dbp, DB_AM_COMPENSATE); + dbp_created = 1; + + /* It is ok if the file is nolonger there. */ dbp->type = DB_UNKNOWN; - ret = __db_dbopen(dbp, - elp->u.p.fname, 0, __db_omode("rw----"), 0); + ret = __db_dbopen(dbp, t, elp->u.p.fname, NULL, + DB_ODDFILESIZE, __db_omode("rw----"), PGNO_BASE_MD); + if (ret == ENOENT) + goto next; + } + + /* + * Verify that we are opening the same file that we were + * referring to when we wrote this log record. + */ + if (memcmp(elp->u.p.uid, dbp->fileid, DB_FILE_ID_LEN) != 0) + goto next; + + mpf = dbp->mpf; + last_pgno = PGNO_INVALID; + + if (ctxn == NULL) { + pgno = PGNO_BASE_MD; + if ((ret = + mpf->get(mpf, &pgno, 0, (PAGE **)&meta)) != 0) + goto err; + last_pgno = meta->free; + } + + ret = __db_limbo_fix(dbp, ctxn, elp, &last_pgno, meta); + /* + * If we were doing compensating transactions, then we are + * going to hope this error was due to running out of space. + * We'll change modes (into the sync the file mode) and keep + * trying. If we weren't doing compensating transactions, + * then this is a real error and we're sunk. + */ + if (ret != 0) { + if (ret == DB_RUNRECOVERY || ctxn == NULL) + goto err; + in_retry = 1; + goto retry; + } + + if (ctxn != NULL) { + ret = ctxn->commit(ctxn, DB_TXN_NOSYNC); + ctxn = NULL; + if (ret != 0) + goto retry; + goto next; + } + + /* + * This is where we handle the case where we're explicitly + * putting together a free list. We need to decide whether + * we have to write the meta-data page, and if we do, then + * we need to sync it as well. + */ + if (last_pgno == meta->free) { + /* No change to page; just put the page back. */ + if ((ret = mpf->put(mpf, meta, 0)) != 0) + goto err; + meta = NULL; } else { /* - * If we are in transaction undo, then we know - * the fileid is still correct. + * These changes are unlogged so we cannot have the + * metapage pointing at pages that are not on disk. + * Therefore, we flush the new free list, then update + * the metapage. We have to put the meta-data page + * first so that it isn't pinned when we try to sync. */ + if (!IS_RECOVERING(dbenv) && !T_RESTORED(txn)) + __db_err(dbenv, "Flushing free list to disk"); + if ((ret = mpf->put(mpf, meta, 0)) != 0) + goto err; + meta = NULL; + dbp->sync(dbp, 0); + pgno = PGNO_BASE_MD; if ((ret = - __db_fileid_to_db(dbenv, &dbp, - elp->u.p.fileid, 0)) != 0 && ret != DB_DELETED) + mpf->get(mpf, &pgno, 0, (PAGE **)&meta)) != 0) + goto err; + meta->free = last_pgno; + if ((ret = mpf->put(mpf, meta, DB_MPOOL_DIRTY)) != 0) goto err; - /* File is being destroyed. */ - if (F_ISSET(dbp, DB_AM_DISCARD)) - ret = DB_DELETED; + meta = NULL; } + +next: /* - * Verify that we are opening the same file that we were - * referring to when we wrote this log record. + * If we get here, either we have processed the list + * or the db file has been deleted or could no be opened. */ - if (ret == 0 && - memcmp(elp->u.p.uid, dbp->fileid, DB_FILE_ID_LEN) == 0) { - last_pgno = PGNO_INVALID; - if (in_recover) { - pgno = PGNO_BASE_MD; - if ((ret = memp_fget(dbp->mpf, - &pgno, 0, (PAGE **)&meta)) != 0) - goto err; - last_pgno = meta->free; - /* - * Check to see if the head of the free - * list is any of the pages we are about - * to link in. We could have crashed - * after linking them in and before writing - * a checkpoint. - * It may not be the last one since - * any page may get reallocated before here. - */ - if (last_pgno != PGNO_INVALID) - for (i = 0; i < elp->u.p.nentries; i++) - if (last_pgno - == elp->u.p.pgno_array[i]) - goto done_it; - } + if (ctxn != NULL && + (t_ret = ctxn->abort(ctxn)) != 0 && ret == 0) + ret = t_ret; - for (i = 0; i < elp->u.p.nentries; i++) { - pgno = elp->u.p.pgno_array[i]; - if ((ret = memp_fget(dbp->mpf, - &pgno, DB_MPOOL_CREATE, &pagep)) != 0) - goto err; + if (dbp_created && + (t_ret = __db_close_i(dbp, txn, 0)) != 0 && ret == 0) + ret = t_ret; + dbp = NULL; + __os_free(dbenv, elp->u.p.fname); + __os_free(dbenv, elp->u.p.pgno_array); + if (ret == ENOENT) + ret = 0; + else if (ret != 0) + goto err; + } - put_page = 1; - if (IS_ZERO_LSN(LSN(pagep))) { - P_INIT(pagep, dbp->pgsize, - pgno, PGNO_INVALID, - last_pgno, 0, P_INVALID); - - if (in_recover) { - LSN(pagep) = LSN(meta); - last_pgno = pgno; - } else { - /* - * Starting the transaction - * is postponed until we know - * we have something to do. - */ - if (txn == NULL && - (ret = txn_begin(dbenv, - NULL, &txn, 0)) != 0) - goto err; - - if (dbc == NULL && - (ret = dbp->cursor(dbp, - txn, &dbc, 0)) != 0) - goto err; - /* Turn off locking. */ - F_SET(dbc, DBC_COMPENSATE); - - /* __db_free puts the page. */ - if ((ret = - __db_free(dbc, pagep)) != 0) - goto err; - put_page = 0; - } - } +err: if (meta != NULL) + (void)mpf->put(mpf, meta, 0); + return (ret); +} - if (put_page == 1 && - (ret = memp_fput(dbp->mpf, - pagep, DB_MPOOL_DIRTY)) != 0) - goto err; - } - if (in_recover) { - if (last_pgno == meta->free) { -done_it: +/* + * __db_limbo_fix -- + * Process a single limbo entry which describes all the page allocations + * for a single file. + */ +static int +__db_limbo_fix(dbp, ctxn, elp, lastp, meta) + DB *dbp; + DB_TXN *ctxn; + DB_TXNLIST *elp; + db_pgno_t *lastp; + DBMETA *meta; +{ + DBC *dbc; + DB_MPOOLFILE *mpf; + PAGE *freep, *pagep; + db_pgno_t next, pgno; + int i, put_page, ret, t_ret; + + /* + * Loop through the entries for this txnlist element and + * either link them into the free list or write a compensating + * record for each. + */ + put_page = 0; + ret = 0; + mpf = dbp->mpf; + dbc = NULL; + + for (i = 0; i < elp->u.p.nentries; i++) { + pgno = elp->u.p.pgno_array[i]; + + if ((ret = mpf->get(mpf, &pgno, DB_MPOOL_CREATE, &pagep)) != 0) + goto err; + put_page = 1; + + if (IS_ZERO_LSN(LSN(pagep))) { + if (ctxn == NULL) { + /* + * If this is a fatal recovery which + * spans a previous crash this page may + * be on the free list already. + */ + for (next = *lastp; next != 0; ) { + if (next == pgno) + break; + if ((ret = mpf->get(mpf, + &next, 0, &freep)) != 0) + goto err; + next = NEXT_PGNO(freep); if ((ret = - memp_fput(dbp->mpf, meta, 0)) != 0) + mpf->put(mpf, freep, 0)) != 0) goto err; - } else { - /* - * Flush the new free list then - * update the metapage. This is - * unlogged so we cannot have the - * metapage pointing at pages that - * are not on disk. - */ - dbp->sync(dbp, 0); - meta->free = last_pgno; - if ((ret = memp_fput(dbp->mpf, - meta, DB_MPOOL_DIRTY)) != 0) + } + + if (next != pgno) { + P_INIT(pagep, dbp->pgsize, pgno, + PGNO_INVALID, *lastp, 0, P_INVALID); + LSN(pagep) = LSN(meta); + *lastp = pgno; + } + } else { + P_INIT(pagep, dbp->pgsize, pgno, + PGNO_INVALID, *lastp, 0, P_INVALID); + if (dbc == NULL && (ret = + dbp->cursor(dbp, ctxn, &dbc, 0)) != 0) goto err; + /* + * If the dbp is compensating (because we + * opened it), the dbc will automatically be + * marked compensating, but in case we didn't + * do the open, we have to mark it explicitly. + */ + F_SET(dbc, DBC_COMPENSATE); + ret = __db_free(dbc, pagep); + put_page = 0; + /* + * On any error, we hope that the error was + * caused due to running out of space, and we + * switch modes, doing the processing where we + * sync out files instead of doing compensating + * transactions. If this was a real error and + * not out of space, we assume that some other + * call will fail real soon. + */ + if (ret != 0) { + /* Assume that this is out of space. */ + (void)dbc->c_close(dbc); + dbc = NULL; + goto err; } } - if (dbc != NULL && (ret = dbc->c_close(dbc)) != 0) - goto err; - dbc = NULL; } - if (in_recover && (t_ret = dbp->close(dbp, 0)) != 0 && ret == 0) - ret = t_ret; - dbp = NULL; - __os_free(elp->u.p.fname, 0); - __os_free(elp->u.p.pgno_array, 0); - if (ret == ENOENT) - ret = 0; - else if (ret != 0) + + if (put_page == 1) { + ret = mpf->put(mpf, pagep, DB_MPOOL_DIRTY); + put_page = 0; + } + if (ret != 0) goto err; } - if (txn != NULL) { - ret = txn_commit(txn, 0); - txn = NULL; - } -err: - if (dbc != NULL) - (void)dbc->c_close(dbc); - if (in_recover && dbp != NULL) - (void)dbp->close(dbp, 0); - if (txn != NULL) - (void)txn_abort(txn); +err: if (put_page && + (t_ret = mpf->put(mpf, pagep, DB_MPOOL_DIRTY)) != 0 && ret == 0) + ret = t_ret; + if (dbc != NULL && (t_ret = dbc->c_close(dbc)) != 0 && ret == 0) + ret = t_ret; return (ret); - } -#define DB_TXNLIST_MAX_PGNO 8 /* A nice even number. */ +#define DB_TXNLIST_MAX_PGNO 8 /* A nice even number. */ /* * __db_txnlist_pgnoadd -- - * Find the txnlist entry for a file and add this pgno, - * or add the list entry for the file and then add the pgno. - * - * PUBLIC: int __db_txnlist_pgnoadd __P((DB_ENV *, DB_TXNHEAD *, - * PUBLIC: int32_t, u_int8_t [DB_FILE_ID_LEN], char *, db_pgno_t)); + * Find the txnlist entry for a file and add this pgno, or add the list + * entry for the file and then add the pgno. */ -int +static int __db_txnlist_pgnoadd(dbenv, hp, fileid, uid, fname, pgno) DB_ENV *dbenv; DB_TXNHEAD *hp; @@ -902,34 +1292,39 @@ __db_txnlist_pgnoadd(dbenv, hp, fileid, uid, fname, pgno) db_pgno_t pgno; { DB_TXNLIST *elp; + u_int32_t hash; int len, ret; elp = NULL; - if (__db_txnlist_find_internal(hp, TXNLIST_PGNO, 0, uid, &elp, 0) != 0) { + if (__db_txnlist_find_internal(dbenv, hp, + TXNLIST_PGNO, 0, uid, &elp, 0) != 0) { if ((ret = - __os_malloc(dbenv, sizeof(DB_TXNLIST), NULL, &elp)) != 0) + __os_malloc(dbenv, sizeof(DB_TXNLIST), &elp)) != 0) goto err; - LIST_INSERT_HEAD(&hp->head, elp, links); + memcpy(&hash, uid, sizeof(hash)); + LIST_INSERT_HEAD( + &hp->head[DB_TXNLIST_MASK(hp, hash)], elp, links); elp->u.p.fileid = fileid; memcpy(elp->u.p.uid, uid, DB_FILE_ID_LEN); - len = strlen(fname) + 1; - if ((ret = __os_malloc(dbenv, len, NULL, &elp->u.p.fname)) != 0) + len = (int)strlen(fname) + 1; + if ((ret = __os_malloc(dbenv, len, &elp->u.p.fname)) != 0) goto err; memcpy(elp->u.p.fname, fname, len); elp->u.p.maxentry = 0; + elp->u.p.locked = 0; elp->type = TXNLIST_PGNO; if ((ret = __os_malloc(dbenv, - 8 * sizeof(db_pgno_t), NULL, &elp->u.p.pgno_array)) != 0) + 8 * sizeof(db_pgno_t), &elp->u.p.pgno_array)) != 0) goto err; elp->u.p.maxentry = DB_TXNLIST_MAX_PGNO; elp->u.p.nentries = 0; } else if (elp->u.p.nentries == elp->u.p.maxentry) { elp->u.p.maxentry <<= 1; if ((ret = __os_realloc(dbenv, elp->u.p.maxentry * - sizeof(db_pgno_t), NULL, &elp->u.p.pgno_array)) != 0) + sizeof(db_pgno_t), &elp->u.p.pgno_array)) != 0) goto err; } @@ -941,6 +1336,36 @@ err: __db_txnlist_end(dbenv, hp); return (ret); } +/* + * __db_default_getpgnos -- + * Fill in default getpgnos information for an application-specific + * log record. + */ +static int +__db_default_getpgnos(dbenv, lsnp, summary) + DB_ENV *dbenv; + DB_LSN *lsnp; + void *summary; +{ + TXN_RECS *t; + int ret; + + t = (TXN_RECS *)summary; + + if ((ret = __rep_check_alloc(dbenv, t, 1)) != 0) + return (ret); + + t->array[t->npages].flags = LSN_PAGE_NOLOCK; + t->array[t->npages].lsn = *lsnp; + t->array[t->npages].fid = DB_LOGFILEID_INVALID; + memset(&t->array[t->npages].pgdesc, 0, + sizeof(t->array[t->npages].pgdesc)); + + t->npages++; + + return (0); +} + #ifdef DEBUG /* * __db_txnlist_print -- @@ -954,25 +1379,21 @@ __db_txnlist_print(listp) { DB_TXNHEAD *hp; DB_TXNLIST *p; + int i; + char *stats[] = { "ok", "commit", "prepare", "abort", "notfound", + "ignore", "expected", "unexpected" }; hp = (DB_TXNHEAD *)listp; printf("Maxid: %lu Generation: %lu\n", (u_long)hp->maxid, (u_long)hp->generation); - for (p = LIST_FIRST(&hp->head); p != NULL; p = LIST_NEXT(p, links)) { + for (i = 0; i < hp->nslots; i++) + for (p = LIST_FIRST(&hp->head[i]); p != NULL; p = LIST_NEXT(p, links)) { switch (p->type) { case TXNLIST_TXNID: - printf("TXNID: %lu(%lu)\n", - (u_long)p->u.t.txnid, (u_long)p->u.t.generation); - break; - case TXNLIST_DELETE: - printf("FILE: %s id=%d ops=%d %s %s\n", - p->u.d.fname, p->u.d.fileid, p->u.d.count, - F_ISSET(&p->u.d, TXNLIST_FLAG_DELETED) ? - "(deleted)" : "(missing)", - F_ISSET(&p->u.d, TXNLIST_FLAG_CLOSED) ? - "(closed)" : "(open)"); - + printf("TXNID: %lx(%lu): %s\n", + (u_long)p->u.t.txnid, (u_long)p->u.t.generation, + stats[p->u.t.status]); break; default: printf("Unrecognized type: %d\n", p->type); diff --git a/bdb/db/db_dup.c b/bdb/db/db_dup.c index 6d8b2df9518..2d33d79153f 100644 --- a/bdb/db/db_dup.c +++ b/bdb/db/db_dup.c @@ -1,14 +1,14 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2002 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: db_dup.c,v 11.18 2000/11/30 00:58:32 ubell Exp $"; +static const char revid[] = "$Id: db_dup.c,v 11.32 2002/08/08 03:57:47 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -18,12 +18,10 @@ static const char revid[] = "$Id: db_dup.c,v 11.18 2000/11/30 00:58:32 ubell Exp #endif #include "db_int.h" -#include "db_page.h" -#include "db_shash.h" -#include "btree.h" -#include "hash.h" -#include "lock.h" -#include "db_am.h" +#include "dbinc/db_page.h" +#include "dbinc/db_shash.h" +#include "dbinc/lock.h" +#include "dbinc/db_am.h" /* * __db_ditem -- @@ -39,19 +37,20 @@ __db_ditem(dbc, pagep, indx, nbytes) { DB *dbp; DBT ldbt; - db_indx_t cnt, offset; + db_indx_t cnt, *inp, offset; int ret; u_int8_t *from; dbp = dbc->dbp; - if (DB_LOGGING(dbc)) { - ldbt.data = P_ENTRY(pagep, indx); + if (DBC_LOGGING(dbc)) { + ldbt.data = P_ENTRY(dbp, pagep, indx); ldbt.size = nbytes; - if ((ret = __db_addrem_log(dbp->dbenv, dbc->txn, - &LSN(pagep), 0, DB_REM_DUP, dbp->log_fileid, PGNO(pagep), + if ((ret = __db_addrem_log(dbp, dbc->txn, + &LSN(pagep), 0, DB_REM_DUP, PGNO(pagep), (u_int32_t)indx, nbytes, &ldbt, NULL, &LSN(pagep))) != 0) return (ret); - } + } else + LSN_NOT_LOGGED(LSN(pagep)); /* * If there's only a single item on the page, we don't have to @@ -63,24 +62,26 @@ __db_ditem(dbc, pagep, indx, nbytes) return (0); } + inp = P_INP(dbp, pagep); /* * Pack the remaining key/data items at the end of the page. Use * memmove(3), the regions may overlap. */ from = (u_int8_t *)pagep + HOFFSET(pagep); - memmove(from + nbytes, from, pagep->inp[indx] - HOFFSET(pagep)); + DB_ASSERT((int)inp[indx] - HOFFSET(pagep) >= 0); + memmove(from + nbytes, from, inp[indx] - HOFFSET(pagep)); HOFFSET(pagep) += nbytes; /* Adjust the indices' offsets. */ - offset = pagep->inp[indx]; + offset = inp[indx]; for (cnt = 0; cnt < NUM_ENT(pagep); ++cnt) - if (pagep->inp[cnt] < offset) - pagep->inp[cnt] += nbytes; + if (inp[cnt] < offset) + inp[cnt] += nbytes; /* Shift the indices down. */ --NUM_ENT(pagep); if (indx != NUM_ENT(pagep)) - memmove(&pagep->inp[indx], &pagep->inp[indx + 1], + memmove(&inp[indx], &inp[indx + 1], sizeof(db_indx_t) * (NUM_ENT(pagep) - indx)); return (0); @@ -104,11 +105,13 @@ __db_pitem(dbc, pagep, indx, nbytes, hdr, data) DB *dbp; BKEYDATA bk; DBT thdr; + db_indx_t *inp; int ret; u_int8_t *p; - if (nbytes > P_FREESPACE(pagep)) { - DB_ASSERT(nbytes <= P_FREESPACE(pagep)); + dbp = dbc->dbp; + if (nbytes > P_FREESPACE(dbp, pagep)) { + DB_ASSERT(nbytes <= P_FREESPACE(dbp, pagep)); return (EINVAL); } /* @@ -128,12 +131,13 @@ __db_pitem(dbc, pagep, indx, nbytes, hdr, data) * the passed in header sizes must be adjusted for the structure's * placeholder for the trailing variable-length data field. */ - dbp = dbc->dbp; - if (DB_LOGGING(dbc)) - if ((ret = __db_addrem_log(dbp->dbenv, dbc->txn, - &LSN(pagep), 0, DB_ADD_DUP, dbp->log_fileid, PGNO(pagep), + if (DBC_LOGGING(dbc)) { + if ((ret = __db_addrem_log(dbp, dbc->txn, + &LSN(pagep), 0, DB_ADD_DUP, PGNO(pagep), (u_int32_t)indx, nbytes, hdr, data, &LSN(pagep))) != 0) return (ret); + } else + LSN_NOT_LOGGED(LSN(pagep)); if (hdr == NULL) { B_TSET(bk.type, B_KEYDATA, 0); @@ -143,16 +147,17 @@ __db_pitem(dbc, pagep, indx, nbytes, hdr, data) thdr.size = SSZA(BKEYDATA, data); hdr = &thdr; } + inp = P_INP(dbp, pagep); /* Adjust the index table, then put the item on the page. */ if (indx != NUM_ENT(pagep)) - memmove(&pagep->inp[indx + 1], &pagep->inp[indx], + memmove(&inp[indx + 1], &inp[indx], sizeof(db_indx_t) * (NUM_ENT(pagep) - indx)); HOFFSET(pagep) -= nbytes; - pagep->inp[indx] = HOFFSET(pagep); + inp[indx] = HOFFSET(pagep); ++NUM_ENT(pagep); - p = P_ENTRY(pagep, indx); + p = P_ENTRY(dbp, pagep, indx); memcpy(p, hdr->data, hdr->size); if (data != NULL) memcpy(p + hdr->size, data->data, data->size); @@ -177,13 +182,16 @@ __db_relink(dbc, add_rem, pagep, new_next, needlock) PAGE *np, *pp; DB_LOCK npl, ppl; DB_LSN *nlsnp, *plsnp, ret_lsn; + DB_MPOOLFILE *mpf; int ret; - ret = 0; + dbp = dbc->dbp; np = pp = NULL; - npl.off = ppl.off = LOCK_INVALID; + LOCK_INIT(npl); + LOCK_INIT(ppl); nlsnp = plsnp = NULL; - dbp = dbc->dbp; + mpf = dbp->mpf; + ret = 0; /* * Retrieve and lock the one/two pages. For a remove, we may need @@ -194,9 +202,8 @@ __db_relink(dbc, add_rem, pagep, new_next, needlock) if (needlock && (ret = __db_lget(dbc, 0, pagep->next_pgno, DB_LOCK_WRITE, 0, &npl)) != 0) goto err; - if ((ret = memp_fget(dbp->mpf, - &pagep->next_pgno, 0, &np)) != 0) { - (void)__db_pgerr(dbp, pagep->next_pgno); + if ((ret = mpf->get(mpf, &pagep->next_pgno, 0, &np)) != 0) { + __db_pgerr(dbp, pagep->next_pgno, ret); goto err; } nlsnp = &np->lsn; @@ -205,28 +212,27 @@ __db_relink(dbc, add_rem, pagep, new_next, needlock) if (needlock && (ret = __db_lget(dbc, 0, pagep->prev_pgno, DB_LOCK_WRITE, 0, &ppl)) != 0) goto err; - if ((ret = memp_fget(dbp->mpf, - &pagep->prev_pgno, 0, &pp)) != 0) { - (void)__db_pgerr(dbp, pagep->next_pgno); + if ((ret = mpf->get(mpf, &pagep->prev_pgno, 0, &pp)) != 0) { + __db_pgerr(dbp, pagep->next_pgno, ret); goto err; } plsnp = &pp->lsn; } /* Log the change. */ - if (DB_LOGGING(dbc)) { - if ((ret = __db_relink_log(dbp->dbenv, dbc->txn, - &ret_lsn, 0, add_rem, dbp->log_fileid, - pagep->pgno, &pagep->lsn, - pagep->prev_pgno, plsnp, pagep->next_pgno, nlsnp)) != 0) + if (DBC_LOGGING(dbc)) { + if ((ret = __db_relink_log(dbp, dbc->txn, &ret_lsn, 0, add_rem, + pagep->pgno, &pagep->lsn, pagep->prev_pgno, plsnp, + pagep->next_pgno, nlsnp)) != 0) goto err; - if (np != NULL) - np->lsn = ret_lsn; - if (pp != NULL) - pp->lsn = ret_lsn; - if (add_rem == DB_REM_PAGE) - pagep->lsn = ret_lsn; - } + } else + LSN_NOT_LOGGED(ret_lsn); + if (np != NULL) + np->lsn = ret_lsn; + if (pp != NULL) + pp->lsn = ret_lsn; + if (add_rem == DB_REM_PAGE) + pagep->lsn = ret_lsn; /* * Modify and release the two pages. @@ -242,10 +248,10 @@ __db_relink(dbc, add_rem, pagep, new_next, needlock) else np->prev_pgno = pagep->prev_pgno; if (new_next == NULL) - ret = memp_fput(dbp->mpf, np, DB_MPOOL_DIRTY); + ret = mpf->put(mpf, np, DB_MPOOL_DIRTY); else { *new_next = np; - ret = memp_fset(dbp->mpf, np, DB_MPOOL_DIRTY); + ret = mpf->set(mpf, np, DB_MPOOL_DIRTY); } if (ret != 0) goto err; @@ -256,7 +262,7 @@ __db_relink(dbc, add_rem, pagep, new_next, needlock) if (pp != NULL) { pp->next_pgno = pagep->next_pgno; - if ((ret = memp_fput(dbp->mpf, pp, DB_MPOOL_DIRTY)) != 0) + if ((ret = mpf->put(mpf, pp, DB_MPOOL_DIRTY)) != 0) goto err; if (needlock) (void)__TLPUT(dbc, ppl); @@ -264,12 +270,12 @@ __db_relink(dbc, add_rem, pagep, new_next, needlock) return (0); err: if (np != NULL) - (void)memp_fput(dbp->mpf, np, 0); - if (needlock && npl.off != LOCK_INVALID) + (void)mpf->put(mpf, np, 0); + if (needlock) (void)__TLPUT(dbc, npl); if (pp != NULL) - (void)memp_fput(dbp->mpf, pp, 0); - if (needlock && ppl.off != LOCK_INVALID) + (void)mpf->put(mpf, pp, 0); + if (needlock) (void)__TLPUT(dbc, ppl); return (ret); } diff --git a/bdb/db/db_iface.c b/bdb/db/db_iface.c index 3548a2527bb..b518c3b14b2 100644 --- a/bdb/db/db_iface.c +++ b/bdb/db/db_iface.c @@ -1,55 +1,69 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2002 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: db_iface.c,v 11.34 2001/01/11 18:19:51 bostic Exp $"; +static const char revid[] = "$Id: db_iface.c,v 11.77 2002/08/08 03:57:47 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES #include <sys/types.h> - -#include <errno.h> #endif #include "db_int.h" -#include "db_page.h" -#include "db_am.h" -#include "btree.h" +#include "dbinc/db_page.h" +#include "dbinc/db_am.h" static int __db_curinval __P((const DB_ENV *)); +static int __db_fnl __P((const DB_ENV *, const char *)); static int __db_rdonly __P((const DB_ENV *, const char *)); static int __dbt_ferr __P((const DB *, const char *, const DBT *, int)); /* + * A database should be required to be readonly if it's been explicitly + * specified as such or if we're a client in a replicated environment and + * we don't have the special "client-writer" designation. + */ +#define IS_READONLY(dbp) \ + (F_ISSET(dbp, DB_AM_RDONLY) || \ + (F_ISSET((dbp)->dbenv, DB_ENV_REP_CLIENT) && \ + !F_ISSET((dbp), DB_AM_CL_WRITER))) + +/* * __db_cursorchk -- * Common cursor argument checking routine. * - * PUBLIC: int __db_cursorchk __P((const DB *, u_int32_t, int)); + * PUBLIC: int __db_cursorchk __P((const DB *, u_int32_t)); */ int -__db_cursorchk(dbp, flags, isrdonly) +__db_cursorchk(dbp, flags) const DB *dbp; u_int32_t flags; - int isrdonly; { + /* DB_DIRTY_READ is the only valid bit-flag and requires locking. */ + if (LF_ISSET(DB_DIRTY_READ)) { + if (!LOCKING_ON(dbp->dbenv)) + return (__db_fnl(dbp->dbenv, "DB->cursor")); + LF_CLR(DB_DIRTY_READ); + } + /* Check for invalid function flags. */ switch (flags) { case 0: break; case DB_WRITECURSOR: - if (isrdonly) + if (IS_READONLY(dbp)) return (__db_rdonly(dbp->dbenv, "DB->cursor")); if (!CDB_LOCKING(dbp->dbenv)) return (__db_ferr(dbp->dbenv, "DB->cursor", 0)); break; case DB_WRITELOCK: - if (isrdonly) + if (IS_READONLY(dbp)) return (__db_rdonly(dbp->dbenv, "DB->cursor")); break; default: @@ -90,22 +104,25 @@ __db_ccountchk(dbp, flags, isvalid) * __db_cdelchk -- * Common cursor delete argument checking routine. * - * PUBLIC: int __db_cdelchk __P((const DB *, u_int32_t, int, int)); + * PUBLIC: int __db_cdelchk __P((const DB *, u_int32_t, int)); */ int -__db_cdelchk(dbp, flags, isrdonly, isvalid) +__db_cdelchk(dbp, flags, isvalid) const DB *dbp; u_int32_t flags; - int isrdonly, isvalid; + int isvalid; { /* Check for changes to a read-only tree. */ - if (isrdonly) + if (IS_READONLY(dbp)) return (__db_rdonly(dbp->dbenv, "c_del")); /* Check for invalid function flags. */ switch (flags) { case 0: break; + case DB_UPDATE_SECONDARY: + DB_ASSERT(F_ISSET(dbp, DB_AM_SECONDARY)); + break; default: return (__db_ferr(dbp->dbenv, "DBcursor->c_del", 0)); } @@ -130,7 +147,7 @@ __db_cgetchk(dbp, key, data, flags, isvalid) u_int32_t flags; int isvalid; { - int ret; + int dirty, multi, ret; /* * Check for read-modify-write validity. DB_RMW doesn't make sense @@ -140,44 +157,68 @@ __db_cgetchk(dbp, key, data, flags, isvalid) * If this changes, confirm that DB does not itself set the DB_RMW * flag in a path where CDB may have been configured. */ - if (LF_ISSET(DB_RMW)) { - if (!LOCKING_ON(dbp->dbenv)) { - __db_err(dbp->dbenv, - "the DB_RMW flag requires locking"); - return (EINVAL); - } - LF_CLR(DB_RMW); + dirty = 0; + if (LF_ISSET(DB_DIRTY_READ | DB_RMW)) { + if (!LOCKING_ON(dbp->dbenv)) + return (__db_fnl(dbp->dbenv, "DBcursor->c_get")); + if (LF_ISSET(DB_DIRTY_READ)) + dirty = 1; + LF_CLR(DB_DIRTY_READ | DB_RMW); + } + + multi = 0; + if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY)) { + multi = 1; + if (LF_ISSET(DB_MULTIPLE) && LF_ISSET(DB_MULTIPLE_KEY)) + goto multi_err; + LF_CLR(DB_MULTIPLE | DB_MULTIPLE_KEY); } /* Check for invalid function flags. */ switch (flags) { case DB_CONSUME: case DB_CONSUME_WAIT: + if (dirty) { + __db_err(dbp->dbenv, + "DB_DIRTY_READ is not supported with DB_CONSUME or DB_CONSUME_WAIT"); + return (EINVAL); + } if (dbp->type != DB_QUEUE) goto err; break; case DB_CURRENT: case DB_FIRST: case DB_GET_BOTH: - case DB_LAST: + case DB_GET_BOTH_RANGE: case DB_NEXT: case DB_NEXT_DUP: case DB_NEXT_NODUP: - case DB_PREV: - case DB_PREV_NODUP: case DB_SET: case DB_SET_RANGE: break; + case DB_LAST: + case DB_PREV: + case DB_PREV_NODUP: + if (multi) +multi_err: return (__db_ferr(dbp->dbenv, "DBcursor->c_get", 1)); + break; case DB_GET_BOTHC: if (dbp->type == DB_QUEUE) goto err; break; case DB_GET_RECNO: - if (!F_ISSET(dbp, DB_BT_RECNUM)) + /* + * The one situation in which this might be legal with a + * non-RECNUM dbp is if dbp is a secondary and its primary is + * DB_AM_RECNUM. + */ + if (!F_ISSET(dbp, DB_AM_RECNUM) && + (!F_ISSET(dbp, DB_AM_SECONDARY) || + !F_ISSET(dbp->s_primary, DB_AM_RECNUM))) goto err; break; case DB_SET_RECNO: - if (!F_ISSET(dbp, DB_BT_RECNUM)) + if (!F_ISSET(dbp, DB_AM_RECNUM)) goto err; break; default: @@ -190,11 +231,24 @@ err: return (__db_ferr(dbp->dbenv, "DBcursor->c_get", 0)); if ((ret = __dbt_ferr(dbp, "data", data, 0)) != 0) return (ret); + if (multi && !F_ISSET(data, DB_DBT_USERMEM)) { + __db_err(dbp->dbenv, + "DB_MULTIPLE(_KEY) requires that DB_DBT_USERMEM be set"); + return (EINVAL); + } + if (multi && + (F_ISSET(key, DB_DBT_PARTIAL) || F_ISSET(data, DB_DBT_PARTIAL))) { + __db_err(dbp->dbenv, + "DB_DBT_PARTIAL forbidden with DB_MULTIPLE(_KEY)"); + return (EINVAL); + } + /* - * The cursor must be initialized for DB_CURRENT or DB_NEXT_DUP, - * return EINVAL for an invalid cursor, otherwise 0. + * The cursor must be initialized for DB_CURRENT, DB_GET_RECNO and + * DB_NEXT_DUP. Return EINVAL for an invalid cursor, otherwise 0. */ - if (isvalid || (flags != DB_CURRENT && flags != DB_NEXT_DUP)) + if (isvalid || (flags != DB_CURRENT && + flags != DB_GET_RECNO && flags != DB_NEXT_DUP)) return (0); return (__db_curinval(dbp->dbenv)); @@ -205,24 +259,35 @@ err: return (__db_ferr(dbp->dbenv, "DBcursor->c_get", 0)); * Common cursor put argument checking routine. * * PUBLIC: int __db_cputchk __P((const DB *, - * PUBLIC: const DBT *, DBT *, u_int32_t, int, int)); + * PUBLIC: const DBT *, DBT *, u_int32_t, int)); */ int -__db_cputchk(dbp, key, data, flags, isrdonly, isvalid) +__db_cputchk(dbp, key, data, flags, isvalid) const DB *dbp; const DBT *key; DBT *data; u_int32_t flags; - int isrdonly, isvalid; + int isvalid; { int key_flags, ret; key_flags = 0; /* Check for changes to a read-only tree. */ - if (isrdonly) + if (IS_READONLY(dbp)) return (__db_rdonly(dbp->dbenv, "c_put")); + /* Check for puts on a secondary. */ + if (F_ISSET(dbp, DB_AM_SECONDARY)) { + if (flags == DB_UPDATE_SECONDARY) + flags = DB_KEYLAST; + else { + __db_err(dbp->dbenv, + "DBcursor->c_put forbidden on secondary indices"); + return (EINVAL); + } + } + /* Check for invalid function flags. */ switch (flags) { case DB_AFTER: @@ -238,7 +303,7 @@ __db_cputchk(dbp, key, data, flags, isrdonly, isvalid) case DB_QUEUE: /* Not permitted. */ goto err; case DB_RECNO: /* Only with mutable record numbers. */ - if (!F_ISSET(dbp, DB_RE_RENUMBER)) + if (!F_ISSET(dbp, DB_AM_RENUMBER)) goto err; key_flags = 1; break; @@ -259,8 +324,6 @@ __db_cputchk(dbp, key, data, flags, isrdonly, isvalid) /* FALLTHROUGH */ case DB_KEYFIRST: case DB_KEYLAST: - if (dbp->type == DB_QUEUE || dbp->type == DB_RECNO) - goto err; key_flags = 1; break; default: @@ -285,48 +348,153 @@ err: return (__db_ferr(dbp->dbenv, "DBcursor->c_put", 0)); } /* - * __db_closechk -- - * DB->close flag check. + * __db_pgetchk -- + * DB->pget flag check. * - * PUBLIC: int __db_closechk __P((const DB *, u_int32_t)); + * PUBLIC: int __db_pgetchk __P((const DB *, const DBT *, DBT *, DBT *, + * PUBLIC: u_int32_t)); */ int -__db_closechk(dbp, flags) +__db_pgetchk(dbp, skey, pkey, data, flags) const DB *dbp; + const DBT *skey; + DBT *pkey, *data; u_int32_t flags; { - /* Check for invalid function flags. */ + int ret; + u_int32_t save_flags; + + save_flags = flags; + + if (!F_ISSET(dbp, DB_AM_SECONDARY)) { + __db_err(dbp->dbenv, + "DB->pget may only be used on secondary indices"); + return (EINVAL); + } + + if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY)) { + __db_err(dbp->dbenv, + "DB_MULTIPLE and DB_MULTIPLE_KEY may not be used on secondary indices"); + return (EINVAL); + } + + /* DB_CONSUME makes no sense on a secondary index. */ + LF_CLR(DB_RMW); switch (flags) { - case 0: - case DB_NOSYNC: + case DB_CONSUME: + case DB_CONSUME_WAIT: + return (__db_ferr(dbp->dbenv, "DB->pget", 0)); + default: + /* __db_getchk will catch the rest. */ + break; + } + + /* + * We allow the pkey field to be NULL, so that we can make the + * two-DBT get calls into wrappers for the three-DBT ones. + */ + if (pkey != NULL && + (ret = __dbt_ferr(dbp, "primary key", pkey, 1)) != 0) + return (ret); + + /* But the pkey field can't be NULL if we're doing a DB_GET_BOTH. */ + if (pkey == NULL && flags == DB_GET_BOTH) { + __db_err(dbp->dbenv, + "DB_GET_BOTH on a secondary index requires a primary key"); + return (EINVAL); + } + + return (__db_getchk(dbp, skey, data, save_flags)); +} + +/* + * __db_cpgetchk -- + * Secondary-index cursor get argument checking routine. + * + * PUBLIC: int __db_cpgetchk __P((const DB *, + * PUBLIC: DBT *, DBT *, DBT *, u_int32_t, int)); + */ +int +__db_cpgetchk(dbp, skey, pkey, data, flags, isvalid) + const DB *dbp; + DBT *skey, *pkey, *data; + u_int32_t flags; + int isvalid; +{ + int ret; + u_int32_t save_flags; + + save_flags = flags; + + if (!F_ISSET(dbp, DB_AM_SECONDARY)) { + __db_err(dbp->dbenv, + "DBcursor->c_pget may only be used on secondary indices"); + return (EINVAL); + } + + if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY)) { + __db_err(dbp->dbenv, + "DB_MULTIPLE and DB_MULTIPLE_KEY may not be used on secondary indices"); + return (EINVAL); + } + + LF_CLR(DB_RMW); + switch (flags) { + case DB_CONSUME: + case DB_CONSUME_WAIT: + /* DB_CONSUME makes no sense on a secondary index. */ + return (__db_ferr(dbp->dbenv, "DBcursor->c_pget", 0)); + case DB_GET_BOTH: + /* DB_GET_BOTH is "get both the primary and the secondary". */ + if (pkey == NULL) { + __db_err(dbp->dbenv, + "DB_GET_BOTH requires both a secondary and a primary key"); + return (EINVAL); + } break; default: - return (__db_ferr(dbp->dbenv, "DB->close", 0)); + /* __db_cgetchk will catch the rest. */ + break; } - return (0); + /* + * We allow the pkey field to be NULL, so that we can make the + * two-DBT get calls into wrappers for the three-DBT ones. + */ + if (pkey != NULL && + (ret = __dbt_ferr(dbp, "primary key", pkey, 0)) != 0) + return (ret); + + /* But the pkey field can't be NULL if we're doing a DB_GET_BOTH. */ + if (pkey == NULL && flags == DB_GET_BOTH) { + __db_err(dbp->dbenv, + "DB_GET_BOTH on a secondary index requires a primary key"); + return (EINVAL); + } + + return (__db_cgetchk(dbp, skey, data, save_flags, isvalid)); } /* * __db_delchk -- * Common delete argument checking routine. * - * PUBLIC: int __db_delchk __P((const DB *, DBT *, u_int32_t, int)); + * PUBLIC: int __db_delchk __P((const DB *, DBT *, u_int32_t)); */ int -__db_delchk(dbp, key, flags, isrdonly) +__db_delchk(dbp, key, flags) const DB *dbp; DBT *key; u_int32_t flags; - int isrdonly; { COMPQUIET(key, NULL); /* Check for changes to a read-only tree. */ - if (isrdonly) + if (IS_READONLY(dbp)) return (__db_rdonly(dbp->dbenv, "delete")); /* Check for invalid function flags. */ + LF_CLR(DB_AUTO_COMMIT); switch (flags) { case 0: break; @@ -350,7 +518,7 @@ __db_getchk(dbp, key, data, flags) DBT *data; u_int32_t flags; { - int ret; + int dirty, multi, ret; /* * Check for read-modify-write validity. DB_RMW doesn't make sense @@ -360,13 +528,21 @@ __db_getchk(dbp, key, data, flags) * If this changes, confirm that DB does not itself set the DB_RMW * flag in a path where CDB may have been configured. */ - if (LF_ISSET(DB_RMW)) { - if (!LOCKING_ON(dbp->dbenv)) { - __db_err(dbp->dbenv, - "the DB_RMW flag requires locking"); - return (EINVAL); - } - LF_CLR(DB_RMW); + dirty = 0; + if (LF_ISSET(DB_DIRTY_READ | DB_RMW)) { + if (!LOCKING_ON(dbp->dbenv)) + return (__db_fnl(dbp->dbenv, "DB->get")); + if (LF_ISSET(DB_DIRTY_READ)) + dirty = 1; + LF_CLR(DB_DIRTY_READ | DB_RMW); + } + + multi = 0; + if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY)) { + if (LF_ISSET(DB_MULTIPLE_KEY)) + goto multi_err; + multi = LF_ISSET(DB_MULTIPLE) ? 1 : 0; + LF_CLR(DB_MULTIPLE); } /* Check for invalid function flags. */ @@ -375,24 +551,48 @@ __db_getchk(dbp, key, data, flags) case DB_GET_BOTH: break; case DB_SET_RECNO: - if (!F_ISSET(dbp, DB_BT_RECNUM)) + if (!F_ISSET(dbp, DB_AM_RECNUM)) goto err; break; case DB_CONSUME: case DB_CONSUME_WAIT: + if (dirty) { + __db_err(dbp->dbenv, + "DB_DIRTY_READ is not supported with DB_CONSUME or DB_CONSUME_WAIT"); + return (EINVAL); + } + if (multi) +multi_err: return (__db_ferr(dbp->dbenv, "DB->get", 1)); if (dbp->type == DB_QUEUE) break; - /* Fall through */ + /* FALLTHROUGH */ default: err: return (__db_ferr(dbp->dbenv, "DB->get", 0)); } - /* Check for invalid key/data flags. */ + /* + * Check for invalid key/data flags. + * + * XXX: Dave Krinsky + * Remember to modify this when we fix the flag-returning problem. + */ if ((ret = __dbt_ferr(dbp, "key", key, flags == DB_SET_RECNO)) != 0) return (ret); if ((ret = __dbt_ferr(dbp, "data", data, 1)) != 0) return (ret); + if (multi && !F_ISSET(data, DB_DBT_USERMEM)) { + __db_err(dbp->dbenv, + "DB_MULTIPLE requires that DB_DBT_USERMEM be set"); + return (EINVAL); + } + if (multi && + (F_ISSET(key, DB_DBT_PARTIAL) || F_ISSET(data, DB_DBT_PARTIAL))) { + __db_err(dbp->dbenv, + "DB_DBT_PARTIAL forbidden with DB_MULTIPLE(_KEY)"); + return (EINVAL); + } + return (0); } @@ -449,13 +649,11 @@ __db_joingetchk(dbp, key, flags) u_int32_t flags; { - if (LF_ISSET(DB_RMW)) { - if (!LOCKING_ON(dbp->dbenv)) { - __db_err(dbp->dbenv, - "the DB_RMW flag requires locking"); - return (EINVAL); - } - LF_CLR(DB_RMW); + if (LF_ISSET(DB_DIRTY_READ | DB_RMW)) { + if (!LOCKING_ON(dbp->dbenv)) + return (__db_fnl(dbp->dbenv, "DBcursor->c_get")); + + LF_CLR(DB_DIRTY_READ | DB_RMW); } switch (flags) { @@ -491,23 +689,32 @@ __db_joingetchk(dbp, key, flags) * Common put argument checking routine. * * PUBLIC: int __db_putchk - * PUBLIC: __P((const DB *, DBT *, const DBT *, u_int32_t, int, int)); + * PUBLIC: __P((const DB *, DBT *, const DBT *, u_int32_t, int)); */ int -__db_putchk(dbp, key, data, flags, isrdonly, isdup) +__db_putchk(dbp, key, data, flags, isdup) const DB *dbp; DBT *key; const DBT *data; u_int32_t flags; - int isrdonly, isdup; + int isdup; { - int ret; + int ret, returnkey; + + returnkey = 0; /* Check for changes to a read-only tree. */ - if (isrdonly) + if (IS_READONLY(dbp)) return (__db_rdonly(dbp->dbenv, "put")); + /* Check for puts on a secondary. */ + if (F_ISSET(dbp, DB_AM_SECONDARY)) { + __db_err(dbp->dbenv, "DB->put forbidden on secondary indices"); + return (EINVAL); + } + /* Check for invalid function flags. */ + LF_CLR(DB_AUTO_COMMIT); switch (flags) { case 0: case DB_NOOVERWRITE: @@ -515,6 +722,7 @@ __db_putchk(dbp, key, data, flags, isrdonly, isdup) case DB_APPEND: if (dbp->type != DB_RECNO && dbp->type != DB_QUEUE) goto err; + returnkey = 1; break; case DB_NODUPDATA: if (F_ISSET(dbp, DB_AM_DUPSORT)) @@ -525,7 +733,7 @@ err: return (__db_ferr(dbp->dbenv, "DB->put", 0)); } /* Check for invalid key/data flags. */ - if ((ret = __dbt_ferr(dbp, "key", key, 0)) != 0) + if ((ret = __dbt_ferr(dbp, "key", key, returnkey)) != 0) return (ret); if ((ret = __dbt_ferr(dbp, "data", data, 0)) != 0) return (ret); @@ -541,28 +749,6 @@ err: return (__db_ferr(dbp->dbenv, "DB->put", 0)); } /* - * __db_removechk -- - * DB->remove flag check. - * - * PUBLIC: int __db_removechk __P((const DB *, u_int32_t)); - */ -int -__db_removechk(dbp, flags) - const DB *dbp; - u_int32_t flags; -{ - /* Check for invalid function flags. */ - switch (flags) { - case 0: - break; - default: - return (__db_ferr(dbp->dbenv, "DB->remove", 0)); - } - - return (0); -} - -/* * __db_statchk -- * Common stat argument checking routine. * @@ -576,12 +762,13 @@ __db_statchk(dbp, flags) /* Check for invalid function flags. */ switch (flags) { case 0: - case DB_CACHED_COUNTS: + case DB_FAST_STAT: + case DB_CACHED_COUNTS: /* Deprecated and undocumented. */ break; - case DB_RECORDCOUNT: + case DB_RECORDCOUNT: /* Deprecated and undocumented. */ if (dbp->type == DB_RECNO) break; - if (dbp->type == DB_BTREE && F_ISSET(dbp, DB_BT_RECNUM)) + if (dbp->type == DB_BTREE && F_ISSET(dbp, DB_AM_RECNUM)) break; goto err; default: @@ -636,9 +823,9 @@ __dbt_ferr(dbp, name, dbt, check_thread) * database and then specify that same DBT as a key to a primary * database, without having to clear flags. */ - if ((ret = __db_fchk(dbenv, name, dbt->flags, - DB_DBT_MALLOC | DB_DBT_DUPOK | - DB_DBT_REALLOC | DB_DBT_USERMEM | DB_DBT_PARTIAL)) != 0) + if ((ret = __db_fchk(dbenv, name, dbt->flags, DB_DBT_APPMALLOC | + DB_DBT_MALLOC | DB_DBT_DUPOK | DB_DBT_REALLOC | DB_DBT_USERMEM | + DB_DBT_PARTIAL)) != 0) return (ret); switch (F_ISSET(dbt, DB_DBT_MALLOC | DB_DBT_REALLOC | DB_DBT_USERMEM)) { case 0: @@ -674,6 +861,20 @@ __db_rdonly(dbenv, name) } /* + * __db_fnl -- + * Common flag-needs-locking message. + */ +static int +__db_fnl(dbenv, name) + const DB_ENV *dbenv; + const char *name; +{ + __db_err(dbenv, + "%s: the DB_DIRTY_READ and DB_RMW flags require locking", name); + return (EINVAL); +} + +/* * __db_curinval * Report that a cursor is in an invalid state. */ @@ -685,3 +886,98 @@ __db_curinval(dbenv) "Cursor position must be set before performing this operation"); return (EINVAL); } + +/* + * __db_secondary_corrupt -- + * Report that a secondary index appears corrupt, as it has a record + * that does not correspond to a record in the primary. + * + * PUBLIC: int __db_secondary_corrupt __P((DB *)); + */ +int +__db_secondary_corrupt(dbp) + DB *dbp; +{ + + __db_err(dbp->dbenv, + "Secondary index corrupt: item in secondary not found in primary"); + return (DB_SECONDARY_BAD); +} + +/* + * __db_associatechk -- + * Argument checking routine for DB->associate(). + * + * PUBLIC: int __db_associatechk __P((DB *, DB *, + * PUBLIC: int (*)(DB *, const DBT *, const DBT *, DBT *), u_int32_t)); + */ +int +__db_associatechk(dbp, sdbp, callback, flags) + DB *dbp, *sdbp; + int (*callback) __P((DB *, const DBT *, const DBT *, DBT *)); + u_int32_t flags; +{ + DB_ENV *dbenv; + + dbenv = dbp->dbenv; + + if (F_ISSET(sdbp, DB_AM_SECONDARY)) { + __db_err(dbenv, + "Secondary index handles may not be re-associated"); + return (EINVAL); + } + if (F_ISSET(dbp, DB_AM_SECONDARY)) { + __db_err(dbenv, + "Secondary indices may not be used as primary databases"); + return (EINVAL); + } + if (F_ISSET(dbp, DB_AM_DUP)) { + __db_err(dbenv, + "Primary databases may not be configured with duplicates"); + return (EINVAL); + } + if (F_ISSET(dbp, DB_AM_RENUMBER)) { + __db_err(dbenv, + "Renumbering recno databases may not be used as primary databases"); + return (EINVAL); + } + if (callback == NULL && + (!F_ISSET(dbp, DB_AM_RDONLY) || !F_ISSET(sdbp, DB_AM_RDONLY))) { + __db_err(dbenv, + "Callback function may be NULL only when database handles are read-only"); + return (EINVAL); + } + + return (__db_fchk(dbenv, + "DB->associate", flags, DB_CREATE | DB_AUTO_COMMIT)); +} + +/* + * __db_txn_auto -- + * Handle DB_AUTO_COMMIT initialization. + * + * PUBLIC: int __db_txn_auto __P((DB *, DB_TXN **)); + */ +int +__db_txn_auto(dbp, txnidp) + DB *dbp; + DB_TXN **txnidp; +{ + DB_ENV *dbenv; + + dbenv = dbp->dbenv; + + if (*txnidp != NULL) { + __db_err(dbenv, + "DB_AUTO_COMMIT may not be specified along with a transaction handle"); + return (EINVAL); + } + + if (!TXN_ON(dbenv)) { + __db_err(dbenv, + "DB_AUTO_COMMIT may not be specified in non-transactional environment"); + return (EINVAL); + } + + return (dbenv->txn_begin(dbenv, NULL, txnidp, 0)); +} diff --git a/bdb/db/db_join.c b/bdb/db/db_join.c index 881dedde0fc..6281b1a8383 100644 --- a/bdb/db/db_join.c +++ b/bdb/db/db_join.c @@ -1,14 +1,14 @@ -/*- +/* * See the file LICENSE for redistribution information. * - * Copyright (c) 1998, 1999, 2000 + * Copyright (c) 1998-2002 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: db_join.c,v 11.31 2000/12/20 22:41:54 krinsky Exp $"; +static const char revid[] = "$Id: db_join.c,v 11.55 2002/08/08 03:57:47 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -19,16 +19,17 @@ static const char revid[] = "$Id: db_join.c,v 11.31 2000/12/20 22:41:54 krinsky #endif #include "db_int.h" -#include "db_page.h" -#include "db_join.h" -#include "db_am.h" -#include "btree.h" +#include "dbinc/db_page.h" +#include "dbinc/db_join.h" +#include "dbinc/btree.h" static int __db_join_close __P((DBC *)); static int __db_join_cmp __P((const void *, const void *)); static int __db_join_del __P((DBC *, u_int32_t)); static int __db_join_get __P((DBC *, DBT *, DBT *, u_int32_t)); -static int __db_join_getnext __P((DBC *, DBT *, DBT *, u_int32_t)); +static int __db_join_getnext __P((DBC *, DBT *, DBT *, u_int32_t, u_int32_t)); +static int __db_join_primget __P((DB *, + DB_TXN *, u_int32_t, DBT *, DBT *, u_int32_t)); static int __db_join_put __P((DBC *, DBT *, DBT *, u_int32_t)); /* @@ -84,7 +85,8 @@ __db_join(primary, curslist, dbcp, flags) DBC *dbc; JOIN_CURSOR *jc; int ret; - u_int32_t i, ncurs, nslots; + u_int32_t i; + size_t ncurs, nslots; COMPQUIET(nslots, 0); @@ -104,11 +106,13 @@ __db_join(primary, curslist, dbcp, flags) 1, sizeof(JOIN_CURSOR), &jc)) != 0) goto err; - if ((ret = __os_malloc(dbenv, 256, NULL, &jc->j_key.data)) != 0) + if ((ret = __os_malloc(dbenv, 256, &jc->j_key.data)) != 0) goto err; jc->j_key.ulen = 256; F_SET(&jc->j_key, DB_DBT_USERMEM); + F_SET(&jc->j_rdata, DB_DBT_REALLOC); + for (jc->j_curslist = curslist; *jc->j_curslist != NULL; jc->j_curslist++) ; @@ -184,7 +188,7 @@ __db_join(primary, curslist, dbcp, flags) jc->j_fdupcurs[i] = NULL; jc->j_exhausted[i] = 0; } - jc->j_ncurs = ncurs; + jc->j_ncurs = (u_int32_t)ncurs; /* * If DB_JOIN_NOSORT is not set, optimize secondary cursors by @@ -226,20 +230,20 @@ __db_join(primary, curslist, dbcp, flags) err: if (jc != NULL) { if (jc->j_curslist != NULL) - __os_free(jc->j_curslist, nslots * sizeof(DBC *)); + __os_free(dbenv, jc->j_curslist); if (jc->j_workcurs != NULL) { if (jc->j_workcurs[0] != NULL) - __os_free(jc->j_workcurs[0], sizeof(DBC)); - __os_free(jc->j_workcurs, nslots * sizeof(DBC *)); + __os_free(dbenv, jc->j_workcurs[0]); + __os_free(dbenv, jc->j_workcurs); } if (jc->j_fdupcurs != NULL) - __os_free(jc->j_fdupcurs, nslots * sizeof(DBC *)); + __os_free(dbenv, jc->j_fdupcurs); if (jc->j_exhausted != NULL) - __os_free(jc->j_exhausted, nslots * sizeof(u_int8_t)); - __os_free(jc, sizeof(JOIN_CURSOR)); + __os_free(dbenv, jc->j_exhausted); + __os_free(dbenv, jc); } if (dbc != NULL) - __os_free(dbc, sizeof(DBC)); + __os_free(dbenv, dbc); return (ret); } @@ -279,8 +283,8 @@ __db_join_get(dbc, key_arg, data_arg, flags) DB *dbp; DBC *cp; JOIN_CURSOR *jc; - int ret; - u_int32_t i, j, operation; + int db_manage_data, ret; + u_int32_t i, j, operation, opmods; dbp = dbc->dbp; jc = (JOIN_CURSOR *)dbc->internal; @@ -289,6 +293,12 @@ __db_join_get(dbc, key_arg, data_arg, flags) operation = LF_ISSET(DB_OPFLAGS_MASK); + /* !!! + * If the set of flags here changes, check that __db_join_primget + * is updated to handle them properly. + */ + opmods = LF_ISSET(DB_RMW | DB_DIRTY_READ); + if ((ret = __db_joingetchk(dbp, key_arg, flags)) != 0) return (ret); @@ -319,13 +329,14 @@ __db_join_get(dbc, key_arg, data_arg, flags) goto samekey; F_CLR(jc, JOIN_RETRY); -retry: ret = jc->j_workcurs[0]->c_get(jc->j_workcurs[0], - &jc->j_key, key_n, jc->j_exhausted[0] ? DB_NEXT_DUP : DB_CURRENT); +retry: ret = jc->j_workcurs[0]->c_real_get(jc->j_workcurs[0], + &jc->j_key, key_n, + opmods | (jc->j_exhausted[0] ? DB_NEXT_DUP : DB_CURRENT)); if (ret == ENOMEM) { jc->j_key.ulen <<= 1; if ((ret = __os_realloc(dbp->dbenv, - jc->j_key.ulen, NULL, &jc->j_key.data)) != 0) + jc->j_key.ulen, &jc->j_key.data)) != 0) goto mem_err; goto retry; } @@ -379,7 +390,7 @@ retry: ret = jc->j_workcurs[0]->c_get(jc->j_workcurs[0], retry2: cp = jc->j_workcurs[i]; if ((ret = __db_join_getnext(cp, &jc->j_key, key_n, - jc->j_exhausted[i])) == DB_NOTFOUND) { + jc->j_exhausted[i], opmods)) == DB_NOTFOUND) { /* * jc->j_workcurs[i] has no more of the datum we're * interested in. Go back one cursor and get @@ -475,7 +486,7 @@ retry2: cp = jc->j_workcurs[i]; if (ret == ENOMEM) { jc->j_key.ulen <<= 1; if ((ret = __os_realloc(dbp->dbenv, jc->j_key.ulen, - NULL, &jc->j_key.data)) != 0) { + &jc->j_key.data)) != 0) { mem_err: __db_err(dbp->dbenv, "Allocation failed for join key, len = %lu", (u_long)jc->j_key.ulen); @@ -523,8 +534,8 @@ samekey: /* * Get the key we tried and failed to return last time; * it should be the current datum of all the secondary cursors. */ - if ((ret = jc->j_workcurs[0]->c_get(jc->j_workcurs[0], - &jc->j_key, key_n, DB_CURRENT)) != 0) + if ((ret = jc->j_workcurs[0]->c_real_get(jc->j_workcurs[0], + &jc->j_key, key_n, DB_CURRENT | opmods)) != 0) return (ret); F_CLR(jc, JOIN_RETRY); } @@ -532,36 +543,28 @@ samekey: /* /* * ret == 0; we have a key to return. * - * If DB_DBT_USERMEM or DB_DBT_MALLOC is set, we need to - * copy it back into the dbt we were given for the key; - * call __db_retcopy. - * - * Otherwise, assert that we do not in fact need to copy anything - * and simply proceed. + * If DB_DBT_USERMEM or DB_DBT_MALLOC is set, we need to copy the key + * back into the dbt we were given for the key; call __db_retcopy. + * Otherwise, assert that we do not need to copy anything and proceed. */ - if (F_ISSET(key_arg, DB_DBT_USERMEM) || - F_ISSET(key_arg, DB_DBT_MALLOC)) { + DB_ASSERT(F_ISSET( + key_arg, DB_DBT_USERMEM | DB_DBT_MALLOC) || key_n == key_arg); + + if (F_ISSET(key_arg, DB_DBT_USERMEM | DB_DBT_MALLOC) && + (ret = __db_retcopy(dbp->dbenv, + key_arg, key_n->data, key_n->size, NULL, NULL)) != 0) { /* - * We need to copy the key back into our original - * datum. Do so. + * The retcopy failed, most commonly because we have a user + * buffer for the key which is too small. Set things up to + * retry next time, and return. */ - if ((ret = __db_retcopy(dbp, - key_arg, key_n->data, key_n->size, NULL, NULL)) != 0) { - /* - * The retcopy failed, most commonly because we - * have a user buffer for the key which is too small. - * Set things up to retry next time, and return. - */ - F_SET(jc, JOIN_RETRY); - return (ret); - } - } else - DB_ASSERT(key_n == key_arg); + F_SET(jc, JOIN_RETRY); + return (ret); + } /* - * If DB_JOIN_ITEM is - * set, we return it; otherwise we do the lookup in the - * primary and then return. + * If DB_JOIN_ITEM is set, we return it; otherwise we do the lookup + * in the primary and then return. * * Note that we use key_arg here; it is safe (and appropriate) * to do so. @@ -569,14 +572,45 @@ samekey: /* if (operation == DB_JOIN_ITEM) return (0); - if ((ret = jc->j_primary->get(jc->j_primary, - jc->j_curslist[0]->txn, key_arg, data_arg, 0)) != 0) - /* - * The get on the primary failed, most commonly because we're - * using a user buffer that's not big enough. Flag our - * failure so we can return the same key next time. - */ - F_SET(jc, JOIN_RETRY); + /* + * If data_arg->flags == 0--that is, if DB is managing the + * data DBT's memory--it's not safe to just pass the DBT + * through to the primary get call, since we don't want that + * memory to belong to the primary DB handle (and if the primary + * is free-threaded, it can't anyway). + * + * Instead, use memory that is managed by the join cursor, in + * jc->j_rdata. + */ + if (!F_ISSET(data_arg, DB_DBT_MALLOC | DB_DBT_REALLOC | DB_DBT_USERMEM)) + db_manage_data = 1; + else + db_manage_data = 0; + if ((ret = __db_join_primget(jc->j_primary, + jc->j_curslist[0]->txn, jc->j_curslist[0]->locker, key_arg, + db_manage_data ? &jc->j_rdata : data_arg, opmods)) != 0) { + if (ret == DB_NOTFOUND) + /* + * If ret == DB_NOTFOUND, the primary and secondary + * are out of sync; every item in each secondary + * should correspond to something in the primary, + * or we shouldn't have done the join this way. + * Wail. + */ + ret = __db_secondary_corrupt(jc->j_primary); + else + /* + * The get on the primary failed for some other + * reason, most commonly because we're using a user + * buffer that's not big enough. Flag our failure + * so we can return the same key next time. + */ + F_SET(jc, JOIN_RETRY); + } + if (db_manage_data && ret == 0) { + data_arg->data = jc->j_rdata.data; + data_arg->size = jc->j_rdata.size; + } return (ret); } @@ -586,12 +620,14 @@ __db_join_close(dbc) DBC *dbc; { DB *dbp; + DB_ENV *dbenv; JOIN_CURSOR *jc; int ret, t_ret; u_int32_t i; jc = (JOIN_CURSOR *)dbc->internal; dbp = dbc->dbp; + dbenv = dbp->dbenv; ret = t_ret = 0; /* @@ -599,11 +635,11 @@ __db_join_close(dbc) * must happen before any action that can fail and return, or else * __db_close may loop indefinitely. */ - MUTEX_THREAD_LOCK(dbp->dbenv, dbp->mutexp); + MUTEX_THREAD_LOCK(dbenv, dbp->mutexp); TAILQ_REMOVE(&dbp->join_queue, dbc, links); - MUTEX_THREAD_UNLOCK(dbp->dbenv, dbp->mutexp); + MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp); - PANIC_CHECK(dbc->dbp->dbenv); + PANIC_CHECK(dbenv); /* * Close any open scratch cursors. In each case, there may @@ -625,13 +661,15 @@ __db_join_close(dbc) ret = t_ret; } - __os_free(jc->j_exhausted, 0); - __os_free(jc->j_curslist, 0); - __os_free(jc->j_workcurs, 0); - __os_free(jc->j_fdupcurs, 0); - __os_free(jc->j_key.data, jc->j_key.ulen); - __os_free(jc, sizeof(JOIN_CURSOR)); - __os_free(dbc, sizeof(DBC)); + __os_free(dbenv, jc->j_exhausted); + __os_free(dbenv, jc->j_curslist); + __os_free(dbenv, jc->j_workcurs); + __os_free(dbenv, jc->j_fdupcurs); + __os_free(dbenv, jc->j_key.data); + if (jc->j_rdata.data != NULL) + __os_ufree(dbenv, jc->j_rdata.data); + __os_free(dbenv, jc); + __os_free(dbenv, dbc); return (ret); } @@ -652,10 +690,10 @@ __db_join_close(dbc) * If no matching datum exists, returns DB_NOTFOUND, else 0. */ static int -__db_join_getnext(dbc, key, data, exhausted) +__db_join_getnext(dbc, key, data, exhausted, opmods) DBC *dbc; DBT *key, *data; - u_int32_t exhausted; + u_int32_t exhausted, opmods; { int ret, cmp; DB *dbp; @@ -667,10 +705,14 @@ __db_join_getnext(dbc, key, data, exhausted) switch (exhausted) { case 0: + /* + * We don't want to step on data->data; use a new + * DBT and malloc so we don't step on dbc's rdata memory. + */ memset(&ldata, 0, sizeof(DBT)); - /* We don't want to step on data->data; malloc. */ F_SET(&ldata, DB_DBT_MALLOC); - if ((ret = dbc->c_get(dbc, key, &ldata, DB_CURRENT)) != 0) + if ((ret = dbc->c_real_get(dbc, + key, &ldata, opmods | DB_CURRENT)) != 0) break; cmp = func(dbp, data, &ldata); if (cmp == 0) { @@ -679,10 +721,10 @@ __db_join_getnext(dbc, key, data, exhausted) * it into data, then free the buffer we malloc'ed * above. */ - if ((ret = __db_retcopy(dbp, data, ldata.data, + if ((ret = __db_retcopy(dbp->dbenv, data, ldata.data, ldata.size, &data->data, &data->size)) != 0) return (ret); - __os_free(ldata.data, 0); + __os_ufree(dbp->dbenv, ldata.data); return (0); } @@ -691,10 +733,10 @@ __db_join_getnext(dbc, key, data, exhausted) * dups. We just forget about ldata and free * its buffer--data contains the value we're searching for. */ - __os_free(ldata.data, 0); + __os_ufree(dbp->dbenv, ldata.data); /* FALLTHROUGH */ case 1: - ret = dbc->c_get(dbc, key, data, DB_GET_BOTHC); + ret = dbc->c_real_get(dbc, key, data, opmods | DB_GET_BOTHC); break; default: ret = EINVAL; @@ -708,7 +750,6 @@ __db_join_getnext(dbc, key, data, exhausted) * __db_join_cmp -- * Comparison function for sorting DBCs in cardinality order. */ - static int __db_join_cmp(a, b) const void *a, *b; @@ -728,3 +769,54 @@ __db_join_cmp(a, b) return (counta - countb); } + +/* + * __db_join_primget -- + * Perform a DB->get in the primary, being careful not to use a new + * locker ID if we're doing CDB locking. + */ +static int +__db_join_primget(dbp, txn, lockerid, key, data, flags) + DB *dbp; + DB_TXN *txn; + u_int32_t lockerid; + DBT *key, *data; + u_int32_t flags; +{ + DBC *dbc; + int dirty, ret, rmw, t_ret; + + /* + * The only allowable flags here are the two flags copied into + * "opmods" in __db_join_get, DB_RMW and DB_DIRTY_READ. The former + * is an op on the c_get call, the latter on the cursor call. + * It's a DB bug if we allow any other flags down in here. + */ + rmw = LF_ISSET(DB_RMW); + dirty = LF_ISSET(DB_DIRTY_READ); + LF_CLR(DB_RMW | DB_DIRTY_READ); + DB_ASSERT(flags == 0); + + if ((ret = __db_icursor(dbp, + txn, dbp->type, PGNO_INVALID, 0, lockerid, &dbc)) != 0) + return (ret); + + if (dirty || + (txn != NULL && F_ISSET(txn, TXN_DIRTY_READ))) + F_SET(dbc, DBC_DIRTY_READ); + F_SET(dbc, DBC_TRANSIENT); + + /* + * This shouldn't be necessary, thanks to the fact that join cursors + * swap in their own DB_DBT_REALLOC'ed buffers, but just for form's + * sake, we mirror what __db_get does. + */ + SET_RET_MEM(dbc, dbp); + + ret = dbc->c_get(dbc, key, data, DB_SET | rmw); + + if ((t_ret = __db_c_close(dbc)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} diff --git a/bdb/db/db_meta.c b/bdb/db/db_meta.c index 5b57c369454..015ef5c8fc7 100644 --- a/bdb/db/db_meta.c +++ b/bdb/db/db_meta.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2002 * Sleepycat Software. All rights reserved. */ /* @@ -43,7 +43,7 @@ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: db_meta.c,v 11.26 2001/01/16 21:57:19 ubell Exp $"; +static const char revid[] = "$Id: db_meta.c,v 11.61 2002/08/08 03:57:48 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -53,12 +53,37 @@ static const char revid[] = "$Id: db_meta.c,v 11.26 2001/01/16 21:57:19 ubell Ex #endif #include "db_int.h" -#include "db_page.h" -#include "db_shash.h" -#include "lock.h" -#include "txn.h" -#include "db_am.h" -#include "btree.h" +#include "dbinc/db_page.h" +#include "dbinc/db_shash.h" +#include "dbinc/lock.h" +#include "dbinc/db_am.h" + +static void __db_init_meta __P((void *, u_int32_t, db_pgno_t, u_int32_t)); + +/* + * __db_init_meta -- + * Helper function for __db_new that initializes the important fields in + * a meta-data page (used instead of P_INIT). We need to make sure that we + * retain the page number and LSN of the existing page. + */ +static void +__db_init_meta(p, pgsize, pgno, pgtype) + void *p; + u_int32_t pgsize; + db_pgno_t pgno; + u_int32_t pgtype; +{ + DB_LSN save_lsn; + DBMETA *meta; + + meta = (DBMETA *)p; + save_lsn = meta->lsn; + memset(meta, 0, sizeof(DBMETA)); + meta->lsn = save_lsn; + meta->pagesize = pgsize; + meta->pgno = pgno; + meta->type = (u_int8_t)pgtype; +} /* * __db_new -- @@ -75,60 +100,110 @@ __db_new(dbc, type, pagepp) DBMETA *meta; DB *dbp; DB_LOCK metalock; + DB_LSN lsn; + DB_MPOOLFILE *mpf; PAGE *h; - db_pgno_t pgno; - int ret; + db_pgno_t pgno, newnext; + int meta_flags, extend, ret; - dbp = dbc->dbp; meta = NULL; + meta_flags = 0; + dbp = dbc->dbp; + mpf = dbp->mpf; h = NULL; + newnext = PGNO_INVALID; pgno = PGNO_BASE_MD; if ((ret = __db_lget(dbc, LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0) goto err; - if ((ret = memp_fget(dbp->mpf, &pgno, 0, (PAGE **)&meta)) != 0) + if ((ret = mpf->get(mpf, &pgno, 0, (PAGE **)&meta)) != 0) goto err; - if (meta->free == PGNO_INVALID) { - if ((ret = memp_fget(dbp->mpf, &pgno, DB_MPOOL_NEW, &h)) != 0) - goto err; - ZERO_LSN(h->lsn); - h->pgno = pgno; + pgno = meta->last_pgno + 1; + ZERO_LSN(lsn); + extend = 1; } else { pgno = meta->free; - if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) + if ((ret = mpf->get(mpf, &pgno, 0, &h)) != 0) goto err; - meta->free = h->next_pgno; - (void)memp_fset(dbp->mpf, (PAGE *)meta, DB_MPOOL_DIRTY); + + /* + * We want to take the first page off the free list and + * then set meta->free to the that page's next_pgno, but + * we need to log the change first. + */ + newnext = h->next_pgno; + lsn = h->lsn; + extend = 0; } - DB_ASSERT(TYPE(h) == P_INVALID); + /* + * Log the allocation before fetching the new page. If we + * don't have room in the log then we don't want to tell + * mpool to extend the file. + */ + if (DBC_LOGGING(dbc)) { + if ((ret = __db_pg_alloc_log(dbp, dbc->txn, &LSN(meta), 0, + &LSN(meta), PGNO_BASE_MD, &lsn, pgno, + (u_int32_t)type, newnext)) != 0) + goto err; + } else + LSN_NOT_LOGGED(LSN(meta)); - if (TYPE(h) != P_INVALID) - return (__db_panic(dbp->dbenv, EINVAL)); + meta_flags = DB_MPOOL_DIRTY; + meta->free = newnext; - /* Log the change. */ - if (DB_LOGGING(dbc)) { - if ((ret = __db_pg_alloc_log(dbp->dbenv, - dbc->txn, &LSN(meta), 0, dbp->log_fileid, - &LSN(meta), &h->lsn, h->pgno, - (u_int32_t)type, meta->free)) != 0) + if (extend == 1) { + meta->last_pgno++; + if ((ret = mpf->get(mpf, &pgno, DB_MPOOL_NEW, &h)) != 0) goto err; - LSN(h) = LSN(meta); + ZERO_LSN(h->lsn); + h->pgno = pgno; + DB_ASSERT(pgno == meta->last_pgno); } + LSN(h) = LSN(meta); + + DB_ASSERT(TYPE(h) == P_INVALID); + + if (TYPE(h) != P_INVALID) + return (__db_panic(dbp->dbenv, EINVAL)); - (void)memp_fput(dbp->mpf, (PAGE *)meta, DB_MPOOL_DIRTY); + (void)mpf->put(mpf, (PAGE *)meta, DB_MPOOL_DIRTY); (void)__TLPUT(dbc, metalock); - P_INIT(h, dbp->pgsize, h->pgno, PGNO_INVALID, PGNO_INVALID, 0, type); + switch (type) { + case P_BTREEMETA: + case P_HASHMETA: + case P_QAMMETA: + __db_init_meta(h, dbp->pgsize, h->pgno, type); + break; + default: + P_INIT(h, dbp->pgsize, + h->pgno, PGNO_INVALID, PGNO_INVALID, 0, type); + break; + } + + /* + * If dirty reads are enabled and we are in a transaction, we could + * abort this allocation after the page(s) pointing to this + * one have their locks downgraded. This would permit dirty readers + * to access this page which is ok, but they must be off the + * page when we abort. This will also prevent updates happening + * to this page until we commit. + */ + if (F_ISSET(dbc->dbp, DB_AM_DIRTY) && dbc->txn != NULL) { + if ((ret = __db_lget(dbc, 0, + h->pgno, DB_LOCK_WWRITE, 0, &metalock)) != 0) + goto err; + } *pagepp = h; return (0); err: if (h != NULL) - (void)memp_fput(dbp->mpf, h, 0); + (void)mpf->put(mpf, h, 0); if (meta != NULL) - (void)memp_fput(dbp->mpf, meta, 0); + (void)mpf->put(mpf, meta, meta_flags); (void)__TLPUT(dbc, metalock); return (ret); } @@ -148,11 +223,13 @@ __db_free(dbc, h) DB *dbp; DBT ldbt; DB_LOCK metalock; + DB_MPOOLFILE *mpf; db_pgno_t pgno; u_int32_t dirty_flag; int ret, t_ret; dbp = dbc->dbp; + mpf = dbp->mpf; /* * Retrieve the metadata page and insert the page at the head of @@ -163,43 +240,44 @@ __db_free(dbc, h) dirty_flag = 0; pgno = PGNO_BASE_MD; if ((ret = __db_lget(dbc, - LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0) + LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0) goto err; - if ((ret = memp_fget(dbp->mpf, &pgno, 0, (PAGE **)&meta)) != 0) { + if ((ret = mpf->get(mpf, &pgno, 0, (PAGE **)&meta)) != 0) { (void)__TLPUT(dbc, metalock); goto err; } DB_ASSERT(h->pgno != meta->free); /* Log the change. */ - if (DB_LOGGING(dbc)) { + if (DBC_LOGGING(dbc)) { memset(&ldbt, 0, sizeof(ldbt)); ldbt.data = h; - ldbt.size = P_OVERHEAD; - if ((ret = __db_pg_free_log(dbp->dbenv, - dbc->txn, &LSN(meta), 0, dbp->log_fileid, h->pgno, - &LSN(meta), &ldbt, meta->free)) != 0) { - (void)memp_fput(dbp->mpf, (PAGE *)meta, 0); + ldbt.size = P_OVERHEAD(dbp); + if ((ret = __db_pg_free_log(dbp, + dbc->txn, &LSN(meta), 0, h->pgno, + &LSN(meta), PGNO_BASE_MD, &ldbt, meta->free)) != 0) { + (void)mpf->put(mpf, (PAGE *)meta, 0); (void)__TLPUT(dbc, metalock); - return (ret); + goto err; } - LSN(h) = LSN(meta); - } + } else + LSN_NOT_LOGGED(LSN(meta)); + LSN(h) = LSN(meta); P_INIT(h, dbp->pgsize, h->pgno, PGNO_INVALID, meta->free, 0, P_INVALID); meta->free = h->pgno; /* Discard the metadata page. */ - if ((t_ret = memp_fput(dbp->mpf, - (PAGE *)meta, DB_MPOOL_DIRTY)) != 0 && ret == 0) + if ((t_ret = + mpf->put(mpf, (PAGE *)meta, DB_MPOOL_DIRTY)) != 0 && ret == 0) ret = t_ret; if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0) ret = t_ret; /* Discard the caller's page reference. */ dirty_flag = DB_MPOOL_DIRTY; -err: if ((t_ret = memp_fput(dbp->mpf, h, dirty_flag)) != 0 && ret == 0) +err: if ((t_ret = mpf->put(mpf, h, dirty_flag)) != 0 && ret == 0) ret = t_ret; /* @@ -227,44 +305,63 @@ __db_lprint(dbc) if (LOCKING_ON(dbp->dbenv)) { req.op = DB_LOCK_DUMP; - lock_vec(dbp->dbenv, dbc->locker, 0, &req, 1, NULL); + dbp->dbenv->lock_vec(dbp->dbenv, dbc->locker, 0, &req, 1, NULL); } return (0); } #endif /* + * Implement the rules for transactional locking. We can release the previous + * lock if we are not in a transaction or COUPLE_ALWAYS is specifed (used in + * record locking). If we are doing dirty reads then we can release read locks + * and down grade write locks. + */ +#define DB_PUT_ACTION(dbc, action, lockp) \ + (((action == LCK_COUPLE || action == LCK_COUPLE_ALWAYS) && \ + LOCK_ISSET(*lockp)) ? \ + (dbc->txn == NULL || action == LCK_COUPLE_ALWAYS || \ + (F_ISSET(dbc, DBC_DIRTY_READ) && \ + (lockp)->mode == DB_LOCK_DIRTY)) ? LCK_COUPLE : \ + (F_ISSET((dbc)->dbp, DB_AM_DIRTY) && \ + (lockp)->mode == DB_LOCK_WRITE) ? LCK_DOWNGRADE : 0 : 0) + +/* * __db_lget -- * The standard lock get call. * * PUBLIC: int __db_lget __P((DBC *, - * PUBLIC: int, db_pgno_t, db_lockmode_t, int, DB_LOCK *)); + * PUBLIC: int, db_pgno_t, db_lockmode_t, u_int32_t, DB_LOCK *)); */ int -__db_lget(dbc, flags, pgno, mode, lkflags, lockp) +__db_lget(dbc, action, pgno, mode, lkflags, lockp) DBC *dbc; - int flags, lkflags; + int action; db_pgno_t pgno; db_lockmode_t mode; + u_int32_t lkflags; DB_LOCK *lockp; { DB *dbp; DB_ENV *dbenv; DB_LOCKREQ couple[2], *reqp; - int ret; + DB_TXN *txn; + int has_timeout, ret; dbp = dbc->dbp; dbenv = dbp->dbenv; + txn = dbc->txn; /* * We do not always check if we're configured for locking before * calling __db_lget to acquire the lock. */ - if (CDB_LOCKING(dbenv) - || !LOCKING_ON(dbenv) || F_ISSET(dbc, DBC_COMPENSATE) - || (!LF_ISSET(LCK_ROLLBACK) && F_ISSET(dbc, DBC_RECOVER)) - || (!LF_ISSET(LCK_ALWAYS) && F_ISSET(dbc, DBC_OPD))) { - lockp->off = LOCK_INVALID; + if (CDB_LOCKING(dbenv) || + !LOCKING_ON(dbenv) || F_ISSET(dbc, DBC_COMPENSATE) || + (F_ISSET(dbc, DBC_RECOVER) && + (action != LCK_ROLLBACK || F_ISSET(dbenv, DB_ENV_REP_CLIENT))) || + (action != LCK_ALWAYS && F_ISSET(dbc, DBC_OPD))) { + LOCK_INIT(*lockp); return (0); } @@ -282,27 +379,73 @@ __db_lget(dbc, flags, pgno, mode, lkflags, lockp) if (DB_NONBLOCK(dbc)) lkflags |= DB_LOCK_NOWAIT; - /* - * If the object not currently locked, acquire the lock and return, - * otherwise, lock couple. - */ - if (LF_ISSET(LCK_COUPLE)) { - couple[0].op = DB_LOCK_GET; + if (F_ISSET(dbc, DBC_DIRTY_READ) && mode == DB_LOCK_READ) + mode = DB_LOCK_DIRTY; + + has_timeout = txn != NULL && F_ISSET(txn, TXN_LOCKTIMEOUT); + + switch (DB_PUT_ACTION(dbc, action, lockp)) { + case LCK_COUPLE: +lck_couple: couple[0].op = has_timeout? DB_LOCK_GET_TIMEOUT : DB_LOCK_GET; couple[0].obj = &dbc->lock_dbt; couple[0].mode = mode; - couple[1].op = DB_LOCK_PUT; - couple[1].lock = *lockp; + if (action == LCK_COUPLE_ALWAYS) + action = LCK_COUPLE; + UMRW_SET(couple[0].timeout); + if (has_timeout) + couple[0].timeout = txn->lock_timeout; + if (action == LCK_COUPLE) { + couple[1].op = DB_LOCK_PUT; + couple[1].lock = *lockp; + } - ret = lock_vec(dbenv, - dbc->locker, lkflags, couple, 2, &reqp); + ret = dbenv->lock_vec(dbenv, dbc->locker, + lkflags, couple, action == LCK_COUPLE ? 2 : 1, &reqp); if (ret == 0 || reqp == &couple[1]) *lockp = couple[0].lock; - } else { - ret = lock_get(dbenv, + break; + case LCK_DOWNGRADE: + if ((ret = dbenv->lock_downgrade( + dbenv, lockp, DB_LOCK_WWRITE, 0)) != 0) + return (ret); + /* FALL THROUGH */ + default: + if (has_timeout) + goto lck_couple; + ret = dbenv->lock_get(dbenv, dbc->locker, lkflags, &dbc->lock_dbt, mode, lockp); + break; + } + + return (ret); +} + +/* + * __db_lput -- + * The standard lock put call. + * + * PUBLIC: int __db_lput __P((DBC *, DB_LOCK *)); + */ +int +__db_lput(dbc, lockp) + DBC *dbc; + DB_LOCK *lockp; +{ + DB_ENV *dbenv; + int ret; - if (ret != 0) - lockp->off = LOCK_INVALID; + dbenv = dbc->dbp->dbenv; + + switch (DB_PUT_ACTION(dbc, LCK_COUPLE, lockp)) { + case LCK_COUPLE: + ret = dbenv->lock_put(dbenv, lockp); + break; + case LCK_DOWNGRADE: + ret = __lock_downgrade(dbenv, lockp, DB_LOCK_WWRITE, 0); + break; + default: + ret = 0; + break; } return (ret); diff --git a/bdb/db/db_method.c b/bdb/db/db_method.c index 01568a6e144..14712180df0 100644 --- a/bdb/db/db_method.c +++ b/bdb/db/db_method.c @@ -1,14 +1,14 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2000 + * Copyright (c) 1999-2002 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: db_method.c,v 11.36 2000/12/21 09:17:04 krinsky Exp $"; +static const char revid[] = "$Id: db_method.c,v 11.78 2002/07/02 19:26:55 sue Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -21,50 +21,56 @@ static const char revid[] = "$Id: db_method.c,v 11.36 2000/12/21 09:17:04 krinsk #include <string.h> #endif -#ifdef HAVE_RPC -#include "db_server.h" -#endif - #include "db_int.h" -#include "db_page.h" -#include "db_am.h" -#include "btree.h" -#include "hash.h" -#include "qam.h" -#include "xa.h" -#include "xa_ext.h" +#include "dbinc/crypto.h" +#include "dbinc/db_page.h" +#include "dbinc/db_shash.h" +#include "dbinc/btree.h" +#include "dbinc/hash.h" +#include "dbinc/qam.h" +#include "dbinc/xa.h" +#include "dbinc_auto/xa_ext.h" +#include "dbinc/db_shash.h" +#include "dbinc/lock.h" #ifdef HAVE_RPC -#include "gen_client_ext.h" -#include "rpc_client_ext.h" +#include "dbinc_auto/db_server.h" +#include "dbinc_auto/rpc_client_ext.h" #endif -static int __db_get_byteswapped __P((DB *)); -static DBTYPE - __db_get_type __P((DB *)); +static int __db_get_byteswapped __P((DB *, int *)); +static int __db_get_type __P((DB *, DBTYPE *dbtype)); static int __db_init __P((DB *, u_int32_t)); static int __db_key_range __P((DB *, DB_TXN *, DBT *, DB_KEY_RANGE *, u_int32_t)); +static int __db_set_alloc __P((DB *, void *(*)(size_t), + void *(*)(void *, size_t), void (*)(void *))); static int __db_set_append_recno __P((DB *, int (*)(DB *, DBT *, db_recno_t))); static int __db_set_cachesize __P((DB *, u_int32_t, u_int32_t, int)); +static int __db_set_cache_priority __P((DB *, DB_CACHE_PRIORITY)); static int __db_set_dup_compare __P((DB *, int (*)(DB *, const DBT *, const DBT *))); -static void __db_set_errcall __P((DB *, void (*)(const char *, char *))); -static void __db_set_errfile __P((DB *, FILE *)); +static int __db_set_encrypt __P((DB *, const char *, u_int32_t)); static int __db_set_feedback __P((DB *, void (*)(DB *, int, int))); static int __db_set_flags __P((DB *, u_int32_t)); -static int __db_set_lorder __P((DB *, int)); -static int __db_set_malloc __P((DB *, void *(*)(size_t))); static int __db_set_pagesize __P((DB *, u_int32_t)); -static int __db_set_realloc __P((DB *, void *(*)(void *, size_t))); -static void __db_set_errpfx __P((DB *, const char *)); static int __db_set_paniccall __P((DB *, void (*)(DB_ENV *, int))); +static void __db_set_errcall __P((DB *, void (*)(const char *, char *))); +static void __db_set_errfile __P((DB *, FILE *)); +static void __db_set_errpfx __P((DB *, const char *)); +static int __db_stat_fail __P((DB *, void *, u_int32_t)); static void __dbh_err __P((DB *, int, const char *, ...)); static void __dbh_errx __P((DB *, const char *, ...)); +#ifdef HAVE_RPC +static int __dbcl_init __P((DB *, DB_ENV *, u_int32_t)); +#endif + /* * db_create -- * DB constructor. + * + * EXTERN: int db_create __P((DB **, DB_ENV *, u_int32_t)); */ int db_create(dbpp, dbenv, flags) @@ -102,27 +108,25 @@ db_create(dbpp, dbenv, flags) if ((ret = __os_calloc(dbenv, 1, sizeof(*dbp), &dbp)) != 0) return (ret); #ifdef HAVE_RPC - if (dbenv != NULL && dbenv->cl_handle != NULL) + if (dbenv != NULL && RPC_ON(dbenv)) ret = __dbcl_init(dbp, dbenv, flags); else #endif ret = __db_init(dbp, flags); if (ret != 0) { - __os_free(dbp, sizeof(*dbp)); + __os_free(dbenv, dbp); return (ret); } /* If we don't have an environment yet, allocate a local one. */ if (dbenv == NULL) { if ((ret = db_env_create(&dbenv, 0)) != 0) { - __os_free(dbp, sizeof(*dbp)); + __os_free(dbenv, dbp); return (ret); } - dbenv->dblocal_ref = 0; F_SET(dbenv, DB_ENV_DBLOCAL); } - if (F_ISSET(dbenv, DB_ENV_DBLOCAL)) - ++dbenv->dblocal_ref; + ++dbenv->db_ref; dbp->dbenv = dbenv; @@ -141,18 +145,21 @@ __db_init(dbp, flags) { int ret; - dbp->log_fileid = DB_LOGFILEID_INVALID; + dbp->lid = DB_LOCK_INVALIDID; + LOCK_INIT(dbp->handle_lock); TAILQ_INIT(&dbp->free_queue); TAILQ_INIT(&dbp->active_queue); TAILQ_INIT(&dbp->join_queue); + LIST_INIT(&dbp->s_secondaries); FLD_SET(dbp->am_ok, DB_OK_BTREE | DB_OK_HASH | DB_OK_QUEUE | DB_OK_RECNO); + dbp->associate = __db_associate; dbp->close = __db_close; dbp->cursor = __db_cursor; - dbp->del = NULL; /* !!! Must be set by access method. */ + dbp->del = __db_delete; dbp->err = __dbh_err; dbp->errx = __dbh_errx; dbp->fd = __db_fd; @@ -162,26 +169,30 @@ __db_init(dbp, flags) dbp->join = __db_join; dbp->key_range = __db_key_range; dbp->open = __db_open; + dbp->pget = __db_pget; dbp->put = __db_put; dbp->remove = __db_remove; dbp->rename = __db_rename; + dbp->truncate = __db_truncate; + dbp->set_alloc = __db_set_alloc; dbp->set_append_recno = __db_set_append_recno; dbp->set_cachesize = __db_set_cachesize; + dbp->set_cache_priority = __db_set_cache_priority; dbp->set_dup_compare = __db_set_dup_compare; + dbp->set_encrypt = __db_set_encrypt; dbp->set_errcall = __db_set_errcall; dbp->set_errfile = __db_set_errfile; dbp->set_errpfx = __db_set_errpfx; dbp->set_feedback = __db_set_feedback; dbp->set_flags = __db_set_flags; dbp->set_lorder = __db_set_lorder; - dbp->set_malloc = __db_set_malloc; dbp->set_pagesize = __db_set_pagesize; dbp->set_paniccall = __db_set_paniccall; - dbp->set_realloc = __db_set_realloc; - dbp->stat = NULL; /* !!! Must be set by access method. */ + dbp->stat = __db_stat_fail; dbp->sync = __db_sync; dbp->upgrade = __db_upgrade; dbp->verify = __db_verify; + /* Access method specific. */ if ((ret = __bam_db_create(dbp)) != 0) return (ret); @@ -244,16 +255,7 @@ __dbh_err(dbp, error, fmt, va_alist) va_dcl #endif { - va_list ap; - -#ifdef __STDC__ - va_start(ap, fmt); -#else - va_start(ap); -#endif - __db_real_err(dbp->dbenv, error, 1, 1, fmt, ap); - - va_end(ap); + DB_REAL_ERR(dbp->dbenv, error, 1, 1, fmt); } /* @@ -270,16 +272,7 @@ __dbh_errx(dbp, fmt, va_alist) va_dcl #endif { - va_list ap; - -#ifdef __STDC__ - va_start(ap, fmt); -#else - va_start(ap); -#endif - __db_real_err(dbp->dbenv, 0, 0, 1, fmt, ap); - - va_end(ap); + DB_REAL_ERR(dbp->dbenv, 0, 0, 1, fmt); } /* @@ -287,25 +280,29 @@ __dbh_errx(dbp, fmt, va_alist) * Return if database requires byte swapping. */ static int -__db_get_byteswapped(dbp) +__db_get_byteswapped(dbp, isswapped) DB *dbp; + int *isswapped; { DB_ILLEGAL_BEFORE_OPEN(dbp, "get_byteswapped"); - return (F_ISSET(dbp, DB_AM_SWAP) ? 1 : 0); + *isswapped = F_ISSET(dbp, DB_AM_SWAP) ? 1 : 0; + return (0); } /* * __db_get_type -- * Return type of underlying database. */ -static DBTYPE -__db_get_type(dbp) +static int +__db_get_type(dbp, dbtype) DB *dbp; + DBTYPE *dbtype; { DB_ILLEGAL_BEFORE_OPEN(dbp, "get_type"); - return (dbp->type); + *dbtype = dbp->type; + return (0); } /* @@ -366,6 +363,26 @@ __db_set_cachesize(dbp, cache_gbytes, cache_bytes, ncache) } /* + * __db_set_cache_priority -- + * Set cache priority for pages from this file. + */ +static int +__db_set_cache_priority(dbp, priority) + DB *dbp; + DB_CACHE_PRIORITY priority; +{ + /* + * If an underlying DB_MPOOLFILE exists, call it. Otherwise, save + * the information away until DB->open is called. + */ + if (dbp->mpf == NULL) { + dbp->priority = priority; + return (0); + } + return (dbp->mpf->set_priority(dbp->mpf, priority)); +} + +/* * __db_set_dup_compare -- * Set duplicate comparison routine. */ @@ -374,14 +391,50 @@ __db_set_dup_compare(dbp, func) DB *dbp; int (*func) __P((DB *, const DBT *, const DBT *)); { + int ret; + DB_ILLEGAL_AFTER_OPEN(dbp, "dup_compare"); DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE | DB_OK_HASH); + if ((ret = dbp->set_flags(dbp, DB_DUPSORT)) != 0) + return (ret); + dbp->dup_compare = func; return (0); } +/* + * __db_set_encrypt -- + * Set database passwd. + */ +static int +__db_set_encrypt(dbp, passwd, flags) + DB *dbp; + const char *passwd; + u_int32_t flags; +{ + DB_CIPHER *db_cipher; + int ret; + + DB_ILLEGAL_IN_ENV(dbp, "set_encrypt"); + DB_ILLEGAL_AFTER_OPEN(dbp, "set_encrypt"); + + if ((ret = dbp->dbenv->set_encrypt(dbp->dbenv, passwd, flags)) != 0) + return (ret); + + /* + * In a real env, this gets initialized with the region. In a local + * env, we must do it here. + */ + db_cipher = (DB_CIPHER *)dbp->dbenv->crypto_handle; + if (!F_ISSET(db_cipher, CIPHER_ANY) && + (ret = db_cipher->init(dbp->dbenv, db_cipher)) != 0) + return (ret); + + return (dbp->set_flags(dbp, DB_ENCRYPT)); +} + static void __db_set_errcall(dbp, errcall) DB *dbp; @@ -430,6 +483,21 @@ __db_set_flags(dbp, flags) * * The queue access method takes no flags. */ + if (LF_ISSET(DB_ENCRYPT)) { + if (!CRYPTO_ON(dbp->dbenv)) { + __db_err(dbp->dbenv, + "Database environment not configured for encryption"); + return (EINVAL); + } + F_SET(dbp, DB_AM_ENCRYPT); + F_SET(dbp, DB_AM_CHKSUM); + LF_CLR(DB_ENCRYPT); + } + if (LF_ISSET(DB_CHKSUM_SHA1)) { + F_SET(dbp, DB_AM_CHKSUM); + LF_CLR(DB_CHKSUM_SHA1); + } + if ((ret = __bam_set_flags(dbp, &flags)) != 0) return (ret); if ((ret = __ram_set_flags(dbp, &flags)) != 0) @@ -438,7 +506,13 @@ __db_set_flags(dbp, flags) return (flags == 0 ? 0 : __db_ferr(dbp->dbenv, "DB->set_flags", 0)); } -static int +/* + * __db_set_lorder -- + * Set whether lorder is swapped or not. + * + * PUBLIC: int __db_set_lorder __P((DB *, int)); + */ +int __db_set_lorder(dbp, db_lorder) DB *dbp; int db_lorder; @@ -463,14 +537,17 @@ __db_set_lorder(dbp, db_lorder) } static int -__db_set_malloc(dbp, func) +__db_set_alloc(dbp, mal_func, real_func, free_func) DB *dbp; - void *(*func) __P((size_t)); + void *(*mal_func) __P((size_t)); + void *(*real_func) __P((void *, size_t)); + void (*free_func) __P((void *)); { - DB_ILLEGAL_AFTER_OPEN(dbp, "set_malloc"); + DB_ILLEGAL_IN_ENV(dbp, "set_alloc"); + DB_ILLEGAL_AFTER_OPEN(dbp, "set_alloc"); - dbp->db_malloc = func; - return (0); + return (dbp->dbenv->set_alloc(dbp->dbenv, + mal_func, real_func, free_func)); } static int @@ -495,7 +572,7 @@ __db_set_pagesize(dbp, db_pagesize) * We don't want anything that's not a power-of-2, as we rely on that * for alignment of various types on the pages. */ - if ((u_int32_t)1 << __db_log2(db_pagesize) != db_pagesize) { + if (!POWER_OF_TWO(db_pagesize)) { __db_err(dbp->dbenv, "page sizes must be a power-of-2"); return (EINVAL); } @@ -511,44 +588,44 @@ __db_set_pagesize(dbp, db_pagesize) } static int -__db_set_realloc(dbp, func) +__db_set_paniccall(dbp, paniccall) DB *dbp; - void *(*func) __P((void *, size_t)); + void (*paniccall) __P((DB_ENV *, int)); { - DB_ILLEGAL_AFTER_OPEN(dbp, "set_realloc"); - - dbp->db_realloc = func; - return (0); + return (dbp->dbenv->set_paniccall(dbp->dbenv, paniccall)); } static int -__db_set_paniccall(dbp, paniccall) +__db_stat_fail(dbp, sp, flags) DB *dbp; - void (*paniccall) __P((DB_ENV *, int)); + void *sp; + u_int32_t flags; { - return (dbp->dbenv->set_paniccall(dbp->dbenv, paniccall)); + COMPQUIET(sp, NULL); + COMPQUIET(flags, 0); + + /* + * DB->stat isn't initialized until the actual DB->open call, + * but we don't want to core dump. + */ + PANIC_CHECK(dbp->dbenv); + DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->stat"); + + /* NOTREACHED */ + return (EINVAL); } #ifdef HAVE_RPC /* * __dbcl_init -- * Initialize a DB structure on the server. - * - * PUBLIC: #ifdef HAVE_RPC - * PUBLIC: int __dbcl_init __P((DB *, DB_ENV *, u_int32_t)); - * PUBLIC: #endif */ -int +static int __dbcl_init(dbp, dbenv, flags) DB *dbp; DB_ENV *dbenv; u_int32_t flags; { - CLIENT *cl; - __db_create_reply *replyp; - __db_create_msg req; - int ret; - TAILQ_INIT(&dbp->free_queue); TAILQ_INIT(&dbp->active_queue); /* !!! @@ -556,6 +633,7 @@ __dbcl_init(dbp, dbenv, flags) * not used in RPC clients. See the comment in __dbcl_db_join_ret(). */ + dbp->associate = __dbcl_db_associate; dbp->close = __dbcl_db_close; dbp->cursor = __dbcl_db_cursor; dbp->del = __dbcl_db_del; @@ -563,31 +641,34 @@ __dbcl_init(dbp, dbenv, flags) dbp->errx = __dbh_errx; dbp->fd = __dbcl_db_fd; dbp->get = __dbcl_db_get; - dbp->get_byteswapped = __dbcl_db_swapped; + dbp->get_byteswapped = __db_get_byteswapped; dbp->get_type = __db_get_type; dbp->join = __dbcl_db_join; dbp->key_range = __dbcl_db_key_range; - dbp->open = __dbcl_db_open; + dbp->open = __dbcl_db_open_wrap; + dbp->pget = __dbcl_db_pget; dbp->put = __dbcl_db_put; dbp->remove = __dbcl_db_remove; dbp->rename = __dbcl_db_rename; + dbp->set_alloc = __dbcl_db_alloc; dbp->set_append_recno = __dbcl_db_set_append_recno; dbp->set_cachesize = __dbcl_db_cachesize; - dbp->set_dup_compare = NULL; + dbp->set_cache_priority = __dbcl_db_cache_priority; + dbp->set_dup_compare = __dbcl_db_dup_compare; + dbp->set_encrypt = __dbcl_db_encrypt; dbp->set_errcall = __db_set_errcall; dbp->set_errfile = __db_set_errfile; dbp->set_errpfx = __db_set_errpfx; dbp->set_feedback = __dbcl_db_feedback; dbp->set_flags = __dbcl_db_flags; dbp->set_lorder = __dbcl_db_lorder; - dbp->set_malloc = __dbcl_db_malloc; dbp->set_pagesize = __dbcl_db_pagesize; dbp->set_paniccall = __dbcl_db_panic; - dbp->set_q_extentsize = __dbcl_db_extentsize; - dbp->set_realloc = __dbcl_db_realloc; dbp->stat = __dbcl_db_stat; dbp->sync = __dbcl_db_sync; + dbp->truncate = __dbcl_db_truncate; dbp->upgrade = __dbcl_db_upgrade; + dbp->verify = __dbcl_db_verify; /* * Set all the method specific functions to client funcs as well. @@ -599,31 +680,12 @@ __dbcl_init(dbp, dbenv, flags) dbp->set_h_ffactor = __dbcl_db_h_ffactor; dbp->set_h_hash = __dbcl_db_h_hash; dbp->set_h_nelem = __dbcl_db_h_nelem; + dbp->set_q_extentsize = __dbcl_db_extentsize; dbp->set_re_delim = __dbcl_db_re_delim; dbp->set_re_len = __dbcl_db_re_len; dbp->set_re_pad = __dbcl_db_re_pad; dbp->set_re_source = __dbcl_db_re_source; -/* - dbp->set_q_extentsize = __dbcl_db_q_extentsize; -*/ - - cl = (CLIENT *)dbenv->cl_handle; - req.flags = flags; - req.envpcl_id = dbenv->cl_id; - - /* - * CALL THE SERVER - */ - replyp = __db_db_create_1(&req, cl); - if (replyp == NULL) { - __db_err(dbenv, clnt_sperror(cl, "Berkeley DB")); - return (DB_NOSERVER); - } - if ((ret = replyp->status) != 0) - return (ret); - - dbp->cl_id = replyp->dbpcl_id; - return (0); + return (__dbcl_db_create(dbp, dbenv, flags)); } #endif diff --git a/bdb/db/db_open.c b/bdb/db/db_open.c new file mode 100644 index 00000000000..f6f96cda547 --- /dev/null +++ b/bdb/db/db_open.c @@ -0,0 +1,705 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2002 + * Sleepycat Software. All rights reserved. + */ + +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: db_open.c,v 11.215 2002/08/15 15:27:52 bostic Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/db_shash.h" +#include "dbinc/db_swap.h" +#include "dbinc/btree.h" +#include "dbinc/crypto.h" +#include "dbinc/hmac.h" +#include "dbinc/fop.h" +#include "dbinc/hash.h" +#include "dbinc/lock.h" +#include "dbinc/log.h" +#include "dbinc/qam.h" +#include "dbinc/txn.h" + +static int __db_openchk __P((DB *, + DB_TXN *, const char *, const char *, DBTYPE, u_int32_t)); + +/* + * __db_open -- + * Main library interface to the DB access methods. + * + * PUBLIC: int __db_open __P((DB *, DB_TXN *, + * PUBLIC: const char *, const char *, DBTYPE, u_int32_t, int)); + */ +int +__db_open(dbp, txn, name, subdb, type, flags, mode) + DB *dbp; + DB_TXN *txn; + const char *name, *subdb; + DBTYPE type; + u_int32_t flags; + int mode; +{ + DB_ENV *dbenv; + int remove_master, remove_me, ret, t_ret, txn_local; + + dbenv = dbp->dbenv; + remove_me = remove_master = txn_local = 0; + + PANIC_CHECK(dbenv); + + if ((ret = __db_openchk(dbp, txn, name, subdb, type, flags)) != 0) + return (ret); + + /* + * Create local transaction as necessary, check for consistent + * transaction usage. + */ + if (IS_AUTO_COMMIT(dbenv, txn, flags)) { + if ((ret = __db_txn_auto(dbp, &txn)) != 0) + return (ret); + txn_local = 1; + } else + if (txn != NULL && !TXN_ON(dbenv)) + return (__db_not_txn_env(dbenv)); + + /* + * If the environment was configured with threads, the DB handle + * must also be free-threaded, so we force the DB_THREAD flag on. + * (See SR #2033 for why this is a requirement--recovery needs + * to be able to grab a dbp using __db_fileid_to_dbp, and it has + * no way of knowing which dbp goes with which thread, so whichever + * one it finds has to be usable in any of them.) + */ + if (F_ISSET(dbenv, DB_ENV_THREAD)) + LF_SET(DB_THREAD); + + /* Convert any DB->open flags. */ + if (LF_ISSET(DB_RDONLY)) + F_SET(dbp, DB_AM_RDONLY); + if (LF_ISSET(DB_DIRTY_READ)) + F_SET(dbp, DB_AM_DIRTY); + + /* Fill in the type. */ + dbp->type = type; + + /* + * If we're opening a subdatabase, we have to open (and potentially + * create) the main database, and then get (and potentially store) + * our base page number in that database. Then, we can finally open + * the subdatabase. + */ + if ((ret = __db_dbopen( + dbp, txn, name, subdb, flags, mode, PGNO_BASE_MD)) != 0) + goto err; + + /* + * You can open the database that describes the subdatabases in the + * rest of the file read-only. The content of each key's data is + * unspecified and applications should never be adding new records + * or updating existing records. However, during recovery, we need + * to open these databases R/W so we can redo/undo changes in them. + * Likewise, we need to open master databases read/write during + * rename and remove so we can be sure they're fully sync'ed, so + * we provide an override flag for the purpose. + */ + if (subdb == NULL && !IS_RECOVERING(dbenv) && !LF_ISSET(DB_RDONLY) && + !LF_ISSET(DB_RDWRMASTER) && F_ISSET(dbp, DB_AM_SUBDB)) { + __db_err(dbenv, + "files containing multiple databases may only be opened read-only"); + ret = EINVAL; + goto err; + } + +err: /* If we were successful, don't discard the file on close. */ + if (ret == 0) + /* If we were successful, don't discard the file on close. */ + F_CLR(dbp, DB_AM_DISCARD | DB_AM_CREATED | DB_AM_CREATED_MSTR); + else { + /* + * If we are not transactional, we need to remove the + * databases/subdatabases. If we are transactional, then + * the abort of the child transaction should take care of + * cleaning them up. + */ + remove_me = txn == NULL && F_ISSET(dbp, DB_AM_CREATED); + remove_master = txn == NULL && F_ISSET(dbp, DB_AM_CREATED_MSTR); + + /* + * If we had an error, it may have happened before or after + * we actually logged the open. If it happened before, then + * abort won't know anything about it and won't close or + * refresh the dbp, so we need to do it explicitly. + */ + (void)__db_refresh(dbp, txn, DB_NOSYNC); + } + + /* Remove anyone we created. */ + if (remove_master || (subdb == NULL && remove_me)) + /* Remove file. */ + (void)dbenv->dbremove(dbenv, txn, name, NULL, 0); + else if (remove_me) + /* Remove subdatabase. */ + (void)dbenv->dbremove(dbenv, txn, name, subdb, 0); + + /* Commit for DB_AUTO_COMMIT. */ + if (txn_local) { + if (ret == 0) + ret = txn->commit(txn, 0); + else + if ((t_ret = txn->abort(txn)) != 0) + ret = __db_panic(dbenv, t_ret); + } + + return (ret); +} + +/* + * __db_dbopen -- + * Open a database. This routine gets called in three different ways. + * 1. It can be called to open a file/database. In this case, subdb will + * be NULL and meta_pgno will be PGNO_BASE_MD. + * 2. It can be called to open a subdatabase during normal operation. In + * this case, name and subname will both be non-NULL and meta_pgno will + * be PGNO_BAS_MD (also PGNO_INVALID). + * 3. It can be called during recovery to open a subdatabase in which case + * name will be non-NULL, subname mqy be NULL and meta-pgno will be + * a valid pgno (i.e., not PGNO_BASE_MD). + * + * PUBLIC: int __db_dbopen __P((DB *, DB_TXN *, + * PUBLIC: const char *, const char *, u_int32_t, int, db_pgno_t)); + */ +int +__db_dbopen(dbp, txn, name, subdb, flags, mode, meta_pgno) + DB *dbp; + DB_TXN *txn; + const char *name, *subdb; + u_int32_t flags; + int mode; + db_pgno_t meta_pgno; +{ + DB_ENV *dbenv; + int ret; + u_int32_t id; + + dbenv = dbp->dbenv; + id = TXN_INVALID; + if (txn != NULL) + F_SET(dbp, DB_AM_TXN); + + DB_TEST_RECOVERY(dbp, DB_TEST_PREOPEN, ret, name); + /* + * If name is NULL, it's always a create, so make sure that we + * have a type specified. It would be nice if this checking + * were done in __db_open where most of the interface checking + * is done, but this interface (__db_dbopen) is used by the + * recovery and limbo system, so we need to safeguard this + * interface as well. + */ + if (name == NULL) { + F_SET(dbp, DB_AM_INMEM); + + if (dbp->type == DB_UNKNOWN) { + __db_err(dbenv, + "DBTYPE of unknown without existing file"); + return (EINVAL); + } + + if (dbp->pgsize == 0) + dbp->pgsize = DB_DEF_IOSIZE; + + /* + * If the file is a temporary file and we're doing locking, + * then we have to create a unique file ID. We can't use our + * normal dev/inode pair (or whatever this OS uses in place of + * dev/inode pairs) because no backing file will be created + * until the mpool cache is filled forcing the buffers to disk. + * Grab a random locker ID to use as a file ID. The created + * ID must never match a potential real file ID -- we know it + * won't because real file IDs contain a time stamp after the + * dev/inode pair, and we're simply storing a 4-byte value. + * + * !!! + * Store the locker in the file id structure -- we can get it + * from there as necessary, and it saves having two copies. + */ + if (LOCKING_ON(dbenv) && (ret = dbenv->lock_id(dbenv, + (u_int32_t *)dbp->fileid)) != 0) + return (ret); + } else if (subdb == NULL && meta_pgno == PGNO_BASE_MD) { + /* Open/create the underlying file. Acquire locks. */ + if ((ret = + __fop_file_setup(dbp, txn, name, mode, flags, &id)) != 0) + return (ret); + } else { + if ((ret = __fop_subdb_setup(dbp, + txn, name, subdb, mode, flags)) != 0) + return (ret); + meta_pgno = dbp->meta_pgno; + } + + /* + * If we created the file, set the truncate flag for the mpool. This + * isn't for anything we've done, it's protection against stupid user + * tricks: if the user deleted a file behind Berkeley DB's back, we + * may still have pages in the mpool that match the file's "unique" ID. + * + * Note that if we're opening a subdatabase, we don't want to set + * the TRUNCATE flag even if we just created the file--we already + * opened and updated the master using access method interfaces, + * so we don't want to get rid of any pages that are in the mpool. + * If we created the file when we opened the master, we already hit + * this check in a non-subdb context then. + */ + if (subdb == NULL && F_ISSET(dbp, DB_AM_CREATED)) + LF_SET(DB_TRUNCATE); + + /* Set up the underlying environment. */ + if ((ret = __db_dbenv_setup(dbp, txn, name, id, flags)) != 0) + return (ret); + + /* + * Set the open flag. We use it to mean that the dbp has gone + * through mpf setup, including dbreg_register. Also, below, + * the underlying access method open functions may want to do + * things like acquire cursors, so the open flag has to be set + * before calling them. + */ + F_SET(dbp, DB_AM_OPEN_CALLED); + + /* + * For unnamed files, we need to actually create the file now + * that the mpool is open. + */ + if (name == NULL && (ret = __db_new_file(dbp, txn, NULL, NULL)) != 0) + return (ret); + + switch (dbp->type) { + case DB_BTREE: + ret = __bam_open(dbp, txn, name, meta_pgno, flags); + break; + case DB_HASH: + ret = __ham_open(dbp, txn, name, meta_pgno, flags); + break; + case DB_RECNO: + ret = __ram_open(dbp, txn, name, meta_pgno, flags); + break; + case DB_QUEUE: + ret = __qam_open(dbp, txn, name, meta_pgno, mode, flags); + break; + case DB_UNKNOWN: + return (__db_unknown_type(dbenv, "__db_dbopen", dbp->type)); + } + if (ret != 0) + goto err; + + DB_TEST_RECOVERY(dbp, DB_TEST_POSTOPEN, ret, name); + + /* + * Unnamed files don't need handle locks, so we only have to check + * for a handle lock downgrade or lockevent in the case of named + * files. + */ + if (!F_ISSET(dbp, DB_AM_RECOVER) && + name != NULL && LOCK_ISSET(dbp->handle_lock)) { + if (txn != NULL) { + ret = __txn_lockevent(dbenv, + txn, dbp, &dbp->handle_lock, dbp->lid); + } else if (LOCKING_ON(dbenv)) + /* Trade write handle lock for read handle lock. */ + ret = __lock_downgrade(dbenv, + &dbp->handle_lock, DB_LOCK_READ, 0); + } +DB_TEST_RECOVERY_LABEL +err: + return (ret); +} + +/* + * __db_new_file -- + * Create a new database file. + * + * PUBLIC: int __db_new_file __P((DB *, DB_TXN *, DB_FH *, const char *)); + */ +int +__db_new_file(dbp, txn, fhp, name) + DB *dbp; + DB_TXN *txn; + DB_FH *fhp; + const char *name; +{ + int ret; + + switch (dbp->type) { + case DB_BTREE: + case DB_RECNO: + ret = __bam_new_file(dbp, txn, fhp, name); + break; + case DB_HASH: + ret = __ham_new_file(dbp, txn, fhp, name); + break; + case DB_QUEUE: + ret = __qam_new_file(dbp, txn, fhp, name); + break; + default: + __db_err(dbp->dbenv, + "%s: Invalid type %d specified", name, dbp->type); + ret = EINVAL; + break; + } + + DB_TEST_RECOVERY(dbp, DB_TEST_POSTLOGMETA, ret, name); + /* Sync the file in preparation for moving it into place. */ + if (ret == 0 && fhp != NULL) + ret = __os_fsync(dbp->dbenv, fhp); + + DB_TEST_RECOVERY(dbp, DB_TEST_POSTSYNC, ret, name); + +DB_TEST_RECOVERY_LABEL + return (ret); +} + +/* + * __db_init_subdb -- + * Initialize the dbp for a subdb. + * + * PUBLIC: int __db_init_subdb __P((DB *, DB *, const char *, DB_TXN *)); + */ +int +__db_init_subdb(mdbp, dbp, name, txn) + DB *mdbp, *dbp; + const char *name; + DB_TXN *txn; +{ + DBMETA *meta; + DB_MPOOLFILE *mpf; + int ret, t_ret; + + ret = 0; + if (!F_ISSET(dbp, DB_AM_CREATED)) { + /* Subdb exists; read meta-data page and initialize. */ + mpf = mdbp->mpf; + if ((ret = mpf->get(mpf, &dbp->meta_pgno, 0, &meta)) != 0) + goto err; + ret = __db_meta_setup(mdbp->dbenv, dbp, name, meta, 0, 0); + if ((t_ret = mpf->put(mpf, meta, 0)) != 0 && ret == 0) + ret = t_ret; + /* + * If __db_meta_setup found that the meta-page hadn't + * been written out during recovery, we can just return. + */ + if (ret == ENOENT) + ret = 0; + goto err; + } + + /* Handle the create case here. */ + switch (dbp->type) { + case DB_BTREE: + case DB_RECNO: + ret = __bam_new_subdb(mdbp, dbp, txn); + break; + case DB_HASH: + ret = __ham_new_subdb(mdbp, dbp, txn); + break; + case DB_QUEUE: + ret = EINVAL; + break; + default: + __db_err(dbp->dbenv, + "Invalid subdatabase type %d specified", dbp->type); + return (EINVAL); + } + +err: return (ret); +} + +/* + * __db_chk_meta -- + * Take a buffer containing a meta-data page and check it for a checksum + * (and verify the checksum if necessary) and possibly decrypt it. + * + * Return 0 on success, >0 (errno) on error, -1 on checksum mismatch. + * + * PUBLIC: int __db_chk_meta __P((DB_ENV *, DB *, DBMETA *, int)); + */ +int +__db_chk_meta(dbenv, dbp, meta, do_metachk) + DB_ENV *dbenv; + DB *dbp; + DBMETA *meta; + int do_metachk; +{ + int is_hmac, ret; + u_int8_t *chksum; + + ret = 0; + + if (FLD_ISSET(meta->metaflags, DBMETA_CHKSUM)) { + if (dbp != NULL) + F_SET(dbp, DB_AM_CHKSUM); + + is_hmac = meta->encrypt_alg == 0 ? 0 : 1; + chksum = ((BTMETA *)meta)->chksum; + if (do_metachk && ((ret = __db_check_chksum(dbenv, + (DB_CIPHER *)dbenv->crypto_handle, chksum, meta, + DBMETASIZE, is_hmac)) != 0)) + return (ret); + } + +#ifdef HAVE_CRYPTO + ret = __crypto_decrypt_meta(dbenv, dbp, (u_int8_t *)meta, do_metachk); +#endif + return (ret); +} + +/* + * __db_meta_setup -- + * + * Take a buffer containing a meta-data page and figure out if it's + * valid, and if so, initialize the dbp from the meta-data page. + * + * PUBLIC: int __db_meta_setup __P((DB_ENV *, + * PUBLIC: DB *, const char *, DBMETA *, u_int32_t, int)); + */ +int +__db_meta_setup(dbenv, dbp, name, meta, oflags, do_metachk) + DB_ENV *dbenv; + DB *dbp; + const char *name; + DBMETA *meta; + u_int32_t oflags; + int do_metachk; +{ + u_int32_t flags, magic; + int ret; + + ret = 0; + + /* + * Figure out what access method we're dealing with, and then + * call access method specific code to check error conditions + * based on conflicts between the found file and application + * arguments. A found file overrides some user information -- + * we don't consider it an error, for example, if the user set + * an expected byte order and the found file doesn't match it. + */ + F_CLR(dbp, DB_AM_SWAP); + magic = meta->magic; + +swap_retry: + switch (magic) { + case DB_BTREEMAGIC: + case DB_HASHMAGIC: + case DB_QAMMAGIC: + case DB_RENAMEMAGIC: + break; + case 0: + /* + * The only time this should be 0 is if we're in the + * midst of opening a subdb during recovery and that + * subdatabase had its meta-data page allocated, but + * not yet initialized. + */ + if (F_ISSET(dbp, DB_AM_SUBDB) && ((IS_RECOVERING(dbenv) && + F_ISSET((DB_LOG *) dbenv->lg_handle, DBLOG_FORCE_OPEN)) || + meta->pgno != PGNO_INVALID)) + return (ENOENT); + + goto bad_format; + default: + if (F_ISSET(dbp, DB_AM_SWAP)) + goto bad_format; + + M_32_SWAP(magic); + F_SET(dbp, DB_AM_SWAP); + goto swap_retry; + } + + /* + * We can only check the meta page if we are sure we have a meta page. + * If it is random data, then this check can fail. So only now can we + * checksum and decrypt. Don't distinguish between configuration and + * checksum match errors here, because we haven't opened the database + * and even a checksum error isn't a reason to panic the environment. + */ + if ((ret = __db_chk_meta(dbenv, dbp, meta, do_metachk)) != 0) { + if (ret == -1) { + __db_err(dbenv, + "%s: metadata page checksum error", name); + ret = EINVAL; + } + goto bad_format; + } + + switch (magic) { + case DB_BTREEMAGIC: + flags = meta->flags; + if (F_ISSET(dbp, DB_AM_SWAP)) + M_32_SWAP(flags); + if (LF_ISSET(BTM_RECNO)) + dbp->type = DB_RECNO; + else + dbp->type = DB_BTREE; + if ((oflags & DB_TRUNCATE) == 0 && (ret = + __bam_metachk(dbp, name, (BTMETA *)meta)) != 0) + return (ret); + break; + case DB_HASHMAGIC: + dbp->type = DB_HASH; + if ((oflags & DB_TRUNCATE) == 0 && (ret = + __ham_metachk(dbp, name, (HMETA *)meta)) != 0) + return (ret); + break; + case DB_QAMMAGIC: + dbp->type = DB_QUEUE; + if ((oflags & DB_TRUNCATE) == 0 && (ret = + __qam_metachk(dbp, name, (QMETA *)meta)) != 0) + return (ret); + break; + case DB_RENAMEMAGIC: + F_SET(dbp, DB_AM_IN_RENAME); + break; + } + return (0); + +bad_format: + __db_err(dbenv, "%s: unexpected file type or format", name); + return (ret); +} + +/* + * __db_openchk -- + * Interface error checking for open calls. + */ +static int +__db_openchk(dbp, txn, name, subdb, type, flags) + DB *dbp; + DB_TXN *txn; + const char *name, *subdb; + DBTYPE type; + u_int32_t flags; +{ + DB_ENV *dbenv; + int ret; + u_int32_t ok_flags; + + dbenv = dbp->dbenv; + + /* Validate arguments. */ +#define OKFLAGS \ + (DB_AUTO_COMMIT | DB_CREATE | DB_DIRTY_READ | DB_EXCL | \ + DB_FCNTL_LOCKING | DB_NOMMAP | DB_RDONLY | DB_RDWRMASTER | \ + DB_THREAD | DB_TRUNCATE | DB_WRITEOPEN) + if ((ret = __db_fchk(dbenv, "DB->open", flags, OKFLAGS)) != 0) + return (ret); + if (LF_ISSET(DB_EXCL) && !LF_ISSET(DB_CREATE)) + return (__db_ferr(dbenv, "DB->open", 1)); + if (LF_ISSET(DB_RDONLY) && LF_ISSET(DB_CREATE)) + return (__db_ferr(dbenv, "DB->open", 1)); + +#ifdef HAVE_VXWORKS + if (LF_ISSET(DB_TRUNCATE)) { + __db_err(dbenv, "DB_TRUNCATE unsupported in VxWorks"); + return (__db_eopnotsup(dbenv)); + } +#endif + switch (type) { + case DB_UNKNOWN: + if (LF_ISSET(DB_CREATE|DB_TRUNCATE)) { + __db_err(dbenv, + "%s: DB_UNKNOWN type specified with DB_CREATE or DB_TRUNCATE", + name); + return (EINVAL); + } + ok_flags = 0; + break; + case DB_BTREE: + ok_flags = DB_OK_BTREE; + break; + case DB_HASH: + ok_flags = DB_OK_HASH; + break; + case DB_QUEUE: + ok_flags = DB_OK_QUEUE; + break; + case DB_RECNO: + ok_flags = DB_OK_RECNO; + break; + default: + __db_err(dbenv, "unknown type: %lu", (u_long)type); + return (EINVAL); + } + if (ok_flags) + DB_ILLEGAL_METHOD(dbp, ok_flags); + + /* The environment may have been created, but never opened. */ + if (!F_ISSET(dbenv, DB_ENV_DBLOCAL | DB_ENV_OPEN_CALLED)) { + __db_err(dbenv, "environment not yet opened"); + return (EINVAL); + } + + /* + * Historically, you could pass in an environment that didn't have a + * mpool, and DB would create a private one behind the scenes. This + * no longer works. + */ + if (!F_ISSET(dbenv, DB_ENV_DBLOCAL) && !MPOOL_ON(dbenv)) { + __db_err(dbenv, "environment did not include a memory pool"); + return (EINVAL); + } + + /* + * You can't specify threads during DB->open if subsystems in the + * environment weren't configured with them. + */ + if (LF_ISSET(DB_THREAD) && + !F_ISSET(dbenv, DB_ENV_DBLOCAL | DB_ENV_THREAD)) { + __db_err(dbenv, "environment not created using DB_THREAD"); + return (EINVAL); + } + + /* DB_TRUNCATE is not transaction recoverable. */ + if (LF_ISSET(DB_TRUNCATE) && txn != NULL) { + __db_err(dbenv, + "DB_TRUNCATE illegal with transaction specified"); + return (EINVAL); + } + + /* Subdatabase checks. */ + if (subdb != NULL) { + /* Subdatabases must be created in named files. */ + if (name == NULL) { + __db_err(dbenv, + "multiple databases cannot be created in temporary files"); + return (EINVAL); + } + + /* Truncate is a physical file operation */ + if (LF_ISSET(DB_TRUNCATE)) { + __db_err(dbenv, + "DB_TRUNCATE illegal with multiple databases"); + return (EINVAL); + } + + /* QAM can't be done as a subdatabase. */ + if (type == DB_QUEUE) { + __db_err(dbenv, "Queue databases must be one-per-file"); + return (EINVAL); + } + } + + return (0); +} diff --git a/bdb/db/db_overflow.c b/bdb/db/db_overflow.c index 54f0a03aafe..27dcb41a2ff 100644 --- a/bdb/db/db_overflow.c +++ b/bdb/db/db_overflow.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2002 * Sleepycat Software. All rights reserved. */ /* @@ -43,7 +43,7 @@ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: db_overflow.c,v 11.21 2000/11/30 00:58:32 ubell Exp $"; +static const char revid[] = "$Id: db_overflow.c,v 11.46 2002/08/08 03:57:48 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -53,9 +53,9 @@ static const char revid[] = "$Id: db_overflow.c,v 11.21 2000/11/30 00:58:32 ubel #endif #include "db_int.h" -#include "db_page.h" -#include "db_am.h" -#include "db_verify.h" +#include "dbinc/db_page.h" +#include "dbinc/db_am.h" +#include "dbinc/db_verify.h" /* * Big key/data code. @@ -83,6 +83,7 @@ __db_goff(dbp, dbt, tlen, pgno, bpp, bpsz) u_int32_t *bpsz; { DB_ENV *dbenv; + DB_MPOOLFILE *mpf; PAGE *h; db_indx_t bytes; u_int32_t curoff, needed, start; @@ -90,6 +91,7 @@ __db_goff(dbp, dbt, tlen, pgno, bpp, bpsz) int ret; dbenv = dbp->dbenv; + mpf = dbp->mpf; /* * Check if the buffer is big enough; if it is not and we are @@ -99,7 +101,12 @@ __db_goff(dbp, dbt, tlen, pgno, bpp, bpsz) */ if (F_ISSET(dbt, DB_DBT_PARTIAL)) { start = dbt->doff; - needed = dbt->dlen; + if (start > tlen) + needed = 0; + else if (dbt->dlen > tlen - start) + needed = tlen - start; + else + needed = dbt->dlen; } else { start = 0; needed = tlen; @@ -112,15 +119,13 @@ __db_goff(dbp, dbt, tlen, pgno, bpp, bpsz) return (ENOMEM); } } else if (F_ISSET(dbt, DB_DBT_MALLOC)) { - if ((ret = __os_malloc(dbenv, - needed, dbp->db_malloc, &dbt->data)) != 0) + if ((ret = __os_umalloc(dbenv, needed, &dbt->data)) != 0) return (ret); } else if (F_ISSET(dbt, DB_DBT_REALLOC)) { - if ((ret = __os_realloc(dbenv, - needed, dbp->db_realloc, &dbt->data)) != 0) + if ((ret = __os_urealloc(dbenv, needed, &dbt->data)) != 0) return (ret); } else if (*bpsz == 0 || *bpsz < needed) { - if ((ret = __os_realloc(dbenv, needed, NULL, bpp)) != 0) + if ((ret = __os_realloc(dbenv, needed, bpp)) != 0) return (ret); *bpsz = needed; dbt->data = *bpp; @@ -133,13 +138,12 @@ __db_goff(dbp, dbt, tlen, pgno, bpp, bpsz) */ dbt->size = needed; for (curoff = 0, p = dbt->data; pgno != PGNO_INVALID && needed > 0;) { - if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) { - (void)__db_pgerr(dbp, pgno); + if ((ret = mpf->get(mpf, &pgno, 0, &h)) != 0) return (ret); - } + /* Check if we need any bytes from this page. */ if (curoff + OV_LEN(h) >= start) { - src = (u_int8_t *)h + P_OVERHEAD; + src = (u_int8_t *)h + P_OVERHEAD(dbp); bytes = OV_LEN(h); if (start > curoff) { src += start - curoff; @@ -153,7 +157,7 @@ __db_goff(dbp, dbt, tlen, pgno, bpp, bpsz) } curoff += OV_LEN(h); pgno = h->next_pgno; - memp_fput(dbp->mpf, h, 0); + (void)mpf->put(mpf, h, 0); } return (0); } @@ -171,13 +175,14 @@ __db_poff(dbc, dbt, pgnop) db_pgno_t *pgnop; { DB *dbp; - PAGE *pagep, *lastp; - DB_LSN new_lsn, null_lsn; DBT tmp_dbt; + DB_LSN new_lsn, null_lsn; + DB_MPOOLFILE *mpf; + PAGE *pagep, *lastp; db_indx_t pagespace; u_int32_t sz; u_int8_t *p; - int ret; + int ret, t_ret; /* * Allocate pages and copy the key/data item into them. Calculate the @@ -185,8 +190,10 @@ __db_poff(dbc, dbt, pgnop) * item. */ dbp = dbc->dbp; - pagespace = P_MAXSPACE(dbp->pgsize); + mpf = dbp->mpf; + pagespace = P_MAXSPACE(dbp, dbp->pgsize); + ret = 0; lastp = NULL; for (p = dbt->data, sz = dbt->size; sz > 0; p += pagespace, sz -= pagespace) { @@ -203,30 +210,36 @@ __db_poff(dbc, dbt, pgnop) * have a partial record. */ if ((ret = __db_new(dbc, P_OVERFLOW, &pagep)) != 0) - return (ret); - if (DB_LOGGING(dbc)) { + break; + if (DBC_LOGGING(dbc)) { tmp_dbt.data = p; tmp_dbt.size = pagespace; ZERO_LSN(null_lsn); - if ((ret = __db_big_log(dbp->dbenv, dbc->txn, - &new_lsn, 0, DB_ADD_BIG, dbp->log_fileid, - PGNO(pagep), lastp ? PGNO(lastp) : PGNO_INVALID, + if ((ret = __db_big_log(dbp, dbc->txn, + &new_lsn, 0, DB_ADD_BIG, PGNO(pagep), + lastp ? PGNO(lastp) : PGNO_INVALID, PGNO_INVALID, &tmp_dbt, &LSN(pagep), lastp == NULL ? &null_lsn : &LSN(lastp), - &null_lsn)) != 0) - return (ret); + &null_lsn)) != 0) { + if (lastp != NULL) + (void)mpf->put(mpf, + lastp, DB_MPOOL_DIRTY); + lastp = pagep; + break; + } + } else + LSN_NOT_LOGGED(new_lsn); - /* Move lsn onto page. */ - if (lastp) - LSN(lastp) = new_lsn; - LSN(pagep) = new_lsn; - } + /* Move LSN onto page. */ + if (lastp != NULL) + LSN(lastp) = new_lsn; + LSN(pagep) = new_lsn; P_INIT(pagep, dbp->pgsize, PGNO(pagep), PGNO_INVALID, PGNO_INVALID, 0, P_OVERFLOW); OV_LEN(pagep) = pagespace; OV_REF(pagep) = 1; - memcpy((u_int8_t *)pagep + P_OVERHEAD, p, pagespace); + memcpy((u_int8_t *)pagep + P_OVERHEAD(dbp), p, pagespace); /* * If this is the first entry, update the user's info. @@ -238,12 +251,14 @@ __db_poff(dbc, dbt, pgnop) else { lastp->next_pgno = PGNO(pagep); pagep->prev_pgno = PGNO(lastp); - (void)memp_fput(dbp->mpf, lastp, DB_MPOOL_DIRTY); + (void)mpf->put(mpf, lastp, DB_MPOOL_DIRTY); } lastp = pagep; } - (void)memp_fput(dbp->mpf, lastp, DB_MPOOL_DIRTY); - return (0); + if (lastp != NULL && + (t_ret = mpf->put(mpf, lastp, DB_MPOOL_DIRTY)) != 0 && ret == 0) + ret = t_ret; + return (ret); } /* @@ -259,23 +274,29 @@ __db_ovref(dbc, pgno, adjust) int32_t adjust; { DB *dbp; + DB_MPOOLFILE *mpf; PAGE *h; int ret; dbp = dbc->dbp; - if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) { - (void)__db_pgerr(dbp, pgno); + mpf = dbp->mpf; + + if ((ret = mpf->get(mpf, &pgno, 0, &h)) != 0) { + __db_pgerr(dbp, pgno, ret); return (ret); } - if (DB_LOGGING(dbc)) - if ((ret = __db_ovref_log(dbp->dbenv, dbc->txn, - &LSN(h), 0, dbp->log_fileid, h->pgno, adjust, - &LSN(h))) != 0) + if (DBC_LOGGING(dbc)) { + if ((ret = __db_ovref_log(dbp, + dbc->txn, &LSN(h), 0, h->pgno, adjust, &LSN(h))) != 0) { + (void)mpf->put(mpf, h, 0); return (ret); + } + } else + LSN_NOT_LOGGED(LSN(h)); OV_REF(h) += adjust; - (void)memp_fput(dbp->mpf, h, DB_MPOOL_DIRTY); + (void)mpf->put(mpf, h, DB_MPOOL_DIRTY); return (0); } @@ -293,13 +314,16 @@ __db_doff(dbc, pgno) DB *dbp; PAGE *pagep; DB_LSN null_lsn; + DB_MPOOLFILE *mpf; DBT tmp_dbt; int ret; dbp = dbc->dbp; + mpf = dbp->mpf; + do { - if ((ret = memp_fget(dbp->mpf, &pgno, 0, &pagep)) != 0) { - (void)__db_pgerr(dbp, pgno); + if ((ret = mpf->get(mpf, &pgno, 0, &pagep)) != 0) { + __db_pgerr(dbp, pgno, ret); return (ret); } @@ -309,20 +333,24 @@ __db_doff(dbc, pgno) * decrement the reference count and return. */ if (OV_REF(pagep) > 1) { - (void)memp_fput(dbp->mpf, pagep, 0); + (void)mpf->put(mpf, pagep, 0); return (__db_ovref(dbc, pgno, -1)); } - if (DB_LOGGING(dbc)) { - tmp_dbt.data = (u_int8_t *)pagep + P_OVERHEAD; + if (DBC_LOGGING(dbc)) { + tmp_dbt.data = (u_int8_t *)pagep + P_OVERHEAD(dbp); tmp_dbt.size = OV_LEN(pagep); ZERO_LSN(null_lsn); - if ((ret = __db_big_log(dbp->dbenv, dbc->txn, - &LSN(pagep), 0, DB_REM_BIG, dbp->log_fileid, - PGNO(pagep), PREV_PGNO(pagep), NEXT_PGNO(pagep), - &tmp_dbt, &LSN(pagep), &null_lsn, &null_lsn)) != 0) + if ((ret = __db_big_log(dbp, dbc->txn, + &LSN(pagep), 0, DB_REM_BIG, + PGNO(pagep), PREV_PGNO(pagep), + NEXT_PGNO(pagep), &tmp_dbt, + &LSN(pagep), &null_lsn, &null_lsn)) != 0) { + (void)mpf->put(mpf, pagep, 0); return (ret); - } + } + } else + LSN_NOT_LOGGED(LSN(pagep)); pgno = pagep->next_pgno; if ((ret = __db_free(dbc, pagep)) != 0) return (ret); @@ -352,13 +380,16 @@ __db_moff(dbp, dbt, pgno, tlen, cmpfunc, cmpp) u_int32_t tlen; int (*cmpfunc) __P((DB *, const DBT *, const DBT *)), *cmpp; { - PAGE *pagep; DBT local_dbt; + DB_MPOOLFILE *mpf; + PAGE *pagep; void *buf; u_int32_t bufsize, cmp_bytes, key_left; u_int8_t *p1, *p2; int ret; + mpf = dbp->mpf; + /* * If there is a user-specified comparison function, build a * contiguous copy of the key, and call it. @@ -373,27 +404,27 @@ __db_moff(dbp, dbt, pgno, tlen, cmpfunc, cmpp) return (ret); /* Pass the key as the first argument */ *cmpp = cmpfunc(dbp, dbt, &local_dbt); - __os_free(buf, bufsize); + __os_free(dbp->dbenv, buf); return (0); } /* While there are both keys to compare. */ for (*cmpp = 0, p1 = dbt->data, key_left = dbt->size; key_left > 0 && pgno != PGNO_INVALID;) { - if ((ret = memp_fget(dbp->mpf, &pgno, 0, &pagep)) != 0) + if ((ret = mpf->get(mpf, &pgno, 0, &pagep)) != 0) return (ret); cmp_bytes = OV_LEN(pagep) < key_left ? OV_LEN(pagep) : key_left; tlen -= cmp_bytes; key_left -= cmp_bytes; - for (p2 = - (u_int8_t *)pagep + P_OVERHEAD; cmp_bytes-- > 0; ++p1, ++p2) + for (p2 = (u_int8_t *)pagep + P_OVERHEAD(dbp); + cmp_bytes-- > 0; ++p1, ++p2) if (*p1 != *p2) { *cmpp = (long)*p1 - (long)*p2; break; } pgno = NEXT_PGNO(pagep); - if ((ret = memp_fput(dbp->mpf, pagep, 0)) != 0) + if ((ret = mpf->put(mpf, pagep, 0)) != 0) return (ret); if (*cmpp != 0) return (0); @@ -440,7 +471,7 @@ __db_vrfy_overflow(dbp, vdp, h, pgno, flags) pip->refcount = OV_REF(h); if (pip->refcount < 1) { EPRINT((dbp->dbenv, - "Overflow page %lu has zero reference count", + "Page %lu: overflow page has zero reference count", (u_long)pgno)); isbad = 1; } @@ -448,7 +479,7 @@ __db_vrfy_overflow(dbp, vdp, h, pgno, flags) /* Just store for now. */ pip->olen = HOFFSET(h); -err: if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0) +err: if ((t_ret = __db_vrfy_putpageinfo(dbp->dbenv, vdp, pip)) != 0) ret = t_ret; return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret); } @@ -495,7 +526,7 @@ __db_vrfy_ovfl_structure(dbp, vdp, pgno, tlen, flags) if (pip->type != P_OVERFLOW) { EPRINT((dbp->dbenv, - "Overflow page %lu of invalid type", + "Page %lu: overflow page of invalid type %lu", (u_long)pgno, (u_long)pip->type)); ret = DB_VERIFY_BAD; goto err; /* Unsafe to continue. */ @@ -504,7 +535,8 @@ __db_vrfy_ovfl_structure(dbp, vdp, pgno, tlen, flags) prev = pip->prev_pgno; if (prev != PGNO_INVALID) { EPRINT((dbp->dbenv, - "First overflow page %lu has a prev_pgno", (u_long)pgno)); + "Page %lu: first page in overflow chain has a prev_pgno %lu", + (u_long)pgno, (u_long)prev)); isbad = 1; } @@ -543,7 +575,7 @@ __db_vrfy_ovfl_structure(dbp, vdp, pgno, tlen, flags) */ if ((u_int32_t)p > refcount) { EPRINT((dbp->dbenv, - "Page %lu encountered twice in overflow traversal", + "Page %lu: encountered twice in overflow traversal", (u_long)pgno)); ret = DB_VERIFY_BAD; goto err; @@ -571,19 +603,20 @@ __db_vrfy_ovfl_structure(dbp, vdp, pgno, tlen, flags) if (!IS_VALID_PGNO(next)) { DB_ASSERT(0); EPRINT((dbp->dbenv, - "Overflow page %lu has bad next_pgno", - (u_long)pgno)); + "Page %lu: bad next_pgno %lu on overflow page", + (u_long)pgno, (u_long)next)); ret = DB_VERIFY_BAD; goto err; } - if ((ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 || + if ((ret = __db_vrfy_putpageinfo(dbp->dbenv, vdp, pip)) != 0 || (ret = __db_vrfy_getpageinfo(vdp, next, &pip)) != 0) return (ret); if (pip->prev_pgno != pgno) { EPRINT((dbp->dbenv, - "Overflow page %lu has bogus prev_pgno value", - (u_long)next)); + "Page %lu: bad prev_pgno %lu on overflow page (should be %lu)", + (u_long)next, (u_long)pip->prev_pgno, + (u_long)pgno)); isbad = 1; /* * It's safe to continue because we have separate @@ -597,10 +630,11 @@ __db_vrfy_ovfl_structure(dbp, vdp, pgno, tlen, flags) if (tlen > 0) { isbad = 1; EPRINT((dbp->dbenv, - "Overflow item incomplete on page %lu", (u_long)pgno)); + "Page %lu: overflow item incomplete", (u_long)pgno)); } -err: if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0) +err: if ((t_ret = + __db_vrfy_putpageinfo(dbp->dbenv, vdp, pip)) != 0 && ret == 0) ret = t_ret; return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret); } @@ -622,13 +656,15 @@ __db_safe_goff(dbp, vdp, pgno, dbt, buf, flags) void **buf; u_int32_t flags; { + DB_MPOOLFILE *mpf; PAGE *h; - int ret, err_ret; + int ret, t_ret; u_int32_t bytesgot, bytes; u_int8_t *src, *dest; - ret = DB_VERIFY_BAD; - err_ret = 0; + mpf = dbp->mpf; + h = NULL; + ret = t_ret = 0; bytesgot = bytes = 0; while ((pgno != PGNO_INVALID) && (IS_VALID_PGNO(pgno))) { @@ -639,7 +675,7 @@ __db_safe_goff(dbp, vdp, pgno, dbt, buf, flags) if ((ret = __db_salvage_markdone(vdp, pgno)) != 0) break; - if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) + if ((ret = mpf->get(mpf, &pgno, 0, &h)) != 0) break; /* @@ -651,14 +687,14 @@ __db_safe_goff(dbp, vdp, pgno, dbt, buf, flags) break; } - src = (u_int8_t *)h + P_OVERHEAD; + src = (u_int8_t *)h + P_OVERHEAD(dbp); bytes = OV_LEN(h); - if (bytes + P_OVERHEAD > dbp->pgsize) - bytes = dbp->pgsize - P_OVERHEAD; + if (bytes + P_OVERHEAD(dbp) > dbp->pgsize) + bytes = dbp->pgsize - P_OVERHEAD(dbp); if ((ret = __os_realloc(dbp->dbenv, - bytesgot + bytes, 0, buf)) != 0) + bytesgot + bytes, buf)) != 0) break; dest = (u_int8_t *)*buf + bytesgot; @@ -667,15 +703,24 @@ __db_safe_goff(dbp, vdp, pgno, dbt, buf, flags) memcpy(dest, src, bytes); pgno = NEXT_PGNO(h); - /* Not much we can do here--we don't want to quit. */ - if ((ret = memp_fput(dbp->mpf, h, 0)) != 0) - err_ret = ret; + + if ((ret = mpf->put(mpf, h, 0)) != 0) + break; + h = NULL; } - if (ret == 0) { + /* + * If we're being aggressive, salvage a partial datum if there + * was an error somewhere along the way. + */ + if (ret == 0 || LF_ISSET(DB_AGGRESSIVE)) { dbt->size = bytesgot; dbt->data = *buf; } - return ((err_ret != 0 && ret == 0) ? err_ret : ret); + /* If we broke out on error, don't leave pages pinned. */ + if (h != NULL && (t_ret = mpf->put(mpf, h, 0)) != 0 && ret == 0) + ret = t_ret; + + return (ret); } diff --git a/bdb/db/db_pr.c b/bdb/db/db_pr.c index cb977cadfda..235e7187f7c 100644 --- a/bdb/db/db_pr.c +++ b/bdb/db/db_pr.c @@ -1,14 +1,14 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2002 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: db_pr.c,v 11.46 2001/01/22 17:25:06 krinsky Exp $"; +static const char revid[] = "$Id: db_pr.c,v 11.84 2002/09/10 02:45:20 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -21,34 +21,24 @@ static const char revid[] = "$Id: db_pr.c,v 11.46 2001/01/22 17:25:06 krinsky Ex #endif #include "db_int.h" -#include "db_page.h" -#include "btree.h" -#include "hash.h" -#include "qam.h" -#include "db_am.h" -#include "db_verify.h" +#include "dbinc/db_page.h" +#include "dbinc/db_shash.h" +#include "dbinc/btree.h" +#include "dbinc/hash.h" +#include "dbinc/mp.h" +#include "dbinc/qam.h" +#include "dbinc/db_verify.h" static int __db_bmeta __P((DB *, FILE *, BTMETA *, u_int32_t)); static int __db_hmeta __P((DB *, FILE *, HMETA *, u_int32_t)); static void __db_meta __P((DB *, DBMETA *, FILE *, FN const *, u_int32_t)); -static const char *__db_dbtype_to_string __P((DB *)); -static void __db_prdb __P((DB *, FILE *, u_int32_t)); -static FILE *__db_prinit __P((FILE *)); -static void __db_proff __P((void *)); -static int __db_prtree __P((DB *, u_int32_t)); -static void __db_psize __P((DB *)); +static const char *__db_pagetype_to_string __P((u_int32_t)); +static void __db_prdb __P((DB *, FILE *)); +static void __db_proff __P((void *, FILE *)); +static int __db_prtree __P((DB *, FILE *, u_int32_t)); static int __db_qmeta __P((DB *, FILE *, QMETA *, u_int32_t)); /* - * 64K is the maximum page size, so by default we check for offsets larger - * than that, and, where possible, we refine the test. - */ -#define PSIZE_BOUNDARY (64 * 1024 + 1) -static size_t set_psize = PSIZE_BOUNDARY; - -static FILE *set_fp; /* Output file descriptor. */ - -/* * __db_loadme -- * A nice place to put a breakpoint. * @@ -57,7 +47,9 @@ static FILE *set_fp; /* Output file descriptor. */ void __db_loadme() { - getpid(); + u_int32_t id; + + __os_id(&id); } /* @@ -71,21 +63,9 @@ __db_dump(dbp, op, name) DB *dbp; char *op, *name; { - FILE *fp, *save_fp; + FILE *fp; u_int32_t flags; - - COMPQUIET(save_fp, NULL); - - if (set_psize == PSIZE_BOUNDARY) - __db_psize(dbp); - - if (name != NULL) { - if ((fp = fopen(name, "w")) == NULL) - return (__os_get_errno()); - save_fp = set_fp; - set_fp = fp; - } else - fp = __db_prinit(NULL); + int ret; for (flags = 0; *op != '\0'; ++op) switch (*op) { @@ -101,60 +81,93 @@ __db_dump(dbp, op, name) return (EINVAL); } - __db_prdb(dbp, fp, flags); + if (name == NULL) + fp = stdout; + else { + if ((fp = fopen(name, "w")) == NULL) + return (__os_get_errno()); + } + + __db_prdb(dbp, fp); fprintf(fp, "%s\n", DB_LINE); - (void)__db_prtree(dbp, flags); + ret = __db_prtree(dbp, fp, flags); fflush(fp); - - if (name != NULL) { + if (name != NULL) fclose(fp); - set_fp = save_fp; - } - return (0); + + return (ret); } /* - * __db_prdb -- - * Print out the DB structure information. + * __db_inmemdbflags -- + * Call a callback for printing or other handling of strings associated + * with whatever in-memory DB structure flags are set. + * + * PUBLIC: void __db_inmemdbflags __P((u_int32_t, void *, + * PUBLIC: void (*)(u_int32_t, const FN *, void *))); */ -static void -__db_prdb(dbp, fp, flags) - DB *dbp; - FILE *fp; +void +__db_inmemdbflags(flags, cookie, callback) u_int32_t flags; + void *cookie; + void (*callback) __P((u_int32_t, const FN *, void *)); { static const FN fn[] = { + { DB_AM_CHKSUM, "checksumming" }, + { DB_AM_CL_WRITER, "client replica writer" }, + { DB_AM_COMPENSATE, "created by compensating transaction" }, + { DB_AM_CREATED, "database created" }, + { DB_AM_CREATED_MSTR, "encompassing file created" }, + { DB_AM_DBM_ERROR, "dbm/ndbm error" }, + { DB_AM_DELIMITER, "variable length" }, + { DB_AM_DIRTY, "dirty reads" }, { DB_AM_DISCARD, "discard cached pages" }, { DB_AM_DUP, "duplicates" }, + { DB_AM_DUPSORT, "sorted duplicates" }, + { DB_AM_ENCRYPT, "encrypted" }, + { DB_AM_FIXEDLEN, "fixed-length records" }, { DB_AM_INMEM, "in-memory" }, + { DB_AM_IN_RENAME, "file is being renamed" }, + { DB_AM_OPEN_CALLED, "DB->open called" }, + { DB_AM_PAD, "pad value" }, { DB_AM_PGDEF, "default page size" }, { DB_AM_RDONLY, "read-only" }, - { DB_AM_SUBDB, "multiple-databases" }, + { DB_AM_RECNUM, "Btree record numbers" }, + { DB_AM_RECOVER, "opened for recovery" }, + { DB_AM_RENUMBER, "renumber" }, + { DB_AM_REVSPLITOFF, "no reverse splits" }, + { DB_AM_SECONDARY, "secondary" }, + { DB_AM_SNAPSHOT, "load on open" }, + { DB_AM_SUBDB, "subdatabases" }, { DB_AM_SWAP, "needswap" }, - { DB_BT_RECNUM, "btree:recnum" }, - { DB_BT_REVSPLIT, "btree:no reverse split" }, - { DB_DBM_ERROR, "dbm/ndbm error" }, - { DB_OPEN_CALLED, "DB->open called" }, - { DB_RE_DELIMITER, "recno:delimiter" }, - { DB_RE_FIXEDLEN, "recno:fixed-length" }, - { DB_RE_PAD, "recno:pad" }, - { DB_RE_RENUMBER, "recno:renumber" }, - { DB_RE_SNAPSHOT, "recno:snapshot" }, + { DB_AM_TXN, "transactional" }, + { DB_AM_VERIFYING, "verifier" }, { 0, NULL } }; + + callback(flags, fn, cookie); +} + +/* + * __db_prdb -- + * Print out the DB structure information. + */ +static void +__db_prdb(dbp, fp) + DB *dbp; + FILE *fp; +{ BTREE *bt; HASH *h; QUEUE *q; - COMPQUIET(flags, 0); - fprintf(fp, "In-memory DB structure:\n%s: %#lx", - __db_dbtype_to_string(dbp), (u_long)dbp->flags); - __db_prflags(dbp->flags, fn, fp); + __db_dbtype_to_string(dbp->type), (u_long)dbp->flags); + __db_inmemdbflags(dbp->flags, fp, __db_prflags); fprintf(fp, "\n"); switch (dbp->type) { @@ -166,7 +179,7 @@ __db_prdb(dbp, fp, flags) fprintf(fp, "bt_maxkey: %lu bt_minkey: %lu\n", (u_long)bt->bt_maxkey, (u_long)bt->bt_minkey); fprintf(fp, "bt_compare: %#lx bt_prefix: %#lx\n", - (u_long)bt->bt_compare, (u_long)bt->bt_prefix); + P_TO_ULONG(bt->bt_compare), P_TO_ULONG(bt->bt_prefix)); fprintf(fp, "bt_lpgno: %lu\n", (u_long)bt->bt_lpgno); if (dbp->type == DB_RECNO) { fprintf(fp, @@ -183,7 +196,7 @@ __db_prdb(dbp, fp, flags) fprintf(fp, "meta_pgno: %lu\n", (u_long)h->meta_pgno); fprintf(fp, "h_ffactor: %lu\n", (u_long)h->h_ffactor); fprintf(fp, "h_nelem: %lu\n", (u_long)h->h_nelem); - fprintf(fp, "h_hash: %#lx\n", (u_long)h->h_hash); + fprintf(fp, "h_hash: %#lx\n", P_TO_ULONG(h->h_hash)); break; case DB_QUEUE: q = dbp->q_internal; @@ -204,39 +217,34 @@ __db_prdb(dbp, fp, flags) * Print out the entire tree. */ static int -__db_prtree(dbp, flags) +__db_prtree(dbp, fp, flags) DB *dbp; + FILE *fp; u_int32_t flags; { + DB_MPOOLFILE *mpf; PAGE *h; db_pgno_t i, last; int ret; - if (set_psize == PSIZE_BOUNDARY) - __db_psize(dbp); + mpf = dbp->mpf; - if (dbp->type == DB_QUEUE) { - ret = __db_prqueue(dbp, flags); - goto done; - } - - /* Find out the page number of the last page in the database. */ - if ((ret = memp_fget(dbp->mpf, &last, DB_MPOOL_LAST, &h)) != 0) - return (ret); - if ((ret = memp_fput(dbp->mpf, h, 0)) != 0) - return (ret); + if (dbp->type == DB_QUEUE) + return (__db_prqueue(dbp, fp, flags)); - /* Dump each page. */ + /* + * Find out the page number of the last page in the database, then + * dump each page. + */ + mpf->last_pgno(mpf, &last); for (i = 0; i <= last; ++i) { - if ((ret = memp_fget(dbp->mpf, &i, 0, &h)) != 0) + if ((ret = mpf->get(mpf, &i, 0, &h)) != 0) return (ret); - (void)__db_prpage(dbp, h, flags); - if ((ret = memp_fput(dbp->mpf, h, 0)) != 0) + (void)__db_prpage(dbp, h, fp, flags); + if ((ret = mpf->put(mpf, h, 0)) != 0) return (ret); } -done: - (void)fflush(__db_prinit(NULL)); return (0); } @@ -252,13 +260,15 @@ __db_meta(dbp, dbmeta, fp, fn, flags) FN const *fn; u_int32_t flags; { + DB_MPOOLFILE *mpf; PAGE *h; - int cnt; db_pgno_t pgno; u_int8_t *p; - int ret; + int cnt, ret; const char *sep; + mpf = dbp->mpf; + fprintf(fp, "\tmagic: %#lx\n", (u_long)dbmeta->magic); fprintf(fp, "\tversion: %lu\n", (u_long)dbmeta->version); fprintf(fp, "\tpagesize: %lu\n", (u_long)dbmeta->pagesize); @@ -275,14 +285,14 @@ __db_meta(dbp, dbmeta, fp, fn, flags) fprintf(fp, "\tfree list: %lu", (u_long)dbmeta->free); for (pgno = dbmeta->free, cnt = 0, sep = ", "; pgno != PGNO_INVALID;) { - if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) { + if ((ret = mpf->get(mpf, &pgno, 0, &h)) != 0) { fprintf(fp, "Unable to retrieve free-list page: %lu: %s\n", (u_long)pgno, db_strerror(ret)); break; } pgno = h->next_pgno; - (void)memp_fput(dbp->mpf, h, 0); + (void)mpf->put(mpf, h, 0); fprintf(fp, "%s%lu", sep, (u_long)pgno); if (++cnt % 10 == 0) { fprintf(fp, "\n"); @@ -292,6 +302,7 @@ __db_meta(dbp, dbmeta, fp, fn, flags) sep = ", "; } fprintf(fp, "\n"); + fprintf(fp, "\tlast_pgno: %lu\n", (u_long)dbmeta->last_pgno); } if (fn != NULL) { @@ -404,26 +415,28 @@ __db_qmeta(dbp, fp, h, flags) * __db_prnpage * -- Print out a specific page. * - * PUBLIC: int __db_prnpage __P((DB *, db_pgno_t)); + * PUBLIC: int __db_prnpage __P((DB *, db_pgno_t, FILE *)); */ int -__db_prnpage(dbp, pgno) +__db_prnpage(dbp, pgno, fp) DB *dbp; db_pgno_t pgno; + FILE *fp; { + DB_MPOOLFILE *mpf; PAGE *h; - int ret; + int ret, t_ret; - if (set_psize == PSIZE_BOUNDARY) - __db_psize(dbp); + mpf = dbp->mpf; - if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) + if ((ret = mpf->get(mpf, &pgno, 0, &h)) != 0) return (ret); - ret = __db_prpage(dbp, h, DB_PR_PAGE); - (void)fflush(__db_prinit(NULL)); + ret = __db_prpage(dbp, h, fp, DB_PR_PAGE); + + if ((t_ret = mpf->put(mpf, h, 0)) != 0 && ret == 0) + ret = t_ret; - (void)memp_fput(dbp->mpf, h, 0); return (ret); } @@ -431,32 +444,29 @@ __db_prnpage(dbp, pgno) * __db_prpage * -- Print out a page. * - * PUBLIC: int __db_prpage __P((DB *, PAGE *, u_int32_t)); + * PUBLIC: int __db_prpage __P((DB *, PAGE *, FILE *, u_int32_t)); */ int -__db_prpage(dbp, h, flags) +__db_prpage(dbp, h, fp, flags) DB *dbp; PAGE *h; + FILE *fp; u_int32_t flags; { BINTERNAL *bi; BKEYDATA *bk; - BTREE *t; - FILE *fp; HOFFPAGE a_hkd; QAMDATA *qp, *qep; RINTERNAL *ri; - db_indx_t dlen, len, i; + db_indx_t dlen, len, i, *inp; db_pgno_t pgno; db_recno_t recno; + u_int32_t pagesize, qlen; + u_int8_t *ep, *hk, *p; int deleted, ret; const char *s; - u_int32_t qlen; - u_int8_t *ep, *hk, *p; void *sp; - fp = __db_prinit(NULL); - /* * If we're doing recovery testing and this page is P_INVALID, * assume it's a page that's on the free list, and don't display it. @@ -471,6 +481,14 @@ __db_prpage(dbp, h, flags) return (1); } + /* + * !!! + * Find out the page size. We don't want to do it the "right" way, + * by reading the value from the meta-data page, that's going to be + * slow. Reach down into the mpool region. + */ + pagesize = (u_int32_t)dbp->mpf->mfp->stat.st_pagesize; + /* Page number, page type. */ fprintf(fp, "page %lu: %s level: %lu", (u_long)h->pgno, s, (u_long)h->level); @@ -500,7 +518,7 @@ __db_prpage(dbp, h, flags) qlen = ((QUEUE *)dbp->q_internal)->re_len; recno = (h->pgno - 1) * QAM_RECNO_PER_PAGE(dbp) + 1; i = 0; - qep = (QAMDATA *)((u_int8_t *)h + set_psize - qlen); + qep = (QAMDATA *)((u_int8_t *)h + pagesize - qlen); for (qp = QAM_GET_RECORD(dbp, h, i); qp < qep; recno++, i++, qp = QAM_GET_RECORD(dbp, h, i)) { if (!F_ISSET(qp, QAM_SET)) @@ -508,9 +526,9 @@ __db_prpage(dbp, h, flags) fprintf(fp, "%s", F_ISSET(qp, QAM_VALID) ? "\t" : " D"); - fprintf(fp, "[%03lu] %4lu ", - (u_long)recno, (u_long)qp - (u_long)h); - __db_pr(qp->data, qlen); + fprintf(fp, "[%03lu] %4lu ", (u_long)recno, + (u_long)((u_int8_t *)qp - (u_int8_t *)h)); + __db_pr(qp->data, qlen, fp); } return (0); } @@ -520,8 +538,6 @@ __db_prpage(dbp, h, flags) fprintf(fp, " (lsn.file: %lu lsn.offset: %lu)\n", (u_long)LSN(h).file, (u_long)LSN(h).offset); - t = dbp->bt_internal; - s = "\t"; if (TYPE(h) != P_IBTREE && TYPE(h) != P_IRECNO) { fprintf(fp, "%sprev: %4lu next: %4lu", @@ -530,7 +546,7 @@ __db_prpage(dbp, h, flags) } if (TYPE(h) == P_OVERFLOW) { fprintf(fp, "%sref cnt: %4lu ", s, (u_long)OV_REF(h)); - __db_pr((u_int8_t *)h + P_OVERHEAD, OV_LEN(h)); + __db_pr((u_int8_t *)h + P_OVERHEAD(dbp), OV_LEN(h), fp); return (0); } fprintf(fp, "%sentries: %4lu", s, (u_long)NUM_ENT(h)); @@ -540,12 +556,14 @@ __db_prpage(dbp, h, flags) return (0); ret = 0; + inp = P_INP(dbp, h); for (i = 0; i < NUM_ENT(h); i++) { - if (P_ENTRY(h, i) - (u_int8_t *)h < P_OVERHEAD || - (size_t)(P_ENTRY(h, i) - (u_int8_t *)h) >= set_psize) { + if ((db_alignp_t)(P_ENTRY(dbp, h, i) - (u_int8_t *)h) < + (db_alignp_t)(P_OVERHEAD(dbp)) || + (size_t)(P_ENTRY(dbp, h, i) - (u_int8_t *)h) >= pagesize) { fprintf(fp, "ILLEGAL PAGE OFFSET: indx: %lu of %lu\n", - (u_long)i, (u_long)h->inp[i]); + (u_long)i, (u_long)inp[i]); ret = EINVAL; continue; } @@ -554,17 +572,17 @@ __db_prpage(dbp, h, flags) case P_HASH: case P_IBTREE: case P_IRECNO: - sp = P_ENTRY(h, i); + sp = P_ENTRY(dbp, h, i); break; case P_LBTREE: - sp = P_ENTRY(h, i); + sp = P_ENTRY(dbp, h, i); deleted = i % 2 == 0 && - B_DISSET(GET_BKEYDATA(h, i + O_INDX)->type); + B_DISSET(GET_BKEYDATA(dbp, h, i + O_INDX)->type); break; case P_LDUP: case P_LRECNO: - sp = P_ENTRY(h, i); - deleted = B_DISSET(GET_BKEYDATA(h, i)->type); + sp = P_ENTRY(dbp, h, i); + deleted = B_DISSET(GET_BKEYDATA(dbp, h, i)->type); break; default: fprintf(fp, @@ -573,7 +591,7 @@ __db_prpage(dbp, h, flags) continue; } fprintf(fp, "%s", deleted ? " D" : "\t"); - fprintf(fp, "[%03lu] %4lu ", (u_long)i, (u_long)h->inp[i]); + fprintf(fp, "[%03lu] %4lu ", (u_long)i, (u_long)inp[i]); switch (TYPE(h)) { case P_HASH: hk = sp; @@ -592,7 +610,7 @@ __db_prpage(dbp, h, flags) * set. */ if (i != 0) - len = LEN_HKEYDATA(h, 0, i); + len = LEN_HKEYDATA(dbp, h, 0, i); else len = 1; @@ -602,13 +620,14 @@ __db_prpage(dbp, h, flags) memcpy(&dlen, p, sizeof(db_indx_t)); p += sizeof(db_indx_t); fprintf(fp, "\t\t"); - __db_pr(p, dlen); + __db_pr(p, dlen, fp); p += sizeof(db_indx_t) + dlen; } break; case H_KEYDATA: __db_pr(HKEYDATA_DATA(hk), - LEN_HKEYDATA(h, i == 0 ? set_psize : 0, i)); + LEN_HKEYDATA(dbp, h, i == 0 ? + pagesize : 0, i), fp); break; case H_OFFPAGE: memcpy(&a_hkd, hk, HOFFPAGE_SIZE); @@ -625,11 +644,11 @@ __db_prpage(dbp, h, flags) (u_long)bi->type); switch (B_TYPE(bi->type)) { case B_KEYDATA: - __db_pr(bi->data, bi->len); + __db_pr(bi->data, bi->len, fp); break; case B_DUPLICATE: case B_OVERFLOW: - __db_proff(bi->data); + __db_proff(bi->data, fp); break; default: fprintf(fp, "ILLEGAL BINTERNAL TYPE: %lu\n", @@ -649,11 +668,11 @@ __db_prpage(dbp, h, flags) bk = sp; switch (B_TYPE(bk->type)) { case B_KEYDATA: - __db_pr(bk->data, bk->len); + __db_pr(bk->data, bk->len, fp); break; case B_DUPLICATE: case B_OVERFLOW: - __db_proff(bk); + __db_proff(bk, fp); break; default: fprintf(fp, @@ -673,19 +692,17 @@ __db_prpage(dbp, h, flags) * __db_pr -- * Print out a data element. * - * PUBLIC: void __db_pr __P((u_int8_t *, u_int32_t)); + * PUBLIC: void __db_pr __P((u_int8_t *, u_int32_t, FILE *)); */ void -__db_pr(p, len) +__db_pr(p, len, fp) u_int8_t *p; u_int32_t len; -{ FILE *fp; +{ u_int lastch; int i; - fp = __db_prinit(NULL); - fprintf(fp, "len: %3lu", (u_long)len); lastch = '.'; if (len != 0) { @@ -744,6 +761,13 @@ __db_prdbt(dbtp, checkprint, prefix, handle, callback, is_recno, vdp) handle, callback, vdp, 0); F_CLR(vdp, SALVAGE_PRINTHEADER); F_SET(vdp, SALVAGE_PRINTFOOTER); + + /* + * Even if the printable flag wasn't set by our immediate + * caller, it may be set on a salvage-wide basis. + */ + if (F_ISSET(vdp, SALVAGE_PRINTABLE)) + checkprint = 1; } /* @@ -760,12 +784,12 @@ __db_prdbt(dbtp, checkprint, prefix, handle, callback, is_recno, vdp) * in a platform-independent way. So we use the numeral in * straight ASCII. */ - __ua_memcpy(&recno, dbtp->data, sizeof(recno)); + (void)__ua_memcpy(&recno, dbtp->data, sizeof(recno)); snprintf(buf, DBTBUFLEN, "%lu", (u_long)recno); /* If we're printing data as hex, print keys as hex too. */ if (!checkprint) { - for (len = strlen(buf), p = buf, hp = hbuf; + for (len = (u_int32_t)strlen(buf), p = buf, hp = hbuf; len-- > 0; ++p) { *hp++ = hex[(u_int8_t)(*p & 0xf0) >> 4]; *hp++ = hex[*p & 0x0f]; @@ -810,14 +834,12 @@ __db_prdbt(dbtp, checkprint, prefix, handle, callback, is_recno, vdp) * Print out an off-page element. */ static void -__db_proff(vp) +__db_proff(vp, fp) void *vp; -{ FILE *fp; +{ BOVERFLOW *bo; - fp = __db_prinit(NULL); - bo = vp; switch (B_TYPE(bo->type)) { case B_OVERFLOW: @@ -834,18 +856,25 @@ __db_proff(vp) * __db_prflags -- * Print out flags values. * - * PUBLIC: void __db_prflags __P((u_int32_t, const FN *, FILE *)); + * PUBLIC: void __db_prflags __P((u_int32_t, const FN *, void *)); */ void -__db_prflags(flags, fn, fp) +__db_prflags(flags, fn, vfp) u_int32_t flags; FN const *fn; - FILE *fp; + void *vfp; { + FILE *fp; const FN *fnp; int found; const char *sep; + /* + * We pass the FILE * through a void * so that we can use + * this function as as a callback. + */ + fp = (FILE *)vfp; + sep = " ("; for (found = 0, fnp = fn; fnp->mask != 0; ++fnp) if (LF_ISSET(fnp->mask)) { @@ -858,62 +887,21 @@ __db_prflags(flags, fn, fp) } /* - * __db_prinit -- - * Initialize tree printing routines. - */ -static FILE * -__db_prinit(fp) - FILE *fp; -{ - if (set_fp == NULL) - set_fp = fp == NULL ? stdout : fp; - return (set_fp); -} - -/* - * __db_psize -- - * Get the page size. - */ -static void -__db_psize(dbp) - DB *dbp; -{ - DBMETA *mp; - db_pgno_t pgno; - - set_psize = PSIZE_BOUNDARY - 1; - - pgno = PGNO_BASE_MD; - if (memp_fget(dbp->mpf, &pgno, 0, &mp) != 0) - return; - - switch (mp->magic) { - case DB_BTREEMAGIC: - case DB_HASHMAGIC: - case DB_QAMMAGIC: - set_psize = mp->pagesize; - break; - } - (void)memp_fput(dbp->mpf, mp, 0); -} - -/* * __db_dbtype_to_string -- * Return the name of the database type. + * PUBLIC: const char * __db_dbtype_to_string __P((DBTYPE)); */ -static const char * -__db_dbtype_to_string(dbp) - DB *dbp; +const char * +__db_dbtype_to_string(type) + DBTYPE type; { - switch (dbp->type) { + switch (type) { case DB_BTREE: return ("btree"); case DB_HASH: return ("hash"); - break; case DB_RECNO: return ("recno"); - break; case DB_QUEUE: return ("queue"); default: @@ -925,10 +913,8 @@ __db_dbtype_to_string(dbp) /* * __db_pagetype_to_string -- * Return the name of the specified page type. - * - * PUBLIC: const char *__db_pagetype_to_string __P((u_int32_t)); */ -const char * +static const char * __db_pagetype_to_string(type) u_int32_t type; { @@ -1000,6 +986,7 @@ __db_prheader(dbp, subname, pflag, keyflag, handle, callback, vdp, meta_pgno) DB_ENV *dbenv; DB_HASH_STAT *hsp; DB_QUEUE_STAT *qsp; + DBT dbt; VRFY_PAGEINFO *pip; char *buf; int buflen, ret, t_ret; @@ -1021,10 +1008,16 @@ __db_prheader(dbp, subname, pflag, keyflag, handle, callback, vdp, meta_pgno) * If we've been passed a verifier statistics object, use * that; we're being called in a context where dbp->stat * is unsafe. + * + * Also, the verifier may set the pflag on a per-salvage basis. + * If so, respect that. */ if (vdp != NULL) { if ((ret = __db_vrfy_getpageinfo(vdp, meta_pgno, &pip)) != 0) return (ret); + + if (F_ISSET(vdp, SALVAGE_PRINTABLE)) + pflag = 1; } else pip = NULL; @@ -1071,16 +1064,22 @@ __db_prheader(dbp, subname, pflag, keyflag, handle, callback, vdp, meta_pgno) /* * 64 bytes is long enough, as a minimum bound, for any of the - * fields besides subname. Subname can be anything, and so - * 64 + subname is big enough for all the things we need to print here. + * fields besides subname. Subname uses __db_prdbt and therefore + * does not need buffer space here. */ - buflen = 64 + ((subname != NULL) ? strlen(subname) : 0); - if ((ret = __os_malloc(dbenv, buflen, NULL, &buf)) != 0) + buflen = 64; + if ((ret = __os_malloc(dbenv, buflen, &buf)) != 0) goto err; if (subname != NULL) { - snprintf(buf, buflen, "database=%s\n", subname); + snprintf(buf, buflen, "database="); if ((ret = callback(handle, buf)) != 0) goto err; + memset(&dbt, 0, sizeof(dbt)); + dbt.data = subname; + dbt.size = (u_int32_t)strlen(subname); + if ((ret = __db_prdbt(&dbt, + 1, NULL, handle, callback, 0, NULL)) != 0) + goto err; } switch (dbtype) { case DB_BTREE: @@ -1106,11 +1105,11 @@ __db_prheader(dbp, subname, pflag, keyflag, handle, callback, vdp, meta_pgno) } break; } - if ((ret = dbp->stat(dbp, &btsp, NULL, 0)) != 0) { + if ((ret = dbp->stat(dbp, &btsp, 0)) != 0) { dbp->err(dbp, ret, "DB->stat"); goto err; } - if (F_ISSET(dbp, DB_BT_RECNUM)) + if (F_ISSET(dbp, DB_AM_RECNUM)) if ((ret = callback(handle, "recnum=1\n")) != 0) goto err; if (btsp->bt_maxkey != 0) { @@ -1144,7 +1143,7 @@ __db_prheader(dbp, subname, pflag, keyflag, handle, callback, vdp, meta_pgno) } break; } - if ((ret = dbp->stat(dbp, &hsp, NULL, 0)) != 0) { + if ((ret = dbp->stat(dbp, &hsp, 0)) != 0) { dbp->err(dbp, ret, "DB->stat"); goto err; } @@ -1154,10 +1153,9 @@ __db_prheader(dbp, subname, pflag, keyflag, handle, callback, vdp, meta_pgno) if ((ret = callback(handle, buf)) != 0) goto err; } - if (hsp->hash_nelem != 0 || hsp->hash_nkeys != 0) { - snprintf(buf, buflen, "h_nelem=%lu\n", - hsp->hash_nelem > hsp->hash_nkeys ? - (u_long)hsp->hash_nelem : (u_long)hsp->hash_nkeys); + if (hsp->hash_nkeys != 0) { + snprintf(buf, buflen, + "h_nelem=%lu\n", (u_long)hsp->hash_nkeys); if ((ret = callback(handle, buf)) != 0) goto err; } @@ -1172,15 +1170,24 @@ __db_prheader(dbp, subname, pflag, keyflag, handle, callback, vdp, meta_pgno) goto err; break; } - if ((ret = dbp->stat(dbp, &qsp, NULL, 0)) != 0) { + if ((ret = dbp->stat(dbp, &qsp, 0)) != 0) { dbp->err(dbp, ret, "DB->stat"); goto err; } snprintf(buf, buflen, "re_len=%lu\n", (u_long)qsp->qs_re_len); - if (qsp->qs_re_pad != 0 && qsp->qs_re_pad != ' ') - snprintf(buf, buflen, "re_pad=%#x\n", qsp->qs_re_pad); if ((ret = callback(handle, buf)) != 0) goto err; + if (qsp->qs_re_pad != 0 && qsp->qs_re_pad != ' ') { + snprintf(buf, buflen, "re_pad=%#x\n", qsp->qs_re_pad); + if ((ret = callback(handle, buf)) != 0) + goto err; + } + if (qsp->qs_extentsize != 0) { + snprintf(buf, buflen, + "extentsize=%lu\n", (u_long)qsp->qs_extentsize); + if ((ret = callback(handle, buf)) != 0) + goto err; + } break; case DB_RECNO: if ((ret = callback(handle, "type=recno\n")) != 0) @@ -1198,14 +1205,14 @@ __db_prheader(dbp, subname, pflag, keyflag, handle, callback, vdp, meta_pgno) } break; } - if ((ret = dbp->stat(dbp, &btsp, NULL, 0)) != 0) { + if ((ret = dbp->stat(dbp, &btsp, 0)) != 0) { dbp->err(dbp, ret, "DB->stat"); goto err; } - if (F_ISSET(dbp, DB_RE_RENUMBER)) + if (F_ISSET(dbp, DB_AM_RENUMBER)) if ((ret = callback(handle, "renumber=1\n")) != 0) goto err; - if (F_ISSET(dbp, DB_RE_FIXEDLEN)) { + if (F_ISSET(dbp, DB_AM_FIXEDLEN)) { snprintf(buf, buflen, "re_len=%lu\n", (u_long)btsp->bt_re_len); if ((ret = callback(handle, buf)) != 0) @@ -1233,6 +1240,9 @@ __db_prheader(dbp, subname, pflag, keyflag, handle, callback, vdp, meta_pgno) goto err; /* We should handle page size. XXX */ } else { + if (F_ISSET(dbp, DB_AM_CHKSUM)) + if ((ret = callback(handle, "chksum=1\n")) != 0) + goto err; if (F_ISSET(dbp, DB_AM_DUP)) if ((ret = callback(handle, "duplicates=1\n")) != 0) goto err; @@ -1253,16 +1263,16 @@ __db_prheader(dbp, subname, pflag, keyflag, handle, callback, vdp, meta_pgno) ret = callback(handle, "HEADER=END\n"); err: if (pip != NULL && - (t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0) + (t_ret = __db_vrfy_putpageinfo(dbenv, vdp, pip)) != 0 && ret == 0) ret = t_ret; if (btsp != NULL) - __os_free(btsp, 0); + __os_ufree(dbenv, btsp); if (hsp != NULL) - __os_free(hsp, 0); + __os_ufree(dbenv, hsp); if (qsp != NULL) - __os_free(qsp, 0); + __os_ufree(dbenv, qsp); if (buf != NULL) - __os_free(buf, buflen); + __os_free(dbenv, buf); return (ret); } diff --git a/bdb/db/db_rec.c b/bdb/db/db_rec.c index 998d074290d..303ab2fe1d4 100644 --- a/bdb/db/db_rec.c +++ b/bdb/db/db_rec.c @@ -1,14 +1,14 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2002 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: db_rec.c,v 11.10 2000/08/03 15:32:19 ubell Exp $"; +static const char revid[] = "$Id: db_rec.c,v 11.35 2002/08/08 03:57:49 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -18,9 +18,9 @@ static const char revid[] = "$Id: db_rec.c,v 11.10 2000/08/03 15:32:19 ubell Exp #endif #include "db_int.h" -#include "db_page.h" -#include "log.h" -#include "hash.h" +#include "dbinc/db_page.h" +#include "dbinc/log.h" +#include "dbinc/hash.h" /* * PUBLIC: int __db_addrem_recover @@ -45,11 +45,12 @@ __db_addrem_recover(dbenv, dbtp, lsnp, op, info) u_int32_t change; int cmp_n, cmp_p, ret; + pagep = NULL; COMPQUIET(info, NULL); REC_PRINT(__db_addrem_print); REC_INTRO(__db_addrem_read, 1); - if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) { + if ((ret = mpf->get(mpf, &argp->pgno, 0, &pagep)) != 0) { if (DB_UNDO(op)) { /* * We are undoing and the page doesn't exist. That @@ -59,7 +60,7 @@ __db_addrem_recover(dbenv, dbtp, lsnp, op, info) */ goto done; } else - if ((ret = memp_fget(mpf, + if ((ret = mpf->get(mpf, &argp->pgno, DB_MPOOL_CREATE, &pagep)) != 0) goto out; } @@ -95,13 +96,16 @@ __db_addrem_recover(dbenv, dbtp, lsnp, op, info) LSN(pagep) = argp->pagelsn; } - if ((ret = memp_fput(mpf, pagep, change)) != 0) + if ((ret = mpf->put(mpf, pagep, change)) != 0) goto out; + pagep = NULL; done: *lsnp = argp->prev_lsn; ret = 0; -out: REC_CLOSE; +out: if (pagep != NULL) + (void)mpf->put(mpf, pagep, 0); + REC_CLOSE; } /* @@ -124,11 +128,12 @@ __db_big_recover(dbenv, dbtp, lsnp, op, info) u_int32_t change; int cmp_n, cmp_p, ret; + pagep = NULL; COMPQUIET(info, NULL); REC_PRINT(__db_big_print); REC_INTRO(__db_big_read, 1); - if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) { + if ((ret = mpf->get(mpf, &argp->pgno, 0, &pagep)) != 0) { if (DB_UNDO(op)) { /* * We are undoing and the page doesn't exist. That @@ -139,7 +144,7 @@ __db_big_recover(dbenv, dbtp, lsnp, op, info) ret = 0; goto ppage; } else - if ((ret = memp_fget(mpf, + if ((ret = mpf->get(mpf, &argp->pgno, DB_MPOOL_CREATE, &pagep)) != 0) goto out; } @@ -161,7 +166,7 @@ __db_big_recover(dbenv, dbtp, lsnp, op, info) argp->next_pgno, 0, P_OVERFLOW); OV_LEN(pagep) = argp->dbt.size; OV_REF(pagep) = 1; - memcpy((u_int8_t *)pagep + P_OVERHEAD, argp->dbt.data, + memcpy((u_int8_t *)pagep + P_OVERHEAD(file_dbp), argp->dbt.data, argp->dbt.size); PREV_PGNO(pagep) = argp->prev_pgno; change = DB_MPOOL_DIRTY; @@ -177,13 +182,21 @@ __db_big_recover(dbenv, dbtp, lsnp, op, info) if (change) LSN(pagep) = DB_REDO(op) ? *lsnp : argp->pagelsn; - if ((ret = memp_fput(mpf, pagep, change)) != 0) + if ((ret = mpf->put(mpf, pagep, change)) != 0) goto out; + pagep = NULL; + + /* + * We only delete a whole chain of overflow. + * Each page is handled individually + */ + if (argp->opcode == DB_REM_BIG) + goto done; /* Now check the previous page. */ ppage: if (argp->prev_pgno != PGNO_INVALID) { change = 0; - if ((ret = memp_fget(mpf, &argp->prev_pgno, 0, &pagep)) != 0) { + if ((ret = mpf->get(mpf, &argp->prev_pgno, 0, &pagep)) != 0) { if (DB_UNDO(op)) { /* * We are undoing and the page doesn't exist. @@ -195,7 +208,7 @@ ppage: if (argp->prev_pgno != PGNO_INVALID) { ret = 0; goto npage; } else - if ((ret = memp_fget(mpf, &argp->prev_pgno, + if ((ret = mpf->get(mpf, &argp->prev_pgno, DB_MPOOL_CREATE, &pagep)) != 0) goto out; } @@ -204,28 +217,27 @@ ppage: if (argp->prev_pgno != PGNO_INVALID) { cmp_p = log_compare(&LSN(pagep), &argp->prevlsn); CHECK_LSN(op, cmp_p, &LSN(pagep), &argp->prevlsn); - if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_ADD_BIG) || - (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_REM_BIG)) { + if (cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_ADD_BIG) { /* Redo add, undo delete. */ NEXT_PGNO(pagep) = argp->pgno; change = DB_MPOOL_DIRTY; - } else if ((cmp_n == 0 && - DB_UNDO(op) && argp->opcode == DB_ADD_BIG) || - (cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_REM_BIG)) { + } else if (cmp_n == 0 && + DB_UNDO(op) && argp->opcode == DB_ADD_BIG) { /* Redo delete, undo add. */ NEXT_PGNO(pagep) = argp->next_pgno; change = DB_MPOOL_DIRTY; } if (change) LSN(pagep) = DB_REDO(op) ? *lsnp : argp->prevlsn; - if ((ret = memp_fput(mpf, pagep, change)) != 0) + if ((ret = mpf->put(mpf, pagep, change)) != 0) goto out; } + pagep = NULL; /* Now check the next page. Can only be set on a delete. */ npage: if (argp->next_pgno != PGNO_INVALID) { change = 0; - if ((ret = memp_fget(mpf, &argp->next_pgno, 0, &pagep)) != 0) { + if ((ret = mpf->get(mpf, &argp->next_pgno, 0, &pagep)) != 0) { if (DB_UNDO(op)) { /* * We are undoing and the page doesn't exist. @@ -235,7 +247,7 @@ npage: if (argp->next_pgno != PGNO_INVALID) { */ goto done; } else - if ((ret = memp_fget(mpf, &argp->next_pgno, + if ((ret = mpf->get(mpf, &argp->next_pgno, DB_MPOOL_CREATE, &pagep)) != 0) goto out; } @@ -252,21 +264,25 @@ npage: if (argp->next_pgno != PGNO_INVALID) { } if (change) LSN(pagep) = DB_REDO(op) ? *lsnp : argp->nextlsn; - if ((ret = memp_fput(mpf, pagep, change)) != 0) + if ((ret = mpf->put(mpf, pagep, change)) != 0) goto out; } + pagep = NULL; done: *lsnp = argp->prev_lsn; ret = 0; -out: REC_CLOSE; +out: if (pagep != NULL) + (void)mpf->put(mpf, pagep, 0); + REC_CLOSE; } /* * __db_ovref_recover -- * Recovery function for __db_ovref(). * - * PUBLIC: int __db_ovref_recover __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); + * PUBLIC: int __db_ovref_recover + * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); */ int __db_ovref_recover(dbenv, dbtp, lsnp, op, info) @@ -283,14 +299,15 @@ __db_ovref_recover(dbenv, dbtp, lsnp, op, info) PAGE *pagep; int cmp, modified, ret; + pagep = NULL; COMPQUIET(info, NULL); REC_PRINT(__db_ovref_print); REC_INTRO(__db_ovref_read, 1); - if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) { + if ((ret = mpf->get(mpf, &argp->pgno, 0, &pagep)) != 0) { if (DB_UNDO(op)) goto done; - (void)__db_pgerr(file_dbp, argp->pgno); + __db_pgerr(file_dbp, argp->pgno, ret); goto out; } @@ -310,13 +327,16 @@ __db_ovref_recover(dbenv, dbtp, lsnp, op, info) pagep->lsn = argp->lsn; modified = 1; } - if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) + if ((ret = mpf->put(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) goto out; + pagep = NULL; done: *lsnp = argp->prev_lsn; ret = 0; -out: REC_CLOSE; +out: if (pagep != NULL) + (void)mpf->put(mpf, pagep, 0); + REC_CLOSE; } /* @@ -341,6 +361,7 @@ __db_relink_recover(dbenv, dbtp, lsnp, op, info) PAGE *pagep; int cmp_n, cmp_p, modified, ret; + pagep = NULL; COMPQUIET(info, NULL); REC_PRINT(__db_relink_print); REC_INTRO(__db_relink_read, 1); @@ -351,9 +372,9 @@ __db_relink_recover(dbenv, dbtp, lsnp, op, info) * the current page is the result of a split and is being recovered * elsewhere, so all we need do is recover the next page. */ - if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) { + if ((ret = mpf->get(mpf, &argp->pgno, 0, &pagep)) != 0) { if (DB_REDO(op)) { - (void)__db_pgerr(file_dbp, argp->pgno); + __db_pgerr(file_dbp, argp->pgno, ret); goto out; } goto next2; @@ -376,12 +397,13 @@ __db_relink_recover(dbenv, dbtp, lsnp, op, info) pagep->lsn = argp->lsn; modified = 1; } -next1: if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) +next1: if ((ret = mpf->put(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) goto out; + pagep = NULL; -next2: if ((ret = memp_fget(mpf, &argp->next, 0, &pagep)) != 0) { +next2: if ((ret = mpf->get(mpf, &argp->next, 0, &pagep)) != 0) { if (DB_REDO(op)) { - (void)__db_pgerr(file_dbp, argp->next); + __db_pgerr(file_dbp, argp->next, ret); goto out; } goto prev; @@ -409,14 +431,15 @@ next2: if ((ret = memp_fget(mpf, &argp->next, 0, &pagep)) != 0) { else pagep->lsn = *lsnp; } - if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) + if ((ret = mpf->put(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) goto out; + pagep = NULL; if (argp->opcode == DB_ADD_PAGE) goto done; -prev: if ((ret = memp_fget(mpf, &argp->prev, 0, &pagep)) != 0) { +prev: if ((ret = mpf->get(mpf, &argp->prev, 0, &pagep)) != 0) { if (DB_REDO(op)) { - (void)__db_pgerr(file_dbp, argp->prev); + __db_pgerr(file_dbp, argp->prev, ret); goto out; } goto done; @@ -441,13 +464,16 @@ prev: if ((ret = memp_fget(mpf, &argp->prev, 0, &pagep)) != 0) { else pagep->lsn = *lsnp; } - if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) + if ((ret = mpf->put(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) goto out; + pagep = NULL; done: *lsnp = argp->prev_lsn; ret = 0; -out: REC_CLOSE; +out: if (pagep != NULL) + (void)mpf->put(mpf, pagep, 0); + REC_CLOSE; } /* @@ -468,8 +494,8 @@ __db_debug_recover(dbenv, dbtp, lsnp, op, info) __db_debug_args *argp; int ret; - COMPQUIET(op, 0); COMPQUIET(dbenv, NULL); + COMPQUIET(op, DB_TXN_ABORT); COMPQUIET(info, NULL); REC_PRINT(__db_debug_print); @@ -504,11 +530,12 @@ __db_noop_recover(dbenv, dbtp, lsnp, op, info) u_int32_t change; int cmp_n, cmp_p, ret; + pagep = NULL; COMPQUIET(info, NULL); REC_PRINT(__db_noop_print); REC_INTRO(__db_noop_read, 0); - if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) + if ((ret = mpf->get(mpf, &argp->pgno, 0, &pagep)) != 0) goto out; cmp_n = log_compare(lsnp, &LSN(pagep)); @@ -522,8 +549,349 @@ __db_noop_recover(dbenv, dbtp, lsnp, op, info) LSN(pagep) = argp->prevlsn; change = DB_MPOOL_DIRTY; } - ret = memp_fput(mpf, pagep, change); + ret = mpf->put(mpf, pagep, change); + pagep = NULL; done: *lsnp = argp->prev_lsn; -out: REC_CLOSE; +out: if (pagep != NULL) + (void)mpf->put(mpf, pagep, 0); + REC_CLOSE; +} + +/* + * __db_pg_alloc_recover -- + * Recovery function for pg_alloc. + * + * PUBLIC: int __db_pg_alloc_recover + * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__db_pg_alloc_recover(dbenv, dbtp, lsnp, op, info) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __db_pg_alloc_args *argp; + DB *file_dbp; + DBC *dbc; + DBMETA *meta; + DB_MPOOLFILE *mpf; + PAGE *pagep; + db_pgno_t pgno; + int cmp_n, cmp_p, created, level, modified, ret; + + meta = NULL; + pagep = NULL; + REC_PRINT(__db_pg_alloc_print); + REC_INTRO(__db_pg_alloc_read, 0); + + /* + * Fix up the allocated page. If we're redoing the operation, we have + * to get the page (creating it if it doesn't exist), and update its + * LSN. If we're undoing the operation, we have to reset the page's + * LSN and put it on the free list. + * + * Fix up the metadata page. If we're redoing the operation, we have + * to get the metadata page and update its LSN and its free pointer. + * If we're undoing the operation and the page was ever created, we put + * it on the freelist. + */ + pgno = PGNO_BASE_MD; + if ((ret = mpf->get(mpf, &pgno, 0, &meta)) != 0) { + /* The metadata page must always exist on redo. */ + if (DB_REDO(op)) { + __db_pgerr(file_dbp, pgno, ret); + goto out; + } else + goto done; + } + created = modified = 0; + if ((ret = mpf->get(mpf, &argp->pgno, 0, &pagep)) != 0) { + /* + * We have to be able to identify if a page was newly + * created so we can recover it properly. We cannot simply + * look for an empty header, because hash uses a pgin + * function that will set the header. Instead, we explicitly + * try for the page without CREATE and if that fails, then + * create it. + */ + if ((ret = + mpf->get(mpf, &argp->pgno, DB_MPOOL_CREATE, &pagep)) != 0) { + __db_pgerr(file_dbp, argp->pgno, ret); + goto out; + } + created = modified = 1; + } + + /* Fix up the allocated page. */ + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->page_lsn); + + /* + * If an inital allocation is aborted and then reallocated + * during an archival restore the log record will have + * an LSN for the page but the page will be empty. + */ + if (IS_ZERO_LSN(LSN(pagep))) + cmp_p = 0; + CHECK_LSN(op, cmp_p, &LSN(pagep), &argp->page_lsn); + /* + * If we we rolled back this allocation previously during an + * archive restore, the page may have the LSN of the meta page + * at the point of the roll back. This will be no more + * than the LSN of the metadata page at the time of this allocation. + * Another special case we have to handle is if we ended up with a + * page of all 0's which can happen if we abort between allocating a + * page in mpool and initializing it. In that case, even if we're + * undoing, we need to re-initialize the page. + */ + if (DB_REDO(op) && + (cmp_p == 0 || + (IS_ZERO_LSN(argp->page_lsn) && + log_compare(&LSN(pagep), &argp->meta_lsn) <= 0))) { + /* Need to redo update described. */ + switch (argp->ptype) { + case P_LBTREE: + case P_LRECNO: + case P_LDUP: + level = LEAFLEVEL; + break; + default: + level = 0; + break; + } + P_INIT(pagep, file_dbp->pgsize, + argp->pgno, PGNO_INVALID, PGNO_INVALID, level, argp->ptype); + + pagep->lsn = *lsnp; + modified = 1; + } else if (DB_UNDO(op) && (cmp_n == 0 || created)) { + /* + * This is where we handle the case of a 0'd page (pagep->pgno + * is equal to PGNO_INVALID). + * Undo the allocation, reinitialize the page and + * link its next pointer to the free list. + */ + P_INIT(pagep, file_dbp->pgsize, + argp->pgno, PGNO_INVALID, argp->next, 0, P_INVALID); + + pagep->lsn = argp->page_lsn; + modified = 1; + } + + /* + * If the page was newly created, put it on the limbo list. + */ + if (IS_ZERO_LSN(LSN(pagep)) && + IS_ZERO_LSN(argp->page_lsn) && DB_UNDO(op)) { + /* Put the page in limbo.*/ + if ((ret = __db_add_limbo(dbenv, + info, argp->fileid, argp->pgno, 1)) != 0) + goto out; + } + + if ((ret = mpf->put(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) + goto out; + pagep = NULL; + + /* Fix up the metadata page. */ + modified = 0; + cmp_n = log_compare(lsnp, &LSN(meta)); + cmp_p = log_compare(&LSN(meta), &argp->meta_lsn); + CHECK_LSN(op, cmp_p, &LSN(meta), &argp->meta_lsn); + if (cmp_p == 0 && DB_REDO(op)) { + /* Need to redo update described. */ + LSN(meta) = *lsnp; + meta->free = argp->next; + modified = 1; + } else if (cmp_n == 0 && DB_UNDO(op)) { + /* Need to undo update described. */ + LSN(meta) = argp->meta_lsn; + + /* + * If the page has a zero LSN then its newly created + * and will go into limbo rather than directly on the + * free list. + */ + if (!IS_ZERO_LSN(argp->page_lsn)) + meta->free = argp->pgno; + modified = 1; + } + if ((ret = mpf->put(mpf, meta, modified ? DB_MPOOL_DIRTY : 0)) != 0) + goto out; + meta = NULL; + /* + * This could be the metapage from a subdb which is read from disk + * to recover its creation. + */ + if (F_ISSET(file_dbp, DB_AM_SUBDB)) + switch (argp->type) { + case P_BTREEMETA: + case P_HASHMETA: + case P_QAMMETA: + file_dbp->sync(file_dbp, 0); + break; + } + +done: *lsnp = argp->prev_lsn; + ret = 0; + +out: if (pagep != NULL) + (void)mpf->put(mpf, pagep, 0); + if (meta != NULL) + (void)mpf->put(mpf, meta, 0); + if (ret == ENOENT && op == DB_TXN_BACKWARD_ALLOC) + ret = 0; + REC_CLOSE; +} + +/* + * __db_pg_free_recover -- + * Recovery function for pg_free. + * + * PUBLIC: int __db_pg_free_recover + * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__db_pg_free_recover(dbenv, dbtp, lsnp, op, info) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __db_pg_free_args *argp; + DB *file_dbp; + DBC *dbc; + DBMETA *meta; + DB_LSN copy_lsn; + DB_MPOOLFILE *mpf; + PAGE *pagep; + db_pgno_t pgno; + int cmp_n, cmp_p, modified, ret; + + COMPQUIET(info, NULL); + meta = NULL; + pagep = NULL; + REC_PRINT(__db_pg_free_print); + REC_INTRO(__db_pg_free_read, 1); + + /* + * Fix up the freed page. If we're redoing the operation we get the + * page and explicitly discard its contents, then update its LSN. If + * we're undoing the operation, we get the page and restore its header. + * Create the page if necessary, we may be freeing an aborted + * create. + */ + if ((ret = mpf->get(mpf, &argp->pgno, DB_MPOOL_CREATE, &pagep)) != 0) + goto out; + modified = 0; + (void)__ua_memcpy(©_lsn, &LSN(argp->header.data), sizeof(DB_LSN)); + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), ©_lsn); + CHECK_LSN(op, cmp_p, &LSN(pagep), ©_lsn); + if (DB_REDO(op) && + (cmp_p == 0 || + (IS_ZERO_LSN(copy_lsn) && + log_compare(&LSN(pagep), &argp->meta_lsn) <= 0))) { + /* Need to redo update described. */ + P_INIT(pagep, file_dbp->pgsize, + argp->pgno, PGNO_INVALID, argp->next, 0, P_INVALID); + pagep->lsn = *lsnp; + + modified = 1; + } else if (cmp_n == 0 && DB_UNDO(op)) { + /* Need to undo update described. */ + memcpy(pagep, argp->header.data, argp->header.size); + + modified = 1; + } + if ((ret = mpf->put(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) + goto out; + pagep = NULL; + + /* + * Fix up the metadata page. If we're redoing or undoing the operation + * we get the page and update its LSN and free pointer. + */ + pgno = PGNO_BASE_MD; + if ((ret = mpf->get(mpf, &pgno, 0, &meta)) != 0) { + /* The metadata page must always exist. */ + __db_pgerr(file_dbp, pgno, ret); + goto out; + } + + modified = 0; + cmp_n = log_compare(lsnp, &LSN(meta)); + cmp_p = log_compare(&LSN(meta), &argp->meta_lsn); + CHECK_LSN(op, cmp_p, &LSN(meta), &argp->meta_lsn); + if (cmp_p == 0 && DB_REDO(op)) { + /* Need to redo the deallocation. */ + meta->free = argp->pgno; + LSN(meta) = *lsnp; + modified = 1; + } else if (cmp_n == 0 && DB_UNDO(op)) { + /* Need to undo the deallocation. */ + meta->free = argp->next; + LSN(meta) = argp->meta_lsn; + modified = 1; + } + if ((ret = mpf->put(mpf, meta, modified ? DB_MPOOL_DIRTY : 0)) != 0) + goto out; + meta = NULL; + +done: *lsnp = argp->prev_lsn; + ret = 0; + +out: if (pagep != NULL) + (void)mpf->put(mpf, pagep, 0); + if (meta != NULL) + (void)mpf->put(mpf, meta, 0); + REC_CLOSE; +} + +/* + * __db_cksum_recover -- + * Recovery function for checksum failure log record. + * + * PUBLIC: int __db_cksum_recover __P((DB_ENV *, + * PUBLIC: DBT *, DB_LSN *, db_recops, void *)); + */ +int +__db_cksum_recover(dbenv, dbtp, lsnp, op, info) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __db_cksum_args *argp; + + int ret; + + COMPQUIET(info, NULL); + COMPQUIET(lsnp, NULL); + COMPQUIET(op, DB_TXN_ABORT); + + REC_PRINT(__db_cksum_print); + + if ((ret = __db_cksum_read(dbenv, dbtp->data, &argp)) != 0) + return (ret); + + /* + * We had a checksum failure -- the only option is to run catastrophic + * recovery. + */ + if (F_ISSET(dbenv, DB_ENV_FATAL)) + ret = 0; + else { + __db_err(dbenv, + "Checksum failure requires catastrophic recovery"); + ret = __db_panic(dbenv, DB_RUNRECOVERY); + } + + __os_free(dbenv, argp); + return (ret); } diff --git a/bdb/db/db_reclaim.c b/bdb/db/db_reclaim.c index 739f348407d..9aa39bcfa9b 100644 --- a/bdb/db/db_reclaim.c +++ b/bdb/db/db_reclaim.c @@ -1,74 +1,26 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2002 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: db_reclaim.c,v 11.5 2000/04/07 14:26:58 bostic Exp $"; +static const char revid[] = "$Id: db_reclaim.c,v 11.28 2002/08/06 06:11:17 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES #include <sys/types.h> +#include <string.h> #endif #include "db_int.h" -#include "db_page.h" -#include "db_am.h" - -/* - * Assume that we enter with a valid pgno. We traverse a set of - * duplicate pages. The format of the callback routine is: - * callback(dbp, page, cookie, did_put). did_put is an output - * value that will be set to 1 by the callback routine if it - * already put the page back. Otherwise, this routine must - * put the page. - * - * PUBLIC: int __db_traverse_dup __P((DB *, - * PUBLIC: db_pgno_t, int (*)(DB *, PAGE *, void *, int *), void *)); - */ -int -__db_traverse_dup(dbp, pgno, callback, cookie) - DB *dbp; - db_pgno_t pgno; - int (*callback) __P((DB *, PAGE *, void *, int *)); - void *cookie; -{ - PAGE *p; - int did_put, i, opgno, ret; - - do { - did_put = 0; - if ((ret = memp_fget(dbp->mpf, &pgno, 0, &p)) != 0) - return (ret); - pgno = NEXT_PGNO(p); - - for (i = 0; i < NUM_ENT(p); i++) { - if (B_TYPE(GET_BKEYDATA(p, i)->type) == B_OVERFLOW) { - opgno = GET_BOVERFLOW(p, i)->pgno; - if ((ret = __db_traverse_big(dbp, - opgno, callback, cookie)) != 0) - goto err; - } - } - - if ((ret = callback(dbp, p, cookie, &did_put)) != 0) - goto err; - - if (!did_put) - if ((ret = memp_fput(dbp->mpf, p, 0)) != 0) - return (ret); - } while (pgno != PGNO_INVALID); - - if (0) { -err: if (did_put == 0) - (void)memp_fput(dbp->mpf, p, 0); - } - return (ret); -} +#include "dbinc/db_page.h" +#include "dbinc/db_shash.h" +#include "dbinc/btree.h" +#include "dbinc/lock.h" /* * __db_traverse_big @@ -88,17 +40,20 @@ __db_traverse_big(dbp, pgno, callback, cookie) int (*callback) __P((DB *, PAGE *, void *, int *)); void *cookie; { + DB_MPOOLFILE *mpf; PAGE *p; int did_put, ret; + mpf = dbp->mpf; + do { did_put = 0; - if ((ret = memp_fget(dbp->mpf, &pgno, 0, &p)) != 0) + if ((ret = mpf->get(mpf, &pgno, 0, &p)) != 0) return (ret); pgno = NEXT_PGNO(p); if ((ret = callback(dbp, p, cookie, &did_put)) == 0 && !did_put) - ret = memp_fput(dbp->mpf, p, 0); + ret = mpf->put(mpf, p, 0); } while (ret == 0 && pgno != PGNO_INVALID); return (ret); @@ -132,3 +87,162 @@ __db_reclaim_callback(dbp, p, cookie, putp) return (0); } + +/* + * __db_truncate_callback + * This is the callback routine used during a truncate. + * we are traversing a btree or hash table and trying to free all the + * pages. + * + * PUBLIC: int __db_truncate_callback __P((DB *, PAGE *, void *, int *)); + */ +int +__db_truncate_callback(dbp, p, cookie, putp) + DB *dbp; + PAGE *p; + void *cookie; + int *putp; +{ + DBMETA *meta; + DBT ldbt; + DB_LOCK metalock; + DB_MPOOLFILE *mpf; + db_indx_t indx, len, off, tlen, top; + db_pgno_t pgno; + db_trunc_param *param; + u_int8_t *hk, type; + int ret; + + top = NUM_ENT(p); + mpf = dbp->mpf; + param = cookie; + *putp = 1; + + switch (TYPE(p)) { + case P_LBTREE: + /* Skip for off-page duplicates and deleted items. */ + for (indx = 0; indx < top; indx += P_INDX) { + type = GET_BKEYDATA(dbp, p, indx + O_INDX)->type; + if (!B_DISSET(type) && B_TYPE(type) != B_DUPLICATE) + ++param->count; + } + /* FALLTHROUGH */ + case P_IBTREE: + case P_IRECNO: + case P_INVALID: + if (dbp->type != DB_HASH && + ((BTREE *)dbp->bt_internal)->bt_root == PGNO(p)) { + type = dbp->type == DB_RECNO ? P_LRECNO : P_LBTREE; + goto reinit; + } + break; + case P_OVERFLOW: + if (DBC_LOGGING(param->dbc)) { + if ((ret = __db_ovref_log(dbp, param->dbc->txn, + &LSN(p), 0, p->pgno, -1, &LSN(p))) != 0) + return (ret); + } else + LSN_NOT_LOGGED(LSN(p)); + if (--OV_REF(p) != 0) + *putp = 0; + break; + case P_LRECNO: + param->count += top; + if (((BTREE *)dbp->bt_internal)->bt_root == PGNO(p)) { + type = P_LRECNO; + goto reinit; + } + break; + case P_LDUP: + /* Correct for deleted items. */ + for (indx = 0; indx < top; indx += O_INDX) + if (!B_DISSET(GET_BKEYDATA(dbp, p, indx)->type)) + ++param->count; + + break; + case P_HASH: + /* Correct for on-page duplicates and deleted items. */ + for (indx = 0; indx < top; indx += P_INDX) { + switch (*H_PAIRDATA(dbp, p, indx)) { + case H_OFFDUP: + case H_OFFPAGE: + break; + case H_KEYDATA: + ++param->count; + break; + case H_DUPLICATE: + tlen = LEN_HDATA(dbp, p, 0, indx); + hk = H_PAIRDATA(dbp, p, indx); + for (off = 0; off < tlen; + off += len + 2 * sizeof (db_indx_t)) { + ++param->count; + memcpy(&len, + HKEYDATA_DATA(hk) + + off, sizeof(db_indx_t)); + } + } + } + /* Don't free the head of the bucket. */ + if (PREV_PGNO(p) == PGNO_INVALID) { + type = P_HASH; + +reinit: *putp = 0; + if (DBC_LOGGING(param->dbc)) { + pgno = PGNO_BASE_MD; + if ((ret = __db_lget(param->dbc, LCK_ALWAYS, + pgno, DB_LOCK_WRITE, 0, &metalock)) != 0) + return (ret); + if ((ret = mpf->get(mpf, + &pgno, 0, (PAGE **)&meta)) != 0) { + goto err; + } + memset(&ldbt, 0, sizeof(ldbt)); + ldbt.data = p; + ldbt.size = P_OVERHEAD(dbp); + if ((ret = __db_pg_free_log(dbp, + param->dbc->txn, &LSN(meta), 0, + p->pgno, &LSN(meta), + PGNO_BASE_MD, &ldbt, meta->free)) != 0) + goto err; + LSN(p) = LSN(meta); + + if ((ret = + __db_pg_alloc_log(dbp, + param->dbc->txn, &LSN(meta), 0, + &LSN(meta), PGNO_BASE_MD, + &p->lsn, p->pgno, type, meta->free)) != 0) { +err: (void)mpf->put(mpf, (PAGE *)meta, 0); + (void)__TLPUT(param->dbc, metalock); + return (ret); + } + LSN(p) = LSN(meta); + + if ((ret = mpf->put(mpf, + (PAGE *)meta, DB_MPOOL_DIRTY)) != 0) { + (void)__TLPUT(param->dbc, metalock); + return (ret); + } + if ((ret = __TLPUT(param->dbc, metalock)) != 0) + return (ret); + } else + LSN_NOT_LOGGED(LSN(p)); + + P_INIT(p, dbp->pgsize, PGNO(p), PGNO_INVALID, + PGNO_INVALID, type == P_HASH ? 0 : 1, type); + } + break; + default: + return (__db_pgfmt(dbp->dbenv, p->pgno)); + } + + if (*putp == 1) { + if ((ret = __db_free(param->dbc, p)) != 0) + return (ret); + } else { + if ((ret = mpf->put(mpf, p, DB_MPOOL_DIRTY)) != 0) + return (ret); + *putp = 1; + } + + return (0); +} diff --git a/bdb/db/db_remove.c b/bdb/db/db_remove.c new file mode 100644 index 00000000000..ef11c342555 --- /dev/null +++ b/bdb/db/db_remove.c @@ -0,0 +1,318 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2001-2002 + * Sleepycat Software. All rights reserved. + */ + +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: db_remove.c,v 11.203 2002/08/19 18:34:18 margo Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#endif + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/fop.h" +#include "dbinc/btree.h" +#include "dbinc/hash.h" +#include "dbinc/db_shash.h" +#include "dbinc/lock.h" + +static int __db_subdb_remove __P((DB *, DB_TXN *, const char *, const char *)); +static int __db_dbtxn_remove __P((DB *, DB_TXN *, const char *)); + +/* + * __dbenv_dbremove + * Remove method for DB_ENV. + * + * PUBLIC: int __dbenv_dbremove __P((DB_ENV *, + * PUBLIC: DB_TXN *, const char *, const char *, u_int32_t)); + */ +int +__dbenv_dbremove(dbenv, txn, name, subdb, flags) + DB_ENV *dbenv; + DB_TXN *txn; + const char *name, *subdb; + u_int32_t flags; +{ + DB *dbp; + int ret, t_ret, txn_local; + + txn_local = 0; + + PANIC_CHECK(dbenv); + ENV_ILLEGAL_BEFORE_OPEN(dbenv, "DB_ENV->dbremove"); + + /* Validate arguments. */ + if ((ret = __db_fchk(dbenv, "DB->remove", flags, DB_AUTO_COMMIT)) != 0) + return (ret); + + if ((ret = db_create(&dbp, dbenv, 0)) != 0) + return (ret); + + /* + * Create local transaction as necessary, check for consistent + * transaction usage. + */ + if (IS_AUTO_COMMIT(dbenv, txn, flags)) { + if ((ret = __db_txn_auto(dbp, &txn)) != 0) + return (ret); + txn_local = 1; + } else + if (txn != NULL && !TXN_ON(dbenv)) + return (__db_not_txn_env(dbenv)); + + ret = __db_remove_i(dbp, txn, name, subdb); + + /* Commit for DB_AUTO_COMMIT. */ + if (txn_local) { + if (ret == 0) + ret = txn->commit(txn, 0); + else + if ((t_ret = txn->abort(txn)) != 0) + ret = __db_panic(dbenv, t_ret); + /* + * We created the DBP here and when we committed/aborted, + * we release all the tranasctional locks, which includes + * the handle lock; mark the handle cleared explicitly. + */ + LOCK_INIT(dbp->handle_lock); + dbp->lid = DB_LOCK_INVALIDID; + } + + /* + * We never opened this dbp for real, so don't call the transactional + * version of DB->close, and use NOSYNC to avoid calling into mpool. + */ + if ((t_ret = dbp->close(dbp, DB_NOSYNC)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __db_remove + * Remove method for DB. + * + * PUBLIC: int __db_remove __P((DB *, const char *, const char *, u_int32_t)); + */ +int +__db_remove(dbp, name, subdb, flags) + DB *dbp; + const char *name, *subdb; + u_int32_t flags; +{ + DB_ENV *dbenv; + int ret, t_ret; + + dbenv = dbp->dbenv; + + PANIC_CHECK(dbenv); + + /* + * Validate arguments, continuing to destroy the handle on failure. + * + * Cannot use DB_ILLEGAL_AFTER_OPEN directly because it returns. + * + * !!! + * We have a serious problem if we're here with a handle used to open + * a database -- we'll destroy the handle, and the application won't + * ever be able to close the database. + */ + if (F_ISSET(dbp, DB_AM_OPEN_CALLED)) { + ret = __db_mi_open(dbenv, "DB->remove", 1); + goto err; + } + + /* Validate arguments. */ + if ((ret = __db_fchk(dbenv, "DB->remove", flags, 0)) != 0) + goto err; + + /* Check for consistent transaction usage. */ + if ((ret = __db_check_txn(dbp, NULL, DB_LOCK_INVALIDID, 0)) != 0) + goto err; + + /* Remove the file. */ + ret = __db_remove_i(dbp, NULL, name, subdb); + + /* + * We never opened this dbp for real, use NOSYNC to avoid calling into + * mpool. + */ +err: if ((t_ret = dbp->close(dbp, DB_NOSYNC)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __db_remove_i + * Internal remove method for DB. + * + * PUBLIC: int __db_remove_i __P((DB *, DB_TXN *, const char *, const char *)); + */ +int +__db_remove_i(dbp, txn, name, subdb) + DB *dbp; + DB_TXN *txn; + const char *name, *subdb; +{ + DB_ENV *dbenv; + DB_LSN newlsn; + int ret; + char *real_name; + + dbenv = dbp->dbenv; + real_name = NULL; + + /* Handle subdatabase removes separately. */ + if (subdb != NULL) + return (__db_subdb_remove(dbp, txn, name, subdb)); + + /* Handle transactional file removes separately. */ + if (txn != NULL) + return (__db_dbtxn_remove(dbp, txn, name)); + + /* + * The remaining case is a non-transactional file remove. + * + * Find the real name of the file. + */ + if ((ret = __db_appname(dbenv, + DB_APP_DATA, name, 0, NULL, &real_name)) != 0) + return (ret); + + if ((ret = __fop_remove_setup(dbp, NULL, real_name, 0)) != 0) + goto err; + + if (dbp->db_am_remove != NULL && + (ret = dbp->db_am_remove(dbp, NULL, name, subdb, &newlsn)) != 0) + goto err; + + ret = __fop_remove(dbenv, NULL, dbp->fileid, name, DB_APP_DATA); + +err: + if (real_name != NULL) + __os_free(dbenv, real_name); + + return (ret); +} + +/* + * __db_subdb_remove -- + * Remove a subdatabase. + */ +static int +__db_subdb_remove(dbp, txn, name, subdb) + DB *dbp; + DB_TXN *txn; + const char *name, *subdb; +{ + DB *mdbp, *sdbp; + int ret, t_ret; + + mdbp = sdbp = NULL; + + /* Open the subdatabase. */ + if ((ret = db_create(&sdbp, dbp->dbenv, 0)) != 0) + goto err; + if ((ret = __db_open(sdbp, + txn, name, subdb, DB_UNKNOWN, DB_WRITEOPEN, 0)) != 0) + goto err; + + DB_TEST_RECOVERY(sdbp, DB_TEST_PREDESTROY, ret, name); + + /* Free up the pages in the subdatabase. */ + switch (sdbp->type) { + case DB_BTREE: + case DB_RECNO: + if ((ret = __bam_reclaim(sdbp, txn)) != 0) + goto err; + break; + case DB_HASH: + if ((ret = __ham_reclaim(sdbp, txn)) != 0) + goto err; + break; + default: + ret = __db_unknown_type( + sdbp->dbenv, "__db_subdb_remove", sdbp->type); + goto err; + } + + /* + * Remove the entry from the main database and free the subdatabase + * metadata page. + */ + if ((ret = __db_master_open(sdbp, txn, name, 0, 0, &mdbp)) != 0) + goto err; + + if ((ret = __db_master_update( + mdbp, sdbp, txn, subdb, sdbp->type, MU_REMOVE, NULL, 0)) != 0) + goto err; + + DB_TEST_RECOVERY(sdbp, DB_TEST_POSTDESTROY, ret, name); + +DB_TEST_RECOVERY_LABEL +err: + /* Close the main and subdatabases. */ + if ((t_ret = __db_close_i(sdbp, txn, 0)) != 0 && ret == 0) + ret = t_ret; + + if (mdbp != NULL && + (t_ret = __db_close_i(mdbp, txn, 0)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +static int +__db_dbtxn_remove(dbp, txn, name) + DB *dbp; + DB_TXN *txn; + const char *name; +{ + DB_ENV *dbenv; + DB_LSN newlsn; + int ret; + char *tmpname; + + dbenv = dbp->dbenv; + tmpname = NULL; + + /* + * This is a transactional rename, so we have to keep the name + * of the file locked until the transaction commits. As a result, + * we implement remove by renaming the file to some other name + * (which creates a dummy named file as a placeholder for the + * file being rename/dremoved) and then deleting that file as + * a delayed remove at commit. + */ + if ((ret = __db_backup_name(dbenv, name, txn, &tmpname)) != 0) + return (ret); + + DB_TEST_RECOVERY(dbp, DB_TEST_PREDESTROY, ret, name); + + if ((ret = __db_rename_i(dbp, txn, name, NULL, tmpname)) != 0) + goto err; + + /* The internal removes will also translate into delayed removes. */ + if (dbp->db_am_remove != NULL && + (ret = dbp->db_am_remove(dbp, txn, tmpname, NULL, &newlsn)) != 0) + goto err; + + ret = __fop_remove(dbenv, txn, dbp->fileid, tmpname, DB_APP_DATA); + + DB_TEST_RECOVERY(dbp, DB_TEST_POSTDESTROY, ret, name); + +err: +DB_TEST_RECOVERY_LABEL + if (tmpname != NULL) + __os_free(dbenv, tmpname); + + return (ret); +} diff --git a/bdb/db/db_rename.c b/bdb/db/db_rename.c new file mode 100644 index 00000000000..87f88232cda --- /dev/null +++ b/bdb/db/db_rename.c @@ -0,0 +1,297 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2001-2002 + * Sleepycat Software. All rights reserved. + */ + +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: db_rename.c,v 11.203 2002/08/07 16:16:47 bostic Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/db_shash.h" +#include "dbinc/db_am.h" +#include "dbinc/fop.h" +#include "dbinc/lock.h" +#include "dbinc/log.h" + +static int __db_subdb_rename __P(( DB *, DB_TXN *, + const char *, const char *, const char *)); + +/* + * __dbenv_dbrename + * Rename method for DB_ENV. + * + * PUBLIC: int __dbenv_dbrename __P((DB_ENV *, DB_TXN *, + * PUBLIC: const char *, const char *, const char *, u_int32_t)); + */ +int +__dbenv_dbrename(dbenv, txn, name, subdb, newname, flags) + DB_ENV *dbenv; + DB_TXN *txn; + const char *name, *subdb, *newname; + u_int32_t flags; +{ + DB *dbp; + int ret, t_ret, txn_local; + + txn_local = 0; + + PANIC_CHECK(dbenv); + ENV_ILLEGAL_BEFORE_OPEN(dbenv, "DB_ENV->dbrename"); + + /* Validate arguments. */ + if ((ret = __db_fchk(dbenv, "DB->rename", flags, DB_AUTO_COMMIT)) != 0) + return (ret); + + if ((ret = db_create(&dbp, dbenv, 0)) != 0) + return (ret); + + /* + * Create local transaction as necessary, check for consistent + * transaction usage. + */ + if (IS_AUTO_COMMIT(dbenv, txn, flags)) { + if ((ret = __db_txn_auto(dbp, &txn)) != 0) + return (ret); + txn_local = 1; + } else + if (txn != NULL && !TXN_ON(dbenv)) + return (__db_not_txn_env(dbenv)); + + ret = __db_rename_i(dbp, txn, name, subdb, newname); + + /* Commit for DB_AUTO_COMMIT. */ + if (txn_local) { + if (ret == 0) + ret = txn->commit(txn, 0); + else + if ((t_ret = txn->abort(txn)) != 0) + ret = __db_panic(dbenv, t_ret); + + /* + * We created the DBP here and when we committed/aborted, + * we release all the tranasctional locks, which includes + * the handle lock; mark the handle cleared explicitly. + */ + LOCK_INIT(dbp->handle_lock); + dbp->lid = DB_LOCK_INVALIDID; + } + + /* + * We never opened this dbp for real, so don't call the transactional + * version of DB->close, and use NOSYNC to avoid calling into mpool. + */ + if ((t_ret = dbp->close(dbp, DB_NOSYNC)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __db_rename + * Rename method for DB. + * + * PUBLIC: int __db_rename __P((DB *, + * PUBLIC: const char *, const char *, const char *, u_int32_t)); + */ +int +__db_rename(dbp, name, subdb, newname, flags) + DB *dbp; + const char *name, *subdb, *newname; + u_int32_t flags; +{ + DB_ENV *dbenv; + int ret, t_ret; + + dbenv = dbp->dbenv; + + PANIC_CHECK(dbenv); + + /* + * Validate arguments, continuing to destroy the handle on failure. + * + * Cannot use DB_ILLEGAL_AFTER_OPEN directly because it returns. + * + * !!! + * We have a serious problem if we're here with a handle used to open + * a database -- we'll destroy the handle, and the application won't + * ever be able to close the database. + */ + if (F_ISSET(dbp, DB_AM_OPEN_CALLED)) { + ret = __db_mi_open(dbenv, "DB->rename", 1); + goto err; + } + + /* Validate arguments. */ + if ((ret = __db_fchk(dbenv, "DB->rename", flags, 0)) != 0) + goto err; + + /* Check for consistent transaction usage. */ + if ((ret = __db_check_txn(dbp, NULL, DB_LOCK_INVALIDID, 0)) != 0) + goto err; + + /* Rename the file. */ + ret = __db_rename_i(dbp, NULL, name, subdb, newname); + + /* + * We never opened this dbp for real, use NOSYNC to avoid calling into + * mpool. + */ +err: if ((t_ret = dbp->close(dbp, DB_NOSYNC)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __db_rename_i + * Internal rename method for DB. + * + * PUBLIC: int __db_rename_i __P((DB *, + * PUBLIC: DB_TXN *, const char *, const char *, const char *)); + */ +int +__db_rename_i(dbp, txn, name, subdb, newname) + DB *dbp; + DB_TXN *txn; + const char *name, *subdb, *newname; +{ + DB_ENV *dbenv; + int ret; + char *real_name; + + dbenv = dbp->dbenv; + real_name = NULL; + + DB_TEST_RECOVERY(dbp, DB_TEST_PREDESTROY, ret, name); + + if (subdb != NULL) { + ret = __db_subdb_rename(dbp, txn, name, subdb, newname); + goto err; + } + + /* From here on down, this pertains to files. */ + + /* Find the real name of the file. */ + if ((ret = __db_appname(dbenv, + DB_APP_DATA, name, 0, NULL, &real_name)) != 0) + goto err; + + if ((ret = __fop_remove_setup(dbp, txn, real_name, 0)) != 0) + goto err; + + if (dbp->db_am_rename != NULL && + (ret = dbp->db_am_rename(dbp, txn, name, subdb, newname)) != 0) + goto err; + + /* + * The transactional case and non-transactional case are + * quite different. In the non-transactional case, we simply + * do the rename. In the transactional case, since we need + * the ability to back out and maintain locking, we have to + * create a temporary object as a placeholder. This is all + * taken care of in the fop layer. + */ + if (txn != NULL) { + if ((ret = __fop_dummy(dbp, txn, name, newname, 0)) != 0) + goto err; + } else { + if ((ret = __fop_dbrename(dbp, name, newname)) != 0) + goto err; + } + + /* + * I am pretty sure that we haven't gotten a dbreg id, so calling + * dbreg_filelist_update is not necessary. + */ + DB_ASSERT(dbp->log_filename == NULL || + dbp->log_filename->id == DB_LOGFILEID_INVALID); + + DB_TEST_RECOVERY(dbp, DB_TEST_POSTDESTROY, ret, newname); + +DB_TEST_RECOVERY_LABEL +err: + if (real_name != NULL) + __os_free(dbenv, real_name); + + return (ret); +} + +/* + * __db_subdb_rename -- + * Rename a subdatabase. + */ +static int +__db_subdb_rename(dbp, txn, name, subdb, newname) + DB *dbp; + DB_TXN *txn; + const char *name, *subdb, *newname; +{ + DB *mdbp; + DB_ENV *dbenv; + PAGE *meta; + int ret, t_ret; + + mdbp = NULL; + meta = NULL; + dbenv = dbp->dbenv; + + /* + * We have not opened this dbp so it isn't marked as a subdb, + * but it ought to be. + */ + F_SET(dbp, DB_AM_SUBDB); + + /* + * Rename the entry in the main database. We need to first + * get the meta-data page number (via MU_OPEN) so that we can + * read the meta-data page and obtain a handle lock. Once we've + * done that, we can proceed to do the rename in the master. + */ + if ((ret = __db_master_open(dbp, txn, name, 0, 0, &mdbp)) != 0) + goto err; + + if ((ret = __db_master_update(mdbp, dbp, txn, subdb, dbp->type, + MU_OPEN, NULL, 0)) != 0) + goto err; + + if ((ret = mdbp->mpf->get(mdbp->mpf, &dbp->meta_pgno, 0, &meta)) != 0) + goto err; + memcpy(&dbp->fileid, ((DBMETA *)meta)->uid, DB_FILE_ID_LEN); + if ((ret = __fop_lock_handle(dbenv, + dbp, mdbp->lid, DB_LOCK_WRITE, NULL, 0)) != 0) + goto err; + + ret = mdbp->mpf->put(mdbp->mpf, meta, 0); + meta = NULL; + if (ret != 0) + goto err; + + if ((ret = __db_master_update(mdbp, dbp, txn, + subdb, dbp->type, MU_RENAME, newname, 0)) != 0) + goto err; + + DB_TEST_RECOVERY(dbp, DB_TEST_POSTDESTROY, ret, name); + +DB_TEST_RECOVERY_LABEL +err: + if (meta != NULL && + (t_ret = mdbp->mpf->put(mdbp->mpf, meta, 0)) != 0 && ret == 0) + ret = t_ret; + + if (mdbp != NULL && + (t_ret = __db_close_i(mdbp, txn, 0)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} diff --git a/bdb/db/db_ret.c b/bdb/db/db_ret.c index 0782de3e450..b1af7b4ffeb 100644 --- a/bdb/db/db_ret.c +++ b/bdb/db/db_ret.c @@ -1,14 +1,14 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2002 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: db_ret.c,v 11.12 2000/11/30 00:58:33 ubell Exp $"; +static const char revid[] = "$Id: db_ret.c,v 11.21 2002/03/28 19:21:47 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -18,9 +18,8 @@ static const char revid[] = "$Id: db_ret.c,v 11.12 2000/11/30 00:58:33 ubell Exp #endif #include "db_int.h" -#include "db_page.h" -#include "btree.h" -#include "db_am.h" +#include "dbinc/db_page.h" +#include "dbinc/db_am.h" /* * __db_ret -- @@ -47,19 +46,19 @@ __db_ret(dbp, h, indx, dbt, memp, memsize) switch (TYPE(h)) { case P_HASH: - hk = P_ENTRY(h, indx); + hk = P_ENTRY(dbp, h, indx); if (HPAGE_PTYPE(hk) == H_OFFPAGE) { memcpy(&ho, hk, sizeof(HOFFPAGE)); return (__db_goff(dbp, dbt, ho.tlen, ho.pgno, memp, memsize)); } - len = LEN_HKEYDATA(h, dbp->pgsize, indx); + len = LEN_HKEYDATA(dbp, h, dbp->pgsize, indx); data = HKEYDATA_DATA(hk); break; case P_LBTREE: case P_LDUP: case P_LRECNO: - bk = GET_BKEYDATA(h, indx); + bk = GET_BKEYDATA(dbp, h, indx); if (B_TYPE(bk->type) == B_OVERFLOW) { bo = (BOVERFLOW *)bk; return (__db_goff(dbp, dbt, @@ -69,33 +68,30 @@ __db_ret(dbp, h, indx, dbt, memp, memsize) data = bk->data; break; default: - return (__db_pgfmt(dbp, h->pgno)); + return (__db_pgfmt(dbp->dbenv, h->pgno)); } - return (__db_retcopy(dbp, dbt, data, len, memp, memsize)); + return (__db_retcopy(dbp->dbenv, dbt, data, len, memp, memsize)); } /* * __db_retcopy -- * Copy the returned data into the user's DBT, handling special flags. * - * PUBLIC: int __db_retcopy __P((DB *, DBT *, + * PUBLIC: int __db_retcopy __P((DB_ENV *, DBT *, * PUBLIC: void *, u_int32_t, void **, u_int32_t *)); */ int -__db_retcopy(dbp, dbt, data, len, memp, memsize) - DB *dbp; +__db_retcopy(dbenv, dbt, data, len, memp, memsize) + DB_ENV *dbenv; DBT *dbt; void *data; u_int32_t len; void **memp; u_int32_t *memsize; { - DB_ENV *dbenv; int ret; - dbenv = dbp == NULL ? NULL : dbp->dbenv; - /* If returning a partial record, reset the length. */ if (F_ISSET(dbt, DB_DBT_PARTIAL)) { data = (u_int8_t *)data + dbt->doff; @@ -131,12 +127,10 @@ __db_retcopy(dbp, dbt, data, len, memp, memsize) * memory pointer is allowed to be NULL. */ if (F_ISSET(dbt, DB_DBT_MALLOC)) { - if ((ret = __os_malloc(dbenv, len, - dbp == NULL ? NULL : dbp->db_malloc, &dbt->data)) != 0) + if ((ret = __os_umalloc(dbenv, len, &dbt->data)) != 0) return (ret); } else if (F_ISSET(dbt, DB_DBT_REALLOC)) { - if ((ret = __os_realloc(dbenv, len, - dbp == NULL ? NULL : dbp->db_realloc, &dbt->data)) != 0) + if ((ret = __os_urealloc(dbenv, len, &dbt->data)) != 0) return (ret); } else if (F_ISSET(dbt, DB_DBT_USERMEM)) { if (len != 0 && (dbt->data == NULL || dbt->ulen < len)) @@ -145,7 +139,7 @@ __db_retcopy(dbp, dbt, data, len, memp, memsize) return (EINVAL); } else { if (len != 0 && (*memsize == 0 || *memsize < len)) { - if ((ret = __os_realloc(dbenv, len, NULL, memp)) != 0) { + if ((ret = __os_realloc(dbenv, len, memp)) != 0) { *memsize = 0; return (ret); } diff --git a/bdb/db/db_truncate.c b/bdb/db/db_truncate.c new file mode 100644 index 00000000000..49546ae51b9 --- /dev/null +++ b/bdb/db/db_truncate.c @@ -0,0 +1,95 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2001-2002 + * Sleepycat Software. All rights reserved. + */ + +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: db_truncate.c,v 11.185 2002/08/07 16:16:48 bostic Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#endif + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/btree.h" +#include "dbinc/hash.h" +#include "dbinc/qam.h" + +/* + * __db_truncate + * truncate method for DB. + * + * PUBLIC: int __db_truncate __P((DB *, DB_TXN *, u_int32_t *, u_int32_t)); + */ +int +__db_truncate(dbp, txn, countp, flags) + DB *dbp; + DB_TXN *txn; + u_int32_t *countp, flags; +{ + DB_ENV *dbenv; + int ret, t_ret, txn_local; + + dbenv = dbp->dbenv; + ret = txn_local = 0; + + PANIC_CHECK(dbenv); + + /* Check for invalid flags. */ + if ((ret = + __db_fchk(dbenv, "DB->truncate", flags, DB_AUTO_COMMIT)) != 0) + return (ret); + + /* + * Create local transaction as necessary, check for consistent + * transaction usage. + */ + if (IS_AUTO_COMMIT(dbenv, txn, flags)) { + if ((ret = __db_txn_auto(dbp, &txn)) != 0) + return (ret); + txn_local = 1; + } else + if (txn != NULL && !TXN_ON(dbenv)) + return (__db_not_txn_env(dbenv)); + + DB_TEST_RECOVERY(dbp, DB_TEST_PREDESTROY, ret, NULL); + switch (dbp->type) { + case DB_BTREE: + case DB_RECNO: + if ((ret = __bam_truncate(dbp, txn, countp)) != 0) + goto err; + break; + case DB_HASH: + if ((ret = __ham_truncate(dbp, txn, countp)) != 0) + goto err; + break; + case DB_QUEUE: + if ((ret = __qam_truncate(dbp, txn, countp)) != 0) + goto err; + break; + default: + ret = __db_unknown_type( + dbenv, "__db_truncate", dbp->type); + goto err; + } + DB_TEST_RECOVERY(dbp, DB_TEST_POSTDESTROY, ret, NULL); + +DB_TEST_RECOVERY_LABEL +err: + /* Commit for DB_AUTO_COMMIT. */ + if (txn_local) { + if (ret == 0) + ret = txn->commit(txn, 0); + else + if ((t_ret = txn->abort(txn)) != 0) + ret = __db_panic(dbenv, t_ret); + } + + return (ret); +} diff --git a/bdb/db/db_upg.c b/bdb/db/db_upg.c index d8573146ad6..c0eb72f3713 100644 --- a/bdb/db/db_upg.c +++ b/bdb/db/db_upg.c @@ -1,14 +1,14 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2002 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: db_upg.c,v 11.20 2000/12/12 17:35:30 bostic Exp $"; +static const char revid[] = "$Id: db_upg.c,v 11.29 2002/03/27 18:59:04 krinsky Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -18,11 +18,11 @@ static const char revid[] = "$Id: db_upg.c,v 11.20 2000/12/12 17:35:30 bostic Ex #endif #include "db_int.h" -#include "db_page.h" -#include "db_swap.h" -#include "btree.h" -#include "hash.h" -#include "qam.h" +#include "dbinc/db_page.h" +#include "dbinc/db_swap.h" +#include "dbinc/btree.h" +#include "dbinc/hash.h" +#include "dbinc/qam.h" static int (* const func_31_list[P_PAGETYPE_MAX]) __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)) = { @@ -68,7 +68,7 @@ __db_upgrade(dbp, fname, flags) /* Get the real backing file name. */ if ((ret = __db_appname(dbenv, - DB_APP_DATA, NULL, fname, 0, NULL, &real_name)) != 0) + DB_APP_DATA, fname, 0, NULL, &real_name)) != 0) return (ret); /* Open the file. */ @@ -117,6 +117,7 @@ __db_upgrade(dbp, fname, flags) goto err; /* FALLTHROUGH */ case 8: + case 9: break; default: __db_err(dbenv, "%s: unsupported btree version: %lu", @@ -173,6 +174,7 @@ __db_upgrade(dbp, fname, flags) goto err; /* FALLTHROUGH */ case 7: + case 8: break; default: __db_err(dbenv, "%s: unsupported hash version: %lu", @@ -202,6 +204,7 @@ __db_upgrade(dbp, fname, flags) goto err; /* FALLTHROUGH */ case 3: + case 4: break; default: __db_err(dbenv, "%s: unsupported queue version: %lu", @@ -231,9 +234,9 @@ __db_upgrade(dbp, fname, flags) ret = __os_fsync(dbenv, &fh); -err: if ((t_ret = __os_closehandle(&fh)) != 0 && ret == 0) +err: if ((t_ret = __os_closehandle(dbenv, &fh)) != 0 && ret == 0) ret = t_ret; - __os_freestr(real_name); + __os_free(dbenv, real_name); /* We're done. */ if (dbp->db_feedback != NULL) @@ -268,7 +271,7 @@ __db_page_pass(dbp, real_name, flags, fl, fhp) return (ret); /* Allocate memory for a single page. */ - if ((ret = __os_malloc(dbenv, dbp->pgsize, NULL, &page)) != 0) + if ((ret = __os_malloc(dbenv, dbp->pgsize, &page)) != 0) return (ret); /* Walk the file, calling the underlying conversion functions. */ @@ -294,7 +297,7 @@ __db_page_pass(dbp, real_name, flags, fl, fhp) } } - __os_free(page, dbp->pgsize); + __os_free(dbp->dbenv, page); return (ret); } diff --git a/bdb/db/db_upg_opd.c b/bdb/db/db_upg_opd.c index a7be784afb8..f410b797bff 100644 --- a/bdb/db/db_upg_opd.c +++ b/bdb/db/db_upg_opd.c @@ -1,14 +1,14 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2002 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: db_upg_opd.c,v 11.9 2000/11/30 00:58:33 ubell Exp $"; +static const char revid[] = "$Id: db_upg_opd.c,v 11.18 2002/08/06 06:11:18 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -18,11 +18,8 @@ static const char revid[] = "$Id: db_upg_opd.c,v 11.9 2000/11/30 00:58:33 ubell #endif #include "db_int.h" -#include "db_page.h" -#include "db_swap.h" -#include "btree.h" -#include "hash.h" -#include "qam.h" +#include "dbinc/db_page.h" +#include "dbinc/btree.h" static int __db_build_bi __P((DB *, DB_FH *, PAGE *, PAGE *, u_int32_t, int *)); static int __db_build_ri __P((DB *, DB_FH *, PAGE *, PAGE *, u_int32_t, int *)); @@ -71,7 +68,7 @@ __db_31_offdup(dbp, real_name, fhp, sorted, pgnop) pgno_cur = pgno_next = NULL; /* Allocate room to hold a page. */ - if ((ret = __os_malloc(dbp->dbenv, dbp->pgsize, NULL, &page)) != 0) + if ((ret = __os_malloc(dbp->dbenv, dbp->pgsize, &page)) != 0) goto err; /* @@ -85,7 +82,7 @@ __db_31_offdup(dbp, real_name, fhp, sorted, pgnop) if (pgno_max == cur_cnt) { pgno_max += 20; if ((ret = __os_realloc(dbp->dbenv, pgno_max * - sizeof(db_pgno_t), NULL, &pgno_cur)) != 0) + sizeof(db_pgno_t), &pgno_cur)) != 0) goto err; } pgno_cur[cur_cnt++] = pgno; @@ -112,7 +109,7 @@ __db_31_offdup(dbp, real_name, fhp, sorted, pgnop) * list while we do so. */ if ((ret = __os_malloc(dbp->dbenv, - cur_cnt * sizeof(db_pgno_t), NULL, &pgno_next)) != 0) + cur_cnt * sizeof(db_pgno_t), &pgno_next)) != 0) goto err; /* Figure out where we can start allocating new pages. */ @@ -121,7 +118,7 @@ __db_31_offdup(dbp, real_name, fhp, sorted, pgnop) /* Allocate room for an internal page. */ if ((ret = __os_malloc(dbp->dbenv, - dbp->pgsize, NULL, &ipage)) != 0) + dbp->pgsize, &ipage)) != 0) goto err; PGNO(ipage) = PGNO_INVALID; } @@ -187,13 +184,13 @@ __db_31_offdup(dbp, real_name, fhp, sorted, pgnop) *pgnop = pgno_cur[0]; err: if (pgno_cur != NULL) - __os_free(pgno_cur, 0); + __os_free(dbp->dbenv, pgno_cur); if (pgno_next != NULL) - __os_free(pgno_next, 0); + __os_free(dbp->dbenv, pgno_next); if (ipage != NULL) - __os_free(ipage, dbp->pgsize); + __os_free(dbp->dbenv, ipage); if (page != NULL) - __os_free(page, dbp->pgsize); + __os_free(dbp->dbenv, page); return (ret); } @@ -214,22 +211,24 @@ __db_build_bi(dbp, fhp, ipage, page, indx, nomemp) BKEYDATA *child_bk; u_int8_t *p; int ret; + db_indx_t *inp; + inp = P_INP(dbp, ipage); switch (TYPE(page)) { case P_IBTREE: - child_bi = GET_BINTERNAL(page, 0); - if (P_FREESPACE(ipage) < BINTERNAL_PSIZE(child_bi->len)) { + child_bi = GET_BINTERNAL(dbp, page, 0); + if (P_FREESPACE(dbp, ipage) < BINTERNAL_PSIZE(child_bi->len)) { *nomemp = 1; return (0); } - ipage->inp[indx] = - HOFFSET(ipage) -= BINTERNAL_SIZE(child_bi->len); - p = P_ENTRY(ipage, indx); + inp[indx] = + HOFFSET(ipage) -= BINTERNAL_SIZE(child_bi->len); + p = P_ENTRY(dbp, ipage, indx); bi.len = child_bi->len; B_TSET(bi.type, child_bi->type, 0); bi.pgno = PGNO(page); - bi.nrecs = __bam_total(page); + bi.nrecs = __bam_total(dbp, page); memcpy(p, &bi, SSZA(BINTERNAL, data)); p += SSZA(BINTERNAL, data); memcpy(p, child_bi->data, child_bi->len); @@ -241,40 +240,40 @@ __db_build_bi(dbp, fhp, ipage, page, indx, nomemp) return (ret); break; case P_LDUP: - child_bk = GET_BKEYDATA(page, 0); + child_bk = GET_BKEYDATA(dbp, page, 0); switch (B_TYPE(child_bk->type)) { case B_KEYDATA: - if (P_FREESPACE(ipage) < + if (P_FREESPACE(dbp, ipage) < BINTERNAL_PSIZE(child_bk->len)) { *nomemp = 1; return (0); } - ipage->inp[indx] = + inp[indx] = HOFFSET(ipage) -= BINTERNAL_SIZE(child_bk->len); - p = P_ENTRY(ipage, indx); + p = P_ENTRY(dbp, ipage, indx); bi.len = child_bk->len; B_TSET(bi.type, child_bk->type, 0); bi.pgno = PGNO(page); - bi.nrecs = __bam_total(page); + bi.nrecs = __bam_total(dbp, page); memcpy(p, &bi, SSZA(BINTERNAL, data)); p += SSZA(BINTERNAL, data); memcpy(p, child_bk->data, child_bk->len); break; case B_OVERFLOW: - if (P_FREESPACE(ipage) < + if (P_FREESPACE(dbp, ipage) < BINTERNAL_PSIZE(BOVERFLOW_SIZE)) { *nomemp = 1; return (0); } - ipage->inp[indx] = + inp[indx] = HOFFSET(ipage) -= BINTERNAL_SIZE(BOVERFLOW_SIZE); - p = P_ENTRY(ipage, indx); + p = P_ENTRY(dbp, ipage, indx); bi.len = BOVERFLOW_SIZE; B_TSET(bi.type, child_bk->type, 0); bi.pgno = PGNO(page); - bi.nrecs = __bam_total(page); + bi.nrecs = __bam_total(dbp, page); memcpy(p, &bi, SSZA(BINTERNAL, data)); p += SSZA(BINTERNAL, data); memcpy(p, child_bk, BOVERFLOW_SIZE); @@ -285,11 +284,11 @@ __db_build_bi(dbp, fhp, ipage, page, indx, nomemp) return (ret); break; default: - return (__db_pgfmt(dbp, PGNO(page))); + return (__db_pgfmt(dbp->dbenv, PGNO(page))); } break; default: - return (__db_pgfmt(dbp, PGNO(page))); + return (__db_pgfmt(dbp->dbenv, PGNO(page))); } return (0); @@ -308,19 +307,19 @@ __db_build_ri(dbp, fhp, ipage, page, indx, nomemp) int *nomemp; { RINTERNAL ri; + db_indx_t *inp; - COMPQUIET(dbp, NULL); COMPQUIET(fhp, NULL); - - if (P_FREESPACE(ipage) < RINTERNAL_PSIZE) { + inp = P_INP(dbp, ipage); + if (P_FREESPACE(dbp, ipage) < RINTERNAL_PSIZE) { *nomemp = 1; return (0); } ri.pgno = PGNO(page); - ri.nrecs = __bam_total(page); - ipage->inp[indx] = HOFFSET(ipage) -= RINTERNAL_SIZE; - memcpy(P_ENTRY(ipage, indx), &ri, RINTERNAL_SIZE); + ri.nrecs = __bam_total(dbp, page); + inp[indx] = HOFFSET(ipage) -= RINTERNAL_SIZE; + memcpy(P_ENTRY(dbp, ipage, indx), &ri, RINTERNAL_SIZE); return (0); } @@ -340,14 +339,14 @@ __db_up_ovref(dbp, fhp, pgno) int ret; /* Allocate room to hold a page. */ - if ((ret = __os_malloc(dbp->dbenv, dbp->pgsize, NULL, &page)) != 0) + if ((ret = __os_malloc(dbp->dbenv, dbp->pgsize, &page)) != 0) return (ret); GET_PAGE(dbp, fhp, pgno, page); ++OV_REF(page); PUT_PAGE(dbp, fhp, pgno, page); -err: __os_free(page, dbp->pgsize); +err: __os_free(dbp->dbenv, page); return (ret); } diff --git a/bdb/db/db_vrfy.c b/bdb/db/db_vrfy.c index 3509e05e91f..1bbecdbd87a 100644 --- a/bdb/db/db_vrfy.c +++ b/bdb/db/db_vrfy.c @@ -1,16 +1,16 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2000 + * Copyright (c) 2000-2002 * Sleepycat Software. All rights reserved. * - * $Id: db_vrfy.c,v 1.53 2001/01/11 18:19:51 bostic Exp $ + * $Id: db_vrfy.c,v 1.107 2002/09/03 17:27:15 bostic Exp $ */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: db_vrfy.c,v 1.53 2001/01/11 18:19:51 bostic Exp $"; +static const char revid[] = "$Id: db_vrfy.c,v 1.107 2002/09/03 17:27:15 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -20,19 +20,25 @@ static const char revid[] = "$Id: db_vrfy.c,v 1.53 2001/01/11 18:19:51 bostic Ex #endif #include "db_int.h" -#include "db_page.h" -#include "db_swap.h" -#include "db_verify.h" -#include "db_ext.h" -#include "btree.h" -#include "hash.h" -#include "qam.h" +#include "dbinc/db_page.h" +#include "dbinc/db_shash.h" +#include "dbinc/db_swap.h" +#include "dbinc/db_verify.h" +#include "dbinc/btree.h" +#include "dbinc/hash.h" +#include "dbinc/lock.h" +#include "dbinc/qam.h" +#include "dbinc/txn.h" static int __db_guesspgsize __P((DB_ENV *, DB_FH *)); static int __db_is_valid_magicno __P((u_int32_t, DBTYPE *)); static int __db_is_valid_pagetype __P((u_int32_t)); static int __db_meta2pgset __P((DB *, VRFY_DBINFO *, db_pgno_t, u_int32_t, DB *)); +static int __db_salvage __P((DB *, VRFY_DBINFO *, db_pgno_t, + PAGE *, void *, int (*)(void *, const void *), u_int32_t)); +static int __db_salvage_subdbpg __P((DB *, VRFY_DBINFO *, + PAGE *, void *, int (*)(void *, const void *), u_int32_t)); static int __db_salvage_subdbs __P((DB *, VRFY_DBINFO *, void *, int(*)(void *, const void *), u_int32_t, int *)); @@ -136,9 +142,7 @@ __db_verify_internal(dbp_orig, name, subdb, handle, callback, flags) DB *dbp; DB_ENV *dbenv; DB_FH fh, *fhp; - PAGE *h; VRFY_DBINFO *vdp; - db_pgno_t last; int has, ret, isbad; char *real_name; @@ -153,16 +157,22 @@ __db_verify_internal(dbp_orig, name, subdb, handle, callback, flags) PANIC_CHECK(dbenv); DB_ILLEGAL_AFTER_OPEN(dbp_orig, "verify"); -#define OKFLAGS (DB_AGGRESSIVE | DB_NOORDERCHK | DB_ORDERCHKONLY | DB_SALVAGE) +#define OKFLAGS (DB_AGGRESSIVE | DB_NOORDERCHK | DB_ORDERCHKONLY | \ + DB_PRINTABLE | DB_SALVAGE) if ((ret = __db_fchk(dbenv, "DB->verify", flags, OKFLAGS)) != 0) return (ret); /* * DB_SALVAGE is mutually exclusive with the other flags except - * DB_AGGRESSIVE. + * DB_AGGRESSIVE and DB_PRINTABLE. */ if (LF_ISSET(DB_SALVAGE) && - (flags & ~DB_AGGRESSIVE) != DB_SALVAGE) + (flags & ~DB_AGGRESSIVE & ~DB_PRINTABLE) != DB_SALVAGE) + return (__db_ferr(dbenv, "__db_verify", 1)); + + /* DB_AGGRESSIVE and DB_PRINTABLE are only meaningful when salvaging. */ + if ((LF_ISSET(DB_AGGRESSIVE) || LF_ISSET(DB_PRINTABLE)) && + !LF_ISSET(DB_SALVAGE)) return (__db_ferr(dbenv, "__db_verify", 1)); if (LF_ISSET(DB_ORDERCHKONLY) && flags != DB_ORDERCHKONLY) @@ -232,9 +242,17 @@ __db_verify_internal(dbp_orig, name, subdb, handle, callback, flags) if ((ret = __db_vrfy_dbinfo_create(dbenv, 1024, &vdp)) != 0) goto err; + /* + * Note whether the user has requested that we use printable + * chars where possible. We won't get here with this flag if + * we're not salvaging. + */ + if (LF_ISSET(DB_PRINTABLE)) + F_SET(vdp, SALVAGE_PRINTABLE); + /* Find the real name of the file. */ if ((ret = __db_appname(dbenv, - DB_APP_DATA, NULL, name, 0, NULL, &real_name)) != 0) + DB_APP_DATA, name, 0, NULL, &real_name)) != 0) goto err; /* @@ -271,25 +289,15 @@ __db_verify_internal(dbp_orig, name, subdb, handle, callback, flags) * the [safe] part of __db_open that initializes the environment-- * and the mpool--manually. */ - if ((ret = __db_dbenv_setup(dbp, - name, DB_ODDFILESIZE | DB_RDONLY)) != 0) + if ((ret = __db_dbenv_setup(dbp, NULL, + name, TXN_INVALID, DB_ODDFILESIZE | DB_RDONLY)) != 0) return (ret); /* Mark the dbp as opened, so that we correctly handle its close. */ - F_SET(dbp, DB_OPEN_CALLED); - - /* - * Find out the page number of the last page in the database. - * - * XXX: This currently fails if the last page is of bad type, - * because it calls __db_pgin and that pukes. This is bad. - */ - if ((ret = memp_fget(dbp->mpf, &last, DB_MPOOL_LAST, &h)) != 0) - goto err; - if ((ret = memp_fput(dbp->mpf, h, 0)) != 0) - goto err; + F_SET(dbp, DB_AM_OPEN_CALLED); - vdp->last_pgno = last; + /* Find out the page number of the last page in the database. */ + dbp->mpf->last_pgno(dbp->mpf, &vdp->last_pgno); /* * DB_ORDERCHKONLY is a special case; our file consists of @@ -373,7 +381,10 @@ __db_verify_internal(dbp_orig, name, subdb, handle, callback, flags) } if (0) { -err: (void)__db_err(dbenv, "%s: %s", name, db_strerror(ret)); + /* Don't try to strerror() DB_VERIFY_FATAL; it's private. */ +err: if (ret == DB_VERIFY_FATAL) + ret = DB_VERIFY_BAD; + (void)__db_err(dbenv, "%s: %s", name, db_strerror(ret)); } if (LF_ISSET(DB_SALVAGE) && @@ -385,13 +396,13 @@ done: if (!LF_ISSET(DB_SALVAGE) && dbp->db_feedback != NULL) dbp->db_feedback(dbp, DB_VERIFY, 100); if (F_ISSET(fhp, DB_FH_VALID)) - (void)__os_closehandle(fhp); + (void)__os_closehandle(dbenv, fhp); if (dbp) (void)dbp->close(dbp, 0); if (vdp) - (void)__db_vrfy_dbinfo_destroy(vdp); + (void)__db_vrfy_dbinfo_destroy(dbenv, vdp); if (real_name) - __os_freestr(real_name); + __os_free(dbenv, real_name); if ((ret == 0 && isbad == 1) || ret == DB_VERIFY_FATAL) ret = DB_VERIFY_BAD; @@ -417,10 +428,11 @@ __db_vrfy_pagezero(dbp, vdp, fhp, flags) DB_ENV *dbenv; VRFY_PAGEINFO *pip; db_pgno_t freelist; - int t_ret, ret, nr, swapped; + size_t nr; + int isbad, ret, swapped; u_int8_t mbuf[DBMETASIZE]; - swapped = ret = t_ret = 0; + isbad = ret = swapped = 0; freelist = 0; dbenv = dbp->dbenv; meta = (DBMETA *)mbuf; @@ -432,29 +444,43 @@ __db_vrfy_pagezero(dbp, vdp, fhp, flags) * may be zero; this is okay, as we want page zero anyway and * 0*0 == 0. */ - if ((ret = __os_seek(dbenv, fhp, 0, 0, 0, 0, DB_OS_SEEK_SET)) != 0) - goto err; - - if ((ret = __os_read(dbenv, fhp, mbuf, DBMETASIZE, (size_t *)&nr)) != 0) - goto err; + if ((ret = __os_seek(dbenv, fhp, 0, 0, 0, 0, DB_OS_SEEK_SET)) != 0 || + (ret = __os_read(dbenv, fhp, mbuf, DBMETASIZE, &nr)) != 0) { + __db_err(dbenv, + "Metadata page %lu cannot be read: %s", + (u_long)PGNO_BASE_MD, db_strerror(ret)); + return (ret); + } if (nr != DBMETASIZE) { - EPRINT((dbp->dbenv, - "Incomplete metadata page %lu", (u_long)PGNO_BASE_MD)); - t_ret = DB_VERIFY_FATAL; - goto err; + EPRINT((dbenv, + "Page %lu: Incomplete metadata page", + (u_long)PGNO_BASE_MD)); + return (DB_VERIFY_FATAL); + } + + if ((ret = __db_chk_meta(dbenv, dbp, meta, 1)) != 0) { + EPRINT((dbenv, + "Page %lu: metadata page corrupted, (u_long)PGNO_BASE_MD")); + isbad = 1; + if (ret != -1) { + EPRINT((dbenv, + "Page %lu: could not check metadata page", + (u_long)PGNO_BASE_MD)); + return (DB_VERIFY_FATAL); + } } /* * Check all of the fields that we can. + * + * 08-11: Current page number. Must == pgno. + * Note that endianness doesn't matter--it's zero. */ - - /* 08-11: Current page number. Must == pgno. */ - /* Note that endianness doesn't matter--it's zero. */ if (meta->pgno != PGNO_BASE_MD) { - EPRINT((dbp->dbenv, "Bad pgno: was %lu, should be %lu", - (u_long)meta->pgno, (u_long)PGNO_BASE_MD)); - ret = DB_VERIFY_BAD; + isbad = 1; + EPRINT((dbenv, "Page %lu: pgno incorrectly set to %lu", + (u_long)PGNO_BASE_MD, (u_long)meta->pgno)); } /* 12-15: Magic number. Must be one of valid set. */ @@ -466,9 +492,10 @@ __db_vrfy_pagezero(dbp, vdp, fhp, flags) &dbp->type)) swapped = 1; else { - EPRINT((dbp->dbenv, - "Bad magic number: %lu", (u_long)meta->magic)); - ret = DB_VERIFY_BAD; + isbad = 1; + EPRINT((dbenv, + "Page %lu: bad magic number %lu", + (u_long)PGNO_BASE_MD, (u_long)meta->magic)); } } @@ -478,12 +505,19 @@ __db_vrfy_pagezero(dbp, vdp, fhp, flags) */ if (swapped) M_32_SWAP(meta->version); - if ((dbp->type == DB_BTREE && meta->version != DB_BTREEVERSION) || - (dbp->type == DB_HASH && meta->version != DB_HASHVERSION) || - (dbp->type == DB_QUEUE && meta->version != DB_QAMVERSION)) { - ret = DB_VERIFY_BAD; - EPRINT((dbp->dbenv, "%s%s", "Old or incorrect DB ", - "version; extraneous errors may result")); + if ((dbp->type == DB_BTREE && + (meta->version > DB_BTREEVERSION || + meta->version < DB_BTREEOLDVER)) || + (dbp->type == DB_HASH && + (meta->version > DB_HASHVERSION || + meta->version < DB_HASHOLDVER)) || + (dbp->type == DB_QUEUE && + (meta->version > DB_QAMVERSION || + meta->version < DB_QAMOLDVER))) { + isbad = 1; + EPRINT((dbenv, + "Page %lu: unsupported DB version %lu; extraneous errors may result", + (u_long)PGNO_BASE_MD, (u_long)meta->version)); } /* @@ -495,9 +529,9 @@ __db_vrfy_pagezero(dbp, vdp, fhp, flags) if (IS_VALID_PAGESIZE(meta->pagesize)) dbp->pgsize = meta->pagesize; else { - EPRINT((dbp->dbenv, - "Bad page size: %lu", (u_long)meta->pagesize)); - ret = DB_VERIFY_BAD; + isbad = 1; + EPRINT((dbenv, "Page %lu: bad page size %lu", + (u_long)PGNO_BASE_MD, (u_long)meta->pagesize)); /* * Now try to settle on a pagesize to use. @@ -516,8 +550,9 @@ __db_vrfy_pagezero(dbp, vdp, fhp, flags) if ((dbp->type == DB_BTREE && meta->type != P_BTREEMETA) || (dbp->type == DB_HASH && meta->type != P_HASHMETA) || (dbp->type == DB_QUEUE && meta->type != P_QAMMETA)) { - ret = DB_VERIFY_BAD; - EPRINT((dbp->dbenv, "Bad page type: %lu", (u_long)meta->type)); + isbad = 1; + EPRINT((dbenv, "Page %lu: bad page type %lu", + (u_long)PGNO_BASE_MD, (u_long)meta->type)); } /* @@ -547,21 +582,16 @@ __db_vrfy_pagezero(dbp, vdp, fhp, flags) pip->free = freelist; - if ((ret = __db_vrfy_putpageinfo(vdp, pip)) != 0) + if ((ret = __db_vrfy_putpageinfo(dbenv, vdp, pip)) != 0) return (ret); /* Set up the dbp's fileid. We don't use the regular open path. */ memcpy(dbp->fileid, meta->uid, DB_FILE_ID_LEN); - if (0) { -err: __db_err(dbenv, "%s", db_strerror(ret)); - } - if (swapped == 1) F_SET(dbp, DB_AM_SWAP); - if (t_ret != 0) - ret = t_ret; - return (ret); + + return (isbad ? DB_VERIFY_BAD : 0); } /* @@ -578,12 +608,14 @@ __db_vrfy_walkpages(dbp, vdp, handle, callback, flags) u_int32_t flags; { DB_ENV *dbenv; + DB_MPOOLFILE *mpf; PAGE *h; db_pgno_t i; int ret, t_ret, isbad; - ret = isbad = t_ret = 0; dbenv = dbp->dbenv; + mpf = dbp->mpf; + ret = isbad = t_ret = 0; if ((ret = __db_fchk(dbenv, "__db_vrfy_walkpages", flags, OKFLAGS)) != 0) @@ -598,11 +630,17 @@ __db_vrfy_walkpages(dbp, vdp, handle, callback, flags) if (LF_ISSET(DB_SALVAGE) && (__db_salvage_isdone(vdp, i) != 0)) continue; - /* If an individual page get fails, keep going. */ - if ((t_ret = memp_fget(dbp->mpf, &i, 0, &h)) != 0) { + /* + * If an individual page get fails, keep going if and only + * if we're salvaging. + */ + if ((t_ret = mpf->get(mpf, &i, 0, &h)) != 0) { if (ret == 0) ret = t_ret; - continue; + if (LF_ISSET(DB_SALVAGE)) + continue; + else + return (ret); } if (LF_ISSET(DB_SALVAGE)) { @@ -619,63 +657,75 @@ __db_vrfy_walkpages(dbp, vdp, handle, callback, flags) } } else { /* + * If we are not salvaging, and we get any error + * other than DB_VERIFY_BAD, return immediately; + * it may not be safe to proceed. If we get + * DB_VERIFY_BAD, keep going; listing more errors + * may make it easier to diagnose problems and + * determine the magnitude of the corruption. + */ + + /* * Verify info common to all page * types. */ - if (i != PGNO_BASE_MD) - if ((t_ret = __db_vrfy_common(dbp, - vdp, h, i, flags)) == DB_VERIFY_BAD) + if (i != PGNO_BASE_MD) { + ret = __db_vrfy_common(dbp, vdp, h, i, flags); + if (ret == DB_VERIFY_BAD) isbad = 1; + else if (ret != 0) + goto err; + } switch (TYPE(h)) { case P_INVALID: - t_ret = __db_vrfy_invalid(dbp, - vdp, h, i, flags); + ret = __db_vrfy_invalid(dbp, vdp, h, i, flags); break; case __P_DUPLICATE: isbad = 1; - EPRINT((dbp->dbenv, - "Old-style duplicate page: %lu", + EPRINT((dbenv, + "Page %lu: old-style duplicate page", (u_long)i)); break; case P_HASH: - t_ret = __ham_vrfy(dbp, + ret = __ham_vrfy(dbp, vdp, h, i, flags); break; case P_IBTREE: case P_IRECNO: case P_LBTREE: case P_LDUP: - t_ret = __bam_vrfy(dbp, + ret = __bam_vrfy(dbp, vdp, h, i, flags); break; case P_LRECNO: - t_ret = __ram_vrfy_leaf(dbp, + ret = __ram_vrfy_leaf(dbp, vdp, h, i, flags); break; case P_OVERFLOW: - t_ret = __db_vrfy_overflow(dbp, + ret = __db_vrfy_overflow(dbp, vdp, h, i, flags); break; case P_HASHMETA: - t_ret = __ham_vrfy_meta(dbp, + ret = __ham_vrfy_meta(dbp, vdp, (HMETA *)h, i, flags); break; case P_BTREEMETA: - t_ret = __bam_vrfy_meta(dbp, + ret = __bam_vrfy_meta(dbp, vdp, (BTMETA *)h, i, flags); break; case P_QAMMETA: - t_ret = __qam_vrfy_meta(dbp, + ret = __qam_vrfy_meta(dbp, vdp, (QMETA *)h, i, flags); break; case P_QAMDATA: - t_ret = __qam_vrfy_data(dbp, + ret = __qam_vrfy_data(dbp, vdp, (QPAGE *)h, i, flags); break; default: - EPRINT((dbp->dbenv, - "Unknown page type: %lu", (u_long)TYPE(h))); + EPRINT((dbenv, + "Page %lu: unknown page type %lu", + (u_long)i, (u_long)TYPE(h))); isbad = 1; break; } @@ -683,12 +733,10 @@ __db_vrfy_walkpages(dbp, vdp, handle, callback, flags) /* * Set up error return. */ - if (t_ret == DB_VERIFY_BAD) + if (ret == DB_VERIFY_BAD) isbad = 1; - else if (t_ret == DB_VERIFY_FATAL) + else if (ret != 0) goto err; - else - ret = t_ret; /* * Provide feedback to the application about our @@ -701,14 +749,21 @@ __db_vrfy_walkpages(dbp, vdp, handle, callback, flags) (i + 1) * 50 / (vdp->last_pgno + 1)); } - if ((t_ret = memp_fput(dbp->mpf, h, 0)) != 0 && ret == 0) - ret = t_ret; + /* + * Just as with the page get, bail if and only if we're + * not salvaging. + */ + if ((t_ret = mpf->put(mpf, h, 0)) != 0) { + if (ret == 0) + ret = t_ret; + if (!LF_ISSET(DB_SALVAGE)) + return (ret); + } } if (0) { -err: if ((t_ret = memp_fput(dbp->mpf, h, 0)) != 0) +err: if ((t_ret = mpf->put(mpf, h, 0)) != 0) return (ret == 0 ? t_ret : ret); - return (DB_VERIFY_BAD); } return ((isbad == 1 && ret == 0) ? DB_VERIFY_BAD : ret); @@ -786,8 +841,8 @@ __db_vrfy_structure(dbp, vdp, dbname, meta_pgno, flags) */ if ((ret = __db_vrfy_getpageinfo(vdp, 0, &pip)) != 0) goto err; - hassubs = F_ISSET(pip, VRFY_HAS_SUBDBS); - if ((ret = __db_vrfy_putpageinfo(vdp, pip)) != 0) + hassubs = F_ISSET(pip, VRFY_HAS_SUBDBS) ? 1 : 0; + if ((ret = __db_vrfy_putpageinfo(dbenv, vdp, pip)) != 0) goto err; if (isbad == 0 && hassubs) @@ -855,23 +910,23 @@ __db_vrfy_structure(dbp, vdp, dbname, meta_pgno, flags) if ((ret = __db_vrfy_pgset_get(pgset, i, &p)) != 0) goto err; if (p == 0) { - EPRINT((dbp->dbenv, - "Unreferenced page %lu", (u_long)i)); + EPRINT((dbenv, + "Page %lu: unreferenced page", (u_long)i)); isbad = 1; } if (F_ISSET(pip, VRFY_IS_ALLZEROES)) { - EPRINT((dbp->dbenv, - "Totally zeroed page %lu", (u_long)i)); + EPRINT((dbenv, + "Page %lu: totally zeroed page", (u_long)i)); isbad = 1; } - if ((ret = __db_vrfy_putpageinfo(vdp, pip)) != 0) + if ((ret = __db_vrfy_putpageinfo(dbenv, vdp, pip)) != 0) goto err; pip = NULL; } err: if (pip != NULL) - (void)__db_vrfy_putpageinfo(vdp, pip); + (void)__db_vrfy_putpageinfo(dbenv, vdp, pip); return ((isbad == 1 && ret == 0) ? DB_VERIFY_BAD : ret); } @@ -936,10 +991,13 @@ __db_vrfy_common(dbp, vdp, h, pgno, flags) db_pgno_t pgno; u_int32_t flags; { + DB_ENV *dbenv; VRFY_PAGEINFO *pip; int ret, t_ret; u_int8_t *p; + dbenv = dbp->dbenv; + if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) return (ret); @@ -957,8 +1015,8 @@ __db_vrfy_common(dbp, vdp, h, pgno, flags) if (pgno != 0 && PGNO(h) == 0) { for (p = (u_int8_t *)h; p < (u_int8_t *)h + dbp->pgsize; p++) if (*p != 0) { - EPRINT((dbp->dbenv, - "Page %lu should be zeroed and is not", + EPRINT((dbenv, + "Page %lu: partially zeroed page", (u_long)pgno)); ret = DB_VERIFY_BAD; goto err; @@ -976,19 +1034,19 @@ __db_vrfy_common(dbp, vdp, h, pgno, flags) } if (PGNO(h) != pgno) { - EPRINT((dbp->dbenv, - "Bad page number: %lu should be %lu", - (u_long)h->pgno, (u_long)pgno)); + EPRINT((dbenv, "Page %lu: bad page number %lu", + (u_long)pgno, (u_long)h->pgno)); ret = DB_VERIFY_BAD; } if (!__db_is_valid_pagetype(h->type)) { - EPRINT((dbp->dbenv, "Bad page type: %lu", (u_long)h->type)); + EPRINT((dbenv, "Page %lu: bad page type %lu", + (u_long)pgno, (u_long)h->type)); ret = DB_VERIFY_BAD; } pip->type = h->type; -err: if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0) +err: if ((t_ret = __db_vrfy_putpageinfo(dbenv, vdp, pip)) != 0 && ret == 0) ret = t_ret; return (ret); @@ -1007,22 +1065,24 @@ __db_vrfy_invalid(dbp, vdp, h, pgno, flags) db_pgno_t pgno; u_int32_t flags; { + DB_ENV *dbenv; VRFY_PAGEINFO *pip; int ret, t_ret; + dbenv = dbp->dbenv; + if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) return (ret); pip->next_pgno = pip->prev_pgno = 0; if (!IS_VALID_PGNO(NEXT_PGNO(h))) { - EPRINT((dbp->dbenv, - "Invalid next_pgno %lu on page %lu", - (u_long)NEXT_PGNO(h), (u_long)pgno)); + EPRINT((dbenv, "Page %lu: invalid next_pgno %lu", + (u_long)pgno, (u_long)NEXT_PGNO(h))); ret = DB_VERIFY_BAD; } else pip->next_pgno = NEXT_PGNO(h); - if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0) + if ((t_ret = __db_vrfy_putpageinfo(dbenv, vdp, pip)) != 0 && ret == 0) ret = t_ret; return (ret); } @@ -1048,9 +1108,12 @@ __db_vrfy_datapage(dbp, vdp, h, pgno, flags) db_pgno_t pgno; u_int32_t flags; { + DB_ENV *dbenv; VRFY_PAGEINFO *pip; int isbad, ret, t_ret; + dbenv = dbp->dbenv; + if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) return (ret); isbad = 0; @@ -1066,12 +1129,12 @@ __db_vrfy_datapage(dbp, vdp, h, pgno, flags) if (TYPE(h) != P_IBTREE && TYPE(h) != P_IRECNO) { if (!IS_VALID_PGNO(PREV_PGNO(h)) || PREV_PGNO(h) == pip->pgno) { isbad = 1; - EPRINT((dbp->dbenv, "Page %lu: Invalid prev_pgno %lu", + EPRINT((dbenv, "Page %lu: invalid prev_pgno %lu", (u_long)pip->pgno, (u_long)PREV_PGNO(h))); } if (!IS_VALID_PGNO(NEXT_PGNO(h)) || NEXT_PGNO(h) == pip->pgno) { isbad = 1; - EPRINT((dbp->dbenv, "Page %lu: Invalid next_pgno %lu", + EPRINT((dbenv, "Page %lu: invalid next_pgno %lu", (u_long)pip->pgno, (u_long)NEXT_PGNO(h))); } pip->prev_pgno = PREV_PGNO(h); @@ -1089,8 +1152,7 @@ __db_vrfy_datapage(dbp, vdp, h, pgno, flags) if (TYPE(h) != P_OVERFLOW) { if (BKEYDATA_PSIZE(0) * NUM_ENT(h) > dbp->pgsize) { isbad = 1; - EPRINT((dbp->dbenv, - "Page %lu: Too many entries: %lu", + EPRINT((dbenv, "Page %lu: too many entries: %lu", (u_long)pgno, (u_long)NUM_ENT(h))); } pip->entries = NUM_ENT(h); @@ -1106,8 +1168,8 @@ __db_vrfy_datapage(dbp, vdp, h, pgno, flags) case P_IRECNO: if (LEVEL(h) < LEAFLEVEL + 1 || LEVEL(h) > MAXBTREELEVEL) { isbad = 1; - EPRINT((dbp->dbenv, "Bad btree level %lu on page %lu", - (u_long)LEVEL(h), (u_long)pgno)); + EPRINT((dbenv, "Page %lu: bad btree level %lu", + (u_long)pgno, (u_long)LEVEL(h))); } pip->bt_level = LEVEL(h); break; @@ -1116,17 +1178,17 @@ __db_vrfy_datapage(dbp, vdp, h, pgno, flags) case P_LRECNO: if (LEVEL(h) != LEAFLEVEL) { isbad = 1; - EPRINT((dbp->dbenv, - "Btree leaf page %lu has incorrect level %lu", + EPRINT((dbenv, + "Page %lu: btree leaf page has incorrect level %lu", (u_long)pgno, (u_long)LEVEL(h))); } break; default: if (LEVEL(h) != 0) { isbad = 1; - EPRINT((dbp->dbenv, - "Nonzero level %lu in non-btree database page %lu", - (u_long)LEVEL(h), (u_long)pgno)); + EPRINT((dbenv, + "Page %lu: nonzero level %lu in non-btree database", + (u_long)pgno, (u_long)LEVEL(h))); } break; } @@ -1139,7 +1201,7 @@ __db_vrfy_datapage(dbp, vdp, h, pgno, flags) * by offset and length--cover the right part of the page * without overlaps, gaps, or violations of the page boundary. */ - if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0) + if ((t_ret = __db_vrfy_putpageinfo(dbenv, vdp, pip)) != 0 && ret == 0) ret = t_ret; return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret); @@ -1161,11 +1223,14 @@ __db_vrfy_meta(dbp, vdp, meta, pgno, flags) db_pgno_t pgno; u_int32_t flags; { + DB_ENV *dbenv; DBTYPE dbtype, magtype; VRFY_PAGEINFO *pip; int isbad, ret, t_ret; isbad = 0; + dbenv = dbp->dbenv; + if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) return (ret); @@ -1190,31 +1255,37 @@ __db_vrfy_meta(dbp, vdp, meta, pgno, flags) /* magic number valid */ if (!__db_is_valid_magicno(meta->magic, &magtype)) { isbad = 1; - EPRINT((dbp->dbenv, - "Magic number invalid on page %lu", (u_long)pgno)); + EPRINT((dbenv, + "Page %lu: invalid magic number", (u_long)pgno)); } if (magtype != dbtype) { isbad = 1; - EPRINT((dbp->dbenv, - "Magic number does not match type of page %lu", + EPRINT((dbenv, + "Page %lu: magic number does not match database type", (u_long)pgno)); } /* version */ - if ((dbtype == DB_BTREE && meta->version != DB_BTREEVERSION) || - (dbtype == DB_HASH && meta->version != DB_HASHVERSION) || - (dbtype == DB_QUEUE && meta->version != DB_QAMVERSION)) { + if ((dbtype == DB_BTREE && + (meta->version > DB_BTREEVERSION || + meta->version < DB_BTREEOLDVER)) || + (dbtype == DB_HASH && + (meta->version > DB_HASHVERSION || + meta->version < DB_HASHOLDVER)) || + (dbtype == DB_QUEUE && + (meta->version > DB_QAMVERSION || + meta->version < DB_QAMOLDVER))) { isbad = 1; - EPRINT((dbp->dbenv, "%s%s", "Old of incorrect DB ", - "version; extraneous errors may result")); + EPRINT((dbenv, + "Page %lu: unsupported database version %lu; extraneous errors may result", + (u_long)pgno, (u_long)meta->version)); } /* pagesize */ if (meta->pagesize != dbp->pgsize) { isbad = 1; - EPRINT((dbp->dbenv, - "Invalid pagesize %lu on page %lu", - (u_long)meta->pagesize, (u_long)pgno)); + EPRINT((dbenv, "Page %lu: invalid pagesize %lu", + (u_long)pgno, (u_long)meta->pagesize)); } /* free list */ @@ -1224,9 +1295,9 @@ __db_vrfy_meta(dbp, vdp, meta, pgno, flags) */ if (pgno != PGNO_BASE_MD && meta->free != PGNO_INVALID) { isbad = 1; - EPRINT((dbp->dbenv, - "Nonempty free list on subdatabase metadata page %lu", - pgno)); + EPRINT((dbenv, + "Page %lu: nonempty free list on subdatabase metadata page", + (u_long)pgno)); } /* Can correctly be PGNO_INVALID--that's just the end of the list. */ @@ -1234,9 +1305,9 @@ __db_vrfy_meta(dbp, vdp, meta, pgno, flags) pip->free = meta->free; else if (!IS_VALID_PGNO(meta->free)) { isbad = 1; - EPRINT((dbp->dbenv, - "Nonsensical free list pgno %lu on page %lu", - (u_long)meta->free, (u_long)pgno)); + EPRINT((dbenv, + "Page %lu: nonsensical free list pgno %lu", + (u_long)pgno, (u_long)meta->free)); } /* @@ -1245,7 +1316,7 @@ __db_vrfy_meta(dbp, vdp, meta, pgno, flags) */ F_CLR(pip, VRFY_INCOMPLETE); -err: if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0) +err: if ((t_ret = __db_vrfy_putpageinfo(dbenv, vdp, pip)) != 0 && ret == 0) ret = t_ret; return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret); @@ -1264,51 +1335,56 @@ __db_vrfy_freelist(dbp, vdp, meta, flags) u_int32_t flags; { DB *pgset; + DB_ENV *dbenv; VRFY_PAGEINFO *pip; - db_pgno_t pgno; + db_pgno_t cur_pgno, next_pgno; int p, ret, t_ret; pgset = vdp->pgset; DB_ASSERT(pgset != NULL); + dbenv = dbp->dbenv; if ((ret = __db_vrfy_getpageinfo(vdp, meta, &pip)) != 0) return (ret); - for (pgno = pip->free; pgno != PGNO_INVALID; pgno = pip->next_pgno) { - if ((ret = __db_vrfy_putpageinfo(vdp, pip)) != 0) + for (next_pgno = pip->free; + next_pgno != PGNO_INVALID; next_pgno = pip->next_pgno) { + cur_pgno = pip->pgno; + if ((ret = __db_vrfy_putpageinfo(dbenv, vdp, pip)) != 0) return (ret); /* This shouldn't happen, but just in case. */ - if (!IS_VALID_PGNO(pgno)) { - EPRINT((dbp->dbenv, - "Invalid next_pgno on free list page %lu", - (u_long)pgno)); + if (!IS_VALID_PGNO(next_pgno)) { + EPRINT((dbenv, + "Page %lu: invalid next_pgno %lu on free list page", + (u_long)cur_pgno, (u_long)next_pgno)); return (DB_VERIFY_BAD); } /* Detect cycles. */ - if ((ret = __db_vrfy_pgset_get(pgset, pgno, &p)) != 0) + if ((ret = __db_vrfy_pgset_get(pgset, next_pgno, &p)) != 0) return (ret); if (p != 0) { - EPRINT((dbp->dbenv, - "Page %lu encountered a second time on free list", - (u_long)pgno)); + EPRINT((dbenv, + "Page %lu: page %lu encountered a second time on free list", + (u_long)cur_pgno, (u_long)next_pgno)); return (DB_VERIFY_BAD); } - if ((ret = __db_vrfy_pgset_inc(pgset, pgno)) != 0) + if ((ret = __db_vrfy_pgset_inc(pgset, next_pgno)) != 0) return (ret); - if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) + if ((ret = __db_vrfy_getpageinfo(vdp, next_pgno, &pip)) != 0) return (ret); if (pip->type != P_INVALID) { - EPRINT((dbp->dbenv, - "Non-invalid page %lu on free list", (u_long)pgno)); + EPRINT((dbenv, + "Page %lu: non-invalid page %lu on free list", + (u_long)cur_pgno, (u_long)next_pgno)); ret = DB_VERIFY_BAD; /* unsafe to continue */ break; } } - if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0) + if ((t_ret = __db_vrfy_putpageinfo(dbenv, vdp, pip)) != 0) ret = t_ret; return (ret); } @@ -1328,6 +1404,7 @@ __db_vrfy_subdbs(dbp, vdp, dbname, flags) DB *mdbp; DBC *dbc; DBT key, data; + DB_ENV *dbenv; VRFY_PAGEINFO *pip; db_pgno_t meta_pgno; int ret, t_ret, isbad; @@ -1335,19 +1412,22 @@ __db_vrfy_subdbs(dbp, vdp, dbname, flags) isbad = 0; dbc = NULL; + dbenv = dbp->dbenv; - if ((ret = __db_master_open(dbp, dbname, DB_RDONLY, 0, &mdbp)) != 0) + if ((ret = + __db_master_open(dbp, NULL, dbname, DB_RDONLY, 0, &mdbp)) != 0) return (ret); - if ((ret = - __db_icursor(mdbp, NULL, DB_BTREE, PGNO_INVALID, 0, &dbc)) != 0) + if ((ret = __db_icursor(mdbp, + NULL, DB_BTREE, PGNO_INVALID, 0, DB_LOCK_INVALIDID, &dbc)) != 0) goto err; memset(&key, 0, sizeof(key)); memset(&data, 0, sizeof(data)); while ((ret = dbc->c_get(dbc, &key, &data, DB_NEXT)) == 0) { if (data.size != sizeof(db_pgno_t)) { - EPRINT((dbp->dbenv, "Database entry of invalid size")); + EPRINT((dbenv, + "Subdatabase entry not page-number size")); isbad = 1; goto err; } @@ -1358,8 +1438,8 @@ __db_vrfy_subdbs(dbp, vdp, dbname, flags) */ DB_NTOHL(&meta_pgno); if (meta_pgno == PGNO_INVALID || meta_pgno > vdp->last_pgno) { - EPRINT((dbp->dbenv, - "Database entry references invalid page %lu", + EPRINT((dbenv, + "Subdatabase entry references invalid page %lu", (u_long)meta_pgno)); isbad = 1; goto err; @@ -1367,7 +1447,7 @@ __db_vrfy_subdbs(dbp, vdp, dbname, flags) if ((ret = __db_vrfy_getpageinfo(vdp, meta_pgno, &pip)) != 0) goto err; type = pip->type; - if ((ret = __db_vrfy_putpageinfo(vdp, pip)) != 0) + if ((ret = __db_vrfy_putpageinfo(dbenv, vdp, pip)) != 0) goto err; switch (type) { case P_BTREEMETA: @@ -1390,8 +1470,8 @@ __db_vrfy_subdbs(dbp, vdp, dbname, flags) break; case P_QAMMETA: default: - EPRINT((dbp->dbenv, - "Database entry references page %lu of invalid type %lu", + EPRINT((dbenv, + "Subdatabase entry references page %lu of invalid type %lu", (u_long)meta_pgno, (u_long)type)); ret = DB_VERIFY_BAD; goto err; @@ -1416,9 +1496,9 @@ err: if (dbc != NULL && (t_ret = __db_c_close(dbc)) != 0 && ret == 0) * Provide feedback during top-down database structure traversal. * (See comment at the beginning of __db_vrfy_structure.) * - * PUBLIC: int __db_vrfy_struct_feedback __P((DB *, VRFY_DBINFO *)); + * PUBLIC: void __db_vrfy_struct_feedback __P((DB *, VRFY_DBINFO *)); */ -int +void __db_vrfy_struct_feedback(dbp, vdp) DB *dbp; VRFY_DBINFO *vdp; @@ -1426,7 +1506,7 @@ __db_vrfy_struct_feedback(dbp, vdp) int progress; if (dbp->db_feedback == NULL) - return (0); + return; if (vdp->pgs_remaining > 0) vdp->pgs_remaining--; @@ -1434,8 +1514,6 @@ __db_vrfy_struct_feedback(dbp, vdp) /* Don't allow a feedback call of 100 until we're really done. */ progress = 100 - (vdp->pgs_remaining * 50 / (vdp->last_pgno + 1)); dbp->db_feedback(dbp, DB_VERIFY, progress == 100 ? 99 : progress); - - return (0); } /* @@ -1453,6 +1531,8 @@ __db_vrfy_orderchkonly(dbp, vdp, name, subdb, flags) DB *mdbp, *pgset; DBC *pgsc; DBT key, data; + DB_ENV *dbenv; + DB_MPOOLFILE *mpf; HASH *h_internal; HMETA *hmeta; PAGE *h, *currpg; @@ -1460,36 +1540,45 @@ __db_vrfy_orderchkonly(dbp, vdp, name, subdb, flags) u_int32_t bucket; int t_ret, ret; - currpg = h = NULL; - pgsc = NULL; pgset = NULL; + pgsc = NULL; + dbenv = dbp->dbenv; + mpf = dbp->mpf; + currpg = h = NULL; LF_CLR(DB_NOORDERCHK); /* Open the master database and get the meta_pgno for the subdb. */ if ((ret = db_create(&mdbp, NULL, 0)) != 0) return (ret); - if ((ret = __db_master_open(dbp, name, DB_RDONLY, 0, &mdbp)) != 0) + if ((ret = __db_master_open(dbp, NULL, name, DB_RDONLY, 0, &mdbp)) != 0) goto err; memset(&key, 0, sizeof(key)); key.data = (void *)subdb; + key.size = (u_int32_t)strlen(subdb); memset(&data, 0, sizeof(data)); - if ((ret = dbp->get(dbp, NULL, &key, &data, 0)) != 0) + if ((ret = mdbp->get(mdbp, NULL, &key, &data, 0)) != 0) goto err; if (data.size != sizeof(db_pgno_t)) { - EPRINT((dbp->dbenv, "Database entry of invalid size")); + EPRINT((dbenv, "Subdatabase entry of invalid size")); ret = DB_VERIFY_BAD; goto err; } memcpy(&meta_pgno, data.data, data.size); - if ((ret = memp_fget(dbp->mpf, &meta_pgno, 0, &h)) != 0) + /* + * Subdatabase meta pgnos are stored in network byte + * order for cross-endian compatibility. Swap if appropriate. + */ + DB_NTOHL(&meta_pgno); + + if ((ret = mpf->get(mpf, &meta_pgno, 0, &h)) != 0) goto err; - if ((ret = __db_vrfy_pgset(dbp->dbenv, dbp->pgsize, &pgset)) != 0) + if ((ret = __db_vrfy_pgset(dbenv, dbp->pgsize, &pgset)) != 0) goto err; switch (TYPE(h)) { @@ -1506,18 +1595,24 @@ __db_vrfy_orderchkonly(dbp, vdp, name, subdb, flags) if ((ret = pgset->cursor(pgset, NULL, &pgsc, 0)) != 0) goto err; while ((ret = __db_vrfy_pgset_next(pgsc, &p)) == 0) { - if ((ret = memp_fget(dbp->mpf, &p, 0, &currpg)) != 0) + if ((ret = mpf->get(mpf, &p, 0, &currpg)) != 0) goto err; if ((ret = __bam_vrfy_itemorder(dbp, NULL, currpg, p, NUM_ENT(currpg), 1, F_ISSET(&btmeta->dbmeta, BTM_DUP), flags)) != 0) goto err; - if ((ret = memp_fput(dbp->mpf, currpg, 0)) != 0) + if ((ret = mpf->put(mpf, currpg, 0)) != 0) goto err; currpg = NULL; } - if ((ret = pgsc->c_close(pgsc)) != 0) - goto err; + + /* + * The normal exit condition for the loop above is DB_NOTFOUND. + * If we see that, zero it and continue on to cleanup. + * Otherwise, it's a real error and will be returned. + */ + if (ret == DB_NOTFOUND) + ret = 0; break; case P_HASHMETA: hmeta = (HMETA *)h; @@ -1525,16 +1620,21 @@ __db_vrfy_orderchkonly(dbp, vdp, name, subdb, flags) /* * Make sure h_charkey is right. */ - if (h_internal == NULL || h_internal->h_hash == NULL) { - EPRINT((dbp->dbenv, - "DB_ORDERCHKONLY requires that a hash function be set")); + if (h_internal == NULL) { + EPRINT((dbenv, + "Page %lu: DB->h_internal field is NULL", + (u_long)meta_pgno)); ret = DB_VERIFY_BAD; goto err; } + if (h_internal->h_hash == NULL) + h_internal->h_hash = hmeta->dbmeta.version < 5 + ? __ham_func4 : __ham_func5; if (hmeta->h_charkey != h_internal->h_hash(dbp, CHARKEY, sizeof(CHARKEY))) { - EPRINT((dbp->dbenv, - "Incorrect hash function for database")); + EPRINT((dbenv, + "Page %lu: incorrect hash function for database", + (u_long)meta_pgno)); ret = DB_VERIFY_BAD; goto err; } @@ -1546,34 +1646,35 @@ __db_vrfy_orderchkonly(dbp, vdp, name, subdb, flags) for (bucket = 0; bucket <= hmeta->max_bucket; bucket++) { pgno = BS_TO_PAGE(bucket, hmeta->spares); while (pgno != PGNO_INVALID) { - if ((ret = memp_fget(dbp->mpf, + if ((ret = mpf->get(mpf, &pgno, 0, &currpg)) != 0) goto err; if ((ret = __ham_vrfy_hashing(dbp, - NUM_ENT(currpg),hmeta, bucket, pgno, + NUM_ENT(currpg), hmeta, bucket, pgno, flags, h_internal->h_hash)) != 0) goto err; pgno = NEXT_PGNO(currpg); - if ((ret = memp_fput(dbp->mpf, currpg, 0)) != 0) + if ((ret = mpf->put(mpf, currpg, 0)) != 0) goto err; currpg = NULL; } } break; default: - EPRINT((dbp->dbenv, "Database meta page %lu of bad type %lu", + EPRINT((dbenv, "Page %lu: database metapage of bad type %lu", (u_long)meta_pgno, (u_long)TYPE(h))); ret = DB_VERIFY_BAD; break; } -err: if (pgsc != NULL) - (void)pgsc->c_close(pgsc); - if (pgset != NULL) - (void)pgset->close(pgset, 0); - if (h != NULL && (t_ret = memp_fput(dbp->mpf, h, 0)) != 0) +err: if (pgsc != NULL && (t_ret = pgsc->c_close(pgsc)) != 0 && ret == 0) + ret = t_ret; + if (pgset != NULL && + (t_ret = pgset->close(pgset, 0)) != 0 && ret == 0) ret = t_ret; - if (currpg != NULL && (t_ret = memp_fput(dbp->mpf, currpg, 0)) != 0) + if (h != NULL && (t_ret = mpf->put(mpf, h, 0)) != 0) + ret = t_ret; + if (currpg != NULL && (t_ret = mpf->put(mpf, currpg, 0)) != 0) ret = t_ret; if ((t_ret = mdbp->close(mdbp, 0)) != 0) ret = t_ret; @@ -1584,11 +1685,8 @@ err: if (pgsc != NULL) * __db_salvage -- * Walk through a page, salvaging all likely or plausible (w/ * DB_AGGRESSIVE) key/data pairs. - * - * PUBLIC: int __db_salvage __P((DB *, VRFY_DBINFO *, db_pgno_t, PAGE *, - * PUBLIC: void *, int (*)(void *, const void *), u_int32_t)); */ -int +static int __db_salvage(dbp, vdp, pgno, h, handle, callback, flags) DB *dbp; VRFY_DBINFO *vdp; @@ -1659,24 +1757,29 @@ __db_salvage_unknowns(dbp, vdp, handle, callback, flags) u_int32_t flags; { DBT unkdbt, key, *dbt; + DB_ENV *dbenv; + DB_MPOOLFILE *mpf; PAGE *h; db_pgno_t pgno; u_int32_t pgtype; int ret, err_ret; void *ovflbuf; + dbenv = dbp->dbenv; + mpf = dbp->mpf; + memset(&unkdbt, 0, sizeof(DBT)); - unkdbt.size = strlen("UNKNOWN") + 1; + unkdbt.size = (u_int32_t)strlen("UNKNOWN") + 1; unkdbt.data = "UNKNOWN"; - if ((ret = __os_malloc(dbp->dbenv, dbp->pgsize, 0, &ovflbuf)) != 0) + if ((ret = __os_malloc(dbenv, dbp->pgsize, &ovflbuf)) != 0) return (ret); err_ret = 0; while ((ret = __db_salvage_getnext(vdp, &pgno, &pgtype)) == 0) { dbt = NULL; - if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) { + if ((ret = mpf->get(mpf, &pgno, 0, &h)) != 0) { err_ret = ret; continue; } @@ -1699,17 +1802,11 @@ __db_salvage_unknowns(dbp, vdp, handle, callback, flags) * a database with no dups. What to do? */ if ((ret = __db_safe_goff(dbp, - vdp, pgno, &key, &ovflbuf, flags)) != 0) { - err_ret = ret; - continue; - } - if ((ret = __db_prdbt(&key, - 0, " ", handle, callback, 0, NULL)) != 0) { - err_ret = ret; - continue; - } - if ((ret = __db_prdbt(&unkdbt, - 0, " ", handle, callback, 0, NULL)) != 0) + vdp, pgno, &key, &ovflbuf, flags)) != 0 || + (ret = __db_prdbt(&key, + 0, " ", handle, callback, 0, vdp)) != 0 || + (ret = __db_prdbt(&unkdbt, + 0, " ", handle, callback, 0, vdp)) != 0) err_ret = ret; break; case SALVAGE_HASH: @@ -1727,11 +1824,11 @@ __db_salvage_unknowns(dbp, vdp, handle, callback, flags) DB_ASSERT(0); break; } - if ((ret = memp_fput(dbp->mpf, h, 0)) != 0) + if ((ret = mpf->put(mpf, h, 0)) != 0) err_ret = ret; } - __os_free(ovflbuf, 0); + __os_free(dbenv, ovflbuf); if (err_ret != 0 && ret == 0) ret = err_ret; @@ -1743,8 +1840,8 @@ __db_salvage_unknowns(dbp, vdp, handle, callback, flags) * Offset of the ith inp array entry, which we can compare to the offset * the entry stores. */ -#define INP_OFFSET(h, i) \ - ((db_indx_t)((u_int8_t *)(h)->inp + (i) - (u_int8_t *)(h))) +#define INP_OFFSET(dbp, h, i) \ + ((db_indx_t)((u_int8_t *)((P_INP(dbp,(h))) + (i)) - (u_int8_t *)(h))) /* * __db_vrfy_inpitem -- @@ -1770,33 +1867,35 @@ __db_vrfy_inpitem(dbp, h, pgno, i, is_btree, flags, himarkp, offsetp) u_int32_t flags, *himarkp, *offsetp; { BKEYDATA *bk; - db_indx_t offset, len; + DB_ENV *dbenv; + db_indx_t *inp, offset, len; + + dbenv = dbp->dbenv; DB_ASSERT(himarkp != NULL); + inp = P_INP(dbp, h); /* * Check that the inp array, which grows from the beginning of the * page forward, has not collided with the data, which grow from the * end of the page backward. */ - if (h->inp + i >= (db_indx_t *)((u_int8_t *)h + *himarkp)) { + if (inp + i >= (db_indx_t *)((u_int8_t *)h + *himarkp)) { /* We've collided with the data. We need to bail. */ - EPRINT((dbp->dbenv, - "Page %lu entries listing %lu overlaps data", + EPRINT((dbenv, "Page %lu: entries listing %lu overlaps data", (u_long)pgno, (u_long)i)); return (DB_VERIFY_FATAL); } - offset = h->inp[i]; + offset = inp[i]; /* * Check that the item offset is reasonable: it points somewhere * after the inp array and before the end of the page. */ - if (offset <= INP_OFFSET(h, i) || offset > dbp->pgsize) { - EPRINT((dbp->dbenv, - "Bad offset %lu at page %lu index %lu", - (u_long)offset, (u_long)pgno, (u_long)i)); + if (offset <= INP_OFFSET(dbp, h, i) || offset > dbp->pgsize) { + EPRINT((dbenv, "Page %lu: bad offset %lu at page index %lu", + (u_long)pgno, (u_long)offset, (u_long)i)); return (DB_VERIFY_BAD); } @@ -1808,7 +1907,7 @@ __db_vrfy_inpitem(dbp, h, pgno, i, is_btree, flags, himarkp, offsetp) /* * Check that the item length remains on-page. */ - bk = GET_BKEYDATA(h, i); + bk = GET_BKEYDATA(dbp, h, i); /* * We need to verify the type of the item here; @@ -1826,16 +1925,16 @@ __db_vrfy_inpitem(dbp, h, pgno, i, is_btree, flags, himarkp, offsetp) len = BOVERFLOW_SIZE; break; default: - EPRINT((dbp->dbenv, - "Item %lu on page %lu of unrecognizable type", - i, pgno)); + EPRINT((dbenv, + "Page %lu: item %lu of unrecognizable type", + (u_long)pgno, (u_long)i)); return (DB_VERIFY_BAD); } if ((size_t)(offset + len) > dbp->pgsize) { - EPRINT((dbp->dbenv, - "Item %lu on page %lu extends past page boundary", - (u_long)i, (u_long)pgno)); + EPRINT((dbenv, + "Page %lu: item %lu extends past page boundary", + (u_long)pgno, (u_long)i)); return (DB_VERIFY_BAD); } } @@ -1861,9 +1960,11 @@ __db_vrfy_duptype(dbp, vdp, pgno, flags) db_pgno_t pgno; u_int32_t flags; { + DB_ENV *dbenv; VRFY_PAGEINFO *pip; int ret, isbad; + dbenv = dbp->dbenv; isbad = 0; if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) @@ -1873,8 +1974,8 @@ __db_vrfy_duptype(dbp, vdp, pgno, flags) case P_IBTREE: case P_LDUP: if (!LF_ISSET(ST_DUPSORT)) { - EPRINT((dbp->dbenv, - "Sorted duplicate set at page %lu in unsorted-dup database", + EPRINT((dbenv, + "Page %lu: sorted duplicate set in unsorted-dup database", (u_long)pgno)); isbad = 1; } @@ -1882,21 +1983,29 @@ __db_vrfy_duptype(dbp, vdp, pgno, flags) case P_IRECNO: case P_LRECNO: if (LF_ISSET(ST_DUPSORT)) { - EPRINT((dbp->dbenv, - "Unsorted duplicate set at page %lu in sorted-dup database", + EPRINT((dbenv, + "Page %lu: unsorted duplicate set in sorted-dup database", (u_long)pgno)); isbad = 1; } break; default: - EPRINT((dbp->dbenv, - "Duplicate page %lu of inappropriate type %lu", - (u_long)pgno, (u_long)pip->type)); + /* + * If the page is entirely zeroed, its pip->type will be a lie + * (we assumed it was a hash page, as they're allowed to be + * zeroed); handle this case specially. + */ + if (F_ISSET(pip, VRFY_IS_ALLZEROES)) + ZEROPG_ERR_PRINT(dbenv, pgno, "duplicate page"); + else + EPRINT((dbenv, + "Page %lu: duplicate page of inappropriate type %lu", + (u_long)pgno, (u_long)pip->type)); isbad = 1; break; } - if ((ret = __db_vrfy_putpageinfo(vdp, pip)) != 0) + if ((ret = __db_vrfy_putpageinfo(dbenv, vdp, pip)) != 0) return (ret); return (isbad == 1 ? DB_VERIFY_BAD : 0); } @@ -1934,14 +2043,17 @@ __db_salvage_duptree(dbp, vdp, pgno, key, handle, callback, flags) int (*callback) __P((void *, const void *)); u_int32_t flags; { + DB_MPOOLFILE *mpf; PAGE *h; int ret, t_ret; + mpf = dbp->mpf; + if (pgno == PGNO_INVALID || !IS_VALID_PGNO(pgno)) return (DB_VERIFY_BAD); /* We have a plausible page. Try it. */ - if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) + if ((ret = mpf->get(mpf, &pgno, 0, &h)) != 0) return (ret); switch (TYPE(h)) { @@ -1972,7 +2084,7 @@ __db_salvage_duptree(dbp, vdp, pgno, key, handle, callback, flags) /* NOTREACHED */ } -err: if ((t_ret = memp_fput(dbp->mpf, h, 0)) != 0 && ret == 0) +err: if ((t_ret = mpf->put(mpf, h, 0)) != 0 && ret == 0) ret = t_ret; return (ret); } @@ -1994,16 +2106,18 @@ __db_salvage_subdbs(dbp, vdp, handle, callback, flags, hassubsp) BTMETA *btmeta; DB *pgset; DBC *pgsc; + DB_MPOOLFILE *mpf; PAGE *h; db_pgno_t p, meta_pgno; int ret, err_ret; - err_ret = 0; - pgsc = NULL; pgset = NULL; + pgsc = NULL; + mpf = dbp->mpf; + err_ret = 0; meta_pgno = PGNO_BASE_MD; - if ((ret = memp_fget(dbp->mpf, &meta_pgno, 0, &h)) != 0) + if ((ret = mpf->get(mpf, &meta_pgno, 0, &h)) != 0) return (ret); if (TYPE(h) == P_BTREEMETA) @@ -2028,7 +2142,7 @@ __db_salvage_subdbs(dbp, vdp, handle, callback, flags, hassubsp) /* We think we've got subdbs. Mark it so. */ *hassubsp = 1; - if ((ret = memp_fput(dbp->mpf, h, 0)) != 0) + if ((ret = mpf->put(mpf, h, 0)) != 0) return (ret); /* @@ -2048,7 +2162,7 @@ __db_salvage_subdbs(dbp, vdp, handle, callback, flags, hassubsp) if ((ret = pgset->cursor(pgset, NULL, &pgsc, 0)) != 0) goto err; while ((ret = __db_vrfy_pgset_next(pgsc, &p)) == 0) { - if ((ret = memp_fget(dbp->mpf, &p, 0, &h)) != 0) { + if ((ret = mpf->get(mpf, &p, 0, &h)) != 0) { err_ret = ret; continue; } @@ -2061,7 +2175,7 @@ __db_salvage_subdbs(dbp, vdp, handle, callback, flags, hassubsp) else if ((ret = __db_salvage_subdbpg( dbp, vdp, h, handle, callback, flags)) != 0) err_ret = ret; -nextpg: if ((ret = memp_fput(dbp->mpf, h, 0)) != 0) +nextpg: if ((ret = mpf->put(mpf, h, 0)) != 0) err_ret = ret; } @@ -2079,7 +2193,7 @@ err: if (pgsc != NULL) (void)pgsc->c_close(pgsc); if (pgset != NULL) (void)pgset->close(pgset, 0); - (void)memp_fput(dbp->mpf, h, 0); + (void)mpf->put(mpf, h, 0); return (ret); } @@ -2087,12 +2201,8 @@ err: if (pgsc != NULL) * __db_salvage_subdbpg -- * Given a known-good leaf page in the master database, salvage all * leaf pages corresponding to each subdb. - * - * PUBLIC: int __db_salvage_subdbpg - * PUBLIC: __P((DB *, VRFY_DBINFO *, PAGE *, void *, - * PUBLIC: int (*)(void *, const void *), u_int32_t)); */ -int +static int __db_salvage_subdbpg(dbp, vdp, master, handle, callback, flags) DB *dbp; VRFY_DBINFO *vdp; @@ -2106,16 +2216,20 @@ __db_salvage_subdbpg(dbp, vdp, master, handle, callback, flags) DB *pgset; DBC *pgsc; DBT key; + DB_ENV *dbenv; + DB_MPOOLFILE *mpf; PAGE *subpg; db_indx_t i; db_pgno_t meta_pgno, p; int ret, err_ret, t_ret; char *subdbname; + dbenv = dbp->dbenv; + mpf = dbp->mpf; ret = err_ret = 0; subdbname = NULL; - if ((ret = __db_vrfy_pgset(dbp->dbenv, dbp->pgsize, &pgset)) != 0) + if ((ret = __db_vrfy_pgset(dbenv, dbp->pgsize, &pgset)) != 0) return (ret); /* @@ -2123,8 +2237,8 @@ __db_salvage_subdbpg(dbp, vdp, master, handle, callback, flags) * corresponding to that entry. */ for (i = 0; i < NUM_ENT(master); i += P_INDX) { - bkkey = GET_BKEYDATA(master, i); - bkdata = GET_BKEYDATA(master, i + O_INDX); + bkkey = GET_BKEYDATA(dbp, master, i); + bkdata = GET_BKEYDATA(dbp, master, i + O_INDX); /* Get the subdatabase name. */ if (B_TYPE(bkkey->type) == B_OVERFLOW) { @@ -2140,13 +2254,13 @@ __db_salvage_subdbpg(dbp, vdp, master, handle, callback, flags) } /* Nul-terminate it. */ - if ((ret = __os_realloc(dbp->dbenv, - key.size + 1, NULL, &subdbname)) != 0) + if ((ret = __os_realloc(dbenv, + key.size + 1, &subdbname)) != 0) goto err; subdbname[key.size] = '\0'; } else if (B_TYPE(bkkey->type == B_KEYDATA)) { - if ((ret = __os_realloc(dbp->dbenv, - bkkey->len + 1, NULL, &subdbname)) != 0) + if ((ret = __os_realloc(dbenv, + bkkey->len + 1, &subdbname)) != 0) goto err; memcpy(subdbname, bkkey->data, bkkey->len); subdbname[bkkey->len] = '\0'; @@ -2159,9 +2273,15 @@ __db_salvage_subdbpg(dbp, vdp, master, handle, callback, flags) } memcpy(&meta_pgno, bkdata->data, sizeof(db_pgno_t)); + /* + * Subdatabase meta pgnos are stored in network byte + * order for cross-endian compatibility. Swap if appropriate. + */ + DB_NTOHL(&meta_pgno); + /* If we can't get the subdb meta page, just skip the subdb. */ if (!IS_VALID_PGNO(meta_pgno) || - (ret = memp_fget(dbp->mpf, &meta_pgno, 0, &subpg)) != 0) { + (ret = mpf->get(mpf, &meta_pgno, 0, &subpg)) != 0) { err_ret = ret; continue; } @@ -2177,7 +2297,7 @@ __db_salvage_subdbpg(dbp, vdp, master, handle, callback, flags) if ((ret = __db_vrfy_common(dbp, vdp, subpg, meta_pgno, flags)) != 0) { err_ret = ret; - (void)memp_fput(dbp->mpf, subpg, 0); + (void)mpf->put(mpf, subpg, 0); continue; } switch (TYPE(subpg)) { @@ -2185,7 +2305,7 @@ __db_salvage_subdbpg(dbp, vdp, master, handle, callback, flags) if ((ret = __bam_vrfy_meta(dbp, vdp, (BTMETA *)subpg, meta_pgno, flags)) != 0) { err_ret = ret; - (void)memp_fput(dbp->mpf, subpg, 0); + (void)mpf->put(mpf, subpg, 0); continue; } break; @@ -2193,7 +2313,7 @@ __db_salvage_subdbpg(dbp, vdp, master, handle, callback, flags) if ((ret = __ham_vrfy_meta(dbp, vdp, (HMETA *)subpg, meta_pgno, flags)) != 0) { err_ret = ret; - (void)memp_fput(dbp->mpf, subpg, 0); + (void)mpf->put(mpf, subpg, 0); continue; } break; @@ -2204,7 +2324,7 @@ __db_salvage_subdbpg(dbp, vdp, master, handle, callback, flags) /* NOTREACHED */ } - if ((ret = memp_fput(dbp->mpf, subpg, 0)) != 0) { + if ((ret = mpf->put(mpf, subpg, 0)) != 0) { err_ret = ret; continue; } @@ -2223,14 +2343,14 @@ __db_salvage_subdbpg(dbp, vdp, master, handle, callback, flags) if ((ret = pgset->cursor(pgset, NULL, &pgsc, 0)) != 0) goto err; while ((ret = __db_vrfy_pgset_next(pgsc, &p)) == 0) { - if ((ret = memp_fget(dbp->mpf, &p, 0, &subpg)) != 0) { + if ((ret = mpf->get(mpf, &p, 0, &subpg)) != 0) { err_ret = ret; continue; } if ((ret = __db_salvage(dbp, vdp, p, subpg, handle, callback, flags)) != 0) err_ret = ret; - if ((ret = memp_fput(dbp->mpf, subpg, 0)) != 0) + if ((ret = mpf->put(mpf, subpg, 0)) != 0) err_ret = ret; } @@ -2243,7 +2363,7 @@ __db_salvage_subdbpg(dbp, vdp, master, handle, callback, flags) goto err; } err: if (subdbname) - __os_free(subdbname, 0); + __os_free(dbenv, subdbname); if ((t_ret = pgset->close(pgset, 0)) != 0) ret = t_ret; @@ -2268,10 +2388,13 @@ __db_meta2pgset(dbp, vdp, pgno, flags, pgset) u_int32_t flags; DB *pgset; { + DB_MPOOLFILE *mpf; PAGE *h; int ret, t_ret; - if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) + mpf = dbp->mpf; + + if ((ret = mpf->get(mpf, &pgno, 0, &h)) != 0) return (ret); switch (TYPE(h)) { @@ -2286,7 +2409,7 @@ __db_meta2pgset(dbp, vdp, pgno, flags, pgset) break; } - if ((t_ret = memp_fput(dbp->mpf, h, 0)) != 0) + if ((t_ret = mpf->put(mpf, h, 0)) != 0) return (t_ret); return (ret); } @@ -2305,7 +2428,6 @@ __db_guesspgsize(dbenv, fhp) size_t nr; u_int32_t guess; u_int8_t type; - int ret; for (guess = DB_MAX_PGSIZE; guess >= DB_MIN_PGSIZE; guess >>= 1) { /* @@ -2321,11 +2443,11 @@ __db_guesspgsize(dbenv, fhp) * our previous guess; that last one was probably the page size. */ for (i = 1; i <= 3; i++) { - if ((ret = __os_seek(dbenv, fhp, guess, - i, SSZ(DBMETA, type), 0, DB_OS_SEEK_SET)) != 0) + if (__os_seek(dbenv, fhp, guess, + i, SSZ(DBMETA, type), 0, DB_OS_SEEK_SET) != 0) break; - if ((ret = __os_read(dbenv, - fhp, &type, 1, &nr)) != 0 || nr == 0) + if (__os_read(dbenv, + fhp, &type, 1, &nr) != 0 || nr == 0) break; if (type == P_INVALID || type >= P_PAGETYPE_MAX) return (guess << 1); diff --git a/bdb/db/db_vrfyutil.c b/bdb/db/db_vrfyutil.c index 89dccdcc760..44344ceed11 100644 --- a/bdb/db/db_vrfyutil.c +++ b/bdb/db/db_vrfyutil.c @@ -1,16 +1,16 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2000 + * Copyright (c) 2000-2002 * Sleepycat Software. All rights reserved. * - * $Id: db_vrfyutil.c,v 11.11 2000/11/28 21:36:04 bostic Exp $ + * $Id: db_vrfyutil.c,v 11.29 2002/08/08 03:57:50 bostic Exp $ */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: db_vrfyutil.c,v 11.11 2000/11/28 21:36:04 bostic Exp $"; +static const char revid[] = "$Id: db_vrfyutil.c,v 11.29 2002/08/08 03:57:50 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -20,10 +20,11 @@ static const char revid[] = "$Id: db_vrfyutil.c,v 11.11 2000/11/28 21:36:04 bost #endif #include "db_int.h" -#include "db_page.h" -#include "db_verify.h" -#include "db_ext.h" +#include "dbinc/db_page.h" +#include "dbinc/db_verify.h" +#include "dbinc/db_am.h" +static int __db_vrfy_pageinfo_create __P((DB_ENV *, VRFY_PAGEINFO **)); static int __db_vrfy_pgset_iinc __P((DB *, db_pgno_t, int)); /* @@ -34,7 +35,7 @@ static int __db_vrfy_pgset_iinc __P((DB *, db_pgno_t, int)); * PUBLIC: __P((DB_ENV *, u_int32_t, VRFY_DBINFO **)); */ int -__db_vrfy_dbinfo_create (dbenv, pgsize, vdpp) +__db_vrfy_dbinfo_create(dbenv, pgsize, vdpp) DB_ENV *dbenv; u_int32_t pgsize; VRFY_DBINFO **vdpp; @@ -53,14 +54,14 @@ __db_vrfy_dbinfo_create (dbenv, pgsize, vdpp) if ((ret = db_create(&cdbp, dbenv, 0)) != 0) goto err; - if ((ret = cdbp->set_flags(cdbp, DB_DUP | DB_DUPSORT)) != 0) + if ((ret = cdbp->set_flags(cdbp, DB_DUP)) != 0) goto err; if ((ret = cdbp->set_pagesize(cdbp, pgsize)) != 0) goto err; if ((ret = - cdbp->open(cdbp, NULL, NULL, DB_BTREE, DB_CREATE, 0600)) != 0) + cdbp->open(cdbp, NULL, NULL, NULL, DB_BTREE, DB_CREATE, 0600)) != 0) goto err; if ((ret = db_create(&pgdbp, dbenv, 0)) != 0) @@ -69,8 +70,8 @@ __db_vrfy_dbinfo_create (dbenv, pgsize, vdpp) if ((ret = pgdbp->set_pagesize(pgdbp, pgsize)) != 0) goto err; - if ((ret = - pgdbp->open(pgdbp, NULL, NULL, DB_BTREE, DB_CREATE, 0600)) != 0) + if ((ret = pgdbp->open(pgdbp, + NULL, NULL, NULL, DB_BTREE, DB_CREATE, 0600)) != 0) goto err; if ((ret = __db_vrfy_pgset(dbenv, pgsize, &pgset)) != 0) @@ -90,7 +91,7 @@ err: if (cdbp != NULL) if (pgdbp != NULL) (void)pgdbp->close(pgdbp, 0); if (vdp != NULL) - __os_free(vdp, sizeof(VRFY_DBINFO)); + __os_free(dbenv, vdp); return (ret); } @@ -99,10 +100,11 @@ err: if (cdbp != NULL) * Destructor for VRFY_DBINFO. Destroys VRFY_PAGEINFOs and deallocates * structure. * - * PUBLIC: int __db_vrfy_dbinfo_destroy __P((VRFY_DBINFO *)); + * PUBLIC: int __db_vrfy_dbinfo_destroy __P((DB_ENV *, VRFY_DBINFO *)); */ int -__db_vrfy_dbinfo_destroy(vdp) +__db_vrfy_dbinfo_destroy(dbenv, vdp) + DB_ENV *dbenv; VRFY_DBINFO *vdp; { VRFY_CHILDINFO *c, *d; @@ -112,7 +114,7 @@ __db_vrfy_dbinfo_destroy(vdp) for (c = LIST_FIRST(&vdp->subdbs); c != NULL; c = d) { d = LIST_NEXT(c, links); - __os_free(c, 0); + __os_free(NULL, c); } if ((t_ret = vdp->pgdbp->close(vdp->pgdbp, 0)) != 0) @@ -126,7 +128,7 @@ __db_vrfy_dbinfo_destroy(vdp) DB_ASSERT(LIST_FIRST(&vdp->activepips) == NULL); - __os_free(vdp, sizeof(VRFY_DBINFO)); + __os_free(dbenv, vdp); return (ret); } @@ -192,7 +194,7 @@ __db_vrfy_getpageinfo(vdp, pgno, pipp) return (ret); /* Case 3 */ - if ((ret = __db_vrfy_pageinfo_create(&pip)) != 0) + if ((ret = __db_vrfy_pageinfo_create(pgdbp->dbenv, &pip)) != 0) return (ret); LIST_INSERT_HEAD(&vdp->activepips, pip, links); @@ -208,10 +210,12 @@ found: pip->pi_refcount++; * __db_vrfy_putpageinfo -- * Put back a VRFY_PAGEINFO that we're done with. * - * PUBLIC: int __db_vrfy_putpageinfo __P((VRFY_DBINFO *, VRFY_PAGEINFO *)); + * PUBLIC: int __db_vrfy_putpageinfo __P((DB_ENV *, + * PUBLIC: VRFY_DBINFO *, VRFY_PAGEINFO *)); */ int -__db_vrfy_putpageinfo(vdp, pip) +__db_vrfy_putpageinfo(dbenv, vdp, pip) + DB_ENV *dbenv; VRFY_DBINFO *vdp; VRFY_PAGEINFO *pip; { @@ -255,7 +259,7 @@ __db_vrfy_putpageinfo(vdp, pip) #endif DB_ASSERT(pip->pi_refcount == 0); - __os_free(pip, 0); + __os_ufree(dbenv, pip); return (0); } @@ -280,7 +284,8 @@ __db_vrfy_pgset(dbenv, pgsize, dbpp) return (ret); if ((ret = dbp->set_pagesize(dbp, pgsize)) != 0) goto err; - if ((ret = dbp->open(dbp, NULL, NULL, DB_BTREE, DB_CREATE, 0600)) == 0) + if ((ret = dbp->open(dbp, + NULL, NULL, NULL, DB_BTREE, DB_CREATE, 0600)) == 0) *dbpp = dbp; else err: (void)dbp->close(dbp, 0); @@ -382,7 +387,7 @@ __db_vrfy_pgset_iinc(dbp, pgno, i) F_SET(&data, DB_DBT_USERMEM); if ((ret = dbp->get(dbp, NULL, &key, &data, 0)) == 0) { - DB_ASSERT(data.size = sizeof(int)); + DB_ASSERT(data.size == sizeof(int)); memcpy(&val, data.data, sizeof(int)); } else if (ret != DB_NOTFOUND) return (ret); @@ -463,8 +468,10 @@ __db_vrfy_childput(vdp, pgno, cip) db_pgno_t pgno; VRFY_CHILDINFO *cip; { - DBT key, data; DB *cdbp; + DBC *cc; + DBT key, data; + VRFY_CHILDINFO *oldcip; int ret; cdbp = vdp->cdbp; @@ -474,17 +481,44 @@ __db_vrfy_childput(vdp, pgno, cip) key.data = &pgno; key.size = sizeof(db_pgno_t); + /* + * We want to avoid adding multiple entries for a single child page; + * we only need to verify each child once, even if a child (such + * as an overflow key) is multiply referenced. + * + * However, we also need to make sure that when walking the list + * of children, we encounter them in the order they're referenced + * on a page. (This permits us, for example, to verify the + * prev_pgno/next_pgno chain of Btree leaf pages.) + * + * Check the child database to make sure that this page isn't + * already a child of the specified page number. If it's not, + * put it at the end of the duplicate set. + */ + if ((ret = __db_vrfy_childcursor(vdp, &cc)) != 0) + return (ret); + for (ret = __db_vrfy_ccset(cc, pgno, &oldcip); ret == 0; + ret = __db_vrfy_ccnext(cc, &oldcip)) + if (oldcip->pgno == cip->pgno) { + /* + * Found a matching child. Return without + * putting it again. + */ + if ((ret = __db_vrfy_ccclose(cc)) != 0) + return (ret); + return (0); + } + if (ret != DB_NOTFOUND) { + (void)__db_vrfy_ccclose(cc); + return (ret); + } + if ((ret = __db_vrfy_ccclose(cc)) != 0) + return (ret); + data.data = cip; data.size = sizeof(VRFY_CHILDINFO); - /* - * Don't add duplicate (data) entries for a given child, and accept - * DB_KEYEXIST as a successful return; we only need to verify - * each child once, even if a child (such as an overflow key) is - * multiply referenced. - */ - ret = cdbp->put(cdbp, NULL, &key, &data, DB_NODUPDATA); - return (ret == DB_KEYEXIST ? 0 : ret); + return (cdbp->put(cdbp, NULL, &key, &data, 0)); } /* @@ -568,19 +602,26 @@ __db_vrfy_ccclose(dbc) /* * __db_vrfy_pageinfo_create -- * Constructor for VRFY_PAGEINFO; allocates and initializes. - * - * PUBLIC: int __db_vrfy_pageinfo_create __P((VRFY_PAGEINFO **)); */ -int -__db_vrfy_pageinfo_create(pgipp) +static int +__db_vrfy_pageinfo_create(dbenv, pgipp) + DB_ENV *dbenv; VRFY_PAGEINFO **pgipp; { VRFY_PAGEINFO *pgip; int ret; - if ((ret = __os_calloc(NULL, - 1, sizeof(VRFY_PAGEINFO), (void **)&pgip)) != 0) + /* + * pageinfo structs are sometimes allocated here and sometimes + * allocated by fetching them from a database with DB_DBT_MALLOC. + * There's no easy way for the destructor to tell which was + * used, and so we always allocate with __os_umalloc so we can free + * with __os_ufree. + */ + if ((ret = __os_umalloc(dbenv, + sizeof(VRFY_PAGEINFO), (void **)&pgip)) != 0) return (ret); + memset(pgip, 0, sizeof(VRFY_PAGEINFO)); DB_ASSERT(pgip->pi_refcount == 0); @@ -607,7 +648,8 @@ __db_salvage_init(vdp) if ((ret = dbp->set_pagesize(dbp, 1024)) != 0) goto err; - if ((ret = dbp->open(dbp, NULL, NULL, DB_BTREE, DB_CREATE, 0)) != 0) + if ((ret = dbp->open(dbp, + NULL, NULL, NULL, DB_BTREE, DB_CREATE, 0)) != 0) goto err; vdp->salvage_pages = dbp; |